aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorShulhan <ms@kilabit.info>2018-09-17 05:04:26 +0700
committerShulhan <ms@kilabit.info>2018-09-18 01:50:21 +0700
commit1cae4ca316afa5d177fdbf7a98a0ec7fffe76a3e (patch)
tree5fa83fc0faa31e09cae82ac4d467cf8ba5f87fc2
parent446fef94cd712861221c0098dcdd9ae52aaed0eb (diff)
downloadpakakeh.go-1cae4ca316afa5d177fdbf7a98a0ec7fffe76a3e.tar.xz
Merge package "github.com/shuLhan/go-mining"
-rw-r--r--lib/mining/.gitignore26
-rw-r--r--lib/mining/LICENSE38
-rw-r--r--lib/mining/README.md26
-rw-r--r--lib/mining/classifier/cart/cart.go480
-rw-r--r--lib/mining/classifier/cart/cart_test.go62
-rw-r--r--lib/mining/classifier/cart/node.go44
-rw-r--r--lib/mining/classifier/cm.go442
-rw-r--r--lib/mining/classifier/cm_test.go69
-rw-r--r--lib/mining/classifier/crf/crf.go470
-rw-r--r--lib/mining/classifier/crf/crf_test.go92
-rw-r--r--lib/mining/classifier/rf/rf.go363
-rw-r--r--lib/mining/classifier/rf/rf_bench_test.go22
-rw-r--r--lib/mining/classifier/rf/rf_test.go190
-rw-r--r--lib/mining/classifier/runtime.go443
-rw-r--r--lib/mining/classifier/stat.go176
-rw-r--r--lib/mining/classifier/stats.go143
-rw-r--r--lib/mining/classifier/stats_interface.go68
-rw-r--r--lib/mining/cmd/cart/iris.dsv35
-rw-r--r--lib/mining/cmd/cart/main.go107
-rw-r--r--lib/mining/cmd/crf/main.go205
-rw-r--r--lib/mining/cmd/crf/phoneme.dsv45
-rw-r--r--lib/mining/cmd/lnsmote/main.go188
-rw-r--r--lib/mining/cmd/lnsmote/phoneme.dsv45
-rw-r--r--lib/mining/cmd/rf/iris.dsv41
-rw-r--r--lib/mining/cmd/rf/main.go188
-rw-r--r--lib/mining/cmd/rf/phoneme.dsv45
-rw-r--r--lib/mining/cmd/smote/main.go193
-rw-r--r--lib/mining/cmd/smote/phoneme.dsv40
-rw-r--r--lib/mining/gain/gini/gini.go461
-rw-r--r--lib/mining/gain/gini/gini_test.go59
-rw-r--r--lib/mining/gain/gini/ginifloat.go168
-rw-r--r--lib/mining/knn/knn.go109
-rw-r--r--lib/mining/knn/knn_test.go62
-rw-r--r--lib/mining/knn/neighbor.go150
-rw-r--r--lib/mining/knn/neighbor_test.go84
-rw-r--r--lib/mining/math/math.go68
-rw-r--r--lib/mining/math/math_test.go48
-rw-r--r--lib/mining/resampling/interface.go52
-rw-r--r--lib/mining/resampling/lnsmote/lnsmote.go287
-rw-r--r--lib/mining/resampling/lnsmote/lnsmote_test.go74
-rw-r--r--lib/mining/resampling/lnsmote/phoneme_lnsmote.dsv38
-rw-r--r--lib/mining/resampling/smote/phoneme_smote.dsv38
-rw-r--r--lib/mining/resampling/smote/smote.go183
-rw-r--r--lib/mining/resampling/smote/smote_test.go49
-rw-r--r--lib/mining/testdata/forensic_glass/fgl.data214
-rw-r--r--lib/mining/testdata/forensic_glass/fgl.dsv97
-rw-r--r--lib/mining/testdata/forensic_glass/glass.data214
-rw-r--r--lib/mining/testdata/forensic_glass/glass.dsv98
-rw-r--r--lib/mining/testdata/forensic_glass/glass.names90
-rw-r--r--lib/mining/testdata/forensic_glass/glass.tag27
-rw-r--r--lib/mining/testdata/iris/iris.dsv35
-rw-r--r--lib/mining/testdata/phoneme/phoneme.dsv59
-rw-r--r--lib/mining/testdata/wvc2010lnsmote/wvc2010_features.lnsmote.dsv129
-rw-r--r--lib/mining/tree/binary/binary.go64
-rw-r--r--lib/mining/tree/binary/binary_test.go47
-rw-r--r--lib/mining/tree/binary/btnode.go77
56 files changed, 7367 insertions, 0 deletions
diff --git a/lib/mining/.gitignore b/lib/mining/.gitignore
new file mode 100644
index 00000000..4b45a114
--- /dev/null
+++ b/lib/mining/.gitignore
@@ -0,0 +1,26 @@
+cmd/cart/cart
+cmd/crf/crf
+cmd/lnsmote/lnsmote
+cmd/rf/rf
+cmd/smote/smote
+resampling/lnsmote/lnsmote.outliers
+resampling/lnsmote/phoneme_lnsmote.csv
+resampling/lnsmote/synthetic.csv
+resampling/smote/phoneme_smote.csv
+resampling/smote/synthetic.csv
+resampling/smote/testdata/phoneme.csv
+testdata/phoneme/phoneme.csv
+*.bak
+*.bench
+*.cprof
+*.csv
+*.dat
+*.mprof
+*.oob
+*.perf
+*.prof
+*.rej
+*.sh
+*.stat
+*.stats
+*.test
diff --git a/lib/mining/LICENSE b/lib/mining/LICENSE
new file mode 100644
index 00000000..ec6ea686
--- /dev/null
+++ b/lib/mining/LICENSE
@@ -0,0 +1,38 @@
+Copyright (C) 2015, Mhd Sulhan (ms@kilabit.info). All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+ list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+ this list of conditions and the following disclaimer in the documentation
+ and/or other materials provided with the distribution.
+
+* Neither the name of Kilabit nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY M.SHULHAN "AS IS" AND ANY EXPRESS OR IMPLIED
+WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
+EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ --- --- --- --- --- --- ---
+
+ TT TT II BB AAAA LLLLLL II KKKKKKKK
+ TT TT II BB AA AA LL LL II KK
+ TTTT II BB AA AA LL LL II KK
+ TT TT II BB AAAAAAAA LLLLLL II KK
+ TT TT II BB AA AA LL LL II KK
+ TT TT II BBBBBBBB AA AA LLLLLL II KK
+
+Website: http://kilabit.info
+Contact: ms@kilabit.info
diff --git a/lib/mining/README.md b/lib/mining/README.md
new file mode 100644
index 00000000..821a6aa3
--- /dev/null
+++ b/lib/mining/README.md
@@ -0,0 +1,26 @@
+[![GoDoc](https://godoc.org/github.com/shuLhan/share/lib/mining?status.svg)](https://godoc.org/github.com/shuLhan/share/lib/mining)
+[![Go Report Card](https://goreportcard.com/badge/github.com/shuLhan/share/lib/mining)](https://goreportcard.com/report/github.com/shuLhan/share/lib/mining)
+
+# go-mining
+
+Go-mining is a small library for data mining.
+
+The library is written in [Go language](golang/go).
+
+## Features
+
+### Classifiers
+
+- CART
+- Random Forest
+- Cascaded Random Forest
+- K-Nearest Neighbourhood
+
+### Resampling
+
+- SMOTE
+- LN-SMOTE (Local Neigbourhood SMOTE)
+
+### Miscellaneous
+
+- Gini index
diff --git a/lib/mining/classifier/cart/cart.go b/lib/mining/classifier/cart/cart.go
new file mode 100644
index 00000000..449781d9
--- /dev/null
+++ b/lib/mining/classifier/cart/cart.go
@@ -0,0 +1,480 @@
+// Copyright 2015 Mhd Sulhan <ms@kilabit.info>. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//
+// Package cart implement the Classification and Regression Tree by Breiman, et al.
+// CART is binary decision tree.
+//
+// Breiman, Leo, et al. Classification and regression trees. CRC press,
+// 1984.
+//
+// The implementation is based on Data Mining book,
+//
+// Han, Jiawei, Micheline Kamber, and Jian Pei. Data mining: concepts and
+// techniques: concepts and techniques. Elsevier, 2011.
+//
+package cart
+
+import (
+ "fmt"
+
+ "github.com/shuLhan/share/lib/debug"
+ "github.com/shuLhan/share/lib/mining/gain/gini"
+ "github.com/shuLhan/share/lib/mining/tree/binary"
+ "github.com/shuLhan/share/lib/numbers"
+ libstrings "github.com/shuLhan/share/lib/strings"
+ "github.com/shuLhan/share/lib/tabula"
+)
+
+const (
+ // SplitMethodGini if defined in Runtime, the dataset will be splitted
+ // using Gini gain for each possible value or partition.
+ //
+ // This option is used in Runtime.SplitMethod.
+ SplitMethodGini = "gini"
+)
+
+const (
+ // ColFlagParent denote that the column is parent/split node.
+ ColFlagParent = 1
+ // ColFlagSkip denote that the column would be skipped.
+ ColFlagSkip = 2
+)
+
+//
+// Runtime data for building CART.
+//
+type Runtime struct {
+ // SplitMethod define the criteria to used for splitting.
+ SplitMethod string `json:"SplitMethod"`
+ // NRandomFeature if less or equal to zero compute gain on all feature,
+ // otherwise select n random feature and compute gain only on selected
+ // features.
+ NRandomFeature int `json:"NRandomFeature"`
+ // OOBErrVal is the last out-of-bag error value in the tree.
+ OOBErrVal float64
+ // Tree in classification.
+ Tree binary.Tree
+}
+
+//
+// New create new Runtime object.
+//
+func New(D tabula.ClasetInterface, splitMethod string, nRandomFeature int) (
+ *Runtime, error,
+) {
+ runtime := &Runtime{
+ SplitMethod: splitMethod,
+ NRandomFeature: nRandomFeature,
+ Tree: binary.Tree{},
+ }
+
+ e := runtime.Build(D)
+ if e != nil {
+ return nil, e
+ }
+
+ return runtime, nil
+}
+
+//
+// Build will create a tree using CART algorithm.
+//
+func (runtime *Runtime) Build(D tabula.ClasetInterface) (e error) {
+ // Re-check input configuration.
+ switch runtime.SplitMethod {
+ case SplitMethodGini:
+ // Do nothing.
+ default:
+ // Set default split method to Gini index.
+ runtime.SplitMethod = SplitMethodGini
+ }
+
+ runtime.Tree.Root, e = runtime.splitTreeByGain(D)
+
+ return
+}
+
+//
+// splitTreeByGain calculate the gain in all dataset, and split into two node:
+// left and right.
+//
+// Return node with the split information.
+//
+func (runtime *Runtime) splitTreeByGain(D tabula.ClasetInterface) (
+ node *binary.BTNode,
+ e error,
+) {
+ node = &binary.BTNode{}
+
+ D.RecountMajorMinor()
+
+ // if dataset is empty return node labeled with majority classes in
+ // dataset.
+ nrow := D.GetNRow()
+
+ if nrow <= 0 {
+ if debug.Value >= 2 {
+ fmt.Printf("[cart] empty dataset (%s) : %v\n",
+ D.MajorityClass(), D)
+ }
+
+ node.Value = NodeValue{
+ IsLeaf: true,
+ Class: D.MajorityClass(),
+ Size: 0,
+ }
+ return node, nil
+ }
+
+ // if all dataset is in the same class, return node as leaf with class
+ // is set to that class.
+ single, name := D.IsInSingleClass()
+ if single {
+ if debug.Value >= 2 {
+ fmt.Printf("[cart] in single class (%s): %v\n", name,
+ D.GetColumns())
+ }
+
+ node.Value = NodeValue{
+ IsLeaf: true,
+ Class: name,
+ Size: nrow,
+ }
+ return node, nil
+ }
+
+ if debug.Value >= 2 {
+ fmt.Println("[cart] D:", D)
+ }
+
+ // calculate the Gini gain for each attribute.
+ gains := runtime.computeGain(D)
+
+ // get attribute with maximum Gini gain.
+ MaxGainIdx := gini.FindMaxGain(&gains)
+ MaxGain := gains[MaxGainIdx]
+
+ // if maxgain value is 0, use majority class as node and terminate
+ // the process
+ if MaxGain.GetMaxGainValue() == 0 {
+ if debug.Value >= 2 {
+ fmt.Println("[cart] max gain 0 with target",
+ D.GetClassAsStrings(),
+ " and majority class is ", D.MajorityClass())
+ }
+
+ node.Value = NodeValue{
+ IsLeaf: true,
+ Class: D.MajorityClass(),
+ Size: 0,
+ }
+ return node, nil
+ }
+
+ // using the sorted index in MaxGain, sort all field in dataset
+ tabula.SortColumnsByIndex(D, MaxGain.SortedIndex)
+
+ if debug.Value >= 2 {
+ fmt.Println("[cart] maxgain:", MaxGain)
+ }
+
+ // Now that we have attribute with max gain in MaxGainIdx, and their
+ // gain dan partition value in Gains[MaxGainIdx] and
+ // GetMaxPartValue(), we split the dataset based on type of max-gain
+ // attribute.
+ // If its continuous, split the attribute using numeric value.
+ // If its discrete, split the attribute using subset (partition) of
+ // nominal values.
+ var splitV interface{}
+
+ if MaxGain.IsContinu {
+ splitV = MaxGain.GetMaxPartGainValue()
+ } else {
+ attrPartV := MaxGain.GetMaxPartGainValue()
+ attrSubV := attrPartV.(libstrings.Row)
+ splitV = attrSubV[0]
+ }
+
+ if debug.Value >= 2 {
+ fmt.Println("[cart] maxgainindex:", MaxGainIdx)
+ fmt.Println("[cart] split v:", splitV)
+ }
+
+ node.Value = NodeValue{
+ SplitAttrName: D.GetColumn(MaxGainIdx).GetName(),
+ IsLeaf: false,
+ IsContinu: MaxGain.IsContinu,
+ Size: nrow,
+ SplitAttrIdx: MaxGainIdx,
+ SplitV: splitV,
+ }
+
+ dsL, dsR, e := tabula.SplitRowsByValue(D, MaxGainIdx, splitV)
+
+ if e != nil {
+ return node, e
+ }
+
+ splitL := dsL.(tabula.ClasetInterface)
+ splitR := dsR.(tabula.ClasetInterface)
+
+ // Set the flag to parent in attribute referenced by
+ // MaxGainIdx, so it will not computed again in the next round.
+ cols := splitL.GetColumns()
+ for x := range *cols {
+ if x == MaxGainIdx {
+ (*cols)[x].Flag = ColFlagParent
+ } else {
+ (*cols)[x].Flag = 0
+ }
+ }
+
+ cols = splitR.GetColumns()
+ for x := range *cols {
+ if x == MaxGainIdx {
+ (*cols)[x].Flag = ColFlagParent
+ } else {
+ (*cols)[x].Flag = 0
+ }
+ }
+
+ nodeLeft, e := runtime.splitTreeByGain(splitL)
+ if e != nil {
+ return node, e
+ }
+
+ nodeRight, e := runtime.splitTreeByGain(splitR)
+ if e != nil {
+ return node, e
+ }
+
+ node.SetLeft(nodeLeft)
+ node.SetRight(nodeRight)
+
+ return node, nil
+}
+
+// SelectRandomFeature if NRandomFeature is greater than zero, select and
+// compute gain in n random features instead of in all features
+func (runtime *Runtime) SelectRandomFeature(D tabula.ClasetInterface) {
+ if runtime.NRandomFeature <= 0 {
+ // all features selected
+ return
+ }
+
+ ncols := D.GetNColumn()
+
+ // count all features minus class
+ nfeature := ncols - 1
+ if runtime.NRandomFeature >= nfeature {
+ // Do nothing if number of random feature equal or greater than
+ // number of feature in dataset.
+ return
+ }
+
+ // exclude class index and parent node index
+ excludeIdx := []int{D.GetClassIndex()}
+ cols := D.GetColumns()
+ for x, col := range *cols {
+ if (col.Flag & ColFlagParent) == ColFlagParent {
+ excludeIdx = append(excludeIdx, x)
+ } else {
+ (*cols)[x].Flag |= ColFlagSkip
+ }
+ }
+
+ // Select random features excluding feature in `excludeIdx`.
+ var pickedIdx []int
+ for x := 0; x < runtime.NRandomFeature; x++ {
+ idx := numbers.IntPickRandPositive(ncols, false, pickedIdx,
+ excludeIdx)
+ pickedIdx = append(pickedIdx, idx)
+
+ // Remove skip flag on selected column
+ col := D.GetColumn(idx)
+ col.Flag = col.Flag &^ ColFlagSkip
+ }
+
+ if debug.Value >= 1 {
+ fmt.Println("[cart] selected random features:", pickedIdx)
+ fmt.Println("[cart] selected columns :", D.GetColumns())
+ }
+}
+
+//
+// computeGain calculate the gini index for each value in each attribute.
+//
+func (runtime *Runtime) computeGain(D tabula.ClasetInterface) (
+ gains []gini.Gini,
+) {
+ switch runtime.SplitMethod {
+ case SplitMethodGini:
+ // create gains value for all attribute minus target class.
+ gains = make([]gini.Gini, D.GetNColumn())
+ }
+
+ runtime.SelectRandomFeature(D)
+
+ classVS := D.GetClassValueSpace()
+ classIdx := D.GetClassIndex()
+ classType := D.GetClassType()
+
+ for x, col := range *D.GetColumns() {
+ // skip class attribute.
+ if x == classIdx {
+ continue
+ }
+
+ // skip column flagged with parent
+ if (col.Flag & ColFlagParent) == ColFlagParent {
+ gains[x].Skip = true
+ continue
+ }
+
+ // ignore column flagged with skip
+ if (col.Flag & ColFlagSkip) == ColFlagSkip {
+ gains[x].Skip = true
+ continue
+ }
+
+ // compute gain.
+ if col.GetType() == tabula.TReal {
+ attr := col.ToFloatSlice()
+
+ if classType == tabula.TString {
+ target := D.GetClassAsStrings()
+ gains[x].ComputeContinu(&attr, &target,
+ &classVS)
+ } else {
+ targetReal := D.GetClassAsReals()
+ classVSReal := libstrings.ToFloat64(classVS)
+
+ gains[x].ComputeContinuFloat(&attr,
+ &targetReal, &classVSReal)
+ }
+ } else {
+ attr := col.ToStringSlice()
+ attrV := col.ValueSpace
+
+ if debug.Value >= 2 {
+ fmt.Println("[cart] attr :", attr)
+ fmt.Println("[cart] attrV:", attrV)
+ }
+
+ target := D.GetClassAsStrings()
+ gains[x].ComputeDiscrete(&attr, &attrV, &target,
+ &classVS)
+ }
+
+ if debug.Value >= 2 {
+ fmt.Println("[cart] gain :", gains[x])
+ }
+ }
+ return
+}
+
+//
+// Classify return the prediction of one sample.
+//
+func (runtime *Runtime) Classify(data *tabula.Row) (class string) {
+ node := runtime.Tree.Root
+ nodev := node.Value.(NodeValue)
+
+ for !nodev.IsLeaf {
+ if nodev.IsContinu {
+ splitV := nodev.SplitV.(float64)
+ attrV := (*data)[nodev.SplitAttrIdx].Float()
+
+ if attrV < splitV {
+ node = node.Left
+ } else {
+ node = node.Right
+ }
+ } else {
+ splitV := nodev.SplitV.([]string)
+ attrV := (*data)[nodev.SplitAttrIdx].String()
+
+ if libstrings.IsContain(splitV, attrV) {
+ node = node.Left
+ } else {
+ node = node.Right
+ }
+ }
+ nodev = node.Value.(NodeValue)
+ }
+
+ return nodev.Class
+}
+
+//
+// ClassifySet set the class attribute based on tree classification.
+//
+func (runtime *Runtime) ClassifySet(data tabula.ClasetInterface) (e error) {
+ nrow := data.GetNRow()
+ targetAttr := data.GetClassColumn()
+
+ for i := 0; i < nrow; i++ {
+ class := runtime.Classify(data.GetRow(i))
+
+ _ = (*targetAttr).Records[i].SetValue(class, tabula.TString)
+ }
+
+ return
+}
+
+//
+// CountOOBError process out-of-bag data on tree and return error value.
+//
+func (runtime *Runtime) CountOOBError(oob tabula.Claset) (
+ errval float64,
+ e error,
+) {
+ // save the original target to be compared later.
+ origTarget := oob.GetClassAsStrings()
+
+ if debug.Value >= 2 {
+ fmt.Println("[cart] OOB:", oob.Columns)
+ fmt.Println("[cart] TREE:", &runtime.Tree)
+ }
+
+ // reset the target.
+ oobtarget := oob.GetClassColumn()
+ oobtarget.ClearValues()
+
+ e = runtime.ClassifySet(&oob)
+
+ if e != nil {
+ // set original target values back.
+ oobtarget.SetValues(origTarget)
+ return
+ }
+
+ target := oobtarget.ToStringSlice()
+
+ if debug.Value >= 2 {
+ fmt.Println("[cart] original target:", origTarget)
+ fmt.Println("[cart] classify target:", target)
+ }
+
+ // count how many target value is miss-classified.
+ runtime.OOBErrVal, _, _ = libstrings.CountMissRate(origTarget, target)
+
+ // set original target values back.
+ oobtarget.SetValues(origTarget)
+
+ return runtime.OOBErrVal, nil
+}
+
+//
+// String yes, it will print it JSON like format.
+//
+func (runtime *Runtime) String() (s string) {
+ s = fmt.Sprintf("NRandomFeature: %d\n"+
+ " SplitMethod : %s\n"+
+ " Tree :\n%v", runtime.NRandomFeature,
+ runtime.SplitMethod,
+ runtime.Tree.String())
+ return s
+}
diff --git a/lib/mining/classifier/cart/cart_test.go b/lib/mining/classifier/cart/cart_test.go
new file mode 100644
index 00000000..14e89b12
--- /dev/null
+++ b/lib/mining/classifier/cart/cart_test.go
@@ -0,0 +1,62 @@
+// Copyright 2015 Mhd Sulhan <ms@kilabit.info>. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package cart
+
+import (
+ "fmt"
+ "testing"
+
+ "github.com/shuLhan/share/lib/dsv"
+ "github.com/shuLhan/share/lib/tabula"
+ "github.com/shuLhan/share/lib/test"
+)
+
+const (
+ NRows = 150
+)
+
+func TestCART(t *testing.T) {
+ fds := "../../testdata/iris/iris.dsv"
+
+ ds := tabula.Claset{}
+
+ _, e := dsv.SimpleRead(fds, &ds)
+ if nil != e {
+ t.Fatal(e)
+ }
+
+ fmt.Println("[cart_test] class index:", ds.GetClassIndex())
+
+ // copy target to be compared later.
+ targetv := ds.GetClassAsStrings()
+
+ test.Assert(t, "", NRows, ds.GetNRow(), true)
+
+ // Build CART tree.
+ CART, e := New(&ds, SplitMethodGini, 0)
+ if e != nil {
+ t.Fatal(e)
+ }
+
+ fmt.Println("[cart_test] CART Tree:\n", CART)
+
+ // Create test set
+ testset := tabula.Claset{}
+ _, e = dsv.SimpleRead(fds, &testset)
+
+ if nil != e {
+ t.Fatal(e)
+ }
+
+ testset.GetClassColumn().ClearValues()
+
+ // Classifiy test set
+ e = CART.ClassifySet(&testset)
+ if nil != e {
+ t.Fatal(e)
+ }
+
+ test.Assert(t, "", targetv, testset.GetClassAsStrings(), true)
+}
diff --git a/lib/mining/classifier/cart/node.go b/lib/mining/classifier/cart/node.go
new file mode 100644
index 00000000..b64dd13c
--- /dev/null
+++ b/lib/mining/classifier/cart/node.go
@@ -0,0 +1,44 @@
+// Copyright 2015 Mhd Sulhan <ms@kilabit.info>. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package cart
+
+import (
+ "fmt"
+ "reflect"
+)
+
+//
+// NodeValue of tree in CART.
+//
+type NodeValue struct {
+ // Class of leaf node.
+ Class string
+ // SplitAttrName define the name of attribute which cause the split.
+ SplitAttrName string
+ // IsLeaf define whether node is a leaf or not.
+ IsLeaf bool
+ // IsContinu define whether the node split is continuous or discrete.
+ IsContinu bool
+ // Size define number of sample that this node hold before splitting.
+ Size int
+ // SplitAttrIdx define the attribute which cause the split.
+ SplitAttrIdx int
+ // SplitV define the split value.
+ SplitV interface{}
+}
+
+//
+// String will return the value of node for printable.
+//
+func (nodev *NodeValue) String() (s string) {
+ if nodev.IsLeaf {
+ s = fmt.Sprintf("Class: %s", nodev.Class)
+ } else {
+ s = fmt.Sprintf("(SplitValue: %v)",
+ reflect.ValueOf(nodev.SplitV))
+ }
+
+ return s
+}
diff --git a/lib/mining/classifier/cm.go b/lib/mining/classifier/cm.go
new file mode 100644
index 00000000..0dc2ee05
--- /dev/null
+++ b/lib/mining/classifier/cm.go
@@ -0,0 +1,442 @@
+// Copyright 2016 Mhd Sulhan <ms@kilabit.info>. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package classifier
+
+import (
+ "fmt"
+ "strconv"
+
+ "github.com/shuLhan/share/lib/tabula"
+)
+
+//
+// CM represent the matrix of classification.
+//
+type CM struct {
+ tabula.Dataset
+ // rowNames contain name in each row.
+ rowNames []string
+ // nSamples contain number of class.
+ nSamples int64
+ // nTrue contain number of true positive and negative.
+ nTrue int64
+ // nFalse contain number of false positive and negative.
+ nFalse int64
+
+ // tpIds contain index of true-positive samples.
+ tpIds []int
+ // fpIds contain index of false-positive samples.
+ fpIds []int
+ // tnIds contain index of true-negative samples.
+ tnIds []int
+ // fnIds contain index of false-negative samples.
+ fnIds []int
+}
+
+//
+// initByNumeric will initialize confusion matrix using numeric value space.
+//
+func (cm *CM) initByNumeric(vs []int64) {
+ var colTypes []int
+ var colNames []string
+
+ for _, v := range vs {
+ vstr := strconv.FormatInt(v, 10)
+ colTypes = append(colTypes, tabula.TInteger)
+ colNames = append(colNames, vstr)
+ cm.rowNames = append(cm.rowNames, vstr)
+ }
+
+ // class error column
+ colTypes = append(colTypes, tabula.TReal)
+ colNames = append(colNames, "class_error")
+
+ cm.Dataset.Init(tabula.DatasetModeMatrix, colTypes, colNames)
+}
+
+//
+// ComputeStrings will calculate confusion matrix using targets and predictions
+// class values.
+//
+func (cm *CM) ComputeStrings(valueSpace, targets, predictions []string) {
+ cm.init(valueSpace)
+
+ for x, target := range valueSpace {
+ col := cm.GetColumn(x)
+
+ for _, predict := range valueSpace {
+ cnt := cm.countTargetPrediction(target, predict,
+ targets, predictions)
+
+ col.PushBack(tabula.NewRecordInt(cnt))
+ }
+
+ cm.PushColumnToRows(*col)
+ }
+
+ cm.computeClassError()
+}
+
+//
+// ComputeNumeric will calculate confusion matrix using targets and predictions
+// values.
+//
+func (cm *CM) ComputeNumeric(vs, actuals, predictions []int64) {
+ cm.initByNumeric(vs)
+
+ for x, act := range vs {
+ col := cm.GetColumn(x)
+
+ for _, pred := range vs {
+ cnt := cm.countNumeric(act, pred, actuals, predictions)
+
+ rec := tabula.NewRecordInt(cnt)
+ col.PushBack(rec)
+ }
+
+ cm.PushColumnToRows(*col)
+ }
+
+ cm.computeClassError()
+}
+
+//
+// create will initialize confusion matrix using value space.
+//
+func (cm *CM) init(valueSpace []string) {
+ var colTypes []int
+ var colNames []string
+
+ for _, v := range valueSpace {
+ colTypes = append(colTypes, tabula.TInteger)
+ colNames = append(colNames, v)
+ cm.rowNames = append(cm.rowNames, v)
+ }
+
+ // class error column
+ colTypes = append(colTypes, tabula.TReal)
+ colNames = append(colNames, "class_error")
+
+ cm.Dataset.Init(tabula.DatasetModeMatrix, colTypes, colNames)
+}
+
+//
+// countTargetPrediction will count and return number of true positive or false
+// positive in predictions using targets values.
+//
+func (cm *CM) countTargetPrediction(target, predict string,
+ targets, predictions []string,
+) (
+ cnt int64,
+) {
+ predictslen := len(predictions)
+
+ for x, v := range targets {
+ // In case out of range, where predictions length less than
+ // targets length.
+ if x >= predictslen {
+ break
+ }
+ if v != target {
+ continue
+ }
+ if predict == predictions[x] {
+ cnt++
+ }
+ }
+ return
+}
+
+//
+// countNumeric will count and return number of `pred` in predictions where
+// actuals value is `act`.
+//
+func (cm *CM) countNumeric(act, pred int64, actuals, predictions []int64) (
+ cnt int64,
+) {
+ // Find minimum length to mitigate out-of-range loop.
+ minlen := len(actuals)
+ if len(predictions) < minlen {
+ minlen = len(predictions)
+ }
+
+ for x := 0; x < minlen; x++ {
+ if actuals[x] != act {
+ continue
+ }
+ if predictions[x] != pred {
+ continue
+ }
+ cnt++
+ }
+ return cnt
+}
+
+//
+// computeClassError will compute the classification error in matrix.
+//
+func (cm *CM) computeClassError() {
+ var tp, fp int64
+
+ cm.nSamples = 0
+ cm.nFalse = 0
+
+ classcol := cm.GetNColumn() - 1
+ col := cm.GetColumnClassError()
+ rows := cm.GetDataAsRows()
+ for x, row := range *rows {
+ for y, cell := range *row {
+ if y == classcol {
+ break
+ }
+ if x == y {
+ tp = cell.Integer()
+ } else {
+ fp += cell.Integer()
+ }
+ }
+
+ nSamplePerRow := tp + fp
+ errv := float64(fp) / float64(nSamplePerRow)
+ col.PushBack(tabula.NewRecordReal(errv))
+
+ cm.nSamples += nSamplePerRow
+ cm.nTrue += tp
+ cm.nFalse += fp
+ }
+
+ cm.PushColumnToRows(*col)
+}
+
+//
+// GroupIndexPredictions given index of samples, group the samples by their
+// class of prediction. For example,
+//
+// sampleIds: [0, 1, 2, 3, 4, 5]
+// actuals: [1, 1, 0, 0, 1, 0]
+// predictions: [1, 0, 1, 0, 1, 1]
+//
+// This function will group the index by true-positive, false-positive,
+// true-negative, and false-negative, which result in,
+//
+// true-positive indices: [0, 4]
+// false-positive indices: [2, 5]
+// true-negative indices: [3]
+// false-negative indices: [1]
+//
+// This function assume that positive value as "1" and negative value as "0".
+//
+func (cm *CM) GroupIndexPredictions(sampleIds []int,
+ actuals, predictions []int64,
+) {
+ // Reset indices.
+ cm.tpIds = nil
+ cm.fpIds = nil
+ cm.tnIds = nil
+ cm.fnIds = nil
+
+ // Make sure we are not out-of-range when looping, always pick the
+ // minimum length between the three parameters.
+ min := len(sampleIds)
+ if len(actuals) < min {
+ min = len(actuals)
+ }
+ if len(predictions) < min {
+ min = len(predictions)
+ }
+
+ for x := 0; x < min; x++ {
+ if actuals[x] == 1 {
+ if predictions[x] == 1 {
+ cm.tpIds = append(cm.tpIds, sampleIds[x])
+ } else {
+ cm.fnIds = append(cm.fnIds, sampleIds[x])
+ }
+ } else {
+ if predictions[x] == 1 {
+ cm.fpIds = append(cm.fpIds, sampleIds[x])
+ } else {
+ cm.tnIds = append(cm.tnIds, sampleIds[x])
+ }
+ }
+ }
+}
+
+//
+// GroupIndexPredictionsStrings is an alternative to GroupIndexPredictions
+// which work with string class.
+//
+func (cm *CM) GroupIndexPredictionsStrings(sampleIds []int,
+ actuals, predictions []string,
+) {
+ if len(sampleIds) <= 0 {
+ return
+ }
+
+ // Reset indices.
+ cm.tpIds = nil
+ cm.fpIds = nil
+ cm.tnIds = nil
+ cm.fnIds = nil
+
+ // Make sure we are not out-of-range when looping, always pick the
+ // minimum length between the three parameters.
+ min := len(sampleIds)
+ if len(actuals) < min {
+ min = len(actuals)
+ }
+ if len(predictions) < min {
+ min = len(predictions)
+ }
+
+ for x := 0; x < min; x++ {
+ if actuals[x] == "1" {
+ if predictions[x] == "1" {
+ cm.tpIds = append(cm.tpIds, sampleIds[x])
+ } else {
+ cm.fnIds = append(cm.fnIds, sampleIds[x])
+ }
+ } else {
+ if predictions[x] == "1" {
+ cm.fpIds = append(cm.fpIds, sampleIds[x])
+ } else {
+ cm.tnIds = append(cm.tnIds, sampleIds[x])
+ }
+ }
+ }
+}
+
+//
+// GetColumnClassError return the last column which is the column that contain
+// the error of classification.
+//
+func (cm *CM) GetColumnClassError() *tabula.Column {
+ return cm.GetColumn(cm.GetNColumn() - 1)
+}
+
+//
+// GetTrueRate return true-positive rate in term of
+//
+// true-positive / (true-positive + false-positive)
+//
+func (cm *CM) GetTrueRate() float64 {
+ return float64(cm.nTrue) / float64(cm.nTrue+cm.nFalse)
+}
+
+//
+// GetFalseRate return false-positive rate in term of,
+//
+// false-positive / (false-positive + true negative)
+//
+func (cm *CM) GetFalseRate() float64 {
+ return float64(cm.nFalse) / float64(cm.nTrue+cm.nFalse)
+}
+
+//
+// TP return number of true-positive in confusion matrix.
+//
+func (cm *CM) TP() int {
+ row := cm.GetRow(0)
+ if row == nil {
+ return 0
+ }
+
+ v, _ := row.GetIntAt(0)
+ return int(v)
+}
+
+//
+// FP return number of false-positive in confusion matrix.
+//
+func (cm *CM) FP() int {
+ row := cm.GetRow(0)
+ if row == nil {
+ return 0
+ }
+
+ v, _ := row.GetIntAt(1)
+ return int(v)
+}
+
+//
+// FN return number of false-negative.
+//
+func (cm *CM) FN() int {
+ row := cm.GetRow(1)
+ if row == nil {
+ return 0
+ }
+ v, _ := row.GetIntAt(0)
+ return int(v)
+}
+
+//
+// TN return number of true-negative.
+//
+func (cm *CM) TN() int {
+ row := cm.GetRow(1)
+ if row == nil {
+ return 0
+ }
+ v, _ := row.GetIntAt(1)
+ return int(v)
+}
+
+//
+// TPIndices return indices of all true-positive samples.
+//
+func (cm *CM) TPIndices() []int {
+ return cm.tpIds
+}
+
+//
+// FNIndices return indices of all false-negative samples.
+//
+func (cm *CM) FNIndices() []int {
+ return cm.fnIds
+}
+
+//
+// FPIndices return indices of all false-positive samples.
+//
+func (cm *CM) FPIndices() []int {
+ return cm.fpIds
+}
+
+//
+// TNIndices return indices of all true-negative samples.
+//
+func (cm *CM) TNIndices() []int {
+ return cm.tnIds
+}
+
+//
+// String will return the output of confusion matrix in table like format.
+//
+func (cm *CM) String() (s string) {
+ s += "Confusion Matrix:\n"
+
+ // Row header: column names.
+ s += "\t"
+ for _, col := range cm.GetColumnsName() {
+ s += col + "\t"
+ }
+ s += "\n"
+
+ rows := cm.GetDataAsRows()
+ for x, row := range *rows {
+ s += cm.rowNames[x] + "\t"
+
+ for _, v := range *row {
+ s += v.String() + "\t"
+ }
+ s += "\n"
+ }
+
+ s += fmt.Sprintf("TP-FP indices %d %d\n", len(cm.tpIds), len(cm.fpIds))
+ s += fmt.Sprintf("FN-TN indices %d %d\n", len(cm.fnIds), len(cm.tnIds))
+
+ return
+}
diff --git a/lib/mining/classifier/cm_test.go b/lib/mining/classifier/cm_test.go
new file mode 100644
index 00000000..6fd47d93
--- /dev/null
+++ b/lib/mining/classifier/cm_test.go
@@ -0,0 +1,69 @@
+// Copyright 2016 Mhd Sulhan <ms@kilabit.info>. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package classifier
+
+import (
+ "fmt"
+ "testing"
+
+ "github.com/shuLhan/share/lib/test"
+)
+
+func TestComputeNumeric(t *testing.T) {
+ actuals := []int64{1, 1, 1, 0, 0, 0, 0}
+ predics := []int64{1, 1, 0, 0, 0, 0, 1}
+ vs := []int64{1, 0}
+ exp := []int{2, 1, 3, 1}
+
+ cm := &CM{}
+
+ cm.ComputeNumeric(vs, actuals, predics)
+
+ test.Assert(t, "", exp[0], cm.TP(), true)
+ test.Assert(t, "", exp[1], cm.FN(), true)
+ test.Assert(t, "", exp[2], cm.TN(), true)
+ test.Assert(t, "", exp[3], cm.FP(), true)
+
+ fmt.Println(cm)
+}
+
+func TestComputeStrings(t *testing.T) {
+ actuals := []string{"1", "1", "1", "0", "0", "0", "0"}
+ predics := []string{"1", "1", "0", "0", "0", "0", "1"}
+ vs := []string{"1", "0"}
+ exp := []int{2, 1, 3, 1}
+
+ cm := &CM{}
+
+ cm.ComputeStrings(vs, actuals, predics)
+
+ test.Assert(t, "", exp[0], cm.TP(), true)
+ test.Assert(t, "", exp[1], cm.FN(), true)
+ test.Assert(t, "", exp[2], cm.TN(), true)
+ test.Assert(t, "", exp[3], cm.FP(), true)
+
+ fmt.Println(cm)
+}
+
+func TestGroupIndexPredictions(t *testing.T) {
+ testIds := []int{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}
+ actuals := []int64{1, 1, 1, 1, 0, 0, 0, 0, 0, 0}
+ predics := []int64{1, 1, 0, 1, 0, 0, 0, 0, 1, 0}
+ exp := [][]int{
+ {0, 1, 3}, // tp
+ {2}, // fn
+ {8}, // fp
+ {4, 5, 6, 7, 9}, // tn
+ }
+
+ cm := &CM{}
+
+ cm.GroupIndexPredictions(testIds, actuals, predics)
+
+ test.Assert(t, "", exp[0], cm.TPIndices(), true)
+ test.Assert(t, "", exp[1], cm.FNIndices(), true)
+ test.Assert(t, "", exp[2], cm.FPIndices(), true)
+ test.Assert(t, "", exp[3], cm.TNIndices(), true)
+}
diff --git a/lib/mining/classifier/crf/crf.go b/lib/mining/classifier/crf/crf.go
new file mode 100644
index 00000000..3df7dd83
--- /dev/null
+++ b/lib/mining/classifier/crf/crf.go
@@ -0,0 +1,470 @@
+// Copyright 2016 Mhd Sulhan <ms@kilabit.info>. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//
+// Package crf implement the cascaded random forest algorithm, proposed by
+// Baumann et.al in their paper:
+//
+// Baumann, Florian, et al. "Cascaded Random Forest for Fast Object
+// Detection." Image Analysis. Springer Berlin Heidelberg, 2013. 131-142.
+//
+//
+package crf
+
+import (
+ "errors"
+ "fmt"
+ "math"
+ "sort"
+
+ "github.com/shuLhan/share/lib/debug"
+ "github.com/shuLhan/share/lib/mining/classifier"
+ "github.com/shuLhan/share/lib/mining/classifier/rf"
+ "github.com/shuLhan/share/lib/numbers"
+ libstrings "github.com/shuLhan/share/lib/strings"
+ "github.com/shuLhan/share/lib/tabula"
+)
+
+const (
+ tag = "[crf]"
+
+ // DefStage default number of stage
+ DefStage = 200
+ // DefTPRate default threshold for true-positive rate.
+ DefTPRate = 0.9
+ // DefTNRate default threshold for true-negative rate.
+ DefTNRate = 0.7
+
+ // DefNumTree default number of tree.
+ DefNumTree = 1
+ // DefPercentBoot default percentage of sample that will be used for
+ // bootstraping a tree.
+ DefPercentBoot = 66
+ // DefPerfFile default performance file output.
+ DefPerfFile = "crf.perf"
+ // DefStatFile default statistic file output.
+ DefStatFile = "crf.stat"
+)
+
+var (
+ // ErrNoInput will tell you when no input is given.
+ ErrNoInput = errors.New("rf: input samples is empty")
+)
+
+//
+// Runtime define the cascaded random forest runtime input and output.
+//
+type Runtime struct {
+ // Runtime embed common fields for classifier.
+ classifier.Runtime
+
+ // NStage number of stage.
+ NStage int `json:"NStage"`
+ // TPRate threshold for true positive rate per stage.
+ TPRate float64 `json:"TPRate"`
+ // TNRate threshold for true negative rate per stage.
+ TNRate float64 `json:"TNRate"`
+
+ // NTree number of tree in each stage.
+ NTree int `json:"NTree"`
+ // NRandomFeature number of features used to split the dataset.
+ NRandomFeature int `json:"NRandomFeature"`
+ // PercentBoot percentage of bootstrap.
+ PercentBoot int `json:"PercentBoot"`
+
+ // forests contain forest for each stage.
+ forests []*rf.Runtime
+ // weights contain weight for each stage.
+ weights []float64
+ // tnset contain sample of all true-negative in each iteration.
+ tnset *tabula.Claset
+}
+
+//
+// New create and return new input for cascaded random-forest.
+//
+func New(nstage, ntree, percentboot, nfeature int,
+ tprate, tnrate float64,
+ samples tabula.ClasetInterface,
+) (
+ crf *Runtime,
+) {
+ crf = &Runtime{
+ NStage: nstage,
+ NTree: ntree,
+ PercentBoot: percentboot,
+ NRandomFeature: nfeature,
+ TPRate: tprate,
+ TNRate: tnrate,
+ }
+
+ return crf
+}
+
+//
+// AddForest will append new forest.
+//
+func (crf *Runtime) AddForest(forest *rf.Runtime) {
+ crf.forests = append(crf.forests, forest)
+}
+
+//
+// Initialize will check crf inputs and set it to default values if its
+// invalid.
+//
+func (crf *Runtime) Initialize(samples tabula.ClasetInterface) error {
+ if crf.NStage <= 0 {
+ crf.NStage = DefStage
+ }
+ if crf.TPRate <= 0 || crf.TPRate >= 1 {
+ crf.TPRate = DefTPRate
+ }
+ if crf.TNRate <= 0 || crf.TNRate >= 1 {
+ crf.TNRate = DefTNRate
+ }
+ if crf.NTree <= 0 {
+ crf.NTree = DefNumTree
+ }
+ if crf.PercentBoot <= 0 {
+ crf.PercentBoot = DefPercentBoot
+ }
+ if crf.NRandomFeature <= 0 {
+ // Set default value to square-root of features.
+ ncol := samples.GetNColumn() - 1
+ crf.NRandomFeature = int(math.Sqrt(float64(ncol)))
+ }
+ if crf.PerfFile == "" {
+ crf.PerfFile = DefPerfFile
+ }
+ if crf.StatFile == "" {
+ crf.StatFile = DefStatFile
+ }
+ crf.tnset = samples.Clone().(*tabula.Claset)
+
+ return crf.Runtime.Initialize()
+}
+
+//
+// Build given a sample dataset, build the stage with randomforest.
+//
+func (crf *Runtime) Build(samples tabula.ClasetInterface) (e error) {
+ if samples == nil {
+ return ErrNoInput
+ }
+
+ e = crf.Initialize(samples)
+ if e != nil {
+ return
+ }
+
+ fmt.Println(tag, "Training samples:", samples)
+ fmt.Println(tag, "Sample (one row):", samples.GetRow(0))
+ fmt.Println(tag, "Config:", crf)
+
+ for x := 0; x < crf.NStage; x++ {
+ if debug.Value >= 1 {
+ fmt.Println(tag, "Stage #", x)
+ }
+
+ forest, e := crf.createForest(samples)
+ if e != nil {
+ return e
+ }
+
+ e = crf.finalizeStage(forest)
+ if e != nil {
+ return e
+ }
+ }
+
+ return crf.Finalize()
+}
+
+//
+// createForest will create and return a forest and run the training `samples`
+// on it.
+//
+// Algorithm,
+// (1) Initialize forest.
+// (2) For 0 to maximum number of tree in forest,
+// (2.1) grow one tree until success.
+// (2.2) If tree tp-rate and tn-rate greater than threshold, stop growing.
+// (3) Calculate weight.
+// (4) TODO: Move true-negative from samples. The collection of true-negative
+// will be used again to test the model and after test and the sample with FP
+// will be moved to training samples again.
+// (5) Refill samples with false-positive.
+//
+func (crf *Runtime) createForest(samples tabula.ClasetInterface) (
+ forest *rf.Runtime, e error,
+) {
+ var cm *classifier.CM
+ var stat *classifier.Stat
+
+ fmt.Println(tag, "Forest samples:", samples)
+
+ // (1)
+ forest = &rf.Runtime{
+ Runtime: classifier.Runtime{
+ RunOOB: true,
+ },
+ NTree: crf.NTree,
+ NRandomFeature: crf.NRandomFeature,
+ }
+
+ e = forest.Initialize(samples)
+ if e != nil {
+ return nil, e
+ }
+
+ // (2)
+ for t := 0; t < crf.NTree; t++ {
+ if debug.Value >= 2 {
+ fmt.Println(tag, "Tree #", t)
+ }
+
+ // (2.1)
+ for {
+ cm, stat, e = forest.GrowTree(samples)
+ if e == nil {
+ break
+ }
+ }
+
+ // (2.2)
+ if stat.TPRate > crf.TPRate &&
+ stat.TNRate > crf.TNRate {
+ break
+ }
+ }
+
+ e = forest.Finalize()
+ if e != nil {
+ return nil, e
+ }
+
+ // (3)
+ crf.computeWeight(stat)
+
+ if debug.Value >= 1 {
+ fmt.Println(tag, "Weight:", stat.FMeasure)
+ }
+
+ // (4)
+ crf.deleteTrueNegative(samples, cm)
+
+ // (5)
+ crf.runTPSet(samples)
+
+ samples.RecountMajorMinor()
+
+ return forest, nil
+}
+
+//
+// finalizeStage save forest and write the forest statistic to file.
+//
+func (crf *Runtime) finalizeStage(forest *rf.Runtime) (e error) {
+ stat := forest.StatTotal()
+ stat.ID = int64(len(crf.forests))
+
+ e = crf.WriteOOBStat(stat)
+ if e != nil {
+ return e
+ }
+
+ crf.AddStat(stat)
+ crf.ComputeStatTotal(stat)
+
+ if debug.Value >= 1 {
+ crf.PrintStatTotal(nil)
+ }
+
+ // (7)
+ crf.AddForest(forest)
+
+ return nil
+}
+
+//
+// computeWeight will compute the weight of stage based on F-measure of the
+// last tree in forest.
+//
+func (crf *Runtime) computeWeight(stat *classifier.Stat) {
+ crf.weights = append(crf.weights, math.Exp(stat.FMeasure))
+}
+
+//
+// deleteTrueNegative will delete all samples data where their row index is in
+// true-negative values in confusion matrix and move it to TN-set.
+//
+// (1) Move true negative to tnset on the first iteration, on the next
+// iteration it will be full deleted.
+// (2) Delete TN from sample set one-by-one with offset, to make sure we
+// are not deleting with wrong index.
+
+func (crf *Runtime) deleteTrueNegative(samples tabula.ClasetInterface,
+ cm *classifier.CM,
+) {
+ var row *tabula.Row
+
+ tnids := cm.TNIndices()
+ sort.Ints(tnids)
+
+ // (1)
+ if len(crf.weights) <= 1 {
+ for _, i := range tnids {
+ crf.tnset.PushRow(samples.GetRow(i))
+ }
+ }
+
+ // (2)
+ c := 0
+ for x, i := range tnids {
+ row = samples.DeleteRow(i - x)
+ if row != nil {
+ c++
+ }
+ }
+
+ if debug.Value >= 1 {
+ fmt.Println(tag, "# TN", len(tnids), "# deleted", c)
+ }
+}
+
+//
+// refillWithFP will copy the false-positive data in training set `tnset`
+// and append it to `samples`.
+//
+func (crf *Runtime) refillWithFP(samples, tnset tabula.ClasetInterface,
+ cm *classifier.CM,
+) {
+ // Get and sort FP.
+ fpids := cm.FPIndices()
+ sort.Ints(fpids)
+
+ // Move FP samples from TN-set to training set samples.
+ for _, i := range fpids {
+ samples.PushRow(tnset.GetRow(i))
+ }
+
+ // Delete FP from training set.
+ var row *tabula.Row
+ c := 0
+ for x, i := range fpids {
+ row = tnset.DeleteRow(i - x)
+ if row != nil {
+ c++
+ }
+ }
+
+ if debug.Value >= 1 {
+ fmt.Println(tag, "# FP", len(fpids), "# refilled", c)
+ }
+}
+
+//
+// runTPSet will run true-positive set into trained stage, to get the
+// false-positive. The FP samples will be added to training set.
+//
+func (crf *Runtime) runTPSet(samples tabula.ClasetInterface) {
+ // Skip the first stage, because we just got tnset from them.
+ if len(crf.weights) <= 1 {
+ return
+ }
+
+ tnIds := numbers.IntCreateSeq(0, crf.tnset.Len()-1)
+ _, cm, _ := crf.ClassifySetByWeight(crf.tnset, tnIds)
+
+ crf.refillWithFP(samples, crf.tnset, cm)
+}
+
+//
+// ClassifySetByWeight will classify each instance in samples by weight
+// with respect to its single performance.
+//
+// Algorithm,
+// (1) For each instance in samples,
+// (1.1) for each stage,
+// (1.1.1) collect votes for instance in current stage.
+// (1.1.2) Compute probabilities of each classes in votes.
+//
+// prob_class = count_of_class / total_votes
+//
+// (1.1.3) Compute total of probabilites times of stage weight.
+//
+// stage_prob = prob_class * stage_weight
+//
+// (1.2) Divide each class stage probabilites with
+//
+// stage_prob = stage_prob /
+// (sum_of_all_weights * number_of_tree_in_forest)
+//
+// (1.3) Select class label with highest probabilites.
+// (1.4) Save stage probabilities for positive class.
+// (2) Compute confusion matrix.
+//
+func (crf *Runtime) ClassifySetByWeight(samples tabula.ClasetInterface,
+ sampleIds []int,
+) (
+ predicts []string, cm *classifier.CM, probs []float64,
+) {
+ stat := classifier.Stat{}
+ stat.Start()
+
+ vs := samples.GetClassValueSpace()
+ stageProbs := make([]float64, len(vs))
+ stageSumProbs := make([]float64, len(vs))
+ sumWeights := numbers.Floats64Sum(crf.weights)
+
+ // (1)
+ rows := samples.GetDataAsRows()
+ for _, row := range *rows {
+ for y := range stageSumProbs {
+ stageSumProbs[y] = 0
+ }
+
+ // (1.1)
+ for y, forest := range crf.forests {
+ // (1.1.1)
+ votes := forest.Votes(row, -1)
+
+ // (1.1.2)
+ probs := libstrings.FrequencyOfTokens(votes, vs, false)
+
+ // (1.1.3)
+ for z := range probs {
+ stageSumProbs[z] += probs[z]
+ stageProbs[z] += probs[z] * crf.weights[y]
+ }
+ }
+
+ // (1.2)
+ stageWeight := sumWeights * float64(crf.NTree)
+
+ for x := range stageProbs {
+ stageProbs[x] = stageProbs[x] / stageWeight
+ }
+
+ // (1.3)
+ _, maxi, ok := numbers.Floats64FindMax(stageProbs)
+ if ok {
+ predicts = append(predicts, vs[maxi])
+ }
+
+ probs = append(probs, stageSumProbs[0]/
+ float64(len(crf.forests)))
+ }
+
+ // (2)
+ actuals := samples.GetClassAsStrings()
+ cm = crf.ComputeCM(sampleIds, vs, actuals, predicts)
+
+ crf.ComputeStatFromCM(&stat, cm)
+ stat.End()
+
+ _ = stat.Write(crf.StatFile)
+
+ return predicts, cm, probs
+}
diff --git a/lib/mining/classifier/crf/crf_test.go b/lib/mining/classifier/crf/crf_test.go
new file mode 100644
index 00000000..4e530591
--- /dev/null
+++ b/lib/mining/classifier/crf/crf_test.go
@@ -0,0 +1,92 @@
+// Copyright 2015 Mhd Sulhan <ms@kilabit.info>. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package crf
+
+import (
+ "fmt"
+ "os"
+ "testing"
+
+ "github.com/shuLhan/share/lib/dsv"
+ "github.com/shuLhan/share/lib/mining/classifier"
+ "github.com/shuLhan/share/lib/tabula"
+)
+
+var (
+ SampleFile string
+ PerfFile string
+ StatFile string
+ NStage = 200
+ NTree = 1
+)
+
+func runCRF(t *testing.T) {
+ // read trainingset.
+ samples := tabula.Claset{}
+ _, e := dsv.SimpleRead(SampleFile, &samples)
+ if e != nil {
+ t.Fatal(e)
+ }
+
+ nbag := (samples.Len() * 63) / 100
+ train, test, _, testIds := tabula.RandomPickRows(&samples, nbag, false)
+
+ trainset := train.(tabula.ClasetInterface)
+ testset := test.(tabula.ClasetInterface)
+
+ crfRuntime := Runtime{
+ Runtime: classifier.Runtime{
+ StatFile: StatFile,
+ PerfFile: PerfFile,
+ },
+ NStage: NStage,
+ NTree: NTree,
+ }
+
+ e = crfRuntime.Build(trainset)
+ if e != nil {
+ t.Fatal(e)
+ }
+
+ testset.RecountMajorMinor()
+ fmt.Println("Testset:", testset)
+
+ predicts, cm, probs := crfRuntime.ClassifySetByWeight(testset, testIds)
+
+ fmt.Println("Confusion matrix:", cm)
+
+ crfRuntime.Performance(testset, predicts, probs)
+ e = crfRuntime.WritePerformance()
+ if e != nil {
+ t.Fatal(e)
+ }
+}
+
+func TestPhoneme200_1(t *testing.T) {
+ SampleFile = "../../testdata/phoneme/phoneme.dsv"
+ PerfFile = "phoneme_200_1.perf"
+ StatFile = "phoneme_200_1.stat"
+
+ runCRF(t)
+}
+
+func TestPhoneme200_10(t *testing.T) {
+ SampleFile = "../../testdata/phoneme/phoneme.dsv"
+ PerfFile = "phoneme_200_10.perf"
+ StatFile = "phoneme_200_10.stat"
+ NTree = 10
+
+ runCRF(t)
+}
+
+func TestMain(m *testing.M) {
+ envTestCRF := os.Getenv("TEST_CRF")
+
+ if len(envTestCRF) == 0 {
+ os.Exit(0)
+ }
+
+ os.Exit(m.Run())
+}
diff --git a/lib/mining/classifier/rf/rf.go b/lib/mining/classifier/rf/rf.go
new file mode 100644
index 00000000..86595efd
--- /dev/null
+++ b/lib/mining/classifier/rf/rf.go
@@ -0,0 +1,363 @@
+// Copyright 2016 Mhd Sulhan <ms@kilabit.info>. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//
+// Package rf implement ensemble of classifiers using random forest
+// algorithm by Breiman and Cutler.
+//
+// Breiman, Leo. "Random forests." Machine learning 45.1 (2001): 5-32.
+//
+// The implementation is based on various sources and using author experience.
+//
+package rf
+
+import (
+ "errors"
+ "fmt"
+ "math"
+
+ "github.com/shuLhan/share/lib/debug"
+ "github.com/shuLhan/share/lib/mining/classifier"
+ "github.com/shuLhan/share/lib/mining/classifier/cart"
+ "github.com/shuLhan/share/lib/numbers"
+ libstrings "github.com/shuLhan/share/lib/strings"
+ "github.com/shuLhan/share/lib/tabula"
+)
+
+const (
+ tag = "[rf]"
+
+ // DefNumTree default number of tree.
+ DefNumTree = 100
+
+ // DefPercentBoot default percentage of sample that will be used for
+ // bootstraping a tree.
+ DefPercentBoot = 66
+
+ // DefOOBStatsFile default statistic file output.
+ DefOOBStatsFile = "rf.oob.stat"
+
+ // DefPerfFile default performance file output.
+ DefPerfFile = "rf.perf"
+
+ // DefStatFile default statistic file.
+ DefStatFile = "rf.stat"
+)
+
+var (
+ // ErrNoInput will tell you when no input is given.
+ ErrNoInput = errors.New("rf: input samples is empty")
+)
+
+//
+// Runtime contains input and output configuration when generating random forest.
+//
+type Runtime struct {
+ // Runtime embed common fields for classifier.
+ classifier.Runtime
+
+ // NTree number of tree in forest.
+ NTree int `json:"NTree"`
+ // NRandomFeature number of feature randomly selected for each tree.
+ NRandomFeature int `json:"NRandomFeature"`
+ // PercentBoot percentage of sample for bootstraping.
+ PercentBoot int `json:"PercentBoot"`
+
+ // nSubsample number of samples used for bootstraping.
+ nSubsample int
+ // trees contain all tree in the forest.
+ trees []cart.Runtime
+ // bagIndices contain list of index of selected samples at bootstraping
+ // for book-keeping.
+ bagIndices [][]int
+}
+
+//
+// Trees return all tree in forest.
+//
+func (forest *Runtime) Trees() []cart.Runtime {
+ return forest.trees
+}
+
+//
+// AddCartTree add tree to forest
+//
+func (forest *Runtime) AddCartTree(tree cart.Runtime) {
+ forest.trees = append(forest.trees, tree)
+}
+
+//
+// AddBagIndex add bagging index for book keeping.
+//
+func (forest *Runtime) AddBagIndex(bagIndex []int) {
+ forest.bagIndices = append(forest.bagIndices, bagIndex)
+}
+
+//
+// Initialize will check forest inputs and set it to default values if invalid.
+//
+// It will also calculate number of random samples for each tree using,
+//
+// number-of-sample * percentage-of-bootstrap
+//
+//
+func (forest *Runtime) Initialize(samples tabula.ClasetInterface) error {
+ if forest.NTree <= 0 {
+ forest.NTree = DefNumTree
+ }
+ if forest.PercentBoot <= 0 {
+ forest.PercentBoot = DefPercentBoot
+ }
+ if forest.NRandomFeature <= 0 {
+ // Set default value to square-root of features.
+ ncol := samples.GetNColumn() - 1
+ forest.NRandomFeature = int(math.Sqrt(float64(ncol)))
+ }
+ if forest.OOBStatsFile == "" {
+ forest.OOBStatsFile = DefOOBStatsFile
+ }
+ if forest.PerfFile == "" {
+ forest.PerfFile = DefPerfFile
+ }
+ if forest.StatFile == "" {
+ forest.StatFile = DefStatFile
+ }
+
+ forest.nSubsample = int(float32(samples.GetNRow()) *
+ (float32(forest.PercentBoot) / 100.0))
+
+ return forest.Runtime.Initialize()
+}
+
+//
+// Build the forest using samples dataset.
+//
+// Algorithm,
+//
+// (0) Recheck input value: number of tree, percentage bootstrap, etc; and
+// Open statistic file output.
+// (1) For 0 to NTree,
+// (1.1) Create new tree, repeat until all trees has been build.
+// (2) Compute and write total statistic.
+//
+func (forest *Runtime) Build(samples tabula.ClasetInterface) (e error) {
+ // check input samples
+ if samples == nil {
+ return ErrNoInput
+ }
+
+ // (0)
+ e = forest.Initialize(samples)
+ if e != nil {
+ return
+ }
+
+ fmt.Println(tag, "Training set :", samples)
+ fmt.Println(tag, "Sample (one row):", samples.GetRow(0))
+ fmt.Println(tag, "Forest config :", forest)
+
+ // (1)
+ for t := 0; t < forest.NTree; t++ {
+ if debug.Value >= 1 {
+ fmt.Println(tag, "tree #", t)
+ }
+
+ // (1.1)
+ for {
+ _, _, e = forest.GrowTree(samples)
+ if e == nil {
+ break
+ }
+
+ fmt.Println(tag, "error:", e)
+ }
+ }
+
+ // (2)
+ return forest.Finalize()
+}
+
+//
+// GrowTree build a new tree in forest, return OOB error value or error if tree
+// can not grow.
+//
+// Algorithm,
+//
+// (1) Select random samples with replacement, also with OOB.
+// (2) Build tree using CART, without pruning.
+// (3) Add tree to forest.
+// (4) Save index of random samples for calculating error rate later.
+// (5) Run OOB on forest.
+// (6) Calculate OOB error rate and statistic values.
+//
+func (forest *Runtime) GrowTree(samples tabula.ClasetInterface) (
+ cm *classifier.CM, stat *classifier.Stat, e error,
+) {
+ stat = &classifier.Stat{}
+ stat.ID = int64(len(forest.trees))
+ stat.Start()
+
+ // (1)
+ bag, oob, bagIdx, oobIdx := tabula.RandomPickRows(
+ samples.(tabula.DatasetInterface),
+ forest.nSubsample, true)
+
+ bagset := bag.(tabula.ClasetInterface)
+
+ if debug.Value >= 2 {
+ bagset.RecountMajorMinor()
+ fmt.Println(tag, "Bagging:", bagset)
+ }
+
+ // (2)
+ cart, e := cart.New(bagset, cart.SplitMethodGini,
+ forest.NRandomFeature)
+ if e != nil {
+ return nil, nil, e
+ }
+
+ // (3)
+ forest.AddCartTree(*cart)
+
+ // (4)
+ forest.AddBagIndex(bagIdx)
+
+ // (5)
+ if forest.RunOOB {
+ oobset := oob.(tabula.ClasetInterface)
+ _, cm, _ = forest.ClassifySet(oobset, oobIdx)
+
+ forest.AddOOBCM(cm)
+ }
+
+ stat.End()
+
+ if debug.Value >= 3 && forest.RunOOB {
+ fmt.Println(tag, "Elapsed time (s):", stat.ElapsedTime)
+ }
+
+ forest.AddStat(stat)
+
+ // (6)
+ if forest.RunOOB {
+ forest.ComputeStatFromCM(stat, cm)
+
+ if debug.Value >= 2 {
+ fmt.Println(tag, "OOB stat:", stat)
+ }
+ }
+
+ forest.ComputeStatTotal(stat)
+ e = forest.WriteOOBStat(stat)
+
+ return cm, stat, e
+}
+
+//
+// ClassifySet given a samples predict their class by running each sample in
+// forest, adn return their class prediction with confusion matrix.
+// `samples` is the sample that will be predicted, `sampleIds` is the index of
+// samples.
+// If `sampleIds` is not nil, then sample index will be checked in each tree,
+// if the sample is used for training, their vote is not counted.
+//
+// Algorithm,
+//
+// (0) Get value space (possible class values in dataset)
+// (1) For each row in test-set,
+// (1.1) collect votes in all trees,
+// (1.2) select majority class vote, and
+// (1.3) compute and save the actual class probabilities.
+// (2) Compute confusion matrix from predictions.
+// (3) Compute stat from confusion matrix.
+// (4) Write the stat to file only if sampleIds is empty, which mean its run
+// not from OOB set.
+//
+func (forest *Runtime) ClassifySet(samples tabula.ClasetInterface,
+ sampleIds []int,
+) (
+ predicts []string, cm *classifier.CM, probs []float64,
+) {
+ stat := classifier.Stat{}
+ stat.Start()
+
+ if len(sampleIds) <= 0 {
+ fmt.Println(tag, "Classify set:", samples)
+ fmt.Println(tag, "Classify set sample (one row):",
+ samples.GetRow(0))
+ }
+
+ // (0)
+ vs := samples.GetClassValueSpace()
+ actuals := samples.GetClassAsStrings()
+ sampleIdx := -1
+
+ // (1)
+ rows := samples.GetRows()
+ for x, row := range *rows {
+ // (1.1)
+ if len(sampleIds) > 0 {
+ sampleIdx = sampleIds[x]
+ }
+ votes := forest.Votes(row, sampleIdx)
+
+ // (1.2)
+ classProbs := libstrings.FrequencyOfTokens(votes, vs, false)
+
+ _, idx, ok := numbers.Floats64FindMax(classProbs)
+
+ if ok {
+ predicts = append(predicts, vs[idx])
+ }
+
+ // (1.3)
+ probs = append(probs, classProbs[0])
+ }
+
+ // (2)
+ cm = forest.ComputeCM(sampleIds, vs, actuals, predicts)
+
+ // (3)
+ forest.ComputeStatFromCM(&stat, cm)
+ stat.End()
+
+ if len(sampleIds) <= 0 {
+ fmt.Println(tag, "CM:", cm)
+ fmt.Println(tag, "Classifying stat:", stat)
+ _ = stat.Write(forest.StatFile)
+ }
+
+ return predicts, cm, probs
+}
+
+//
+// Votes will return votes, or classes, in each tree based on sample.
+// If checkIdx is true then the `sampleIdx` will be checked in if it has been used
+// when training the tree, if its exist then the sample will be skipped.
+//
+// (1) If row is used to build the tree then skip it,
+// (2) classify row in tree,
+// (3) save tree class value.
+//
+func (forest *Runtime) Votes(sample *tabula.Row, sampleIdx int) (
+ votes []string,
+) {
+ for x, tree := range forest.trees {
+ // (1)
+ if sampleIdx >= 0 {
+ exist := numbers.IntsIsExist(forest.bagIndices[x],
+ sampleIdx)
+ if exist {
+ continue
+ }
+ }
+
+ // (2)
+ class := tree.Classify(sample)
+
+ // (3)
+ votes = append(votes, class)
+ }
+ return votes
+}
diff --git a/lib/mining/classifier/rf/rf_bench_test.go b/lib/mining/classifier/rf/rf_bench_test.go
new file mode 100644
index 00000000..eddd721c
--- /dev/null
+++ b/lib/mining/classifier/rf/rf_bench_test.go
@@ -0,0 +1,22 @@
+// Copyright 2016 Mhd Sulhan <ms@kilabit.info>. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package rf
+
+import (
+ "testing"
+)
+
+func BenchmarkPhoneme(b *testing.B) {
+ SampleDsvFile = "../../testdata/phoneme/phoneme.dsv"
+ OOBStatsFile = "phoneme.oob"
+ PerfFile = "phoneme.perf"
+
+ MinFeature = 3
+ MaxFeature = 4
+
+ for x := 0; x < b.N; x++ {
+ runRandomForest()
+ }
+}
diff --git a/lib/mining/classifier/rf/rf_test.go b/lib/mining/classifier/rf/rf_test.go
new file mode 100644
index 00000000..ba96125b
--- /dev/null
+++ b/lib/mining/classifier/rf/rf_test.go
@@ -0,0 +1,190 @@
+// Copyright 2016 Mhd Sulhan <ms@kilabit.info>. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package rf
+
+import (
+ "fmt"
+ "log"
+ "os"
+ "testing"
+
+ "github.com/shuLhan/share/lib/dsv"
+ "github.com/shuLhan/share/lib/mining/classifier"
+ "github.com/shuLhan/share/lib/tabula"
+)
+
+// Global options to run for each test.
+var (
+ // SampleDsvFile is the file that contain samples config.
+ SampleDsvFile string
+ // DoTest if its true then the dataset will splited into training and
+ // test set with random selection without replacement.
+ DoTest = false
+ // NTree number of tree to generate.
+ NTree = 100
+ // NBootstrap percentage of sample used as subsample.
+ NBootstrap = 66
+ // MinFeature number of feature to begin with.
+ MinFeature = 1
+ // MaxFeature maximum number of feature to test
+ MaxFeature = -1
+ // RunOOB if its true the the OOB samples will be used to test the
+ // model in each iteration.
+ RunOOB = true
+ // OOBStatsFile is the file where OOB statistic will be saved.
+ OOBStatsFile string
+ // PerfFile is the file where performance statistic will be saved.
+ PerfFile string
+ // StatFile is the file where classifying statistic will be saved.
+ StatFile string
+)
+
+func getSamples() (train, test tabula.ClasetInterface) {
+ samples := tabula.Claset{}
+ _, e := dsv.SimpleRead(SampleDsvFile, &samples)
+ if nil != e {
+ log.Fatal(e)
+ }
+
+ if !DoTest {
+ return &samples, nil
+ }
+
+ ntrain := int(float32(samples.Len()) * (float32(NBootstrap) / 100.0))
+
+ bag, oob, _, _ := tabula.RandomPickRows(&samples, ntrain, false)
+
+ train = bag.(tabula.ClasetInterface)
+ test = oob.(tabula.ClasetInterface)
+
+ train.SetClassIndex(samples.GetClassIndex())
+ test.SetClassIndex(samples.GetClassIndex())
+
+ return train, test
+}
+
+func runRandomForest() {
+ oobStatsFile := OOBStatsFile
+ perfFile := PerfFile
+ statFile := StatFile
+
+ trainset, testset := getSamples()
+
+ if MaxFeature < 0 {
+ MaxFeature = trainset.GetNColumn()
+ }
+
+ for nfeature := MinFeature; nfeature < MaxFeature; nfeature++ {
+ // Add prefix to OOB stats file.
+ oobStatsFile = fmt.Sprintf("N%d.%s", nfeature, OOBStatsFile)
+
+ // Add prefix to performance file.
+ perfFile = fmt.Sprintf("N%d.%s", nfeature, PerfFile)
+
+ // Add prefix to stat file.
+ statFile = fmt.Sprintf("N%d.%s", nfeature, StatFile)
+
+ // Create and build random forest.
+ forest := Runtime{
+ Runtime: classifier.Runtime{
+ RunOOB: RunOOB,
+ OOBStatsFile: oobStatsFile,
+ PerfFile: perfFile,
+ StatFile: statFile,
+ },
+ NTree: NTree,
+ NRandomFeature: nfeature,
+ PercentBoot: NBootstrap,
+ }
+
+ e := forest.Build(trainset)
+ if e != nil {
+ log.Fatal(e)
+ }
+
+ if DoTest {
+ predicts, _, probs := forest.ClassifySet(testset, nil)
+
+ forest.Performance(testset, predicts, probs)
+ e = forest.WritePerformance()
+ if e != nil {
+ log.Fatal(e)
+ }
+ }
+ }
+}
+
+func TestEnsemblingGlass(t *testing.T) {
+ SampleDsvFile = "../../testdata/forensic_glass/fgl.dsv"
+ RunOOB = false
+ OOBStatsFile = "glass.oob"
+ StatFile = "glass.stat"
+ PerfFile = "glass.perf"
+ DoTest = true
+
+ runRandomForest()
+}
+
+func TestEnsemblingIris(t *testing.T) {
+ SampleDsvFile = "../../testdata/iris/iris.dsv"
+ OOBStatsFile = "iris.oob"
+
+ runRandomForest()
+}
+
+func TestEnsemblingPhoneme(t *testing.T) {
+ SampleDsvFile = "../../testdata/phoneme/phoneme.dsv"
+ OOBStatsFile = "phoneme.oob.stat"
+ StatFile = "phoneme.stat"
+ PerfFile = "phoneme.perf"
+
+ NTree = 200
+ MinFeature = 3
+ MaxFeature = 4
+ RunOOB = false
+ DoTest = true
+
+ runRandomForest()
+}
+
+func TestEnsemblingSmotePhoneme(t *testing.T) {
+ SampleDsvFile = "../../resampling/smote/phoneme_smote.dsv"
+ OOBStatsFile = "phonemesmote.oob"
+
+ MinFeature = 3
+ MaxFeature = 4
+
+ runRandomForest()
+}
+
+func TestEnsemblingLnsmotePhoneme(t *testing.T) {
+ SampleDsvFile = "../../resampling/lnsmote/phoneme_lnsmote.dsv"
+ OOBStatsFile = "phonemelnsmote.oob"
+
+ MinFeature = 3
+ MaxFeature = 4
+
+ runRandomForest()
+}
+
+func TestWvc2010Lnsmote(t *testing.T) {
+ SampleDsvFile = "../../testdata/wvc2010lnsmote/wvc2010_features.lnsmote.dsv"
+ OOBStatsFile = "wvc2010lnsmote.oob"
+
+ NTree = 1
+ MinFeature = 5
+ MaxFeature = 6
+
+ runRandomForest()
+}
+
+func TestMain(m *testing.M) {
+ envTestRF := os.Getenv("TEST_RF")
+ if len(envTestRF) == 0 {
+ os.Exit(0)
+ }
+
+ os.Exit(m.Run())
+}
diff --git a/lib/mining/classifier/runtime.go b/lib/mining/classifier/runtime.go
new file mode 100644
index 00000000..0f7a7755
--- /dev/null
+++ b/lib/mining/classifier/runtime.go
@@ -0,0 +1,443 @@
+// Copyright 2016 Mhd Sulhan <ms@kilabit.info>. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package classifier
+
+import (
+ "fmt"
+ "math"
+
+ "github.com/shuLhan/share/lib/debug"
+ "github.com/shuLhan/share/lib/dsv"
+ "github.com/shuLhan/share/lib/numbers"
+ libstrings "github.com/shuLhan/share/lib/strings"
+ "github.com/shuLhan/share/lib/tabula"
+)
+
+const (
+ tag = "[classifier.runtime]"
+)
+
+//
+// Runtime define a generic type which provide common fields that can be
+// embedded by the real classifier (e.g. RandomForest).
+//
+type Runtime struct {
+ // RunOOB if its true the OOB will be computed, default is false.
+ RunOOB bool `json:"RunOOB"`
+
+ // OOBStatsFile is the file where OOB statistic will be written.
+ OOBStatsFile string `json:"OOBStatsFile"`
+
+ // PerfFile is the file where statistic of performance will be written.
+ PerfFile string `json:"PerfFile"`
+
+ // StatFile is the file where statistic of classifying samples will be
+ // written.
+ StatFile string `json:"StatFile"`
+
+ // oobCms contain confusion matrix value for each OOB in iteration.
+ oobCms []CM
+
+ // oobStats contain statistic of classifier for each OOB in iteration.
+ oobStats Stats
+
+ // oobStatTotal contain total OOB statistic values.
+ oobStatTotal Stat
+
+ // oobWriter contain file writer for statistic.
+ oobWriter *dsv.Writer
+
+ // perfs contain performance statistic per sample, after classifying
+ // sample on classifier.
+ perfs Stats
+}
+
+//
+// Initialize will start the runtime for processing by saving start time and
+// opening stats file.
+//
+func (rt *Runtime) Initialize() error {
+ rt.oobStatTotal.Start()
+
+ return rt.OpenOOBStatsFile()
+}
+
+//
+// Finalize finish the runtime, compute total statistic, write it to file, and
+// close the file.
+//
+func (rt *Runtime) Finalize() (e error) {
+ st := &rt.oobStatTotal
+
+ st.End()
+ st.ID = int64(len(rt.oobStats))
+
+ e = rt.WriteOOBStat(st)
+ if e != nil {
+ return e
+ }
+
+ return rt.CloseOOBStatsFile()
+}
+
+//
+// OOBStats return all statistic objects.
+//
+func (rt *Runtime) OOBStats() *Stats {
+ return &rt.oobStats
+}
+
+//
+// StatTotal return total statistic.
+//
+func (rt *Runtime) StatTotal() *Stat {
+ return &rt.oobStatTotal
+}
+
+//
+// AddOOBCM will append new confusion matrix.
+//
+func (rt *Runtime) AddOOBCM(cm *CM) {
+ rt.oobCms = append(rt.oobCms, *cm)
+}
+
+//
+// AddStat will append new classifier statistic data.
+//
+func (rt *Runtime) AddStat(stat *Stat) {
+ rt.oobStats = append(rt.oobStats, stat)
+}
+
+//
+// ComputeCM will compute confusion matrix of sample using value space, actual
+// and prediction values.
+//
+func (rt *Runtime) ComputeCM(sampleIds []int,
+ vs, actuals, predicts []string,
+) (
+ cm *CM,
+) {
+ cm = &CM{}
+
+ cm.ComputeStrings(vs, actuals, predicts)
+ cm.GroupIndexPredictionsStrings(sampleIds, actuals, predicts)
+
+ if debug.Value >= 2 {
+ fmt.Println(tag, cm)
+ }
+
+ return cm
+}
+
+//
+// ComputeStatFromCM will compute statistic using confusion matrix.
+//
+func (rt *Runtime) ComputeStatFromCM(stat *Stat, cm *CM) {
+
+ stat.OobError = cm.GetFalseRate()
+
+ stat.OobErrorMean = rt.oobStatTotal.OobError /
+ float64(len(rt.oobStats)+1)
+
+ stat.TP = int64(cm.TP())
+ stat.FP = int64(cm.FP())
+ stat.TN = int64(cm.TN())
+ stat.FN = int64(cm.FN())
+
+ t := float64(stat.TP + stat.FN)
+ if t == 0 {
+ stat.TPRate = 0
+ } else {
+ stat.TPRate = float64(stat.TP) / t
+ }
+
+ t = float64(stat.FP + stat.TN)
+ if t == 0 {
+ stat.FPRate = 0
+ } else {
+ stat.FPRate = float64(stat.FP) / t
+ }
+
+ t = float64(stat.FP + stat.TN)
+ if t == 0 {
+ stat.TNRate = 0
+ } else {
+ stat.TNRate = float64(stat.TN) / t
+ }
+
+ t = float64(stat.TP + stat.FP)
+ if t == 0 {
+ stat.Precision = 0
+ } else {
+ stat.Precision = float64(stat.TP) / t
+ }
+
+ t = (1 / stat.Precision) + (1 / stat.TPRate)
+ if t == 0 {
+ stat.FMeasure = 0
+ } else {
+ stat.FMeasure = 2 / t
+ }
+
+ t = float64(stat.TP + stat.TN + stat.FP + stat.FN)
+ if t == 0 {
+ stat.Accuracy = 0
+ } else {
+ stat.Accuracy = float64(stat.TP+stat.TN) / t
+ }
+
+ if debug.Value >= 1 {
+ rt.PrintOobStat(stat, cm)
+ rt.PrintStat(stat)
+ }
+}
+
+//
+// ComputeStatTotal compute total statistic.
+//
+func (rt *Runtime) ComputeStatTotal(stat *Stat) {
+ if stat == nil {
+ return
+ }
+
+ nstat := len(rt.oobStats)
+ if nstat == 0 {
+ return
+ }
+
+ t := &rt.oobStatTotal
+
+ t.OobError += stat.OobError
+ t.OobErrorMean = t.OobError / float64(nstat)
+ t.TP += stat.TP
+ t.FP += stat.FP
+ t.TN += stat.TN
+ t.FN += stat.FN
+
+ total := float64(t.TP + t.FN)
+ if total == 0 {
+ t.TPRate = 0
+ } else {
+ t.TPRate = float64(t.TP) / total
+ }
+
+ total = float64(t.FP + t.TN)
+ if total == 0 {
+ t.FPRate = 0
+ } else {
+ t.FPRate = float64(t.FP) / total
+ }
+
+ total = float64(t.FP + t.TN)
+ if total == 0 {
+ t.TNRate = 0
+ } else {
+ t.TNRate = float64(t.TN) / total
+ }
+
+ total = float64(t.TP + t.FP)
+ if total == 0 {
+ t.Precision = 0
+ } else {
+ t.Precision = float64(t.TP) / total
+ }
+
+ total = (1 / t.Precision) + (1 / t.TPRate)
+ if total == 0 {
+ t.FMeasure = 0
+ } else {
+ t.FMeasure = 2 / total
+ }
+
+ total = float64(t.TP + t.TN + t.FP + t.FN)
+ if total == 0 {
+ t.Accuracy = 0
+ } else {
+ t.Accuracy = float64(t.TP+t.TN) / total
+ }
+}
+
+//
+// OpenOOBStatsFile will open statistic file for output.
+//
+func (rt *Runtime) OpenOOBStatsFile() error {
+ if rt.oobWriter != nil {
+ _ = rt.CloseOOBStatsFile()
+ }
+ rt.oobWriter = &dsv.Writer{}
+ return rt.oobWriter.OpenOutput(rt.OOBStatsFile)
+}
+
+//
+// WriteOOBStat will write statistic of process to file.
+//
+func (rt *Runtime) WriteOOBStat(stat *Stat) error {
+ if rt.oobWriter == nil {
+ return nil
+ }
+ if stat == nil {
+ return nil
+ }
+ return rt.oobWriter.WriteRawRow(stat.ToRow(), nil, nil)
+}
+
+//
+// CloseOOBStatsFile will close statistics file for writing.
+//
+func (rt *Runtime) CloseOOBStatsFile() (e error) {
+ if rt.oobWriter == nil {
+ return
+ }
+
+ e = rt.oobWriter.Close()
+ rt.oobWriter = nil
+
+ return
+}
+
+//
+// PrintOobStat will print the out-of-bag statistic to standard output.
+//
+func (rt *Runtime) PrintOobStat(stat *Stat, cm *CM) {
+ fmt.Printf("%s OOB error rate: %.4f,"+
+ " total: %.4f, mean %.4f, true rate: %.4f\n", tag,
+ stat.OobError, rt.oobStatTotal.OobError,
+ stat.OobErrorMean, cm.GetTrueRate())
+}
+
+//
+// PrintStat will print statistic value to standard output.
+//
+func (rt *Runtime) PrintStat(stat *Stat) {
+ if stat == nil {
+ statslen := len(rt.oobStats)
+ if statslen <= 0 {
+ return
+ }
+ stat = rt.oobStats[statslen-1]
+ }
+
+ fmt.Printf("%s TPRate: %.4f, FPRate: %.4f,"+
+ " TNRate: %.4f, precision: %.4f, f-measure: %.4f,"+
+ " accuracy: %.4f\n", tag, stat.TPRate, stat.FPRate,
+ stat.TNRate, stat.Precision, stat.FMeasure, stat.Accuracy)
+}
+
+//
+// PrintStatTotal will print total statistic to standard output.
+//
+func (rt *Runtime) PrintStatTotal(st *Stat) {
+ if st == nil {
+ st = &rt.oobStatTotal
+ }
+ rt.PrintStat(st)
+}
+
+//
+// Performance given an actuals class label and their probabilities, compute
+// the performance statistic of classifier.
+//
+// Algorithm,
+// (1) Sort the probabilities in descending order.
+// (2) Sort the actuals and predicts using sorted index from probs
+// (3) Compute tpr, fpr, precision
+// (4) Write performance to file.
+//
+func (rt *Runtime) Performance(samples tabula.ClasetInterface,
+ predicts []string, probs []float64,
+) (
+ perfs Stats,
+) {
+ // (1)
+ actuals := samples.GetClassAsStrings()
+ sortedIds := numbers.IntCreateSeq(0, len(probs)-1)
+ numbers.Floats64InplaceMergesort(probs, sortedIds, 0, len(probs),
+ false)
+
+ // (2)
+ libstrings.SortByIndex(&actuals, sortedIds)
+ libstrings.SortByIndex(&predicts, sortedIds)
+
+ // (3)
+ rt.computePerfByProbs(samples, actuals, probs)
+
+ return rt.perfs
+}
+
+func trapezoidArea(fp, fpprev, tp, tpprev int64) float64 {
+ base := math.Abs(float64(fp - fpprev))
+ heightAvg := float64(tp+tpprev) / float64(2.0)
+ return base * heightAvg
+}
+
+//
+// computePerfByProbs will compute classifier performance using probabilities
+// or score `probs`.
+//
+// This currently only work for two class problem.
+//
+func (rt *Runtime) computePerfByProbs(samples tabula.ClasetInterface,
+ actuals []string, probs []float64,
+) {
+ vs := samples.GetClassValueSpace()
+ nactuals := numbers.IntsTo64(samples.Counts())
+ nclass := libstrings.CountTokens(actuals, vs, false)
+
+ pprev := math.Inf(-1)
+ tp := int64(0)
+ fp := int64(0)
+ tpprev := int64(0)
+ fpprev := int64(0)
+
+ auc := float64(0)
+
+ for x, p := range probs {
+ if p != pprev {
+ stat := Stat{}
+ stat.SetTPRate(tp, nactuals[0])
+ stat.SetFPRate(fp, nactuals[1])
+ stat.SetPrecisionFromRate(nactuals[0], nactuals[1])
+
+ auc = auc + trapezoidArea(fp, fpprev, tp, tpprev)
+ stat.SetAUC(auc)
+
+ rt.perfs = append(rt.perfs, &stat)
+
+ pprev = p
+ tpprev = tp
+ fpprev = fp
+ }
+
+ if actuals[x] == vs[0] {
+ tp++
+ } else {
+ fp++
+ }
+ }
+
+ stat := Stat{}
+ stat.SetTPRate(tp, nactuals[0])
+ stat.SetFPRate(fp, nactuals[1])
+ stat.SetPrecisionFromRate(nactuals[0], nactuals[1])
+
+ auc = auc + trapezoidArea(fp, fpprev, tp, tpprev)
+ auc = auc / float64(nclass[0]*nclass[1])
+ stat.SetAUC(auc)
+
+ rt.perfs = append(rt.perfs, &stat)
+
+ if len(rt.perfs) >= 2 {
+ // Replace the first stat with second stat, because of NaN
+ // value on the first precision.
+ rt.perfs[0] = rt.perfs[1]
+ }
+}
+
+//
+// WritePerformance will write performance data to file.
+//
+func (rt *Runtime) WritePerformance() error {
+ return rt.perfs.Write(rt.PerfFile)
+}
diff --git a/lib/mining/classifier/stat.go b/lib/mining/classifier/stat.go
new file mode 100644
index 00000000..790aec10
--- /dev/null
+++ b/lib/mining/classifier/stat.go
@@ -0,0 +1,176 @@
+// Copyright 2015-2016 Mhd Sulhan <ms@kilabit.info>. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package classifier
+
+import (
+ "time"
+
+ "github.com/shuLhan/share/lib/dsv"
+ "github.com/shuLhan/share/lib/tabula"
+)
+
+//
+// Stat hold statistic value of classifier, including TP rate, FP rate, precision,
+// and recall.
+//
+type Stat struct {
+ // ID unique id for this statistic (e.g. number of tree).
+ ID int64
+ // StartTime contain the start time of classifier in unix timestamp.
+ StartTime int64
+ // EndTime contain the end time of classifier in unix timestamp.
+ EndTime int64
+ // ElapsedTime contain actual time, in seconds, between end and start
+ // time.
+ ElapsedTime int64
+ // TP contain true-positive value.
+ TP int64
+ // FP contain false-positive value.
+ FP int64
+ // TN contain true-negative value.
+ TN int64
+ // FN contain false-negative value.
+ FN int64
+ // OobError contain out-of-bag error.
+ OobError float64
+ // OobErrorMean contain mean of out-of-bag error.
+ OobErrorMean float64
+ // TPRate contain true-positive rate (recall): tp/(tp+fn)
+ TPRate float64
+ // FPRate contain false-positive rate: fp/(fp+tn)
+ FPRate float64
+ // TNRate contain true-negative rate: tn/(tn+fp)
+ TNRate float64
+ // Precision contain: tp/(tp+fp)
+ Precision float64
+ // FMeasure contain value of F-measure or the harmonic mean of
+ // precision and recall.
+ FMeasure float64
+ // Accuracy contain the degree of closeness of measurements of a
+ // quantity to that quantity's true value.
+ Accuracy float64
+ // AUC contain the area under curve.
+ AUC float64
+}
+
+// SetAUC will set the AUC value.
+func (stat *Stat) SetAUC(v float64) {
+ stat.AUC = v
+}
+
+//
+// SetTPRate will set TP and TPRate using number of positive `p`.
+//
+func (stat *Stat) SetTPRate(tp, p int64) {
+ stat.TP = tp
+ stat.TPRate = float64(tp) / float64(p)
+}
+
+//
+// SetFPRate will set FP and FPRate using number of negative `n`.
+//
+func (stat *Stat) SetFPRate(fp, n int64) {
+ stat.FP = fp
+ stat.FPRate = float64(fp) / float64(n)
+}
+
+//
+// SetPrecisionFromRate will set Precision value using tprate and fprate.
+// `p` and `n` is the number of positive and negative class in samples.
+//
+func (stat *Stat) SetPrecisionFromRate(p, n int64) {
+ stat.Precision = (stat.TPRate * float64(p)) /
+ ((stat.TPRate * float64(p)) + (stat.FPRate * float64(n)))
+}
+
+//
+// Recall return value of recall.
+//
+func (stat *Stat) Recall() float64 {
+ return stat.TPRate
+}
+
+//
+// Sum will add statistic from other stat object to current stat, not including
+// the start and end time.
+//
+func (stat *Stat) Sum(other *Stat) {
+ stat.OobError += other.OobError
+ stat.OobErrorMean += other.OobErrorMean
+ stat.TP += other.TP
+ stat.FP += other.FP
+ stat.TN += other.TN
+ stat.FN += other.FN
+ stat.TPRate += other.TPRate
+ stat.FPRate += other.FPRate
+ stat.TNRate += other.TNRate
+ stat.Precision += other.Precision
+ stat.FMeasure += other.FMeasure
+ stat.Accuracy += other.Accuracy
+}
+
+//
+// ToRow will convert the stat to tabula.row in the order of Stat field.
+//
+func (stat *Stat) ToRow() (row *tabula.Row) {
+ row = &tabula.Row{}
+
+ row.PushBack(tabula.NewRecordInt(stat.ID))
+ row.PushBack(tabula.NewRecordInt(stat.StartTime))
+ row.PushBack(tabula.NewRecordInt(stat.EndTime))
+ row.PushBack(tabula.NewRecordInt(stat.ElapsedTime))
+ row.PushBack(tabula.NewRecordReal(stat.OobError))
+ row.PushBack(tabula.NewRecordReal(stat.OobErrorMean))
+ row.PushBack(tabula.NewRecordInt(stat.TP))
+ row.PushBack(tabula.NewRecordInt(stat.FP))
+ row.PushBack(tabula.NewRecordInt(stat.TN))
+ row.PushBack(tabula.NewRecordInt(stat.FN))
+ row.PushBack(tabula.NewRecordReal(stat.TPRate))
+ row.PushBack(tabula.NewRecordReal(stat.FPRate))
+ row.PushBack(tabula.NewRecordReal(stat.TNRate))
+ row.PushBack(tabula.NewRecordReal(stat.Precision))
+ row.PushBack(tabula.NewRecordReal(stat.FMeasure))
+ row.PushBack(tabula.NewRecordReal(stat.Accuracy))
+ row.PushBack(tabula.NewRecordReal(stat.AUC))
+
+ return
+}
+
+//
+// Start will start the timer.
+//
+func (stat *Stat) Start() {
+ stat.StartTime = time.Now().Unix()
+}
+
+//
+// End will stop the timer and compute the elapsed time.
+//
+func (stat *Stat) End() {
+ stat.EndTime = time.Now().Unix()
+ stat.ElapsedTime = stat.EndTime - stat.StartTime
+}
+
+//
+// Write will write the content of stat to `file`.
+//
+func (stat *Stat) Write(file string) (e error) {
+ if file == "" {
+ return
+ }
+
+ writer := &dsv.Writer{}
+ e = writer.OpenOutput(file)
+ if e != nil {
+ return e
+ }
+
+ e = writer.WriteRawRow(stat.ToRow(), nil, nil)
+ if e != nil {
+ return e
+ }
+
+ return writer.Close()
+}
diff --git a/lib/mining/classifier/stats.go b/lib/mining/classifier/stats.go
new file mode 100644
index 00000000..b29d3510
--- /dev/null
+++ b/lib/mining/classifier/stats.go
@@ -0,0 +1,143 @@
+// Copyright 2016 Mhd Sulhan <ms@kilabit.info>. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package classifier
+
+import (
+ "github.com/shuLhan/share/lib/dsv"
+)
+
+//
+// Stats define list of statistic values.
+//
+type Stats []*Stat
+
+//
+// Add will add other stat object to the slice.
+//
+func (stats *Stats) Add(stat *Stat) {
+ *stats = append(*stats, stat)
+}
+
+//
+// StartTimes return all start times in unix timestamp.
+//
+func (stats *Stats) StartTimes() (times []int64) {
+ for _, stat := range *stats {
+ times = append(times, stat.StartTime)
+ }
+ return
+}
+
+//
+// EndTimes return all end times in unix timestamp.
+//
+func (stats *Stats) EndTimes() (times []int64) {
+ for _, stat := range *stats {
+ times = append(times, stat.EndTime)
+ }
+ return
+}
+
+//
+// OobErrorMeans return all out-of-bag error mean values.
+//
+func (stats *Stats) OobErrorMeans() (oobmeans []float64) {
+ oobmeans = make([]float64, len(*stats))
+ for x, stat := range *stats {
+ oobmeans[x] = stat.OobErrorMean
+ }
+ return
+}
+
+//
+// TPRates return all true-positive rate values.
+//
+func (stats *Stats) TPRates() (tprates []float64) {
+ for _, stat := range *stats {
+ tprates = append(tprates, stat.TPRate)
+ }
+ return
+}
+
+//
+// FPRates return all false-positive rate values.
+//
+func (stats *Stats) FPRates() (fprates []float64) {
+ for _, stat := range *stats {
+ fprates = append(fprates, stat.FPRate)
+ }
+ return
+}
+
+//
+// TNRates will return all true-negative rate values.
+//
+func (stats *Stats) TNRates() (tnrates []float64) {
+ for _, stat := range *stats {
+ tnrates = append(tnrates, stat.TNRate)
+ }
+ return
+}
+
+//
+// Precisions return all precision values.
+//
+func (stats *Stats) Precisions() (precs []float64) {
+ for _, stat := range *stats {
+ precs = append(precs, stat.Precision)
+ }
+ return
+}
+
+//
+// Recalls return all recall values.
+//
+func (stats *Stats) Recalls() (recalls []float64) {
+ return stats.TPRates()
+}
+
+//
+// FMeasures return all F-measure values.
+//
+func (stats *Stats) FMeasures() (fmeasures []float64) {
+ for _, stat := range *stats {
+ fmeasures = append(fmeasures, stat.FMeasure)
+ }
+ return
+}
+
+//
+// Accuracies return all accuracy values.
+//
+func (stats *Stats) Accuracies() (accuracies []float64) {
+ for _, stat := range *stats {
+ accuracies = append(accuracies, stat.Accuracy)
+ }
+ return
+}
+
+//
+// Write will write all statistic data to `file`.
+//
+func (stats *Stats) Write(file string) (e error) {
+ if file == "" {
+ return
+ }
+
+ writer := &dsv.Writer{}
+ e = writer.OpenOutput(file)
+ if e != nil {
+ return e
+ }
+
+ for _, st := range *stats {
+ e = writer.WriteRawRow(st.ToRow(), nil, nil)
+ if e != nil {
+ return e
+ }
+ }
+
+ return writer.Close()
+}
diff --git a/lib/mining/classifier/stats_interface.go b/lib/mining/classifier/stats_interface.go
new file mode 100644
index 00000000..a9b60b9a
--- /dev/null
+++ b/lib/mining/classifier/stats_interface.go
@@ -0,0 +1,68 @@
+// Copyright 2016 Mhd Sulhan <ms@kilabit.info>. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package classifier
+
+//
+// ComputeFMeasures given array of precisions and recalls, compute F-measure
+// of each instance and return it.
+//
+func ComputeFMeasures(precisions, recalls []float64) (fmeasures []float64) {
+ // Get the minimum length of precision and recall.
+ // This is to make sure that we are not looping out of range.
+ minlen := len(precisions)
+ recallslen := len(recalls)
+ if recallslen < minlen {
+ minlen = recallslen
+ }
+
+ for x := 0; x < minlen; x++ {
+ f := 2 / ((1 / precisions[x]) + (1 / recalls[x]))
+ fmeasures = append(fmeasures, f)
+ }
+ return
+}
+
+//
+// ComputeAccuracies will compute and return accuracy from array of
+// true-positive, false-positive, true-negative, and false-negative; using
+// formula,
+//
+// (tp + tn) / (tp + tn + tn + fn)
+//
+func ComputeAccuracies(tp, fp, tn, fn []int64) (accuracies []float64) {
+ // Get minimum length of input, just to make sure we are not looping
+ // out of range.
+ minlen := len(tp)
+ if len(fp) < len(tn) {
+ minlen = len(fp)
+ }
+ if len(fn) < minlen {
+ minlen = len(fn)
+ }
+
+ for x := 0; x < minlen; x++ {
+ acc := float64(tp[x]+tn[x]) /
+ float64(tp[x]+fp[x]+tn[x]+fn[x])
+ accuracies = append(accuracies, acc)
+ }
+ return
+}
+
+//
+// ComputeElapsedTimes will compute and return elapsed time between `start`
+// and `end` timestamps.
+//
+func ComputeElapsedTimes(start, end []int64) (elaps []int64) {
+ // Get minimum length.
+ minlen := len(start)
+ if len(end) < minlen {
+ minlen = len(end)
+ }
+
+ for x := 0; x < minlen; x++ {
+ elaps = append(elaps, end[x]-start[x])
+ }
+ return
+}
diff --git a/lib/mining/cmd/cart/iris.dsv b/lib/mining/cmd/cart/iris.dsv
new file mode 100644
index 00000000..82a52d04
--- /dev/null
+++ b/lib/mining/cmd/cart/iris.dsv
@@ -0,0 +1,35 @@
+{
+ "Input" :"../../testdata/iris/iris.dat"
+, "Rejected" :"iris.rej"
+, "MaxRows" :-1
+, "ClassIndex" :4
+, "DatasetMode" :"matrix"
+, "SplitMethod" :"gini"
+, "InputMetadata" :
+ [{
+ "Name" :"sepal-length"
+ , "Separator" :","
+ , "Type" :"real"
+ },{
+ "Name" :"sepal-width"
+ , "Separator" :","
+ , "Type" :"real"
+ },{
+ "Name" :"petal-length"
+ , "Separator" :","
+ , "Type" :"real"
+ },{
+ "Name" :"petal-width"
+ , "Separator" :","
+ , "Type" :"real"
+ },{
+ "Name" :"class"
+ , "Type" :"string"
+ , "ValueSpace" :
+ [
+ "Iris-setosa"
+ , "Iris-versicolor"
+ , "Iris-virginica"
+ ]
+ }]
+}
diff --git a/lib/mining/cmd/cart/main.go b/lib/mining/cmd/cart/main.go
new file mode 100644
index 00000000..08510203
--- /dev/null
+++ b/lib/mining/cmd/cart/main.go
@@ -0,0 +1,107 @@
+// Copyright 2016 Mhd Sulhan <ms@kilabit.info>. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package main
+
+import (
+ "encoding/json"
+ "flag"
+ "fmt"
+ "io/ioutil"
+ "os"
+ "time"
+
+ "github.com/shuLhan/share/lib/debug"
+ "github.com/shuLhan/share/lib/dsv"
+ "github.com/shuLhan/share/lib/mining/classifier/cart"
+ "github.com/shuLhan/share/lib/tabula"
+)
+
+var (
+ nRandomFeature = 0
+)
+
+var usage = func() {
+ cmd := os.Args[0]
+ fmt.Fprintf(os.Stderr, "Usage of %s: [-n number] [config.dsv]\n", cmd)
+ flag.PrintDefaults()
+}
+
+func init() {
+ flagUsage := []string{
+ "Number of random feature (default 0)",
+ }
+
+ flag.IntVar(&nRandomFeature, "n", 0, flagUsage[0])
+}
+
+func trace(s string) (string, time.Time) {
+ fmt.Println("[START]", s)
+ return s, time.Now()
+}
+
+func un(s string, startTime time.Time) {
+ endTime := time.Now()
+ fmt.Println("[END]", s, "with elapsed time",
+ endTime.Sub(startTime))
+}
+
+func createCart(fcfg string) (*cart.Runtime, error) {
+ cartrt := &cart.Runtime{}
+
+ config, e := ioutil.ReadFile(fcfg)
+ if e != nil {
+ return nil, e
+ }
+
+ e = json.Unmarshal(config, cartrt)
+ if e != nil {
+ return nil, e
+ }
+
+ if nRandomFeature > 0 {
+ cartrt.NRandomFeature = nRandomFeature
+ }
+
+ return cartrt, nil
+}
+
+func main() {
+ defer un(trace("cart"))
+
+ flag.Parse()
+
+ if len(flag.Args()) <= 0 {
+ usage()
+ os.Exit(1)
+ }
+
+ fcfg := flag.Arg(0)
+
+ // Parsing config file and check command parameter values.
+ cartrt, e := createCart(fcfg)
+ if e != nil {
+ panic(e)
+ }
+
+ // Get dataset
+ dataset := tabula.Claset{}
+ _, e = dsv.SimpleRead(fcfg, &dataset)
+ if e != nil {
+ panic(e)
+ }
+
+ if debug.Value >= 1 {
+ fmt.Printf("[cart] Class index: %v\n", dataset.GetClassIndex())
+ }
+
+ e = cartrt.Build(&dataset)
+ if e != nil {
+ panic(e)
+ }
+
+ if debug.Value >= 1 {
+ fmt.Println("[cart] CART tree:\n", cartrt)
+ }
+}
diff --git a/lib/mining/cmd/crf/main.go b/lib/mining/cmd/crf/main.go
new file mode 100644
index 00000000..2bdd698a
--- /dev/null
+++ b/lib/mining/cmd/crf/main.go
@@ -0,0 +1,205 @@
+// Copyright 2016 Mhd Sulhan <ms@kilabit.info>. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package main
+
+import (
+ "encoding/json"
+ "flag"
+ "fmt"
+ "io/ioutil"
+ "os"
+ "time"
+
+ "github.com/shuLhan/share/lib/dsv"
+ "github.com/shuLhan/share/lib/mining/classifier/crf"
+ "github.com/shuLhan/share/lib/tabula"
+)
+
+const (
+ tag = "[crf]"
+)
+
+var (
+ // nStage number of stage.
+ nStage = 0
+ // nTree number of tree.
+ nTree = 0
+ // nRandomFeature number of feature to compute.
+ nRandomFeature = 0
+ // percentBoot percentage of sample for bootstraping.
+ percentBoot = 0
+ // oobStatsFile where statistic will be written.
+ oobStatsFile = ""
+ // perfFile where performance of classifier will be written.
+ perfFile = ""
+ // trainCfg point to the configuration file for training or creating
+ // a model
+ trainCfg = ""
+ // testCfg point to the configuration file for testing
+ testCfg = ""
+
+ // crforest the main object.
+ crforest crf.Runtime
+)
+
+var usage = func() {
+ flag.PrintDefaults()
+}
+
+func init() {
+ flagUsage := []string{
+ "Number of stage (default 200)",
+ "Number of tree in each stage (default 1)",
+ "Number of feature to compute (default 0, all features)",
+ "Percentage of bootstrap (default 64%)",
+ "OOB statistic file, where OOB data will be written",
+ "Performance file, where statistic of classifying data set will be written",
+ "Training configuration",
+ "Test configuration",
+ }
+
+ flag.IntVar(&nStage, "nstage", -1, flagUsage[0])
+ flag.IntVar(&nTree, "ntree", -1, flagUsage[1])
+ flag.IntVar(&nRandomFeature, "nrandomfeature", -1, flagUsage[2])
+ flag.IntVar(&percentBoot, "percentboot", -1, flagUsage[3])
+
+ flag.StringVar(&oobStatsFile, "oobstatsfile", "", flagUsage[4])
+ flag.StringVar(&perfFile, "perffile", "", flagUsage[5])
+
+ flag.StringVar(&trainCfg, "train", "", flagUsage[6])
+ flag.StringVar(&testCfg, "test", "", flagUsage[7])
+}
+
+func trace() (start time.Time) {
+ start = time.Now()
+ fmt.Println(tag, "start", start)
+ return
+}
+
+func un(startTime time.Time) {
+ endTime := time.Now()
+ fmt.Println(tag, "elapsed time", endTime.Sub(startTime))
+}
+
+//
+// createCRF will create cascaded random forest for training, with the
+// following steps,
+// (1) load training configuration.
+// (2) Overwrite configuration parameter if its set from command line.
+//
+func createCRF() error {
+ // (1)
+ config, e := ioutil.ReadFile(trainCfg)
+ if e != nil {
+ return e
+ }
+
+ crforest = crf.Runtime{}
+
+ e = json.Unmarshal(config, &crforest)
+ if e != nil {
+ return e
+ }
+
+ // (2)
+ if nStage > 0 {
+ crforest.NStage = nStage
+ }
+ if nTree > 0 {
+ crforest.NTree = nTree
+ }
+ if nRandomFeature > 0 {
+ crforest.NRandomFeature = nRandomFeature
+ }
+ if percentBoot > 0 {
+ crforest.PercentBoot = percentBoot
+ }
+ if oobStatsFile != "" {
+ crforest.OOBStatsFile = oobStatsFile
+ }
+ if perfFile != "" {
+ crforest.PerfFile = perfFile
+ }
+
+ crforest.RunOOB = true
+
+ return nil
+}
+
+func train() {
+ e := createCRF()
+ if e != nil {
+ panic(e)
+ }
+
+ trainset := tabula.Claset{}
+
+ _, e = dsv.SimpleRead(trainCfg, &trainset)
+ if e != nil {
+ panic(e)
+ }
+
+ e = crforest.Build(&trainset)
+ if e != nil {
+ panic(e)
+ }
+}
+
+func test() {
+ testset := tabula.Claset{}
+ _, e := dsv.SimpleRead(testCfg, &testset)
+ if e != nil {
+ panic(e)
+ }
+
+ fmt.Println(tag, "Test set:", &testset)
+ fmt.Println(tag, "Sample test set:", testset.GetRow(0))
+
+ predicts, cm, probs := crforest.ClassifySetByWeight(&testset, nil)
+
+ fmt.Println("[crf] Test set CM:", cm)
+
+ crforest.Performance(&testset, predicts, probs)
+
+ e = crforest.WritePerformance()
+ if e != nil {
+ panic(e)
+ }
+}
+
+//
+// (0) Parse and check command line parameters.
+// (1) If trainCfg parameter is set,
+// (1.1) train the model,
+// (1.2) TODO: load saved model.
+// (2) If testCfg parameter is set,
+// (2.1) Test the model using data from testCfg.
+//
+func main() {
+ defer un(trace())
+
+ // (0)
+ flag.Parse()
+
+ fmt.Println(tag, "Training config:", trainCfg)
+ fmt.Println(tag, "Test config:", testCfg)
+
+ // (1)
+ if trainCfg != "" {
+ // (1.1)
+ train()
+ } else {
+ // (1.2)
+ if len(flag.Args()) <= 0 {
+ usage()
+ os.Exit(1)
+ }
+ }
+
+ // (2)
+ if testCfg != "" {
+ test()
+ }
+}
diff --git a/lib/mining/cmd/crf/phoneme.dsv b/lib/mining/cmd/crf/phoneme.dsv
new file mode 100644
index 00000000..1219989f
--- /dev/null
+++ b/lib/mining/cmd/crf/phoneme.dsv
@@ -0,0 +1,45 @@
+{
+ "Input" :"../../testdata/phoneme/phoneme.dat"
+, "Rejected" :"phoneme.rej"
+, "MaxRows" :-1
+, "DatasetMode" :"matrix"
+, "ClassIndex" :5
+
+, "NTree" :50
+, "NRandomFeature" :2
+, "RunOOB" :true
+, "OOBStatsFile" :"phoneme.oob.stat"
+, "PerfFile" :"phoneme.perf"
+, "StatFile" :"phoneme.stat"
+
+, "InputMetadata" :
+ [{
+ "Name" :"f1"
+ , "Type" :"real"
+ , "Separator" :" "
+ },{
+ "Name" :"f2"
+ , "Type" :"real"
+ , "Separator" :" "
+ },{
+ "Name" :"f3"
+ , "Type" :"real"
+ , "Separator" :" "
+ },{
+ "Name" :"f4"
+ , "Type" :"real"
+ , "Separator" :" "
+ },{
+ "Name" :"f5"
+ , "Type" :"real"
+ , "Separator" :" "
+ },{
+ "Name" :"class"
+ , "Type" :"real"
+ , "ValueSpace" :
+ [
+ "1"
+ , "0"
+ ]
+ }]
+}
diff --git a/lib/mining/cmd/lnsmote/main.go b/lib/mining/cmd/lnsmote/main.go
new file mode 100644
index 00000000..692e9cf4
--- /dev/null
+++ b/lib/mining/cmd/lnsmote/main.go
@@ -0,0 +1,188 @@
+// Copyright 2016 Mhd Sulhan <ms@kilabit.info>. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package main
+
+import (
+ "encoding/json"
+ "flag"
+ "fmt"
+ "io/ioutil"
+ "os"
+ "time"
+
+ "github.com/shuLhan/share/lib/debug"
+ "github.com/shuLhan/share/lib/dsv"
+ "github.com/shuLhan/share/lib/mining/resampling/lnsmote"
+ "github.com/shuLhan/share/lib/tabula"
+)
+
+var (
+ // percentOver contain percentage of over sampling.
+ percentOver = 100
+ // knn contain number of nearest neighbours considered when
+ // oversampling.
+ knn = 5
+ // synFile flag for synthetic file output.
+ synFile = ""
+ // merge flag, if its true the original and synthetic will be merged
+ // into `synFile`.
+ merge = false
+)
+
+var usage = func() {
+ cmd := os.Args[0]
+ fmt.Fprintf(os.Stderr, "Usage of %s:\n"+
+ "[-percentover number] "+
+ "[-knn number] "+
+ "[-syntheticfile string] "+
+ "[-merge bool] "+
+ "[config.dsv]\n", cmd)
+ flag.PrintDefaults()
+}
+
+func init() {
+ flagUsage := []string{
+ "Percentage of oversampling (default 100)",
+ "Number of nearest neighbours (default 5)",
+ "File where synthetic samples will be written (default '')",
+ "If true then original and synthetic will be merged when" +
+ " written to file (default false)",
+ }
+
+ flag.IntVar(&percentOver, "percentover", -1, flagUsage[0])
+ flag.IntVar(&knn, "knn", -1, flagUsage[1])
+ flag.StringVar(&synFile, "syntheticfile", "", flagUsage[2])
+ flag.BoolVar(&merge, "merge", false, flagUsage[3])
+}
+
+func trace(s string) (string, time.Time) {
+ fmt.Println("[START]", s)
+ return s, time.Now()
+}
+
+func un(s string, startTime time.Time) {
+ endTime := time.Now()
+ fmt.Println("[END]", s, "with elapsed time",
+ endTime.Sub(startTime))
+}
+
+//
+// createLnsmote will create and initialize SMOTE object from config file and
+// from command parameter.
+//
+func createLnsmote(fcfg string) (lnsmoteRun *lnsmote.Runtime, e error) {
+ lnsmoteRun = &lnsmote.Runtime{}
+
+ config, e := ioutil.ReadFile(fcfg)
+ if e != nil {
+ return nil, e
+ }
+
+ e = json.Unmarshal(config, lnsmoteRun)
+ if e != nil {
+ return nil, e
+ }
+
+ // Use option value from command parameter.
+ if percentOver > 0 {
+ lnsmoteRun.PercentOver = percentOver
+ }
+ if knn > 0 {
+ lnsmoteRun.K = knn
+ }
+
+ if debug.Value >= 1 {
+ fmt.Println("[lnsmote]", lnsmoteRun)
+ }
+
+ return
+}
+
+//
+// runLnsmote will select minority class from dataset and run oversampling.
+//
+func runLnsmote(lnsmoteRun *lnsmote.Runtime, dataset *tabula.Claset) (e error) {
+ e = lnsmoteRun.Resampling(dataset)
+ if e != nil {
+ return
+ }
+
+ if debug.Value >= 1 {
+ fmt.Println("[lnsmote] # synthetics:",
+ lnsmoteRun.GetSynthetics().Len())
+ }
+
+ return
+}
+
+// runMerge will append original dataset to synthetic file.
+func runMerge(lnsmoteRun *lnsmote.Runtime, dataset *tabula.Claset) (e error) {
+ writer, e := dsv.NewWriter("")
+ if e != nil {
+ return
+ }
+
+ e = writer.ReopenOutput(lnsmoteRun.SyntheticFile)
+ if e != nil {
+ return
+ }
+
+ sep := dsv.DefSeparator
+ n, e := writer.WriteRawDataset(dataset, &sep)
+ if e != nil {
+ return
+ }
+
+ if debug.Value >= 1 {
+ fmt.Println("[lnsmote] # appended:", n)
+ }
+
+ return writer.Close()
+}
+
+func main() {
+ defer un(trace("lnsmote"))
+
+ flag.Parse()
+
+ if len(flag.Args()) <= 0 {
+ usage()
+ os.Exit(1)
+ }
+
+ fcfg := flag.Arg(0)
+
+ // Parsing config file and parameter.
+ lnsmoteRun, e := createLnsmote(fcfg)
+ if e != nil {
+ panic(e)
+ }
+
+ // Get dataset.
+ dataset := tabula.Claset{}
+ _, e = dsv.SimpleRead(fcfg, &dataset)
+ if e != nil {
+ panic(e)
+ }
+
+ fmt.Println("[lnsmote] Dataset:", &dataset)
+
+ row := dataset.GetRow(0)
+ fmt.Println("[lnsmote] sample:", row)
+
+ e = runLnsmote(lnsmoteRun, &dataset)
+ if e != nil {
+ panic(e)
+ }
+
+ if !merge {
+ return
+ }
+
+ e = runMerge(lnsmoteRun, &dataset)
+ if e != nil {
+ panic(e)
+ }
+}
diff --git a/lib/mining/cmd/lnsmote/phoneme.dsv b/lib/mining/cmd/lnsmote/phoneme.dsv
new file mode 100644
index 00000000..6e179c50
--- /dev/null
+++ b/lib/mining/cmd/lnsmote/phoneme.dsv
@@ -0,0 +1,45 @@
+{
+ "Input" : "../../testdata/phoneme/phoneme.dat"
+, "Rejected" : "phoneme.rej"
+, "MaxRows" : -1
+, "ClassIndex" : 5
+, "DatasetMode" : "matrix"
+
+, "K" : 5
+
+, "PercentOver" : 100
+, "SyntheticFile" : "phoneme.lnsmote.dat"
+
+, "ClassMinor" : "1"
+
+, "InputMetadata" :
+ [{
+ "Name" :"f1"
+ , "Type" :"real"
+ , "Separator" :" "
+ },{
+ "Name" :"f2"
+ , "Type" :"real"
+ , "Separator" :" "
+ },{
+ "Name" :"f3"
+ , "Type" :"real"
+ , "Separator" :" "
+ },{
+ "Name" :"f4"
+ , "Type" :"real"
+ , "Separator" :" "
+ },{
+ "Name" :"f5"
+ , "Type" :"real"
+ , "Separator" :" "
+ },{
+ "Name" :"class"
+ , "Type" :"string"
+ , "ValueSpace" :
+ [
+ "0"
+ , "1"
+ ]
+ }]
+}
diff --git a/lib/mining/cmd/rf/iris.dsv b/lib/mining/cmd/rf/iris.dsv
new file mode 100644
index 00000000..e86af6d1
--- /dev/null
+++ b/lib/mining/cmd/rf/iris.dsv
@@ -0,0 +1,41 @@
+{
+ "Input" :"../../testdata/iris/iris.dat"
+, "Rejected" :"iris.rej"
+, "MaxRows" :-1
+, "ClassIndex" :4
+, "DatasetMode" :"matrix"
+, "SplitMethod" :"gini"
+, "NTree" :50
+, "NRandomFeature" :2
+, "RunOOB" :true
+, "OOBStatsFile" :"iris.oob.stat"
+, "PerfFile" :"iris.perf"
+, "StatFile" :"iris.stat"
+, "InputMetadata" :
+ [{
+ "Name" :"sepal-length"
+ , "Separator" :","
+ , "Type" :"real"
+ },{
+ "Name" :"sepal-width"
+ , "Separator" :","
+ , "Type" :"real"
+ },{
+ "Name" :"petal-length"
+ , "Separator" :","
+ , "Type" :"real"
+ },{
+ "Name" :"petal-width"
+ , "Separator" :","
+ , "Type" :"real"
+ },{
+ "Name" :"class"
+ , "Type" :"string"
+ , "ValueSpace" :
+ [
+ "Iris-setosa"
+ , "Iris-versicolor"
+ , "Iris-virginica"
+ ]
+ }]
+}
diff --git a/lib/mining/cmd/rf/main.go b/lib/mining/cmd/rf/main.go
new file mode 100644
index 00000000..2c029c37
--- /dev/null
+++ b/lib/mining/cmd/rf/main.go
@@ -0,0 +1,188 @@
+// Copyright 2016 Mhd Sulhan <ms@kilabit.info>. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package main
+
+import (
+ "encoding/json"
+ "flag"
+ "fmt"
+ "io/ioutil"
+ "os"
+ "time"
+
+ "github.com/shuLhan/share/lib/dsv"
+ "github.com/shuLhan/share/lib/mining/classifier/rf"
+ "github.com/shuLhan/share/lib/tabula"
+)
+
+const (
+ tag = "[rf]"
+)
+
+var (
+ // nTree number of tree.
+ nTree = 0
+ // nRandomFeature number of feature to compute.
+ nRandomFeature = 0
+ // percentBoot percentage of sample for bootstraping.
+ percentBoot = 0
+ // oobStatsFile where statistic will be written.
+ oobStatsFile = ""
+ // perfFile where performance of classifier will be written.
+ perfFile = ""
+ // trainCfg point to the configuration file for training or creating
+ // a model
+ trainCfg = ""
+ // testCfg point to the configuration file for testing
+ testCfg = ""
+
+ // forest the main object.
+ forest rf.Runtime
+)
+
+var usage = func() {
+ flag.PrintDefaults()
+}
+
+func init() {
+ flagUsage := []string{
+ "Number of tree in forest (default 100)",
+ "Number of feature to compute (default 0)",
+ "Percentage of bootstrap (default 64%)",
+ "OOB statistic file, where OOB data will be written",
+ "Performance file, where statistic of classifying data set will be written",
+ "Training configuration",
+ "Test configuration",
+ }
+
+ flag.IntVar(&nTree, "ntree", -1, flagUsage[0])
+ flag.IntVar(&nRandomFeature, "nrandomfeature", -1, flagUsage[1])
+ flag.IntVar(&percentBoot, "percentboot", -1, flagUsage[2])
+
+ flag.StringVar(&oobStatsFile, "oobstatsfile", "", flagUsage[3])
+ flag.StringVar(&perfFile, "perffile", "", flagUsage[4])
+
+ flag.StringVar(&trainCfg, "train", "", flagUsage[5])
+ flag.StringVar(&testCfg, "test", "", flagUsage[6])
+}
+
+func trace() (start time.Time) {
+ start = time.Now()
+ fmt.Println(tag, "start", start)
+ return
+}
+
+func un(startTime time.Time) {
+ endTime := time.Now()
+ fmt.Println(tag, "elapsed time", endTime.Sub(startTime))
+}
+
+//
+// createRandomForest will create random forest for training, with the
+// following steps,
+// (1) load training configuration.
+// (2) Overwrite configuration parameter if its set from command line.
+//
+func createRandomForest() error {
+ // (1)
+ config, e := ioutil.ReadFile(trainCfg)
+ if e != nil {
+ return e
+ }
+
+ forest = rf.Runtime{}
+
+ e = json.Unmarshal(config, &forest)
+ if e != nil {
+ return e
+ }
+
+ // (2)
+ if nTree > 0 {
+ forest.NTree = nTree
+ }
+ if nRandomFeature > 0 {
+ forest.NRandomFeature = nRandomFeature
+ }
+ if percentBoot > 0 {
+ forest.PercentBoot = percentBoot
+ }
+ if oobStatsFile != "" {
+ forest.OOBStatsFile = oobStatsFile
+ }
+ if perfFile != "" {
+ forest.PerfFile = perfFile
+ }
+
+ return nil
+}
+
+func train() {
+ e := createRandomForest()
+ if e != nil {
+ panic(e)
+ }
+
+ trainset := tabula.Claset{}
+
+ _, e = dsv.SimpleRead(trainCfg, &trainset)
+ if e != nil {
+ panic(e)
+ }
+
+ e = forest.Build(&trainset)
+ if e != nil {
+ panic(e)
+ }
+}
+
+func test() {
+ testset := tabula.Claset{}
+ _, e := dsv.SimpleRead(testCfg, &testset)
+ if e != nil {
+ panic(e)
+ }
+
+ predicts, _, probs := forest.ClassifySet(&testset, nil)
+
+ forest.Performance(&testset, predicts, probs)
+
+ e = forest.WritePerformance()
+ if e != nil {
+ panic(e)
+ }
+}
+
+//
+// (0) Parse and check command line parameters.
+// (1) If trainCfg parameter is set,
+// (1.1) train the model,
+// (1.2) TODO: load saved model.
+// (2) If testCfg parameter is set,
+// (2.1) Test the model using data from testCfg.
+//
+func main() {
+ defer un(trace())
+
+ // (0)
+ flag.Parse()
+
+ // (1)
+ if trainCfg != "" {
+ // (1.1)
+ train()
+ } else {
+ // (1.2)
+ if len(flag.Args()) <= 0 {
+ usage()
+ os.Exit(1)
+ }
+ }
+
+ // (2)
+ if testCfg != "" {
+ test()
+ }
+}
diff --git a/lib/mining/cmd/rf/phoneme.dsv b/lib/mining/cmd/rf/phoneme.dsv
new file mode 100644
index 00000000..1219989f
--- /dev/null
+++ b/lib/mining/cmd/rf/phoneme.dsv
@@ -0,0 +1,45 @@
+{
+ "Input" :"../../testdata/phoneme/phoneme.dat"
+, "Rejected" :"phoneme.rej"
+, "MaxRows" :-1
+, "DatasetMode" :"matrix"
+, "ClassIndex" :5
+
+, "NTree" :50
+, "NRandomFeature" :2
+, "RunOOB" :true
+, "OOBStatsFile" :"phoneme.oob.stat"
+, "PerfFile" :"phoneme.perf"
+, "StatFile" :"phoneme.stat"
+
+, "InputMetadata" :
+ [{
+ "Name" :"f1"
+ , "Type" :"real"
+ , "Separator" :" "
+ },{
+ "Name" :"f2"
+ , "Type" :"real"
+ , "Separator" :" "
+ },{
+ "Name" :"f3"
+ , "Type" :"real"
+ , "Separator" :" "
+ },{
+ "Name" :"f4"
+ , "Type" :"real"
+ , "Separator" :" "
+ },{
+ "Name" :"f5"
+ , "Type" :"real"
+ , "Separator" :" "
+ },{
+ "Name" :"class"
+ , "Type" :"real"
+ , "ValueSpace" :
+ [
+ "1"
+ , "0"
+ ]
+ }]
+}
diff --git a/lib/mining/cmd/smote/main.go b/lib/mining/cmd/smote/main.go
new file mode 100644
index 00000000..48a53f51
--- /dev/null
+++ b/lib/mining/cmd/smote/main.go
@@ -0,0 +1,193 @@
+// Copyright 2016 Mhd Sulhan <ms@kilabit.info>. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package main
+
+import (
+ "encoding/json"
+ "flag"
+ "fmt"
+ "io/ioutil"
+ "os"
+ "time"
+
+ "github.com/shuLhan/share/lib/debug"
+ "github.com/shuLhan/share/lib/dsv"
+ "github.com/shuLhan/share/lib/mining/resampling/smote"
+ "github.com/shuLhan/share/lib/tabula"
+)
+
+var (
+ // percentOver contain percentage of over sampling.
+ percentOver = 100
+ // knn contain number of nearest neighbours considered when
+ // oversampling.
+ knn = 5
+ // synFile flag for synthetic file output.
+ synFile = ""
+ // merge flag, if its true the original and synthetic will be merged
+ // into `synFile`.
+ merge = false
+)
+
+var usage = func() {
+ cmd := os.Args[0]
+ fmt.Fprintf(os.Stderr, "Usage of %s:\n"+
+ "[-percentover number] "+
+ "[-knn number] "+
+ "[-syntheticfile string] "+
+ "[-merge bool] "+
+ "[config.dsv]\n", cmd)
+ flag.PrintDefaults()
+}
+
+func init() {
+ flagUsage := []string{
+ "Percentage of oversampling (default 100)",
+ "Number of nearest neighbours (default 5)",
+ "File where synthetic samples will be written (default '')",
+ "If true then original and synthetic will be merged when" +
+ " written to file (default false)",
+ }
+
+ flag.IntVar(&percentOver, "percentover", -1, flagUsage[0])
+ flag.IntVar(&knn, "knn", -1, flagUsage[1])
+ flag.StringVar(&synFile, "syntheticfile", "", flagUsage[2])
+ flag.BoolVar(&merge, "merge", false, flagUsage[3])
+}
+
+func trace(s string) (string, time.Time) {
+ fmt.Println("[START]", s)
+ return s, time.Now()
+}
+
+func un(s string, startTime time.Time) {
+ endTime := time.Now()
+ fmt.Println("[END]", s, "with elapsed time",
+ endTime.Sub(startTime))
+}
+
+//
+// createSmote will create and initialize SMOTE object from config file and
+// from command parameter.
+//
+func createSmote(fcfg string) (smoteRun *smote.Runtime, e error) {
+ smoteRun = &smote.Runtime{}
+
+ config, e := ioutil.ReadFile(fcfg)
+ if e != nil {
+ return nil, e
+ }
+
+ e = json.Unmarshal(config, smoteRun)
+ if e != nil {
+ return nil, e
+ }
+
+ // Use option value from command parameter.
+ if percentOver > 0 {
+ smoteRun.PercentOver = percentOver
+ }
+ if knn > 0 {
+ smoteRun.K = knn
+ }
+
+ if debug.Value >= 1 {
+ fmt.Println("[smote]", smoteRun)
+ }
+
+ return
+}
+
+//
+// runSmote will select minority class from dataset and run oversampling.
+//
+func runSmote(smote *smote.Runtime, dataset *tabula.Claset) (e error) {
+ minorset := dataset.GetMinorityRows()
+
+ if debug.Value >= 1 {
+ fmt.Println("[smote] # minority samples:", minorset.Len())
+ }
+
+ e = smote.Resampling(*minorset)
+ if e != nil {
+ return
+ }
+
+ if debug.Value >= 1 {
+ fmt.Println("[smote] # synthetics:", smote.Synthetics.Len())
+ }
+
+ return
+}
+
+// runMerge will append original dataset to synthetic file.
+func runMerge(smote *smote.Runtime, dataset *tabula.Claset) (e error) {
+ writer, e := dsv.NewWriter("")
+ if e != nil {
+ return
+ }
+
+ e = writer.ReopenOutput(smote.SyntheticFile)
+ if e != nil {
+ return
+ }
+
+ sep := dsv.DefSeparator
+ n, e := writer.WriteRawDataset(dataset, &sep)
+ if e != nil {
+ return
+ }
+
+ if debug.Value >= 1 {
+ fmt.Println("[smote] # appended:", n)
+ }
+
+ return writer.Close()
+}
+
+func main() {
+ defer un(trace("smote"))
+
+ flag.Parse()
+
+ if len(flag.Args()) <= 0 {
+ usage()
+ os.Exit(1)
+ }
+
+ fcfg := flag.Arg(0)
+
+ // Parsing config file and parameter.
+ smote, e := createSmote(fcfg)
+ if e != nil {
+ panic(e)
+ }
+
+ // Get dataset.
+ dataset := tabula.Claset{}
+ _, e = dsv.SimpleRead(fcfg, &dataset)
+ if e != nil {
+ panic(e)
+ }
+
+ fmt.Println("[smote] Dataset:", &dataset)
+
+ row := dataset.GetRow(0)
+ fmt.Println("[smote] sample:", row)
+
+ e = runSmote(smote, &dataset)
+ if e != nil {
+ panic(e)
+ }
+
+ if !merge {
+ return
+ }
+
+ e = runMerge(smote, &dataset)
+ if e != nil {
+ panic(e)
+ }
+}
diff --git a/lib/mining/cmd/smote/phoneme.dsv b/lib/mining/cmd/smote/phoneme.dsv
new file mode 100644
index 00000000..8cba121c
--- /dev/null
+++ b/lib/mining/cmd/smote/phoneme.dsv
@@ -0,0 +1,40 @@
+{
+ "Input" : "../../testdata/phoneme/phoneme.dat"
+, "Rejected" : "phoneme.rej"
+, "MaxRows" : -1
+, "ClassIndex" : 5
+, "DatasetMode" : "matrix"
+, "PercentOver" : 100
+, "K" : 5
+, "SyntheticFile" : "phoneme_smote.dat"
+, "InputMetadata" :
+ [{
+ "Name" :"f1"
+ , "Type" :"real"
+ , "Separator" :" "
+ },{
+ "Name" :"f2"
+ , "Type" :"real"
+ , "Separator" :" "
+ },{
+ "Name" :"f3"
+ , "Type" :"real"
+ , "Separator" :" "
+ },{
+ "Name" :"f4"
+ , "Type" :"real"
+ , "Separator" :" "
+ },{
+ "Name" :"f5"
+ , "Type" :"real"
+ , "Separator" :" "
+ },{
+ "Name" :"class"
+ , "Type" :"string"
+ , "ValueSpace" :
+ [
+ "0"
+ , "1"
+ ]
+ }]
+}
diff --git a/lib/mining/gain/gini/gini.go b/lib/mining/gain/gini/gini.go
new file mode 100644
index 00000000..2fcc2f9d
--- /dev/null
+++ b/lib/mining/gain/gini/gini.go
@@ -0,0 +1,461 @@
+// Copyright 2015 Mhd Sulhan <ms@kilabit.info>. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//
+// Package gini contain function to calculating Gini gain.
+//
+// Gini gain, which is an impurity-based criterion that measures the divergences
+// between the probability distribution of the target attributes' values.
+//
+package gini
+
+import (
+ "fmt"
+
+ "github.com/shuLhan/share/lib/debug"
+ "github.com/shuLhan/share/lib/numbers"
+ libstrings "github.com/shuLhan/share/lib/strings"
+)
+
+//
+// Gini contain slice of sorted index, slice of partition values, slice of Gini
+// index, Gini value for all samples.
+//
+type Gini struct {
+ // Skip if its true, the gain value would not be searched on this
+ // instance.
+ Skip bool
+ // IsContinue define whether the Gini index came from continuous
+ // attribute or not.
+ IsContinu bool
+ // Value of Gini index for all value in attribute.
+ Value float64
+ // MaxPartGain contain the index of partition which have the maximum
+ // gain.
+ MaxPartGain int
+ // MaxGainValue contain maximum gain of index.
+ MaxGainValue float64
+ // MinIndexPart contain the index of partition which have the minimum
+ // Gini index.
+ MinIndexPart int
+ // MinIndexGini contain minimum Gini index value.
+ MinIndexValue float64
+ // SortedIndex of attribute, sorted by values of attribute. This will
+ // be used to reference the unsorted target attribute.
+ SortedIndex []int
+ // ContinuPart contain list of partition value for continuous attribute.
+ ContinuPart []float64
+ // DiscretePart contain the possible combination of discrete values.
+ DiscretePart libstrings.Table
+ // Index contain list of Gini Index for each partition.
+ Index []float64
+ // Gain contain information gain for each partition.
+ Gain []float64
+}
+
+//
+// ComputeDiscrete Given an attribute A with discreate value 'discval', and the
+// target attribute T which contain N classes in C, compute the information gain
+// of A.
+//
+// The result is saved as gain value in MaxGainValue for each partition.
+//
+func (gini *Gini) ComputeDiscrete(A *[]string, discval *[]string, T *[]string,
+ C *[]string) {
+ gini.IsContinu = false
+
+ // create partition for possible combination of discrete values.
+ gini.createDiscretePartition((*discval))
+
+ if debug.Value >= 2 {
+ fmt.Println("[gini] part :", gini.DiscretePart)
+ }
+
+ gini.Index = make([]float64, len(gini.DiscretePart))
+ gini.Gain = make([]float64, len(gini.DiscretePart))
+ gini.MinIndexValue = 1.0
+
+ // compute gini index for all samples
+ gini.Value = gini.compute(T, C)
+
+ gini.computeDiscreteGain(A, T, C)
+}
+
+//
+// computeDiscreteGain will compute Gini index and Gain for each partition.
+//
+func (gini *Gini) computeDiscreteGain(A *[]string, T *[]string, C *[]string) {
+ // number of samples
+ nsample := float64(len(*A))
+
+ if debug.Value >= 3 {
+ fmt.Println("[gini] sample:", T)
+ fmt.Printf("[gini] Gini(a=%s) = %f\n", (*A), gini.Value)
+ }
+
+ // compute gini index for each discrete values
+ for i, subPart := range gini.DiscretePart {
+ // check if sub partition has at least an element
+ if len(subPart) <= 0 {
+ continue
+ }
+
+ sumGI := 0.0
+ for _, part := range subPart {
+ ndisc := 0.0
+ var subT []string
+
+ for _, el := range part {
+ for t, a := range *A {
+ if a != el {
+ continue
+ }
+
+ // count how many sample with this discrete value
+ ndisc++
+ // split the target by discrete value
+ subT = append(subT, (*T)[t])
+ }
+ }
+
+ // compute gini index for subtarget
+ giniIndex := gini.compute(&subT, C)
+
+ // compute probabilites of discrete value through all samples
+ p := ndisc / nsample
+
+ probIndex := p * giniIndex
+
+ // sum all probabilities times gini index.
+ sumGI += probIndex
+
+ if debug.Value >= 3 {
+ fmt.Printf("[gini] subsample: %v\n", subT)
+ fmt.Printf("[gini] Gini(a=%s) = %f/%f * %f = %f\n",
+ part, ndisc, nsample,
+ giniIndex, probIndex)
+ }
+ }
+
+ gini.Index[i] = sumGI
+ gini.Gain[i] = gini.Value - sumGI
+
+ if debug.Value >= 3 {
+ fmt.Printf("[gini] sample: %v\n", subPart)
+ fmt.Printf("[gini] Gain(a=%s) = %f - %f = %f\n",
+ subPart, gini.Value, sumGI,
+ gini.Gain[i])
+ }
+
+ if gini.MinIndexValue > gini.Index[i] && gini.Index[i] != 0 {
+ gini.MinIndexValue = gini.Index[i]
+ gini.MinIndexPart = i
+ }
+
+ if gini.MaxGainValue < gini.Gain[i] {
+ gini.MaxGainValue = gini.Gain[i]
+ gini.MaxPartGain = i
+ }
+ }
+}
+
+//
+// createDiscretePartition will create possible combination for discrete value
+// in DiscretePart.
+//
+func (gini *Gini) createDiscretePartition(discval []string) {
+ // no discrete values ?
+ if len(discval) <= 0 {
+ return
+ }
+
+ // use set partition function to group the discrete values into two
+ // subset.
+ gini.DiscretePart = libstrings.Partition(discval, 2)
+}
+
+//
+// ComputeContinu Given an attribute A and the target attribute T which contain
+// N classes in C, compute the information gain of A.
+//
+// The result of Gini partitions value, Gini Index, and Gini Gain is saved in
+// ContinuPart, Index, and Gain.
+//
+func (gini *Gini) ComputeContinu(A *[]float64, T *[]string, C *[]string) {
+ gini.IsContinu = true
+
+ // make a copy of attribute and target.
+ A2 := make([]float64, len(*A))
+ copy(A2, *A)
+
+ T2 := make([]string, len(*T))
+ copy(T2, *T)
+
+ gini.SortedIndex = numbers.Floats64IndirectSort(A2, true)
+
+ if debug.Value >= 1 {
+ fmt.Println("[gini] attr sorted :", A2)
+ }
+
+ // sort the target attribute using sorted index.
+ libstrings.SortByIndex(&T2, gini.SortedIndex)
+
+ // create partition
+ gini.createContinuPartition(&A2)
+
+ // create holder for gini index and gini gain
+ gini.Index = make([]float64, len(gini.ContinuPart))
+ gini.Gain = make([]float64, len(gini.ContinuPart))
+ gini.MinIndexValue = 1.0
+
+ // compute gini index for all samples
+ gini.Value = gini.compute(&T2, C)
+
+ gini.computeContinuGain(&A2, &T2, C)
+}
+
+//
+// createContinuPartition for dividing class and computing Gini index.
+//
+// This is assuming that the data `A` has been sorted in ascending order.
+//
+func (gini *Gini) createContinuPartition(A *[]float64) {
+ l := len(*A)
+ gini.ContinuPart = make([]float64, 0)
+
+ // loop from first index until last index - 1
+ for i := 0; i < l-1; i++ {
+ sum := (*A)[i] + (*A)[i+1]
+ med := sum / 2.0
+
+ // If median is zero, its mean both left and right value is
+ // zero. We are not allowing this, because it will result the
+ // minimum Gini Index or maximum Gain value.
+ if med == 0 {
+ continue
+ }
+
+ // Reject if median is contained in attribute's value.
+ // We use equality because if both A[i] and A[i+1] value is
+ // equal, the median is equal to both of them.
+ exist := false
+ for j := 0; j <= i; j++ {
+ if (*A)[j] == med {
+ exist = true
+ break
+ }
+ }
+ if !exist {
+ gini.ContinuPart = append(gini.ContinuPart, med)
+ }
+ }
+}
+
+//
+// compute value for attribute T.
+//
+// Return Gini value in the form of,
+//
+// 1 - sum (probability of each classes in T)
+//
+func (gini *Gini) compute(T *[]string, C *[]string) float64 {
+ n := float64(len(*T))
+ if n == 0 {
+ return 0
+ }
+
+ classCount := libstrings.CountTokens(*T, *C, true)
+
+ var sump2 float64
+
+ for x, v := range classCount {
+ p := float64(v) / n
+ sump2 += (p * p)
+
+ if debug.Value >= 3 {
+ fmt.Printf("[gini] compute (%s): (%d/%f)^2 = %f\n",
+ (*C)[x], v, n, p*p)
+ }
+
+ }
+
+ return 1 - sump2
+}
+
+//
+// computeContinuGain for each partition.
+//
+// The Gini gain formula we used here is,
+//
+// Gain(part,S) = Gini(S) - ((count(left)/S * Gini(left))
+// + (count(right)/S * Gini(right)))
+//
+// where,
+// - left is sub-sample from S that is less than part value.
+// - right is sub-sample from S that is greater than part value.
+//
+func (gini *Gini) computeContinuGain(A *[]float64, T *[]string, C *[]string) {
+ var gleft, gright float64
+ var tleft, tright []string
+
+ nsample := len(*A)
+
+ if debug.Value >= 2 {
+ fmt.Println("[gini] sorted data:", A)
+ fmt.Println("[gini] Gini.Value:", gini.Value)
+ }
+
+ for p, contVal := range gini.ContinuPart {
+
+ // find the split of samples between partition based on
+ // partition value
+ partidx := nsample
+ for x, attrVal := range *A {
+ if attrVal > contVal {
+ partidx = x
+ break
+ }
+ }
+
+ nleft := partidx
+ nright := nsample - partidx
+ pleft := float64(nleft) / float64(nsample)
+ pright := float64(nright) / float64(nsample)
+
+ if partidx > 0 {
+ tleft = (*T)[0:partidx]
+ tright = (*T)[partidx:]
+
+ gleft = gini.compute(&tleft, C)
+ gright = gini.compute(&tright, C)
+ } else {
+ tleft = nil
+ tright = (*T)[0:]
+
+ gleft = 0
+ gright = gini.compute(&tright, C)
+ }
+
+ // count class in partition
+ gini.Index[p] = ((pleft * gleft) + (pright * gright))
+ gini.Gain[p] = gini.Value - gini.Index[p]
+
+ if debug.Value >= 3 {
+ fmt.Println("[gini] tleft:", tleft)
+ fmt.Println("[gini] tright:", tright)
+
+ fmt.Printf("[gini] GiniGain(%v) = %f - (%f * %f) + (%f * %f) = %f\n",
+ contVal, gini.Value, pleft, gleft,
+ pright, gright, gini.Gain[p])
+ }
+
+ if gini.MinIndexValue > gini.Index[p] && gini.Index[p] != 0 {
+ gini.MinIndexValue = gini.Index[p]
+ gini.MinIndexPart = p
+ }
+
+ if gini.MaxGainValue < gini.Gain[p] {
+ gini.MaxGainValue = gini.Gain[p]
+ gini.MaxPartGain = p
+ }
+ }
+}
+
+//
+// GetMaxPartGainValue return the partition that have the maximum Gini gain.
+//
+func (gini *Gini) GetMaxPartGainValue() interface{} {
+ if gini.IsContinu {
+ return gini.ContinuPart[gini.MaxPartGain]
+ }
+
+ return gini.DiscretePart[gini.MaxPartGain]
+}
+
+//
+// GetMaxGainValue return the value of partition which contain the maximum Gini
+// gain.
+//
+func (gini *Gini) GetMaxGainValue() float64 {
+ return gini.MaxGainValue
+}
+
+//
+// GetMinIndexPartValue return the partition that have the minimum Gini index.
+//
+func (gini *Gini) GetMinIndexPartValue() interface{} {
+ if gini.IsContinu {
+ return gini.ContinuPart[gini.MinIndexPart]
+ }
+
+ return gini.DiscretePart[gini.MinIndexPart]
+}
+
+//
+// GetMinIndexValue return the minimum Gini index value.
+//
+func (gini *Gini) GetMinIndexValue() float64 {
+ return gini.MinIndexValue
+}
+
+//
+// FindMaxGain find the attribute and value that have the maximum gain.
+// The returned value is index of attribute.
+//
+func FindMaxGain(gains *[]Gini) (MaxGainIdx int) {
+ var gainValue = 0.0
+ var maxGainValue = 0.0
+
+ for i := range *gains {
+ if (*gains)[i].Skip {
+ continue
+ }
+ gainValue = (*gains)[i].GetMaxGainValue()
+ if gainValue > maxGainValue {
+ maxGainValue = gainValue
+ MaxGainIdx = i
+ }
+ }
+
+ return
+}
+
+//
+// FindMinGiniIndex return the index of attribute that have the minimum Gini index.
+//
+func FindMinGiniIndex(ginis *[]Gini) (MinIndexIdx int) {
+ var indexV = 0.0
+ var minIndexV = 1.0
+
+ for i := range *ginis {
+ indexV = (*ginis)[i].GetMinIndexValue()
+ if indexV > minIndexV {
+ minIndexV = indexV
+ MinIndexIdx = i
+ }
+ }
+
+ return
+}
+
+//
+// String yes, it will print it JSON like format.
+//
+func (gini Gini) String() (s string) {
+ s = fmt.Sprint("{\n",
+ " Skip :", gini.Skip, "\n",
+ " IsContinu :", gini.IsContinu, "\n",
+ " Index :", gini.Index, "\n",
+ " Value :", gini.Value, "\n",
+ " Gain :", gini.Gain, "\n",
+ " MaxPartGain :", gini.MaxPartGain, "\n",
+ " MaxGainValue :", gini.MaxGainValue, "\n",
+ " MinIndexPart :", gini.MinIndexPart, "\n",
+ " MinIndexValue :", gini.MinIndexValue, "\n",
+ " SortedIndex :", gini.SortedIndex, "\n",
+ " ContinuPart :", gini.ContinuPart, "\n",
+ " DiscretePart :", gini.DiscretePart, "\n",
+ "}")
+ return
+}
diff --git a/lib/mining/gain/gini/gini_test.go b/lib/mining/gain/gini/gini_test.go
new file mode 100644
index 00000000..682d5159
--- /dev/null
+++ b/lib/mining/gain/gini/gini_test.go
@@ -0,0 +1,59 @@
+// Copyright 2015 Mhd Sulhan <ms@kilabit.info>. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gini
+
+import (
+ "fmt"
+ "testing"
+)
+
+var data = [][]float64{
+ {1.0, 6.0, 5.0, 4.0, 7.0, 3.0, 8.0, 7.0, 5.0},
+ {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0},
+}
+var targetValues = []string{"P", "P", "N", "P", "N", "N", "N", "P", "N"}
+var classes = []string{"P", "N"}
+
+func TestComputeContinu(t *testing.T) {
+ target := make([]string, len(targetValues))
+
+ copy(target, targetValues)
+
+ fmt.Println("target:", target)
+
+ fmt.Println("data:", data[0])
+ GINI := Gini{}
+ GINI.ComputeContinu(&data[0], &target, &classes)
+ fmt.Println(">>> gini:", GINI)
+
+ fmt.Println("data:", data[1])
+ GINI = Gini{}
+ GINI.ComputeContinu(&data[1], &target, &classes)
+ fmt.Println(">>> gini:", GINI)
+}
+
+var discreteSamples = [][]string{
+ {"T", "T", "T", "F", "F", "F", "F", "T", "F"},
+ {"T", "T", "F", "F", "T", "T", "F", "F", "T"},
+ {"T", "T", "F", "T", "F", "F", "F", "T", "F"},
+}
+var discreteValues = []string{"T", "F"}
+
+func TestComputeDiscrete(t *testing.T) {
+ gini := Gini{}
+ target := make([]string, len(targetValues))
+
+ for _, sample := range discreteSamples {
+ copy(target, targetValues)
+
+ fmt.Println("target:", target)
+ fmt.Println("data:", sample)
+
+ gini.ComputeDiscrete(&sample, &discreteValues, &target,
+ &classes)
+
+ fmt.Println(gini)
+ }
+}
diff --git a/lib/mining/gain/gini/ginifloat.go b/lib/mining/gain/gini/ginifloat.go
new file mode 100644
index 00000000..b890b39d
--- /dev/null
+++ b/lib/mining/gain/gini/ginifloat.go
@@ -0,0 +1,168 @@
+// Copyright 2016 Mhd Sulhan <ms@kilabit.info>. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gini
+
+import (
+ "fmt"
+
+ "github.com/shuLhan/share/lib/debug"
+ "github.com/shuLhan/share/lib/numbers"
+)
+
+//
+// ComputeContinuFloat Given an attribute A and the target attribute T which contain
+// N classes in C, compute the information gain of A.
+//
+// The result of Gini partitions value, Gini Index, and Gini Gain is saved in
+// ContinuPart, Index, and Gain.
+//
+// Algorithm,
+// (0) Sort the attribute.
+// (1) Sort the target attribute using sorted index.
+// (2) Create continu partition.
+// (3) Create temporary space for gini index and gini gain.
+// (4) Compute gini index for all target.
+//
+func (gini *Gini) ComputeContinuFloat(A, T, C *[]float64) {
+ gini.IsContinu = true
+
+ gini.SortedIndex = numbers.Floats64IndirectSort(*A, true)
+
+ if debug.Value >= 1 {
+ fmt.Println("[gini] attr sorted :", A)
+ }
+
+ // (1)
+ numbers.Floats64SortByIndex(T, gini.SortedIndex)
+
+ // (2)
+ gini.createContinuPartition(A)
+
+ // (3)
+ gini.Index = make([]float64, len(gini.ContinuPart))
+ gini.Gain = make([]float64, len(gini.ContinuPart))
+ gini.MinIndexValue = 1.0
+
+ // (4)
+ gini.Value = gini.computeFloat(T, C)
+
+ gini.computeContinuGainFloat(A, T, C)
+}
+
+//
+// computeFloat will compute Gini value for attribute T.
+//
+// Gini value is computed using formula,
+//
+// 1 - sum (probability of each classes in T)
+//
+func (gini *Gini) computeFloat(T, C *[]float64) float64 {
+ n := float64(len(*T))
+ if n == 0 {
+ return 0
+ }
+
+ classCount := numbers.Floats64Counts(*T, *C)
+
+ var sump2 float64
+
+ for x, v := range classCount {
+ p := float64(v) / n
+ sump2 += (p * p)
+
+ if debug.Value >= 3 {
+ fmt.Printf("[gini] compute (%f): (%d/%f)^2 = %f\n",
+ (*C)[x], v, n, p*p)
+ }
+
+ }
+
+ return 1 - sump2
+}
+
+//
+// computeContinuGainFloat will compute gain for each partition.
+//
+// The Gini gain formula we used here is,
+//
+// Gain(part,S) = Gini(S) - ((count(left)/S * Gini(left))
+// + (count(right)/S * Gini(right)))
+//
+// where,
+// - left is sub-sample from S that is less than part value.
+// - right is sub-sample from S that is greater than part value.
+//
+// Algorithm,
+// (0) For each partition value,
+// (0.1) Find the split of samples between partition based on partition value.
+// (0.2) Count class in partition.
+//
+func (gini *Gini) computeContinuGainFloat(A, T, C *[]float64) {
+ var gainLeft, gainRight float64
+ var tleft, tright []float64
+
+ nsample := len(*A)
+
+ if debug.Value >= 2 {
+ fmt.Println("[gini] sorted data:", A)
+ fmt.Println("[gini] Gini.Value:", gini.Value)
+ }
+
+ // (0)
+ for p, contVal := range gini.ContinuPart {
+
+ // (0.1)
+ partidx := nsample
+ for x, attrVal := range *A {
+ if attrVal > contVal {
+ partidx = x
+ break
+ }
+ }
+
+ nleft := float64(partidx)
+ nright := float64(nsample - partidx)
+ probLeft := nleft / float64(nsample)
+ probRight := nright / float64(nsample)
+
+ if partidx > 0 {
+ tleft = (*T)[0:partidx]
+ tright = (*T)[partidx:]
+
+ gainLeft = gini.computeFloat(&tleft, C)
+ gainRight = gini.computeFloat(&tright, C)
+ } else {
+ tleft = nil
+ tright = (*T)[0:]
+
+ gainLeft = 0
+ gainRight = gini.computeFloat(&tright, C)
+ }
+
+ // (0.2)
+ gini.Index[p] = ((probLeft * gainLeft) +
+ (probRight * gainRight))
+ gini.Gain[p] = gini.Value - gini.Index[p]
+
+ if debug.Value >= 3 {
+ fmt.Println("[gini] tleft:", tleft)
+ fmt.Println("[gini] tright:", tright)
+
+ fmt.Printf("[gini] GiniGain(%v) = %f - (%f * %f) + (%f * %f) = %f\n",
+ contVal, gini.Value, probLeft, gainLeft,
+ probRight, gainRight, gini.Gain[p])
+ }
+
+ if gini.MinIndexValue > gini.Index[p] && gini.Index[p] != 0 {
+ gini.MinIndexValue = gini.Index[p]
+ gini.MinIndexPart = p
+ }
+
+ if gini.MaxGainValue < gini.Gain[p] {
+ gini.MaxGainValue = gini.Gain[p]
+ gini.MaxPartGain = p
+ }
+ }
+}
diff --git a/lib/mining/knn/knn.go b/lib/mining/knn/knn.go
new file mode 100644
index 00000000..0fa38f9c
--- /dev/null
+++ b/lib/mining/knn/knn.go
@@ -0,0 +1,109 @@
+// Copyright 2015-2016 Mhd Sulhan <ms@kilabit.info>. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//
+// Package knn implement the K Nearest Neighbor using Euclidian to compute the
+// distance between samples.
+//
+package knn
+
+import (
+ "fmt"
+ "math"
+ "sort"
+
+ "github.com/shuLhan/share/lib/debug"
+ "github.com/shuLhan/share/lib/tabula"
+)
+
+const (
+ // TEuclidianDistance used in Runtime.DistanceMethod.
+ TEuclidianDistance = 0
+)
+
+//
+// Runtime parameters for KNN processing.
+//
+type Runtime struct {
+ // DistanceMethod define how the distance between sample will be
+ // measured.
+ DistanceMethod int
+ // ClassIndex define index of class in dataset.
+ ClassIndex int `json:"ClassIndex"`
+ // K define number of nearest neighbors that will be searched.
+ K int `json:"K"`
+
+ // AllNeighbors contain all neighbours
+ AllNeighbors Neighbors
+}
+
+//
+// ComputeEuclidianDistance compute the distance of instance with each sample in
+// dataset `samples` and return it.
+//
+func (in *Runtime) ComputeEuclidianDistance(samples *tabula.Rows,
+ instance *tabula.Row,
+) {
+ for x := range *samples {
+ row := (*samples)[x]
+
+ // compute euclidian distance
+ d := 0.0
+ for y, rec := range *row {
+ if y == in.ClassIndex {
+ // skip class attribute
+ continue
+ }
+
+ ir := (*instance)[y]
+ diff := 0.0
+
+ diff = ir.Float() - rec.Float()
+
+ d += math.Abs(diff)
+ }
+
+ // only add sample distance which is not zero (its probably
+ // we calculating with the instance itself)
+ if d != 0 {
+ in.AllNeighbors.Add(row, math.Sqrt(d))
+ }
+ }
+
+ sort.Sort(&in.AllNeighbors)
+}
+
+//
+// FindNeighbors Given sample set and an instance, return the nearest neighbors as
+// a slice of neighbors.
+//
+func (in *Runtime) FindNeighbors(samples *tabula.Rows, instance *tabula.Row) (
+ kneighbors Neighbors,
+) {
+ // Reset current input neighbours
+ in.AllNeighbors = Neighbors{}
+
+ switch in.DistanceMethod {
+ case TEuclidianDistance:
+ in.ComputeEuclidianDistance(samples, instance)
+ }
+
+ // Make sure number of neighbors is greater than request.
+ minK := in.AllNeighbors.Len()
+ if minK > in.K {
+ minK = in.K
+ }
+
+ if debug.Value >= 2 {
+ fmt.Println("[knn] all neighbors:", in.AllNeighbors.Len())
+ }
+
+ kneighbors = in.AllNeighbors.SelectRange(0, minK)
+
+ if debug.Value >= 2 {
+ fmt.Println("[knn] k neighbors:", kneighbors.Len())
+ }
+
+ return
+}
diff --git a/lib/mining/knn/knn_test.go b/lib/mining/knn/knn_test.go
new file mode 100644
index 00000000..86ef75c0
--- /dev/null
+++ b/lib/mining/knn/knn_test.go
@@ -0,0 +1,62 @@
+// Copyright 2015 Mhd Sulhan <ms@kilabit.info>. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package knn
+
+import (
+ "fmt"
+ "testing"
+
+ "github.com/shuLhan/share/lib/dsv"
+ "github.com/shuLhan/share/lib/tabula"
+ "github.com/shuLhan/share/lib/test"
+)
+
+func TestComputeEuclidianDistance(t *testing.T) {
+ var exp = []string{
+ `[0.302891 0.608544 0.47413 1.42718 -0.811085 1]`,
+ `[0.243474 0.505146 0.472892 1.34802 -0.844252 1]` +
+ `[0.202343 0.485983 0.527533 1.47307 -0.809672 1]` +
+ `[0.215496 0.523418 0.51719 1.43548 -0.933981 1]` +
+ `[0.214331 0.546086 0.414773 1.38542 -0.702336 1]` +
+ `[0.301676 0.554505 0.594757 1.21258 -0.873084 1]`,
+ }
+ var expDistances = "[0.5257185558832786" +
+ " 0.5690474496911485" +
+ " 0.5888777462258191" +
+ " 0.6007362149895741" +
+ " 0.672666336306493]"
+
+ // Reading data
+ dataset := tabula.Dataset{}
+ _, e := dsv.SimpleRead("../testdata/phoneme/phoneme.dsv", &dataset)
+ if nil != e {
+ return
+ }
+
+ // Processing
+ knnIn := Runtime{
+ DistanceMethod: TEuclidianDistance,
+ ClassIndex: 5,
+ K: 5,
+ }
+
+ classes := dataset.GetRows().GroupByValue(knnIn.ClassIndex)
+
+ _, minoritySet := classes.GetMinority()
+
+ kneighbors := knnIn.FindNeighbors(&minoritySet, minoritySet[0])
+
+ var got string
+ rows := kneighbors.Rows()
+ for _, row := range *rows {
+ got += fmt.Sprint(*row)
+ }
+
+ test.Assert(t, "", exp[1], got, true)
+
+ distances := kneighbors.Distances()
+ got = fmt.Sprint(*distances)
+ test.Assert(t, "", expDistances, got, true)
+}
diff --git a/lib/mining/knn/neighbor.go b/lib/mining/knn/neighbor.go
new file mode 100644
index 00000000..e472d09f
--- /dev/null
+++ b/lib/mining/knn/neighbor.go
@@ -0,0 +1,150 @@
+// Copyright 2015 Mhd Sulhan <ms@kilabit.info>. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package knn
+
+import (
+ "github.com/shuLhan/share/lib/tabula"
+)
+
+//
+// Neighbors is a mapping between sample and their distance.
+// This type implement the sort interface.
+//
+type Neighbors struct {
+ // rows contain pointer to rows.
+ rows []*tabula.Row
+ // Distance value
+ distances []float64
+}
+
+//
+// Rows return all rows.
+//
+func (neighbors *Neighbors) Rows() *[]*tabula.Row {
+ return &neighbors.rows
+}
+
+//
+// Row return pointer to row at index `idx`.
+//
+func (neighbors *Neighbors) Row(idx int) *tabula.Row {
+ return neighbors.rows[idx]
+}
+
+//
+// Distances return slice of distance of each neighbours.
+//
+func (neighbors *Neighbors) Distances() *[]float64 {
+ return &neighbors.distances
+}
+
+//
+// Distance return distance value at index `idx`.
+//
+func (neighbors *Neighbors) Distance(idx int) float64 {
+ return neighbors.distances[idx]
+}
+
+//
+// Len return the number of neighbors.
+// This is for sort interface.
+//
+func (neighbors *Neighbors) Len() int {
+ return len(neighbors.distances)
+}
+
+//
+// Less return true if i < j.
+// This is for sort interface.
+//
+func (neighbors *Neighbors) Less(i, j int) bool {
+ if neighbors.distances[i] < neighbors.distances[j] {
+ return true
+ }
+ return false
+}
+
+//
+// Swap content of object in index i with index j.
+// This is for sort interface.
+//
+func (neighbors *Neighbors) Swap(i, j int) {
+ row := neighbors.rows[i]
+ distance := neighbors.distances[i]
+
+ neighbors.rows[i] = neighbors.rows[j]
+ neighbors.distances[i] = neighbors.distances[j]
+
+ neighbors.rows[j] = row
+ neighbors.distances[j] = distance
+}
+
+//
+// Add new neighbor.
+//
+func (neighbors *Neighbors) Add(row *tabula.Row, distance float64) {
+ neighbors.rows = append(neighbors.rows, row)
+ neighbors.distances = append(neighbors.distances, distance)
+}
+
+//
+// SelectRange select all neighbors from index `start` to `end`.
+// Return an empty set if start or end is out of range.
+//
+func (neighbors *Neighbors) SelectRange(start, end int) (newn Neighbors) {
+ if start < 0 {
+ return
+ }
+
+ if end > neighbors.Len() {
+ return
+ }
+
+ for x := start; x < end; x++ {
+ row := neighbors.rows[x]
+ newn.Add(row, neighbors.distances[x])
+ }
+ return
+}
+
+//
+// SelectWhere return all neighbors where row value at index `idx` is equal
+// to string `val`.
+//
+func (neighbors *Neighbors) SelectWhere(idx int, val string) (newn Neighbors) {
+ for x, row := range neighbors.rows {
+ colval := (*row)[idx].String()
+
+ if colval == val {
+ newn.Add(row, neighbors.Distance(x))
+ }
+ }
+ return
+}
+
+//
+// Contain return true if `row` is in neighbors and their index, otherwise
+// return false and -1.
+//
+func (neighbors *Neighbors) Contain(row *tabula.Row) (bool, int) {
+ for x, xrow := range neighbors.rows {
+ if xrow.IsEqual(row) {
+ return true, x
+ }
+ }
+ return false, -1
+}
+
+//
+// Replace neighbor at index `idx` with new row and distance value.
+//
+func (neighbors *Neighbors) Replace(idx int, row *tabula.Row, distance float64) {
+ if idx > len(neighbors.rows) {
+ return
+ }
+
+ neighbors.rows[idx] = row
+ neighbors.distances[idx] = distance
+}
diff --git a/lib/mining/knn/neighbor_test.go b/lib/mining/knn/neighbor_test.go
new file mode 100644
index 00000000..047d9a32
--- /dev/null
+++ b/lib/mining/knn/neighbor_test.go
@@ -0,0 +1,84 @@
+// Copyright 2016 Mhd Sulhan <ms@kilabit.info>. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package knn
+
+import (
+ "math/rand"
+ "sort"
+ "testing"
+ "time"
+
+ "github.com/shuLhan/share/lib/tabula"
+ "github.com/shuLhan/share/lib/test"
+)
+
+var dataFloat64 = [][]float64{
+ {0.243474, 0.505146, 0.472892, 1.34802, -0.844252, 1},
+ {0.202343, 0.485983, 0.527533, 1.47307, -0.809672, 1},
+ {0.215496, 0.523418, 0.517190, 1.43548, -0.933981, 1},
+ {0.214331, 0.546086, 0.414773, 1.38542, -0.702336, 1},
+ {0.301676, 0.554505, 0.594757, 1.21258, -0.873084, 1},
+}
+
+var distances = []int{4, 3, 2, 1, 0}
+
+func createNeigbours() (neighbors Neighbors) {
+ for x, d := range dataFloat64 {
+ row := tabula.Row{}
+
+ for _, v := range d {
+ rec := tabula.NewRecordReal(v)
+ row.PushBack(rec)
+ }
+
+ neighbors.Add(&row, float64(distances[x]))
+ }
+ return
+}
+
+func createNeigboursByIdx(indices []int) (neighbors Neighbors) {
+ for x, idx := range indices {
+ row := tabula.Row{}
+
+ for _, v := range dataFloat64[idx] {
+ rec := tabula.NewRecordReal(v)
+ row.PushBack(rec)
+ }
+
+ neighbors.Add(&row, float64(distances[x]))
+ }
+ return
+}
+
+func TestContain(t *testing.T) {
+ rand.Seed(time.Now().UnixNano())
+
+ neighbors := createNeigbours()
+
+ // pick random sample from neighbors
+ pickIdx := rand.Intn(neighbors.Len())
+ randSample := neighbors.Row(pickIdx).Clone()
+
+ isin, idx := neighbors.Contain(randSample)
+
+ test.Assert(t, "", true, isin, true)
+ test.Assert(t, "", pickIdx, idx, true)
+
+ // change one of record value to check for false.
+ (*randSample)[0].SetFloat(0)
+
+ isin, _ = neighbors.Contain(randSample)
+
+ test.Assert(t, "", false, isin, true)
+}
+
+func TestSort(t *testing.T) {
+ neighbors := createNeigbours()
+ exp := createNeigboursByIdx(distances)
+
+ sort.Sort(&neighbors)
+
+ test.Assert(t, "", exp.Rows(), neighbors.Rows(), true)
+}
diff --git a/lib/mining/math/math.go b/lib/mining/math/math.go
new file mode 100644
index 00000000..944890de
--- /dev/null
+++ b/lib/mining/math/math.go
@@ -0,0 +1,68 @@
+// Copyright 2015 Mhd Sulhan <ms@kilabit.info>. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//
+// Package math provide generic functions working with math.
+//
+package math
+
+import (
+ "math"
+)
+
+//
+// Factorial compute the factorial of n.
+//
+func Factorial(n int) (f int) {
+ if n >= 0 {
+ f = 1
+ } else {
+ f = -1
+ n = n * -1
+ }
+
+ for ; n > 0; n-- {
+ f = f * n
+ }
+
+ return f
+}
+
+//
+// BinomialCoefficient or combination, compute number of picking k from
+// n possibilities.
+//
+// Result is n! / ((n - k)! * k!)
+//
+func BinomialCoefficient(n int, k int) int {
+ if k > n {
+ return 0
+ }
+ return Factorial(n) / (Factorial(n-k) * Factorial(k))
+}
+
+//
+// StirlingS2 The number of ways of partitioning a set of n elements into
+// k nonempty sets (i.e., k set blocks), also called a Stirling set number.
+//
+// For example, the set {1,2,3} can be partitioned into three subsets in one way:
+// {{1},{2},{3}}; into two subsets in three ways: {{1,2},{3}}, {{1,3},{2}},
+// and {{1},{2,3}}; and into one subset in one way: {{1,2,3}}.
+//
+// Ref: http://mathworld.wolfram.com/StirlingNumberoftheSecondKind.html
+//
+func StirlingS2(n int, k int) int {
+ if k == 1 || n == k {
+ return 1
+ }
+ var sum int
+
+ for i := 0; i <= k; i++ {
+ sum += int(math.Pow(-1, float64(i))) *
+ BinomialCoefficient(k, i) *
+ int(math.Pow(float64(k-i), float64(n)))
+ }
+
+ return sum / Factorial(k)
+}
diff --git a/lib/mining/math/math_test.go b/lib/mining/math/math_test.go
new file mode 100644
index 00000000..ae7bf563
--- /dev/null
+++ b/lib/mining/math/math_test.go
@@ -0,0 +1,48 @@
+// Copyright 2015 Mhd Sulhan <ms@kilabit.info>. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package math
+
+import (
+ "testing"
+)
+
+func TestFactorial(t *testing.T) {
+ in := []int{-3, -2, -1, 0, 1, 2, 3}
+ exp := []int{-6, -2, -1, 1, 1, 2, 6}
+
+ for i := range in {
+ res := Factorial(in[i])
+
+ if res != exp[i] {
+ t.Fatal("Expecting ", exp[i], ", got ", res)
+ }
+ }
+}
+
+func TestBinomialCoefficient(t *testing.T) {
+ in := [][]int{{1, 2}, {1, 1}, {3, 2}, {5, 3}}
+ exp := []int{0, 1, 3, 10}
+
+ for i := range in {
+ res := BinomialCoefficient(in[i][0], in[i][1])
+
+ if res != exp[i] {
+ t.Fatal("Expecting ", exp[i], ", got ", res)
+ }
+ }
+}
+
+func TestStirlingS2(t *testing.T) {
+ in := [][]int{{3, 1}, {3, 2}, {3, 3}, {5, 3}}
+ exp := []int{1, 3, 1, 25}
+
+ for i := range in {
+ res := StirlingS2(in[i][0], in[i][1])
+
+ if res != exp[i] {
+ t.Fatal("Expecting ", exp[i], ", got ", res)
+ }
+ }
+}
diff --git a/lib/mining/resampling/interface.go b/lib/mining/resampling/interface.go
new file mode 100644
index 00000000..58c333ce
--- /dev/null
+++ b/lib/mining/resampling/interface.go
@@ -0,0 +1,52 @@
+// Copyright 2016 Mhd Sulhan <ms@kilabit.info>. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//
+// Package resampling provide common interface, constants, and methods for
+// resampling modules.
+//
+package resampling
+
+import (
+ "github.com/shuLhan/share/lib/dsv"
+ "github.com/shuLhan/share/lib/tabula"
+)
+
+const (
+ // DefaultK nearest neighbors.
+ DefaultK = 5
+ // DefaultPercentOver sampling.
+ DefaultPercentOver = 100
+)
+
+//
+// Interface define common methods used by resampling module.
+//
+type Interface interface {
+ GetSynthetics() tabula.DatasetInterface
+}
+
+//
+// WriteSynthetics will write synthetic samples in resampling module `ri` into
+// `file`.
+//
+func WriteSynthetics(ri Interface, file string) (e error) {
+ writer, e := dsv.NewWriter("")
+ if nil != e {
+ return
+ }
+
+ e = writer.OpenOutput(file)
+ if e != nil {
+ return
+ }
+
+ sep := dsv.DefSeparator
+ _, e = writer.WriteRawDataset(ri.GetSynthetics(), &sep)
+ if e != nil {
+ return
+ }
+
+ return writer.Close()
+}
diff --git a/lib/mining/resampling/lnsmote/lnsmote.go b/lib/mining/resampling/lnsmote/lnsmote.go
new file mode 100644
index 00000000..ce6f26e9
--- /dev/null
+++ b/lib/mining/resampling/lnsmote/lnsmote.go
@@ -0,0 +1,287 @@
+// Copyright 2016 Mhd Sulhan <ms@kilabit.info>. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//
+// Package lnsmote implement the Local-Neighborhood algorithm from the paper,
+//
+// Maciejewski, Tomasz, and Jerzy Stefanowski. "Local neighbourhood
+// extension of SMOTE for mining imbalanced data." Computational
+// Intelligence and Data Mining (CIDM), 2011 IEEE Symposium on. IEEE,
+// 2011.
+//
+package lnsmote
+
+import (
+ "fmt"
+ "math/rand"
+
+ "github.com/shuLhan/share/lib/debug"
+ "github.com/shuLhan/share/lib/dsv"
+ "github.com/shuLhan/share/lib/mining/knn"
+ "github.com/shuLhan/share/lib/mining/resampling/smote"
+ "github.com/shuLhan/share/lib/tabula"
+)
+
+//
+// Runtime parameters for input and output.
+//
+type Runtime struct {
+ // Runtime of SMOTE, since this module extend the SMOTE method.
+ smote.Runtime
+
+ // ClassMinor the minority sample in dataset that we want to
+ // oversampling.
+ ClassMinor string `json:"ClassMinor"`
+
+ // minorset contain minor class in samples.
+ minorset tabula.DatasetInterface
+
+ // datasetRows contain all rows in dataset.
+ datasetRows *tabula.Rows
+
+ // outliersRows contain all sample that is detected as outliers.
+ outliers tabula.Rows
+
+ // OutliersFile if its not empty then outliers will be saved in file
+ // specified by this option.
+ OutliersFile string `json:"OutliersFile"`
+}
+
+//
+// New create and return new LnSmote object.
+//
+func New(percentOver, k, classIndex int, classMinor, outliers string) (
+ lnsmoteRun *Runtime,
+) {
+ lnsmoteRun = &Runtime{
+ Runtime: smote.Runtime{
+ Runtime: knn.Runtime{
+ DistanceMethod: knn.TEuclidianDistance,
+ ClassIndex: classIndex,
+ K: k,
+ },
+ PercentOver: percentOver,
+ },
+ ClassMinor: classMinor,
+ OutliersFile: outliers,
+ }
+
+ return
+}
+
+//
+// Init will initialize LNSmote runtime by checking input values and set it to
+// default if not set or invalid.
+//
+func (in *Runtime) Init(dataset tabula.ClasetInterface) {
+ in.Runtime.Init()
+
+ in.NSynthetic = in.PercentOver / 100.0
+ in.datasetRows = dataset.GetDataAsRows()
+
+ in.minorset = tabula.SelectRowsWhere(dataset, in.ClassIndex,
+ in.ClassMinor)
+
+ in.outliers = make(tabula.Rows, 0)
+
+ if debug.Value >= 1 {
+ fmt.Println("[lnsmote] n:", in.NSynthetic)
+ fmt.Println("[lnsmote] n minority:", in.minorset.Len())
+ }
+}
+
+//
+// Resampling will run resampling process on dataset and return the synthetic
+// samples.
+//
+func (in *Runtime) Resampling(dataset tabula.ClasetInterface) (
+ e error,
+) {
+ in.Init(dataset)
+
+ minorRows := in.minorset.GetDataAsRows()
+
+ for x := range *minorRows {
+ p := (*minorRows)[x]
+
+ neighbors := in.FindNeighbors(in.datasetRows, p)
+
+ if debug.Value >= 3 {
+ fmt.Println("[lnsmote] neighbors:", neighbors.Rows())
+ }
+
+ for y := 0; y < in.NSynthetic; y++ {
+ syn := in.createSynthetic(p, neighbors)
+
+ if syn != nil {
+ in.Synthetics.PushRow(syn)
+ }
+ }
+
+ if debug.Value >= 1 {
+ fmt.Printf("[lnsmote] %-4d n synthetics: %v\n", x,
+ in.Synthetics.Len())
+ }
+ }
+
+ if in.SyntheticFile != "" {
+ e = in.Write(in.SyntheticFile)
+ }
+ if in.OutliersFile != "" && in.outliers.Len() > 0 {
+ e = in.writeOutliers()
+ }
+
+ return
+}
+
+//
+// createSynthetic will create synthetics row from original row `p` and their
+// `neighbors`.
+//
+func (in *Runtime) createSynthetic(p *tabula.Row, neighbors knn.Neighbors) (
+ synthetic *tabula.Row,
+) {
+ // choose one of the K nearest neighbors
+ randIdx := rand.Intn(neighbors.Len())
+ n := neighbors.Row(randIdx)
+
+ // Check if synthetic sample can be created from p and n.
+ canit, slp, sln := in.canCreate(p, n)
+ if !canit {
+ if debug.Value >= 2 {
+ fmt.Println("[lnsmote] can not create synthetic")
+ }
+
+ if slp.Len() <= 0 {
+ in.outliers.PushBack(p)
+ }
+
+ // we can not create from p and synthetic.
+ return nil
+ }
+
+ synthetic = p.Clone()
+
+ for x, srec := range *synthetic {
+ // Skip class attribute.
+ if x == in.ClassIndex {
+ continue
+ }
+
+ delta := in.randomGap(p, n, slp.Len(), sln.Len())
+ pv := (*p)[x].Float()
+ diff := (*n)[x].Float() - pv
+ srec.SetFloat(pv + delta*diff)
+ }
+
+ return
+}
+
+//
+// canCreate return true if synthetic can be created between two sample `p` and
+// `n`. Otherwise it will return false.
+//
+func (in *Runtime) canCreate(p, n *tabula.Row) (bool, knn.Neighbors,
+ knn.Neighbors,
+) {
+ slp := in.safeLevel(p)
+ sln := in.safeLevel2(p, n)
+
+ if debug.Value >= 2 {
+ fmt.Println("[lnsmote] slp : ", slp.Len())
+ fmt.Println("[lnsmote] sln : ", sln.Len())
+ }
+
+ return slp.Len() != 0 || sln.Len() != 0, slp, sln
+}
+
+//
+// safeLevel return the minority neighbors in sample `p`.
+//
+func (in *Runtime) safeLevel(p *tabula.Row) knn.Neighbors {
+ neighbors := in.FindNeighbors(in.datasetRows, p)
+ minorNeighbors := neighbors.SelectWhere(in.ClassIndex, in.ClassMinor)
+
+ return minorNeighbors
+}
+
+//
+// safeLevel2 return the minority neighbors between sample `p` and `n`.
+//
+func (in *Runtime) safeLevel2(p, n *tabula.Row) knn.Neighbors {
+ neighbors := in.FindNeighbors(in.datasetRows, n)
+
+ // check if n is in minority class.
+ nIsMinor := (*n)[in.ClassIndex].IsEqualToString(in.ClassMinor)
+
+ // check if p is in neighbors.
+ pInNeighbors, pidx := neighbors.Contain(p)
+
+ // if p in neighbors, replace it with neighbours in K+1
+ if nIsMinor && pInNeighbors {
+ if debug.Value >= 1 {
+ fmt.Println("[lnsmote] Replacing ", pidx)
+ }
+ if debug.Value >= 2 {
+ fmt.Println("[lnsmote] Replacing ", pidx, " in ", neighbors)
+ }
+
+ row := in.AllNeighbors.Row(in.K + 1)
+ dist := in.AllNeighbors.Distance(in.K + 1)
+ neighbors.Replace(pidx, row, dist)
+
+ if debug.Value >= 2 {
+ fmt.Println("[lnsmote] Replacement ", neighbors)
+ }
+ }
+
+ minorNeighbors := neighbors.SelectWhere(in.ClassIndex, in.ClassMinor)
+
+ return minorNeighbors
+}
+
+//
+// randomGap return the neighbors gap between sample `p` and `n` using safe
+// level (number of minority neighbors) of p in `lenslp` and `n` in `lensln`.
+//
+func (in *Runtime) randomGap(p, n *tabula.Row, lenslp, lensln int) (
+ delta float64,
+) {
+ if lensln == 0 && lenslp > 0 {
+ return
+ }
+
+ slratio := float64(lenslp) / float64(lensln)
+ if slratio == 1 {
+ delta = rand.Float64()
+ } else if slratio > 1 {
+ delta = rand.Float64() * (1 / slratio)
+ } else {
+ delta = 1 - rand.Float64()*slratio
+ }
+
+ return delta
+}
+
+// writeOutliers will save the `outliers` to file specified by
+// `OutliersFile`.
+func (in *Runtime) writeOutliers() (e error) {
+ writer, e := dsv.NewWriter("")
+ if nil != e {
+ return
+ }
+
+ e = writer.OpenOutput(in.OutliersFile)
+ if e != nil {
+ return
+ }
+
+ sep := dsv.DefSeparator
+ _, e = writer.WriteRawRows(&in.outliers, &sep)
+ if e != nil {
+ return
+ }
+
+ return writer.Close()
+}
diff --git a/lib/mining/resampling/lnsmote/lnsmote_test.go b/lib/mining/resampling/lnsmote/lnsmote_test.go
new file mode 100644
index 00000000..1c8e45e4
--- /dev/null
+++ b/lib/mining/resampling/lnsmote/lnsmote_test.go
@@ -0,0 +1,74 @@
+// Copyright 2016 Mhd Sulhan <ms@kilabit.info>. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package lnsmote
+
+import (
+ "fmt"
+ "os"
+ "testing"
+
+ "github.com/shuLhan/share/lib/dsv"
+ "github.com/shuLhan/share/lib/tabula"
+)
+
+const (
+ fcfg = "../../testdata/phoneme/phoneme.dsv"
+)
+
+func TestLNSmote(t *testing.T) {
+ // Read sample dataset.
+ dataset := tabula.Claset{}
+ _, e := dsv.SimpleRead(fcfg, &dataset)
+ if nil != e {
+ t.Fatal(e)
+ }
+
+ fmt.Println("[lnsmote_test] Total samples:", dataset.GetNRow())
+
+ // Write original samples.
+ writer, e := dsv.NewWriter("")
+
+ if nil != e {
+ t.Fatal(e)
+ }
+
+ e = writer.OpenOutput("phoneme_lnsmote.csv")
+ if e != nil {
+ t.Fatal(e)
+ }
+
+ sep := dsv.DefSeparator
+ _, e = writer.WriteRawRows(dataset.GetRows(), &sep)
+ if e != nil {
+ t.Fatal(e)
+ }
+
+ // Initialize LN-SMOTE.
+ lnsmoteRun := New(100, 5, 5, "1", "lnsmote.outliers")
+
+ e = lnsmoteRun.Resampling(&dataset)
+
+ fmt.Println("[lnsmote_test] # synthetic:", lnsmoteRun.Synthetics.Len())
+
+ sep = dsv.DefSeparator
+ _, e = writer.WriteRawRows(lnsmoteRun.Synthetics.GetRows(), &sep)
+ if e != nil {
+ t.Fatal(e)
+ }
+
+ e = writer.Close()
+ if e != nil {
+ t.Fatal(e)
+ }
+}
+
+func TestMain(m *testing.M) {
+ envTestLnsmote := os.Getenv("TEST_LNSMOTE")
+ if len(envTestLnsmote) == 0 {
+ os.Exit(0)
+ }
+
+ os.Exit(m.Run())
+}
diff --git a/lib/mining/resampling/lnsmote/phoneme_lnsmote.dsv b/lib/mining/resampling/lnsmote/phoneme_lnsmote.dsv
new file mode 100644
index 00000000..cd1fef8b
--- /dev/null
+++ b/lib/mining/resampling/lnsmote/phoneme_lnsmote.dsv
@@ -0,0 +1,38 @@
+{
+ "Input" :"phoneme_lnsmote.csv"
+, "Rejected" :"phoneme.rej"
+, "MaxRows" :-1
+, "ClassMetadataIndex" :5
+, "ClassIndex" :5
+, "DatasetMode" :"matrix"
+, "InputMetadata" :
+ [{
+ "Name" :"f1"
+ , "Type" :"real"
+ , "Separator" :","
+ },{
+ "Name" :"f2"
+ , "Type" :"real"
+ , "Separator" :","
+ },{
+ "Name" :"f3"
+ , "Type" :"real"
+ , "Separator" :","
+ },{
+ "Name" :"f4"
+ , "Type" :"real"
+ , "Separator" :","
+ },{
+ "Name" :"f5"
+ , "Type" :"real"
+ , "Separator" :","
+ },{
+ "Name" :"class"
+ , "Type" :"string"
+ , "ValueSpace" :
+ [
+ "0"
+ , "1"
+ ]
+ }]
+}
diff --git a/lib/mining/resampling/smote/phoneme_smote.dsv b/lib/mining/resampling/smote/phoneme_smote.dsv
new file mode 100644
index 00000000..63180379
--- /dev/null
+++ b/lib/mining/resampling/smote/phoneme_smote.dsv
@@ -0,0 +1,38 @@
+{
+ "Input" :"phoneme_smote.csv"
+, "Rejected" :"phoneme.rej"
+, "MaxRows" :-1
+, "ClassMetadataIndex" :5
+, "ClassIndex" :5
+, "DatasetMode" :"matrix"
+, "InputMetadata" :
+ [{
+ "Name" :"f1"
+ , "Type" :"real"
+ , "Separator" :","
+ },{
+ "Name" :"f2"
+ , "Type" :"real"
+ , "Separator" :","
+ },{
+ "Name" :"f3"
+ , "Type" :"real"
+ , "Separator" :","
+ },{
+ "Name" :"f4"
+ , "Type" :"real"
+ , "Separator" :","
+ },{
+ "Name" :"f5"
+ , "Type" :"real"
+ , "Separator" :","
+ },{
+ "Name" :"class"
+ , "Type" :"string"
+ , "ValueSpace" :
+ [
+ "0"
+ , "1"
+ ]
+ }]
+}
diff --git a/lib/mining/resampling/smote/smote.go b/lib/mining/resampling/smote/smote.go
new file mode 100644
index 00000000..32f21b7e
--- /dev/null
+++ b/lib/mining/resampling/smote/smote.go
@@ -0,0 +1,183 @@
+// Copyright 2015 Mhd Sulhan <ms@kilabit.info>. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//
+// Package smote resamples a dataset by applying the Synthetic Minority
+// Oversampling TEchnique (SMOTE). The original dataset must fit entirely in
+// memory. The amount of SMOTE and number of nearest neighbors may be specified.
+// For more information, see
+//
+// Nitesh V. Chawla et. al. (2002). Synthetic Minority Over-sampling
+// Technique. Journal of Artificial Intelligence Research. 16:321-357.
+//
+package smote
+
+import (
+ "fmt"
+ "math/rand"
+ "time"
+
+ "github.com/shuLhan/share/lib/mining/knn"
+ "github.com/shuLhan/share/lib/mining/resampling"
+ "github.com/shuLhan/share/lib/tabula"
+)
+
+//
+// Runtime for input and output.
+//
+type Runtime struct {
+ // Runtime the K-Nearest-Neighbourhood parameters.
+ knn.Runtime
+ // PercentOver input for oversampling percentage.
+ PercentOver int `json:"PercentOver"`
+ // SyntheticFile is a filename where synthetic samples will be written.
+ SyntheticFile string `json:"SyntheticFile"`
+ // NSynthetic input for number of new synthetic per sample.
+ NSynthetic int
+ // Synthetics contain output of resampling as synthetic samples.
+ Synthetics tabula.Dataset
+}
+
+//
+// New create and return new smote runtime.
+//
+func New(percentOver, k, classIndex int) (smoteRun *Runtime) {
+ smoteRun = &Runtime{
+ Runtime: knn.Runtime{
+ DistanceMethod: knn.TEuclidianDistance,
+ ClassIndex: classIndex,
+ K: k,
+ },
+ PercentOver: percentOver,
+ }
+ return
+}
+
+//
+// Init will recheck input and set to default value if its not valid.
+//
+func (smote *Runtime) Init() {
+ rand.Seed(time.Now().UnixNano())
+
+ if smote.K <= 0 {
+ smote.K = resampling.DefaultK
+ }
+ if smote.PercentOver <= 0 {
+ smote.PercentOver = resampling.DefaultPercentOver
+ }
+}
+
+//
+// GetSynthetics return synthetic samples.
+//
+func (smote *Runtime) GetSynthetics() tabula.DatasetInterface {
+ return &smote.Synthetics
+}
+
+//
+// populate will generate new synthetic sample using nearest neighbors.
+//
+func (smote *Runtime) populate(instance *tabula.Row, neighbors knn.Neighbors) {
+ lenAttr := len(*instance)
+
+ for x := 0; x < smote.NSynthetic; x++ {
+ // choose one of the K nearest neighbors
+ n := rand.Intn(neighbors.Len())
+ sample := neighbors.Row(n)
+
+ newSynt := make(tabula.Row, lenAttr)
+
+ // Compute new synthetic attributes.
+ for attr, sr := range *sample {
+ if attr == smote.ClassIndex {
+ continue
+ }
+
+ ir := (*instance)[attr]
+
+ iv := ir.Float()
+ sv := sr.Float()
+
+ dif := sv - iv
+ gap := rand.Float64()
+ newAttr := iv + (gap * dif)
+
+ record := &tabula.Record{}
+ record.SetFloat(newAttr)
+ newSynt[attr] = record
+ }
+
+ newSynt[smote.ClassIndex] = (*instance)[smote.ClassIndex]
+
+ smote.Synthetics.PushRow(&newSynt)
+ }
+}
+
+//
+// Resampling will run resampling algorithm using values that has been defined
+// in `Runtime` and return list of synthetic samples.
+//
+// The `dataset` must be samples of minority class not the whole dataset.
+//
+// Algorithms,
+//
+// (0) If oversampling percentage less than 100, then
+// (0.1) replace the input dataset by selecting n random sample from dataset
+// without replacement, where n is
+//
+// (percentage-oversampling / 100) * number-of-sample
+//
+// (1) For each `sample` in dataset,
+// (1.1) find k-nearest-neighbors of `sample`,
+// (1.2) generate synthetic sample in neighbors.
+// (2) Write synthetic samples to file, only if `SyntheticFile` is not empty.
+//
+func (smote *Runtime) Resampling(dataset tabula.Rows) (e error) {
+ smote.Init()
+
+ if smote.PercentOver < 100 {
+ // (0.1)
+ smote.NSynthetic = (smote.PercentOver / 100.0) * len(dataset)
+ dataset, _, _, _ = dataset.RandomPick(smote.NSynthetic, false)
+ } else {
+ smote.NSynthetic = smote.PercentOver / 100.0
+ }
+
+ // (1)
+ for x := range dataset {
+ sample := dataset[x]
+
+ // (1.1)
+ neighbors := smote.FindNeighbors(&dataset, sample)
+
+ // (1.2)
+ smote.populate(sample, neighbors)
+ }
+
+ // (2)
+ if smote.SyntheticFile != "" {
+ e = resampling.WriteSynthetics(smote, smote.SyntheticFile)
+ }
+
+ return
+}
+
+//
+// Write will write synthetic samples to file defined in `file`.
+//
+func (smote *Runtime) Write(file string) error {
+ return resampling.WriteSynthetics(smote, file)
+}
+
+func (smote *Runtime) String() (s string) {
+ s = fmt.Sprintf("'smote' : {\n"+
+ " 'ClassIndex' :%d\n"+
+ " , 'K' :%d\n"+
+ " , 'PercentOver' :%d\n"+
+ " , 'DistanceMethod' :%d\n"+
+ "}", smote.ClassIndex, smote.K, smote.PercentOver,
+ smote.DistanceMethod)
+
+ return
+}
diff --git a/lib/mining/resampling/smote/smote_test.go b/lib/mining/resampling/smote/smote_test.go
new file mode 100644
index 00000000..c3a633cb
--- /dev/null
+++ b/lib/mining/resampling/smote/smote_test.go
@@ -0,0 +1,49 @@
+// Copyright 2015 Mhd Sulhan <ms@kilabit.info>. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package smote
+
+import (
+ "fmt"
+ "testing"
+
+ "github.com/shuLhan/share/lib/dsv"
+ "github.com/shuLhan/share/lib/tabula"
+)
+
+var (
+ fcfg = "../../testdata/phoneme/phoneme.dsv"
+ PercentOver = 100
+ K = 5
+)
+
+func TestSmote(t *testing.T) {
+ smot := New(PercentOver, K, 5)
+
+ // Read samples.
+ dataset := tabula.Claset{}
+
+ _, e := dsv.SimpleRead(fcfg, &dataset)
+ if nil != e {
+ t.Fatal(e)
+ }
+
+ fmt.Println("[smote_test] Total samples:", dataset.Len())
+
+ minorset := dataset.GetMinorityRows()
+
+ fmt.Println("[smote_test] # minority samples:", minorset.Len())
+
+ e = smot.Resampling(*minorset)
+ if e != nil {
+ t.Fatal(e)
+ }
+
+ fmt.Println("[smote_test] # synthetic:", smot.GetSynthetics().Len())
+
+ e = smot.Write("phoneme_smote.csv")
+ if e != nil {
+ t.Fatal(e)
+ }
+}
diff --git a/lib/mining/testdata/forensic_glass/fgl.data b/lib/mining/testdata/forensic_glass/fgl.data
new file mode 100644
index 00000000..8dc6fc0c
--- /dev/null
+++ b/lib/mining/testdata/forensic_glass/fgl.data
@@ -0,0 +1,214 @@
+1 3.01 13.64 4.49 1.10 71.78 0.06 8.75 0.00 0.00 WinF
+2 -0.39 13.89 3.60 1.36 72.73 0.48 7.83 0.00 0.00 WinF
+3 -1.82 13.53 3.55 1.54 72.99 0.39 7.78 0.00 0.00 WinF
+4 -0.34 13.21 3.69 1.29 72.61 0.57 8.22 0.00 0.00 WinF
+5 -0.58 13.27 3.62 1.24 73.08 0.55 8.07 0.00 0.00 WinF
+6 -2.04 12.79 3.61 1.62 72.97 0.64 8.07 0.00 0.26 WinF
+7 -0.57 13.30 3.60 1.14 73.09 0.58 8.17 0.00 0.00 WinF
+8 -0.44 13.15 3.61 1.05 73.24 0.57 8.24 0.00 0.00 WinF
+9 1.18 14.04 3.58 1.37 72.08 0.56 8.30 0.00 0.00 WinF
+10 -0.45 13.00 3.60 1.36 72.99 0.57 8.40 0.00 0.11 WinF
+11 -2.29 12.72 3.46 1.56 73.20 0.67 8.09 0.00 0.24 WinF
+12 -0.37 12.80 3.66 1.27 73.01 0.60 8.56 0.00 0.00 WinF
+13 -2.11 12.88 3.43 1.40 73.28 0.69 8.05 0.00 0.24 WinF
+14 -0.52 12.86 3.56 1.27 73.21 0.54 8.38 0.00 0.17 WinF
+15 -0.37 12.61 3.59 1.31 73.29 0.58 8.50 0.00 0.00 WinF
+16 -0.39 12.81 3.54 1.23 73.24 0.58 8.39 0.00 0.00 WinF
+17 -0.16 12.68 3.67 1.16 73.11 0.61 8.70 0.00 0.00 WinF
+18 3.96 14.36 3.85 0.89 71.36 0.15 9.15 0.00 0.00 WinF
+19 1.11 13.90 3.73 1.18 72.12 0.06 8.89 0.00 0.00 WinF
+20 -0.65 13.02 3.54 1.69 72.73 0.54 8.44 0.00 0.07 WinF
+21 -0.50 12.82 3.55 1.49 72.75 0.54 8.52 0.00 0.19 WinF
+22 1.66 14.77 3.75 0.29 72.02 0.03 9.00 0.00 0.00 WinF
+23 -0.64 12.78 3.62 1.29 72.79 0.59 8.70 0.00 0.00 WinF
+24 -0.49 12.81 3.57 1.35 73.02 0.62 8.59 0.00 0.00 WinF
+25 -0.80 13.38 3.50 1.15 72.85 0.50 8.43 0.00 0.00 WinF
+26 -0.36 12.98 3.54 1.21 73.00 0.65 8.53 0.00 0.00 WinF
+27 -0.07 13.21 3.48 1.41 72.64 0.59 8.43 0.00 0.00 WinF
+28 -0.79 12.87 3.48 1.33 73.04 0.56 8.43 0.00 0.00 WinF
+29 -0.32 12.56 3.52 1.43 73.15 0.57 8.54 0.00 0.00 WinF
+30 -0.16 13.08 3.49 1.28 72.86 0.60 8.49 0.00 0.00 WinF
+31 -0.32 12.65 3.56 1.30 73.08 0.61 8.69 0.00 0.14 WinF
+32 -0.53 12.84 3.50 1.14 73.27 0.56 8.55 0.00 0.00 WinF
+33 -0.25 12.85 3.48 1.23 72.97 0.61 8.56 0.09 0.22 WinF
+34 -0.47 12.57 3.47 1.38 73.39 0.60 8.55 0.00 0.06 WinF
+35 -0.17 12.69 3.54 1.34 72.95 0.57 8.75 0.00 0.00 WinF
+36 -2.33 13.29 3.45 1.21 72.74 0.56 8.57 0.00 0.00 WinF
+37 1.09 13.89 3.53 1.32 71.81 0.51 8.78 0.11 0.00 WinF
+38 -0.03 12.74 3.48 1.35 72.96 0.64 8.68 0.00 0.00 WinF
+39 4.13 14.21 3.82 0.47 71.77 0.11 9.57 0.00 0.00 WinF
+40 4.13 14.21 3.82 0.47 71.77 0.11 9.57 0.00 0.00 WinF
+41 -0.07 12.79 3.50 1.12 73.03 0.64 8.77 0.00 0.00 WinF
+42 -0.45 12.71 3.42 1.20 73.20 0.59 8.64 0.00 0.00 WinF
+43 -0.21 13.21 3.39 1.33 72.76 0.59 8.59 0.00 0.00 WinF
+44 4.10 13.73 3.84 0.72 71.76 0.17 9.74 0.00 0.00 WinF
+45 -0.14 12.73 3.43 1.19 72.95 0.62 8.76 0.00 0.30 WinF
+46 1.00 13.49 3.48 1.35 71.95 0.55 9.00 0.00 0.00 WinF
+47 0.69 13.19 3.37 1.18 72.72 0.57 8.83 0.00 0.16 WinF
+48 8.67 13.99 3.70 0.71 71.57 0.02 9.82 0.00 0.10 WinF
+49 4.23 13.21 3.77 0.79 71.99 0.13 10.02 0.00 0.00 WinF
+50 0.98 13.58 3.35 1.23 72.08 0.59 8.91 0.00 0.00 WinF
+51 5.20 13.72 3.72 0.51 71.75 0.09 10.06 0.00 0.16 WinF
+52 1.26 13.20 3.33 1.28 72.36 0.60 9.14 0.00 0.11 WinF
+53 0.08 13.43 2.87 1.19 72.84 0.55 9.03 0.00 0.00 WinF
+54 0.37 13.14 2.84 1.28 72.85 0.55 9.07 0.00 0.00 WinF
+55 -0.22 13.21 2.81 1.29 72.98 0.51 9.02 0.00 0.09 WinF
+56 -0.31 12.45 2.71 1.29 73.70 0.56 9.06 0.00 0.24 WinF
+57 -5.85 12.99 3.47 1.12 72.98 0.62 8.35 0.00 0.31 WinF
+58 0.24 12.87 3.48 1.29 72.95 0.60 8.43 0.00 0.00 WinF
+59 -0.46 13.48 3.74 1.17 72.99 0.59 8.03 0.00 0.00 WinF
+60 -0.46 13.39 3.66 1.19 72.79 0.57 8.27 0.00 0.11 WinF
+61 1.05 13.60 3.62 1.11 72.64 0.14 8.76 0.00 0.00 WinF
+62 1.77 13.81 3.58 1.32 71.72 0.12 8.67 0.69 0.00 WinF
+63 3.72 13.51 3.86 0.88 71.79 0.23 9.54 0.00 0.11 WinF
+64 4.27 14.17 3.81 0.78 71.35 0.00 9.69 0.00 0.00 WinF
+65 3.72 13.48 3.74 0.90 72.01 0.18 9.61 0.00 0.07 WinF
+66 2.99 13.69 3.59 1.12 71.96 0.09 9.40 0.00 0.00 WinF
+67 3.52 13.05 3.65 0.87 72.22 0.19 9.85 0.00 0.17 WinF
+68 3.52 13.05 3.65 0.87 72.32 0.19 9.85 0.00 0.17 WinF
+69 3.52 13.12 3.58 0.90 72.20 0.23 9.82 0.00 0.16 WinF
+70 5.00 13.31 3.58 0.82 71.99 0.12 10.17 0.00 0.03 WinF
+71 -2.26 14.86 3.67 1.74 71.87 0.16 7.36 0.00 0.12 WinNF
+72 0.48 13.64 3.87 1.27 71.96 0.54 8.32 0.00 0.32 WinNF
+73 -2.07 13.09 3.59 1.52 73.10 0.67 7.83 0.00 0.00 WinNF
+74 -1.69 13.34 3.57 1.57 72.87 0.61 7.89 0.00 0.00 WinNF
+75 -2.04 13.02 3.56 1.54 73.11 0.72 7.90 0.00 0.00 WinNF
+76 -2.10 13.02 3.58 1.51 73.12 0.69 7.96 0.00 0.00 WinNF
+77 -1.55 13.44 3.61 1.54 72.39 0.66 8.03 0.00 0.00 WinNF
+78 -1.73 13.00 3.58 1.54 72.83 0.61 8.04 0.00 0.00 WinNF
+79 -1.87 13.92 3.52 1.25 72.88 0.37 7.94 0.00 0.14 WinNF
+80 -2.10 12.82 3.52 1.90 72.86 0.69 7.97 0.00 0.00 WinNF
+81 -2.08 12.86 3.52 2.12 72.66 0.69 7.97 0.00 0.00 WinNF
+82 -2.07 13.25 3.45 1.43 73.17 0.61 7.86 0.00 0.00 WinNF
+83 -1.54 13.41 3.55 1.25 72.81 0.68 8.10 0.00 0.00 WinNF
+84 -2.06 13.09 3.52 1.55 72.87 0.68 8.05 0.00 0.09 WinNF
+85 -3.91 14.25 3.09 2.08 72.28 1.10 7.08 0.00 0.00 WinNF
+86 -1.75 13.36 3.58 1.49 72.72 0.45 8.21 0.00 0.00 WinNF
+87 -2.31 13.24 3.49 1.47 73.25 0.38 8.03 0.00 0.00 WinNF
+88 -1.55 13.40 3.49 1.52 72.65 0.67 8.08 0.00 0.10 WinNF
+89 -1.82 13.01 3.50 1.48 72.89 0.60 8.12 0.00 0.00 WinNF
+90 -1.60 12.55 3.48 1.87 73.23 0.63 8.08 0.00 0.09 WinNF
+91 0.41 12.93 3.74 1.11 72.28 0.64 8.96 0.00 0.22 WinNF
+92 -1.95 12.90 3.44 1.45 73.06 0.44 8.27 0.00 0.00 WinNF
+93 -2.12 13.12 3.41 1.58 73.26 0.07 8.39 0.00 0.19 WinNF
+94 -2.10 13.24 3.34 1.47 73.10 0.39 8.22 0.00 0.00 WinNF
+95 -1.71 12.71 3.33 1.49 73.28 0.67 8.24 0.00 0.00 WinNF
+96 0.60 13.36 3.43 1.43 72.26 0.51 8.60 0.00 0.00 WinNF
+97 0.41 13.02 3.62 1.06 72.34 0.64 9.13 0.00 0.15 WinNF
+98 -0.57 12.20 3.25 1.16 73.55 0.62 8.90 0.00 0.24 WinNF
+99 -1.11 12.67 2.88 1.71 73.21 0.73 8.54 0.00 0.00 WinNF
+100 0.11 12.96 2.96 1.43 72.92 0.60 8.79 0.14 0.00 WinNF
+101 -1.45 12.75 2.85 1.44 73.27 0.57 8.79 0.11 0.22 WinNF
+102 -0.70 12.35 2.72 1.63 72.87 0.70 9.23 0.00 0.00 WinNF
+103 0.20 12.62 2.76 0.83 73.81 0.35 9.42 0.00 0.20 WinNF
+104 9.25 13.80 3.15 0.66 70.57 0.08 11.64 0.00 0.00 WinNF
+105 6.10 13.83 2.90 1.17 71.15 0.08 10.79 0.00 0.00 WinNF
+106 6.75 11.45 0.00 1.88 72.19 0.81 13.24 0.00 0.34 WinNF
+107 13.25 10.73 0.00 2.10 69.81 0.58 13.30 3.15 0.28 WinNF
+108 15.93 12.30 0.00 1.00 70.16 0.12 16.19 0.00 0.24 WinNF
+109 4.22 14.43 0.00 1.00 72.67 0.10 11.52 0.00 0.08 WinNF
+110 0.18 13.72 0.00 0.56 74.45 0.00 10.99 0.00 0.00 WinNF
+111 8.64 11.23 0.00 0.77 73.21 0.00 14.68 0.00 0.00 WinNF
+112 9.39 11.02 0.00 0.75 73.08 0.00 14.96 0.00 0.00 WinNF
+113 9.77 12.64 0.00 0.67 72.02 0.06 14.40 0.00 0.00 WinNF
+114 0.92 13.46 3.83 1.26 72.55 0.57 8.21 0.00 0.14 WinNF
+115 0.47 13.10 3.97 1.19 72.44 0.60 8.43 0.00 0.00 WinNF
+116 0.46 13.41 3.89 1.33 72.38 0.51 8.28 0.00 0.00 WinNF
+117 0.29 13.24 3.90 1.41 72.33 0.55 8.31 0.00 0.10 WinNF
+118 -0.92 13.72 3.68 1.81 72.06 0.64 7.88 0.00 0.00 WinNF
+119 -1.27 13.30 3.64 1.53 72.53 0.65 8.03 0.00 0.29 WinNF
+120 -1.48 13.56 3.57 1.47 72.45 0.64 7.96 0.00 0.00 WinNF
+121 0.44 13.25 3.76 1.32 72.40 0.58 8.42 0.00 0.00 WinNF
+122 -1.37 12.93 3.54 1.62 72.96 0.64 8.03 0.00 0.21 WinNF
+123 -1.13 13.23 3.54 1.48 72.84 0.56 8.10 0.00 0.00 WinNF
+124 -0.93 13.48 3.48 1.71 72.52 0.62 7.99 0.00 0.00 WinNF
+125 3.77 13.20 3.68 1.15 72.75 0.54 8.52 0.00 0.00 WinNF
+126 0.72 12.93 3.66 1.56 72.51 0.58 8.55 0.00 0.12 WinNF
+127 -1.33 12.94 3.61 1.26 72.75 0.56 8.60 0.00 0.00 WinNF
+128 2.81 13.78 2.28 1.43 71.99 0.49 9.85 0.00 0.17 WinNF
+129 2.68 13.55 2.09 1.67 72.18 0.53 9.57 0.27 0.17 WinNF
+130 2.20 13.98 1.35 1.63 71.76 0.39 10.56 0.00 0.18 WinNF
+131 3.77 13.75 1.01 1.36 72.19 0.33 11.14 0.00 0.00 WinNF
+132 8.14 13.70 0.00 1.36 71.24 0.19 13.44 0.00 0.10 WinNF
+133 0.13 13.43 3.98 1.18 72.49 0.58 8.15 0.00 0.00 WinNF
+134 0.00 13.71 3.93 1.54 71.81 0.54 8.21 0.00 0.15 WinNF
+135 0.11 13.33 3.85 1.25 72.78 0.52 8.12 0.00 0.00 WinNF
+136 -0.11 13.19 3.90 1.30 72.33 0.55 8.44 0.00 0.28 WinNF
+137 0.06 13.00 3.80 1.08 73.07 0.56 8.38 0.00 0.12 WinNF
+138 -0.89 12.89 3.62 1.57 72.96 0.61 8.11 0.00 0.00 WinNF
+139 -1.26 12.79 3.52 1.54 73.36 0.66 7.90 0.00 0.00 WinNF
+140 -1.26 12.87 3.56 1.64 73.14 0.65 7.99 0.00 0.00 WinNF
+141 -1.10 13.33 3.54 1.61 72.54 0.68 8.11 0.00 0.00 WinNF
+142 0.51 13.20 3.63 1.07 72.83 0.57 8.41 0.09 0.17 WinNF
+143 -1.38 12.85 3.51 1.44 73.01 0.68 8.23 0.06 0.25 WinNF
+144 -0.91 13.00 3.47 1.79 72.72 0.66 8.18 0.00 0.00 WinNF
+145 -1.40 12.99 3.18 1.23 72.97 0.58 8.81 0.00 0.24 WinNF
+146 0.39 12.85 3.67 1.24 72.57 0.62 8.68 0.00 0.35 WinNF
+147 -0.31 13.65 3.66 1.11 72.77 0.11 8.60 0.00 0.00 Veh
+148 -1.90 13.33 3.53 1.34 72.67 0.56 8.33 0.00 0.00 Veh
+149 -1.30 13.24 3.57 1.38 72.70 0.56 8.44 0.00 0.10 Veh
+150 -1.57 12.16 3.52 1.35 72.89 0.57 8.53 0.00 0.00 Veh
+151 -1.35 13.14 3.45 1.76 72.48 0.60 8.38 0.00 0.17 Veh
+152 3.27 14.32 3.90 0.83 71.50 0.00 9.49 0.00 0.00 Veh
+153 -0.21 13.64 3.65 0.65 73.00 0.06 8.93 0.00 0.00 Veh
+154 -1.90 13.42 3.40 1.22 72.69 0.59 8.32 0.00 0.00 Veh
+155 -1.06 12.86 3.58 1.31 72.61 0.61 8.79 0.00 0.00 Veh
+156 -1.54 13.04 3.40 1.26 73.01 0.52 8.58 0.00 0.00 Veh
+157 -1.45 13.41 3.39 1.28 72.64 0.52 8.65 0.00 0.00 Veh
+158 3.21 14.03 3.76 0.58 71.79 0.11 9.65 0.00 0.00 Veh
+159 -0.24 13.53 3.41 1.52 72.04 0.58 8.79 0.00 0.00 Veh
+160 -0.04 13.50 3.36 1.63 71.94 0.57 8.81 0.00 0.09 Veh
+161 0.32 13.33 3.34 1.54 72.14 0.56 8.99 0.00 0.00 Veh
+162 1.34 13.64 3.54 0.75 72.65 0.16 8.89 0.15 0.24 Veh
+163 4.11 14.19 3.78 0.91 71.36 0.23 9.14 0.00 0.37 Veh
+164 -2.86 14.01 2.68 3.50 69.89 1.68 5.87 2.20 0.00 Con
+165 1.15 12.73 1.85 1.86 72.69 0.60 10.09 0.00 0.00 Con
+166 3.71 11.56 1.88 1.56 72.86 0.47 11.41 0.00 0.00 Con
+167 3.51 11.03 1.71 1.56 73.44 0.58 11.62 0.00 0.00 Con
+168 1.69 12.64 0.00 1.65 73.75 0.38 11.53 0.00 0.00 Con
+169 -1.34 12.86 0.00 1.83 73.88 0.97 10.17 0.00 0.00 Con
+170 1.94 13.27 0.00 1.76 73.03 0.47 11.32 0.00 0.00 Con
+171 5.69 13.44 0.00 1.58 72.22 0.32 12.24 0.00 0.00 Con
+172 -4.84 13.02 0.00 3.04 70.48 6.21 6.96 0.00 0.00 Con
+173 -4.79 13.00 0.00 3.02 70.70 6.21 6.93 0.00 0.00 Con
+174 2.43 13.38 0.00 1.40 72.25 0.33 12.50 0.00 0.00 Con
+175 2.58 12.85 1.61 2.17 72.18 0.76 9.70 0.24 0.51 Con
+176 3.19 12.97 0.33 1.51 73.39 0.13 11.27 0.00 0.28 Con
+177 1.05 14.00 2.39 1.56 72.37 0.00 9.57 0.00 0.00 Tabl
+178 1.37 13.79 2.41 1.19 72.76 0.00 9.77 0.00 0.00 Tabl
+179 0.29 14.46 2.24 1.62 72.38 0.00 9.26 0.00 0.00 Tabl
+180 0.52 14.09 2.19 1.66 72.67 0.00 9.32 0.00 0.00 Tabl
+181 -5.01 14.40 1.74 1.54 74.55 0.00 7.59 0.00 0.00 Tabl
+182 0.88 14.99 0.78 1.74 72.50 0.00 9.95 0.00 0.00 Tabl
+183 1.16 14.15 0.00 2.09 72.74 0.00 10.88 0.00 0.00 Tabl
+184 1.69 14.56 0.00 0.56 73.48 0.00 11.22 0.00 0.00 Tabl
+185 -6.85 17.38 0.00 0.34 75.41 0.00 6.65 0.00 0.00 Tabl
+186 -6.69 13.69 3.20 1.81 72.81 1.76 5.43 1.19 0.00 Head
+187 0.38 14.32 3.26 2.22 71.25 1.46 5.79 1.63 0.00 Head
+188 5.15 13.44 3.34 1.23 72.38 0.60 8.83 0.00 0.00 Head
+189 4.47 14.86 2.20 2.06 70.26 0.76 9.76 0.00 0.00 Head
+190 5.65 15.79 1.83 1.31 70.43 0.31 8.61 1.68 0.00 Head
+191 -1.87 13.88 1.78 1.79 73.10 0.00 8.67 0.76 0.00 Head
+192 -1.98 14.85 0.00 2.38 73.28 0.00 8.76 0.64 0.09 Head
+193 -1.77 14.20 0.00 2.79 73.46 0.04 9.04 0.40 0.09 Head
+194 -0.81 14.75 0.00 2.00 73.02 0.00 8.53 1.59 0.08 Head
+195 -1.17 14.56 0.00 1.98 73.29 0.00 8.52 1.57 0.07 Head
+196 -2.55 14.14 0.00 2.68 73.39 0.08 9.07 0.61 0.05 Head
+197 -2.44 13.87 0.00 2.54 73.23 0.14 9.41 0.81 0.01 Head
+198 -0.73 14.70 0.00 2.34 73.28 0.00 8.95 0.66 0.00 Head
+199 -2.69 14.38 0.00 2.66 73.10 0.04 9.08 0.64 0.00 Head
+200 -1.91 15.01 0.00 2.51 73.05 0.05 8.83 0.53 0.00 Head
+201 -2.92 15.15 0.00 2.25 73.50 0.00 8.34 0.63 0.00 Head
+202 -1.47 11.95 0.00 1.19 75.18 2.70 8.93 0.00 0.00 Head
+203 -2.86 14.85 0.00 2.42 73.72 0.00 8.39 0.56 0.00 Head
+204 -1.42 14.80 0.00 1.99 73.11 0.00 8.28 1.71 0.00 Head
+205 -1.83 14.95 0.00 2.27 73.30 0.00 8.71 0.67 0.00 Head
+206 -0.68 14.95 0.00 1.80 72.99 0.00 8.61 1.55 0.00 Head
+207 -1.55 14.94 0.00 1.87 73.11 0.00 8.67 1.38 0.00 Head
+208 0.31 14.39 0.00 1.82 72.86 1.41 6.47 2.88 0.00 Head
+209 -1.60 14.37 0.00 2.74 72.85 0.00 9.45 0.54 0.00 Head
+210 -1.77 14.14 0.00 2.88 72.61 0.08 9.18 1.06 0.00 Head
+211 -1.15 14.92 0.00 1.99 73.06 0.00 8.40 1.59 0.00 Head
+212 2.65 14.36 0.00 2.02 73.42 0.00 8.44 1.64 0.00 Head
+213 -1.49 14.38 0.00 1.94 73.61 0.00 8.48 1.57 0.00 Head
+214 -0.89 14.23 0.00 2.08 73.36 0.00 8.62 1.67 0.00 Head
diff --git a/lib/mining/testdata/forensic_glass/fgl.dsv b/lib/mining/testdata/forensic_glass/fgl.dsv
new file mode 100644
index 00000000..fd33e0d5
--- /dev/null
+++ b/lib/mining/testdata/forensic_glass/fgl.dsv
@@ -0,0 +1,97 @@
+{
+ "Input" :"fgl.data"
+, "Rejected" :"glass.rej"
+, "MaxRows" :-1
+, "ClassMetadataIndex" :10
+, "ClassIndex" :9
+, "DatasetMode" :"matrix"
+, "InputMetadata" :
+ [{
+ "Name" :"ID"
+ , "Separator" :" "
+ , "Type" :"integer"
+ , "Skip" :true
+ },{
+ "Name" :"RI"
+ , "Separator" :" "
+ , "Type" :"real"
+ },{
+ "Name" :"NA20"
+ , "Separator" :" "
+ , "Type" :"real"
+ },{
+ "Name" :"MGO"
+ , "Separator" :" "
+ , "Type" :"real"
+ },{
+ "Name" :"AL203"
+ , "Separator" :" "
+ , "Type" :"real"
+ },{
+ "Name" :"SI02"
+ , "Separator" :" "
+ , "Type" :"real"
+ },{
+ "Name" :"K20"
+ , "Separator" :" "
+ , "Type" :"real"
+ },{
+ "Name" :"CAO"
+ , "Separator" :" "
+ , "Type" :"real"
+ },{
+ "Name" :"BAO"
+ , "Separator" :" "
+ , "Type" :"real"
+ },{
+ "Name" :"FE203"
+ , "Separator" :" "
+ , "Type" :"real"
+ },{
+ "Name" :"TYPE"
+ , "Type" :"string"
+ , "ValueSpace" :
+ [
+ "WinF"
+ , "WinNF"
+ , "Veh"
+ , "Con"
+ , "Tabl"
+ , "Head"
+ ]
+ }]
+
+, "Output" :"glass.out"
+, "OutputMetadata" :
+ [{
+ "Name" :"RI"
+ , "Separator" :","
+ },{
+ "Name" :"NA20"
+ , "Separator" :","
+ },{
+ "Name" :"MGO"
+ , "Separator" :","
+ },{
+ "Name" :"AL203"
+ , "Separator" :","
+ },{
+ "Name" :"SI02"
+ , "Separator" :","
+ },{
+ "Name" :"K20"
+ , "Separator" :","
+ },{
+ "Name" :"CAO"
+ , "Separator" :","
+ },{
+ "Name" :"BAO"
+ , "Separator" :","
+ },{
+ "Name" :"FE203"
+ , "Separator" :","
+ },{
+ "Name" :"TYPE"
+ , "Type" :"integer"
+ }]
+}
diff --git a/lib/mining/testdata/forensic_glass/glass.data b/lib/mining/testdata/forensic_glass/glass.data
new file mode 100644
index 00000000..ab21361c
--- /dev/null
+++ b/lib/mining/testdata/forensic_glass/glass.data
@@ -0,0 +1,214 @@
+1,1.52101,13.64,4.49,1.10,71.78,0.06,8.75,0.00,0.00,1
+2,1.51761,13.89,3.60,1.36,72.73,0.48,7.83,0.00,0.00,1
+3,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.00,0.00,1
+4,1.51766,13.21,3.69,1.29,72.61,0.57,8.22,0.00,0.00,1
+5,1.51742,13.27,3.62,1.24,73.08,0.55,8.07,0.00,0.00,1
+6,1.51596,12.79,3.61,1.62,72.97,0.64,8.07,0.00,0.26,1
+7,1.51743,13.30,3.60,1.14,73.09,0.58,8.17,0.00,0.00,1
+8,1.51756,13.15,3.61,1.05,73.24,0.57,8.24,0.00,0.00,1
+9,1.51918,14.04,3.58,1.37,72.08,0.56,8.30,0.00,0.00,1
+10,1.51755,13.00,3.60,1.36,72.99,0.57,8.40,0.00,0.11,1
+11,1.51571,12.72,3.46,1.56,73.20,0.67,8.09,0.00,0.24,1
+12,1.51763,12.80,3.66,1.27,73.01,0.60,8.56,0.00,0.00,1
+13,1.51589,12.88,3.43,1.40,73.28,0.69,8.05,0.00,0.24,1
+14,1.51748,12.86,3.56,1.27,73.21,0.54,8.38,0.00,0.17,1
+15,1.51763,12.61,3.59,1.31,73.29,0.58,8.50,0.00,0.00,1
+16,1.51761,12.81,3.54,1.23,73.24,0.58,8.39,0.00,0.00,1
+17,1.51784,12.68,3.67,1.16,73.11,0.61,8.70,0.00,0.00,1
+18,1.52196,14.36,3.85,0.89,71.36,0.15,9.15,0.00,0.00,1
+19,1.51911,13.90,3.73,1.18,72.12,0.06,8.89,0.00,0.00,1
+20,1.51735,13.02,3.54,1.69,72.73,0.54,8.44,0.00,0.07,1
+21,1.51750,12.82,3.55,1.49,72.75,0.54,8.52,0.00,0.19,1
+22,1.51966,14.77,3.75,0.29,72.02,0.03,9.00,0.00,0.00,1
+23,1.51736,12.78,3.62,1.29,72.79,0.59,8.70,0.00,0.00,1
+24,1.51751,12.81,3.57,1.35,73.02,0.62,8.59,0.00,0.00,1
+25,1.51720,13.38,3.50,1.15,72.85,0.50,8.43,0.00,0.00,1
+26,1.51764,12.98,3.54,1.21,73.00,0.65,8.53,0.00,0.00,1
+27,1.51793,13.21,3.48,1.41,72.64,0.59,8.43,0.00,0.00,1
+28,1.51721,12.87,3.48,1.33,73.04,0.56,8.43,0.00,0.00,1
+29,1.51768,12.56,3.52,1.43,73.15,0.57,8.54,0.00,0.00,1
+30,1.51784,13.08,3.49,1.28,72.86,0.60,8.49,0.00,0.00,1
+31,1.51768,12.65,3.56,1.30,73.08,0.61,8.69,0.00,0.14,1
+32,1.51747,12.84,3.50,1.14,73.27,0.56,8.55,0.00,0.00,1
+33,1.51775,12.85,3.48,1.23,72.97,0.61,8.56,0.09,0.22,1
+34,1.51753,12.57,3.47,1.38,73.39,0.60,8.55,0.00,0.06,1
+35,1.51783,12.69,3.54,1.34,72.95,0.57,8.75,0.00,0.00,1
+36,1.51567,13.29,3.45,1.21,72.74,0.56,8.57,0.00,0.00,1
+37,1.51909,13.89,3.53,1.32,71.81,0.51,8.78,0.11,0.00,1
+38,1.51797,12.74,3.48,1.35,72.96,0.64,8.68,0.00,0.00,1
+39,1.52213,14.21,3.82,0.47,71.77,0.11,9.57,0.00,0.00,1
+40,1.52213,14.21,3.82,0.47,71.77,0.11,9.57,0.00,0.00,1
+41,1.51793,12.79,3.50,1.12,73.03,0.64,8.77,0.00,0.00,1
+42,1.51755,12.71,3.42,1.20,73.20,0.59,8.64,0.00,0.00,1
+43,1.51779,13.21,3.39,1.33,72.76,0.59,8.59,0.00,0.00,1
+44,1.52210,13.73,3.84,0.72,71.76,0.17,9.74,0.00,0.00,1
+45,1.51786,12.73,3.43,1.19,72.95,0.62,8.76,0.00,0.30,1
+46,1.51900,13.49,3.48,1.35,71.95,0.55,9.00,0.00,0.00,1
+47,1.51869,13.19,3.37,1.18,72.72,0.57,8.83,0.00,0.16,1
+48,1.52667,13.99,3.70,0.71,71.57,0.02,9.82,0.00,0.10,1
+49,1.52223,13.21,3.77,0.79,71.99,0.13,10.02,0.00,0.00,1
+50,1.51898,13.58,3.35,1.23,72.08,0.59,8.91,0.00,0.00,1
+51,1.52320,13.72,3.72,0.51,71.75,0.09,10.06,0.00,0.16,1
+52,1.51926,13.20,3.33,1.28,72.36,0.60,9.14,0.00,0.11,1
+53,1.51808,13.43,2.87,1.19,72.84,0.55,9.03,0.00,0.00,1
+54,1.51837,13.14,2.84,1.28,72.85,0.55,9.07,0.00,0.00,1
+55,1.51778,13.21,2.81,1.29,72.98,0.51,9.02,0.00,0.09,1
+56,1.51769,12.45,2.71,1.29,73.70,0.56,9.06,0.00,0.24,1
+57,1.51215,12.99,3.47,1.12,72.98,0.62,8.35,0.00,0.31,1
+58,1.51824,12.87,3.48,1.29,72.95,0.60,8.43,0.00,0.00,1
+59,1.51754,13.48,3.74,1.17,72.99,0.59,8.03,0.00,0.00,1
+60,1.51754,13.39,3.66,1.19,72.79,0.57,8.27,0.00,0.11,1
+61,1.51905,13.60,3.62,1.11,72.64,0.14,8.76,0.00,0.00,1
+62,1.51977,13.81,3.58,1.32,71.72,0.12,8.67,0.69,0.00,1
+63,1.52172,13.51,3.86,0.88,71.79,0.23,9.54,0.00,0.11,1
+64,1.52227,14.17,3.81,0.78,71.35,0.00,9.69,0.00,0.00,1
+65,1.52172,13.48,3.74,0.90,72.01,0.18,9.61,0.00,0.07,1
+66,1.52099,13.69,3.59,1.12,71.96,0.09,9.40,0.00,0.00,1
+67,1.52152,13.05,3.65,0.87,72.22,0.19,9.85,0.00,0.17,1
+68,1.52152,13.05,3.65,0.87,72.32,0.19,9.85,0.00,0.17,1
+69,1.52152,13.12,3.58,0.90,72.20,0.23,9.82,0.00,0.16,1
+70,1.52300,13.31,3.58,0.82,71.99,0.12,10.17,0.00,0.03,1
+71,1.51574,14.86,3.67,1.74,71.87,0.16,7.36,0.00,0.12,2
+72,1.51848,13.64,3.87,1.27,71.96,0.54,8.32,0.00,0.32,2
+73,1.51593,13.09,3.59,1.52,73.10,0.67,7.83,0.00,0.00,2
+74,1.51631,13.34,3.57,1.57,72.87,0.61,7.89,0.00,0.00,2
+75,1.51596,13.02,3.56,1.54,73.11,0.72,7.90,0.00,0.00,2
+76,1.51590,13.02,3.58,1.51,73.12,0.69,7.96,0.00,0.00,2
+77,1.51645,13.44,3.61,1.54,72.39,0.66,8.03,0.00,0.00,2
+78,1.51627,13.00,3.58,1.54,72.83,0.61,8.04,0.00,0.00,2
+79,1.51613,13.92,3.52,1.25,72.88,0.37,7.94,0.00,0.14,2
+80,1.51590,12.82,3.52,1.90,72.86,0.69,7.97,0.00,0.00,2
+81,1.51592,12.86,3.52,2.12,72.66,0.69,7.97,0.00,0.00,2
+82,1.51593,13.25,3.45,1.43,73.17,0.61,7.86,0.00,0.00,2
+83,1.51646,13.41,3.55,1.25,72.81,0.68,8.10,0.00,0.00,2
+84,1.51594,13.09,3.52,1.55,72.87,0.68,8.05,0.00,0.09,2
+85,1.51409,14.25,3.09,2.08,72.28,1.10,7.08,0.00,0.00,2
+86,1.51625,13.36,3.58,1.49,72.72,0.45,8.21,0.00,0.00,2
+87,1.51569,13.24,3.49,1.47,73.25,0.38,8.03,0.00,0.00,2
+88,1.51645,13.40,3.49,1.52,72.65,0.67,8.08,0.00,0.10,2
+89,1.51618,13.01,3.50,1.48,72.89,0.60,8.12,0.00,0.00,2
+90,1.51640,12.55,3.48,1.87,73.23,0.63,8.08,0.00,0.09,2
+91,1.51841,12.93,3.74,1.11,72.28,0.64,8.96,0.00,0.22,2
+92,1.51605,12.90,3.44,1.45,73.06,0.44,8.27,0.00,0.00,2
+93,1.51588,13.12,3.41,1.58,73.26,0.07,8.39,0.00,0.19,2
+94,1.51590,13.24,3.34,1.47,73.10,0.39,8.22,0.00,0.00,2
+95,1.51629,12.71,3.33,1.49,73.28,0.67,8.24,0.00,0.00,2
+96,1.51860,13.36,3.43,1.43,72.26,0.51,8.60,0.00,0.00,2
+97,1.51841,13.02,3.62,1.06,72.34,0.64,9.13,0.00,0.15,2
+98,1.51743,12.20,3.25,1.16,73.55,0.62,8.90,0.00,0.24,2
+99,1.51689,12.67,2.88,1.71,73.21,0.73,8.54,0.00,0.00,2
+100,1.51811,12.96,2.96,1.43,72.92,0.60,8.79,0.14,0.00,2
+101,1.51655,12.75,2.85,1.44,73.27,0.57,8.79,0.11,0.22,2
+102,1.51730,12.35,2.72,1.63,72.87,0.70,9.23,0.00,0.00,2
+103,1.51820,12.62,2.76,0.83,73.81,0.35,9.42,0.00,0.20,2
+104,1.52725,13.80,3.15,0.66,70.57,0.08,11.64,0.00,0.00,2
+105,1.52410,13.83,2.90,1.17,71.15,0.08,10.79,0.00,0.00,2
+106,1.52475,11.45,0.00,1.88,72.19,0.81,13.24,0.00,0.34,2
+107,1.53125,10.73,0.00,2.10,69.81,0.58,13.30,3.15,0.28,2
+108,1.53393,12.30,0.00,1.00,70.16,0.12,16.19,0.00,0.24,2
+109,1.52222,14.43,0.00,1.00,72.67,0.10,11.52,0.00,0.08,2
+110,1.51818,13.72,0.00,0.56,74.45,0.00,10.99,0.00,0.00,2
+111,1.52664,11.23,0.00,0.77,73.21,0.00,14.68,0.00,0.00,2
+112,1.52739,11.02,0.00,0.75,73.08,0.00,14.96,0.00,0.00,2
+113,1.52777,12.64,0.00,0.67,72.02,0.06,14.40,0.00,0.00,2
+114,1.51892,13.46,3.83,1.26,72.55,0.57,8.21,0.00,0.14,2
+115,1.51847,13.10,3.97,1.19,72.44,0.60,8.43,0.00,0.00,2
+116,1.51846,13.41,3.89,1.33,72.38,0.51,8.28,0.00,0.00,2
+117,1.51829,13.24,3.90,1.41,72.33,0.55,8.31,0.00,0.10,2
+118,1.51708,13.72,3.68,1.81,72.06,0.64,7.88,0.00,0.00,2
+119,1.51673,13.30,3.64,1.53,72.53,0.65,8.03,0.00,0.29,2
+120,1.51652,13.56,3.57,1.47,72.45,0.64,7.96,0.00,0.00,2
+121,1.51844,13.25,3.76,1.32,72.40,0.58,8.42,0.00,0.00,2
+122,1.51663,12.93,3.54,1.62,72.96,0.64,8.03,0.00,0.21,2
+123,1.51687,13.23,3.54,1.48,72.84,0.56,8.10,0.00,0.00,2
+124,1.51707,13.48,3.48,1.71,72.52,0.62,7.99,0.00,0.00,2
+125,1.52177,13.20,3.68,1.15,72.75,0.54,8.52,0.00,0.00,2
+126,1.51872,12.93,3.66,1.56,72.51,0.58,8.55,0.00,0.12,2
+127,1.51667,12.94,3.61,1.26,72.75,0.56,8.60,0.00,0.00,2
+128,1.52081,13.78,2.28,1.43,71.99,0.49,9.85,0.00,0.17,2
+129,1.52068,13.55,2.09,1.67,72.18,0.53,9.57,0.27,0.17,2
+130,1.52020,13.98,1.35,1.63,71.76,0.39,10.56,0.00,0.18,2
+131,1.52177,13.75,1.01,1.36,72.19,0.33,11.14,0.00,0.00,2
+132,1.52614,13.70,0.00,1.36,71.24,0.19,13.44,0.00,0.10,2
+133,1.51813,13.43,3.98,1.18,72.49,0.58,8.15,0.00,0.00,2
+134,1.51800,13.71,3.93,1.54,71.81,0.54,8.21,0.00,0.15,2
+135,1.51811,13.33,3.85,1.25,72.78,0.52,8.12,0.00,0.00,2
+136,1.51789,13.19,3.90,1.30,72.33,0.55,8.44,0.00,0.28,2
+137,1.51806,13.00,3.80,1.08,73.07,0.56,8.38,0.00,0.12,2
+138,1.51711,12.89,3.62,1.57,72.96,0.61,8.11,0.00,0.00,2
+139,1.51674,12.79,3.52,1.54,73.36,0.66,7.90,0.00,0.00,2
+140,1.51674,12.87,3.56,1.64,73.14,0.65,7.99,0.00,0.00,2
+141,1.51690,13.33,3.54,1.61,72.54,0.68,8.11,0.00,0.00,2
+142,1.51851,13.20,3.63,1.07,72.83,0.57,8.41,0.09,0.17,2
+143,1.51662,12.85,3.51,1.44,73.01,0.68,8.23,0.06,0.25,2
+144,1.51709,13.00,3.47,1.79,72.72,0.66,8.18,0.00,0.00,2
+145,1.51660,12.99,3.18,1.23,72.97,0.58,8.81,0.00,0.24,2
+146,1.51839,12.85,3.67,1.24,72.57,0.62,8.68,0.00,0.35,2
+147,1.51769,13.65,3.66,1.11,72.77,0.11,8.60,0.00,0.00,3
+148,1.51610,13.33,3.53,1.34,72.67,0.56,8.33,0.00,0.00,3
+149,1.51670,13.24,3.57,1.38,72.70,0.56,8.44,0.00,0.10,3
+150,1.51643,12.16,3.52,1.35,72.89,0.57,8.53,0.00,0.00,3
+151,1.51665,13.14,3.45,1.76,72.48,0.60,8.38,0.00,0.17,3
+152,1.52127,14.32,3.90,0.83,71.50,0.00,9.49,0.00,0.00,3
+153,1.51779,13.64,3.65,0.65,73.00,0.06,8.93,0.00,0.00,3
+154,1.51610,13.42,3.40,1.22,72.69,0.59,8.32,0.00,0.00,3
+155,1.51694,12.86,3.58,1.31,72.61,0.61,8.79,0.00,0.00,3
+156,1.51646,13.04,3.40,1.26,73.01,0.52,8.58,0.00,0.00,3
+157,1.51655,13.41,3.39,1.28,72.64,0.52,8.65,0.00,0.00,3
+158,1.52121,14.03,3.76,0.58,71.79,0.11,9.65,0.00,0.00,3
+159,1.51776,13.53,3.41,1.52,72.04,0.58,8.79,0.00,0.00,3
+160,1.51796,13.50,3.36,1.63,71.94,0.57,8.81,0.00,0.09,3
+161,1.51832,13.33,3.34,1.54,72.14,0.56,8.99,0.00,0.00,3
+162,1.51934,13.64,3.54,0.75,72.65,0.16,8.89,0.15,0.24,3
+163,1.52211,14.19,3.78,0.91,71.36,0.23,9.14,0.00,0.37,3
+164,1.51514,14.01,2.68,3.50,69.89,1.68,5.87,2.20,0.00,5
+165,1.51915,12.73,1.85,1.86,72.69,0.60,10.09,0.00,0.00,5
+166,1.52171,11.56,1.88,1.56,72.86,0.47,11.41,0.00,0.00,5
+167,1.52151,11.03,1.71,1.56,73.44,0.58,11.62,0.00,0.00,5
+168,1.51969,12.64,0.00,1.65,73.75,0.38,11.53,0.00,0.00,5
+169,1.51666,12.86,0.00,1.83,73.88,0.97,10.17,0.00,0.00,5
+170,1.51994,13.27,0.00,1.76,73.03,0.47,11.32,0.00,0.00,5
+171,1.52369,13.44,0.00,1.58,72.22,0.32,12.24,0.00,0.00,5
+172,1.51316,13.02,0.00,3.04,70.48,6.21,6.96,0.00,0.00,5
+173,1.51321,13.00,0.00,3.02,70.70,6.21,6.93,0.00,0.00,5
+174,1.52043,13.38,0.00,1.40,72.25,0.33,12.50,0.00,0.00,5
+175,1.52058,12.85,1.61,2.17,72.18,0.76,9.70,0.24,0.51,5
+176,1.52119,12.97,0.33,1.51,73.39,0.13,11.27,0.00,0.28,5
+177,1.51905,14.00,2.39,1.56,72.37,0.00,9.57,0.00,0.00,6
+178,1.51937,13.79,2.41,1.19,72.76,0.00,9.77,0.00,0.00,6
+179,1.51829,14.46,2.24,1.62,72.38,0.00,9.26,0.00,0.00,6
+180,1.51852,14.09,2.19,1.66,72.67,0.00,9.32,0.00,0.00,6
+181,1.51299,14.40,1.74,1.54,74.55,0.00,7.59,0.00,0.00,6
+182,1.51888,14.99,0.78,1.74,72.50,0.00,9.95,0.00,0.00,6
+183,1.51916,14.15,0.00,2.09,72.74,0.00,10.88,0.00,0.00,6
+184,1.51969,14.56,0.00,0.56,73.48,0.00,11.22,0.00,0.00,6
+185,1.51115,17.38,0.00,0.34,75.41,0.00,6.65,0.00,0.00,6
+186,1.51131,13.69,3.20,1.81,72.81,1.76,5.43,1.19,0.00,7
+187,1.51838,14.32,3.26,2.22,71.25,1.46,5.79,1.63,0.00,7
+188,1.52315,13.44,3.34,1.23,72.38,0.60,8.83,0.00,0.00,7
+189,1.52247,14.86,2.20,2.06,70.26,0.76,9.76,0.00,0.00,7
+190,1.52365,15.79,1.83,1.31,70.43,0.31,8.61,1.68,0.00,7
+191,1.51613,13.88,1.78,1.79,73.10,0.00,8.67,0.76,0.00,7
+192,1.51602,14.85,0.00,2.38,73.28,0.00,8.76,0.64,0.09,7
+193,1.51623,14.20,0.00,2.79,73.46,0.04,9.04,0.40,0.09,7
+194,1.51719,14.75,0.00,2.00,73.02,0.00,8.53,1.59,0.08,7
+195,1.51683,14.56,0.00,1.98,73.29,0.00,8.52,1.57,0.07,7
+196,1.51545,14.14,0.00,2.68,73.39,0.08,9.07,0.61,0.05,7
+197,1.51556,13.87,0.00,2.54,73.23,0.14,9.41,0.81,0.01,7
+198,1.51727,14.70,0.00,2.34,73.28,0.00,8.95,0.66,0.00,7
+199,1.51531,14.38,0.00,2.66,73.10,0.04,9.08,0.64,0.00,7
+200,1.51609,15.01,0.00,2.51,73.05,0.05,8.83,0.53,0.00,7
+201,1.51508,15.15,0.00,2.25,73.50,0.00,8.34,0.63,0.00,7
+202,1.51653,11.95,0.00,1.19,75.18,2.70,8.93,0.00,0.00,7
+203,1.51514,14.85,0.00,2.42,73.72,0.00,8.39,0.56,0.00,7
+204,1.51658,14.80,0.00,1.99,73.11,0.00,8.28,1.71,0.00,7
+205,1.51617,14.95,0.00,2.27,73.30,0.00,8.71,0.67,0.00,7
+206,1.51732,14.95,0.00,1.80,72.99,0.00,8.61,1.55,0.00,7
+207,1.51645,14.94,0.00,1.87,73.11,0.00,8.67,1.38,0.00,7
+208,1.51831,14.39,0.00,1.82,72.86,1.41,6.47,2.88,0.00,7
+209,1.51640,14.37,0.00,2.74,72.85,0.00,9.45,0.54,0.00,7
+210,1.51623,14.14,0.00,2.88,72.61,0.08,9.18,1.06,0.00,7
+211,1.51685,14.92,0.00,1.99,73.06,0.00,8.40,1.59,0.00,7
+212,1.52065,14.36,0.00,2.02,73.42,0.00,8.44,1.64,0.00,7
+213,1.51651,14.38,0.00,1.94,73.61,0.00,8.48,1.57,0.00,7
+214,1.51711,14.23,0.00,2.08,73.36,0.00,8.62,1.67,0.00,7
diff --git a/lib/mining/testdata/forensic_glass/glass.dsv b/lib/mining/testdata/forensic_glass/glass.dsv
new file mode 100644
index 00000000..10e9ba98
--- /dev/null
+++ b/lib/mining/testdata/forensic_glass/glass.dsv
@@ -0,0 +1,98 @@
+{
+ "Input" :"glass.data"
+, "Rejected" :"glass.rej"
+, "MaxRows" :-1
+, "ClassMetadataIndex" :10
+, "ClassIndex" :9
+, "DatasetMode" :"matrix"
+, "InputMetadata" :
+ [{
+ "Name" :"ID"
+ , "Separator" :","
+ , "Type" :"integer"
+ , "Skip" :true
+ },{
+ "Name" :"RI"
+ , "Separator" :","
+ , "Type" :"real"
+ },{
+ "Name" :"NA20"
+ , "Separator" :","
+ , "Type" :"real"
+ },{
+ "Name" :"MGO"
+ , "Separator" :","
+ , "Type" :"real"
+ },{
+ "Name" :"AL203"
+ , "Separator" :","
+ , "Type" :"real"
+ },{
+ "Name" :"SI02"
+ , "Separator" :","
+ , "Type" :"real"
+ },{
+ "Name" :"K20"
+ , "Separator" :","
+ , "Type" :"real"
+ },{
+ "Name" :"CAO"
+ , "Separator" :","
+ , "Type" :"real"
+ },{
+ "Name" :"BAO"
+ , "Separator" :","
+ , "Type" :"real"
+ },{
+ "Name" :"FE203"
+ , "Separator" :","
+ , "Type" :"real"
+ },{
+ "Name" :"TYPE"
+ , "Type" :"real"
+ , "ValueSpace" :
+ [
+ "1"
+ , "2"
+ , "3"
+ , "4"
+ , "5"
+ , "6"
+ , "7"
+ ]
+ }]
+
+, "Output" :"glass.out"
+, "OutputMetadata" :
+ [{
+ "Name" :"RI"
+ , "Separator" :","
+ },{
+ "Name" :"NA20"
+ , "Separator" :","
+ },{
+ "Name" :"MGO"
+ , "Separator" :","
+ },{
+ "Name" :"AL203"
+ , "Separator" :","
+ },{
+ "Name" :"SI02"
+ , "Separator" :","
+ },{
+ "Name" :"K20"
+ , "Separator" :","
+ },{
+ "Name" :"CAO"
+ , "Separator" :","
+ },{
+ "Name" :"BAO"
+ , "Separator" :","
+ },{
+ "Name" :"FE203"
+ , "Separator" :","
+ },{
+ "Name" :"TYPE"
+ , "Type" :"integer"
+ }]
+}
diff --git a/lib/mining/testdata/forensic_glass/glass.names b/lib/mining/testdata/forensic_glass/glass.names
new file mode 100644
index 00000000..caff715c
--- /dev/null
+++ b/lib/mining/testdata/forensic_glass/glass.names
@@ -0,0 +1,90 @@
+1. Title: Glass Identification Database
+
+2. Sources:
+ (a) Creator: B. German
+ -- Central Research Establishment
+ Home Office Forensic Science Service
+ Aldermaston, Reading, Berkshire RG7 4PN
+ (b) Donor: Vina Spiehler, Ph.D., DABFT
+ Diagnostic Products Corporation
+ (213) 776-0180 (ext 3014)
+ (c) Date: September, 1987
+
+3. Past Usage:
+ -- Rule Induction in Forensic Science
+ -- Ian W. Evett and Ernest J. Spiehler
+ -- Central Research Establishment
+ Home Office Forensic Science Service
+ Aldermaston, Reading, Berkshire RG7 4PN
+ -- Unknown technical note number (sorry, not listed here)
+ -- General Results: nearest neighbor held its own with respect to the
+ rule-based system
+
+4. Relevant Information:n
+ Vina conducted a comparison test of her rule-based system, BEAGLE, the
+ nearest-neighbor algorithm, and discriminant analysis. BEAGLE is
+ a product available through VRS Consulting, Inc.; 4676 Admiralty Way,
+ Suite 206; Marina Del Ray, CA 90292 (213) 827-7890 and FAX: -3189.
+ In determining whether the glass was a type of "float" glass or not,
+ the following results were obtained (# incorrect answers):
+
+ Type of Sample Beagle NN DA
+ Windows that were float processed (87) 10 12 21
+ Windows that were not: (76) 19 16 22
+
+ The study of classification of types of glass was motivated by
+ criminological investigation. At the scene of the crime, the glass left
+ can be used as evidence...if it is correctly identified!
+
+5. Number of Instances: 214
+
+6. Number of Attributes: 10 (including an Id#) plus the class attribute
+ -- all attributes are continuously valued
+
+7. Attribute Information:
+ 1. Id number: 1 to 214
+ 2. RI: refractive index
+ 3. Na: Sodium (unit measurement: weight percent in corresponding oxide, as
+ are attributes 4-10)
+ 4. Mg: Magnesium
+ 5. Al: Aluminum
+ 6. Si: Silicon
+ 7. K: Potassium
+ 8. Ca: Calcium
+ 9. Ba: Barium
+ 10. Fe: Iron
+ 11. Type of glass: (class attribute)
+ -- 1 building_windows_float_processed
+ -- 2 building_windows_non_float_processed
+ -- 3 vehicle_windows_float_processed
+ -- 4 vehicle_windows_non_float_processed (none in this database)
+ -- 5 containers
+ -- 6 tableware
+ -- 7 headlamps
+
+8. Missing Attribute Values: None
+
+Summary Statistics:
+Attribute: Min Max Mean SD Correlation with class
+ 2. RI: 1.5112 1.5339 1.5184 0.0030 -0.1642
+ 3. Na: 10.73 17.38 13.4079 0.8166 0.5030
+ 4. Mg: 0 4.49 2.6845 1.4424 -0.7447
+ 5. Al: 0.29 3.5 1.4449 0.4993 0.5988
+ 6. Si: 69.81 75.41 72.6509 0.7745 0.1515
+ 7. K: 0 6.21 0.4971 0.6522 -0.0100
+ 8. Ca: 5.43 16.19 8.9570 1.4232 0.0007
+ 9. Ba: 0 3.15 0.1750 0.4972 0.5751
+10. Fe: 0 0.51 0.0570 0.0974 -0.1879
+
+9. Class Distribution: (out of 214 total instances)
+ -- 163 Window glass (building windows and vehicle windows)
+ -- 87 float processed
+ -- 70 building windows
+ -- 17 vehicle windows
+ -- 76 non-float processed
+ -- 76 building windows
+ -- 0 vehicle windows
+ -- 51 Non-window glass
+ -- 13 containers
+ -- 9 tableware
+ -- 29 headlamps
diff --git a/lib/mining/testdata/forensic_glass/glass.tag b/lib/mining/testdata/forensic_glass/glass.tag
new file mode 100644
index 00000000..8bf70515
--- /dev/null
+++ b/lib/mining/testdata/forensic_glass/glass.tag
@@ -0,0 +1,27 @@
+An original file donated by Vina Speihler
+
+ID, N -- numeric identifier of the instance
+RI, N -- refractive index
+NA2O, N -- Sodium oxide
+MGO, N -- magnesium oxide
+AL2O3, N -- aluminum oxide
+SIO2, N -- silcon oxide
+K2O, N -- potassium oxide
+CAO, N -- calcium oxide
+BAO, N -- barium oxide
+FE2O3, N -- iron oxide
+TYPE, N -- An unknown, but must correspond to the types in the paper
+CAMG, N -- Unsure
+
+Types include:
+ 1. WF (Float Window)
+ 2. WNF (Non-float Window)
+ 3. C (Container)
+ 4. T (Tableware)
+ 5. H (Headlamp) 214 2568 14127 glass.dat
+ 19 92 518 glass.tag
+ 62 742 4775 glassx.dat
+ 51 610 3928 nonwindo.dat
+ 6 14 120 phones
+ 163 1955 12552 window.dat
+ 515 5981 36020 total
diff --git a/lib/mining/testdata/iris/iris.dsv b/lib/mining/testdata/iris/iris.dsv
new file mode 100644
index 00000000..9839b12b
--- /dev/null
+++ b/lib/mining/testdata/iris/iris.dsv
@@ -0,0 +1,35 @@
+{
+ "Input" :"iris.dat"
+, "Rejected" :"iris.rej"
+, "MaxRows" :-1
+, "ClassMetadataIndex" :4
+, "ClassIndex" :4
+, "DatasetMode" :"matrix"
+, "InputMetadata" :
+ [{
+ "Name" :"sepal-length"
+ , "Separator" :","
+ , "Type" :"real"
+ },{
+ "Name" :"sepal-width"
+ , "Separator" :","
+ , "Type" :"real"
+ },{
+ "Name" :"petal-length"
+ , "Separator" :","
+ , "Type" :"real"
+ },{
+ "Name" :"petal-width"
+ , "Separator" :","
+ , "Type" :"real"
+ },{
+ "Name" :"class"
+ , "Type" :"string"
+ , "ValueSpace" :
+ [
+ "Iris-setosa"
+ , "Iris-versicolor"
+ , "Iris-virginica"
+ ]
+ }]
+}
diff --git a/lib/mining/testdata/phoneme/phoneme.dsv b/lib/mining/testdata/phoneme/phoneme.dsv
new file mode 100644
index 00000000..595576b5
--- /dev/null
+++ b/lib/mining/testdata/phoneme/phoneme.dsv
@@ -0,0 +1,59 @@
+{
+ "Input" :"phoneme.dat"
+, "Rejected" :"phoneme.rej"
+, "MaxRows" :-1
+, "ClassMetadataIndex" :5
+, "ClassIndex" :5
+, "DatasetMode" :"matrix"
+, "OutliersFile" :"phoneme.outliers"
+, "InputMetadata" :
+ [{
+ "Name" :"f1"
+ , "Type" :"real"
+ , "Separator" :" "
+ },{
+ "Name" :"f2"
+ , "Type" :"real"
+ , "Separator" :" "
+ },{
+ "Name" :"f3"
+ , "Type" :"real"
+ , "Separator" :" "
+ },{
+ "Name" :"f4"
+ , "Type" :"real"
+ , "Separator" :" "
+ },{
+ "Name" :"f5"
+ , "Type" :"real"
+ , "Separator" :" "
+ },{
+ "Name" :"class"
+ , "Type" :"real"
+ , "ValueSpace" :
+ [
+ "1"
+ , "0"
+ ]
+ }]
+, "Output" :"phoneme.csv"
+, "OutputMetadata":
+ [{
+ "Name" :"f1"
+ , "Separator" :","
+ },{
+ "Name" :"f2"
+ , "Separator" :","
+ },{
+ "Name" :"f3"
+ , "Separator" :","
+ },{
+ "Name" :"f4"
+ , "Separator" :","
+ },{
+ "Name" :"f5"
+ , "Separator" :","
+ },{
+ "Name" :"class"
+ }]
+}
diff --git a/lib/mining/testdata/wvc2010lnsmote/wvc2010_features.lnsmote.dsv b/lib/mining/testdata/wvc2010lnsmote/wvc2010_features.lnsmote.dsv
new file mode 100644
index 00000000..3c88ca38
--- /dev/null
+++ b/lib/mining/testdata/wvc2010lnsmote/wvc2010_features.lnsmote.dsv
@@ -0,0 +1,129 @@
+{
+ "Input" : "wvc2010_features.lnsmote.dat"
+, "Rejected" : "wvc2010_features.lnsmote.rej"
+, "DatasetMode" : "matrix"
+, "MaxRows" : -1
+, "ClassIndex" : 27
+
+, "NTree" : 500
+, "PercentBoot" : 66
+, "StatsFile" : "randomforest.lnsmote.stats.dat"
+
+, "InputMetadata" : [{
+ "Name" : "anonim"
+ , "Type" : "real"
+ , "Separator" : ","
+ },{
+ "Name" : "comment_length"
+ , "Type" : "real"
+ , "Separator" : ","
+ },{
+ "Name" : "size_increment"
+ , "Type" : "real"
+ , "Separator" : ","
+ },{
+ "Name" : "size_ratio"
+ , "Type" : "real"
+ , "Separator" : ","
+ },{
+ "Name" : "upper_lower_ratio"
+ , "Type" : "real"
+ , "Separator" : ","
+ },{
+ "Name" : "upper_to_all_ratio"
+ , "Type" : "real"
+ , "Separator" : ","
+ },{
+ "Name" : "digit_ratio"
+ , "Type" : "real"
+ , "Separator" : ","
+ },{
+ "Name" : "non_alnum_ratio"
+ , "Type" : "real"
+ , "Separator" : ","
+ },{
+ "Name" : "char_diversity"
+ , "Type" : "real"
+ , "Separator" : ","
+ },{
+ "Name" : "char_distribution_insert"
+ , "Type" : "real"
+ , "Separator" : ","
+ },{
+ "Name" : "compress_rate"
+ , "Type" : "real"
+ , "Separator" : ","
+ },{
+ "Name" : "good_token"
+ , "Type" : "real"
+ , "Separator" : ","
+ },{
+ "Name" : "term_frequency"
+ , "Type" : "real"
+ , "Separator" : ","
+ },{
+ "Name" : "longest_word"
+ , "Type" : "real"
+ , "Separator" : ","
+ },{
+ "Name" : "longest_char_sequence"
+ , "Type" : "real"
+ , "Separator" : ","
+ },{
+ "Name" : "words_vulgar_frequency"
+ , "Type" : "real"
+ , "Separator" : ","
+ },{
+ "Name" : "words_vulgar_impact"
+ , "Type" : "real"
+ , "Separator" : ","
+ },{
+ "Name" : "words_pronoun_frequency"
+ , "Type" : "real"
+ , "Separator" : ","
+ },{
+ "Name" : "words_pronoun_impact"
+ , "Type" : "real"
+ , "Separator" : ","
+ },{
+ "Name" : "words_bias_frequency"
+ , "Type" : "real"
+ , "Separator" : ","
+ },{
+ "Name" : "words_bias_impact"
+ , "Type" : "real"
+ , "Separator" : ","
+ },{
+ "Name" : "words_sex_frequency"
+ , "Type" : "real"
+ , "Separator" : ","
+ },{
+ "Name" : "words_sex_impact"
+ , "Type" : "real"
+ , "Separator" : ","
+ },{
+ "Name" : "words_bad_frequency"
+ , "Type" : "real"
+ , "Separator" : ","
+ },{
+ "Name" : "words_bad_impact"
+ , "Type" : "real"
+ , "Separator" : ","
+ },{
+ "Name" : "words_all_frequency"
+ , "Type" : "real"
+ , "Separator" : ","
+ },{
+ "Name" : "words_all_impact"
+ , "Type" : "real"
+ , "Separator" : ","
+ },{
+ "Name" : "class"
+ , "Type" : "real"
+ , "ValueSpace" :
+ [
+ "1"
+ , "0"
+ ]
+ }]
+}
diff --git a/lib/mining/tree/binary/binary.go b/lib/mining/tree/binary/binary.go
new file mode 100644
index 00000000..57c459b0
--- /dev/null
+++ b/lib/mining/tree/binary/binary.go
@@ -0,0 +1,64 @@
+// Copyright 2015 Mhd Sulhan <ms@kilabit.info>. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package binary contain implementation of binary tree.
+package binary
+
+import (
+ "fmt"
+)
+
+//
+// Tree is a abstract data type for tree with only two branch: left and right.
+//
+type Tree struct {
+ // Root is pointer to root of tree.
+ Root *BTNode
+}
+
+//
+// NewTree will create new binary tree with empty root.
+//
+func NewTree() *Tree {
+ return &Tree{
+ Root: nil,
+ }
+}
+
+//
+// String will print all the branch and leaf in tree.
+//
+func (btree *Tree) String() (s string) {
+ var parent, node *BTNode
+
+ parent = btree.Root
+
+ s = fmt.Sprint(parent)
+
+ node = parent.Right
+
+ for parent != nil {
+ // Print right node down to the leaf.
+ for node.Right != nil {
+ s += fmt.Sprint(node)
+
+ parent = node
+ node = node.Right
+ }
+ s += fmt.Sprint(node)
+
+ // crawling to the stop one at a time ...
+ for parent != nil && node.Parent == parent {
+ if parent.Right == node {
+ node = parent.Left
+ break
+ } else if parent.Left == node {
+ node = parent
+ parent = parent.Parent
+ }
+ }
+ }
+
+ return s
+}
diff --git a/lib/mining/tree/binary/binary_test.go b/lib/mining/tree/binary/binary_test.go
new file mode 100644
index 00000000..4f61deb0
--- /dev/null
+++ b/lib/mining/tree/binary/binary_test.go
@@ -0,0 +1,47 @@
+// Copyright 2015 Mhd Sulhan <ms@kilabit.info>. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package binary
+
+import (
+ "fmt"
+ "testing"
+)
+
+func TestTree(t *testing.T) {
+ exp := `1
+ 12
+ 24
+ 34
+ 33
+ 23
+ 11
+ 22
+ 32
+ 31
+ 21
+`
+
+ btree := NewTree()
+
+ root := NewBTNode(1,
+ NewBTNode(11,
+ NewBTNode(21, nil, nil),
+ NewBTNode(22,
+ NewBTNode(31, nil, nil),
+ NewBTNode(32, nil, nil))),
+ NewBTNode(12,
+ NewBTNode(23, nil, nil),
+ NewBTNode(24,
+ NewBTNode(33, nil, nil),
+ NewBTNode(34, nil, nil))))
+
+ btree.Root = root
+
+ res := fmt.Sprint(btree)
+
+ if res != exp {
+ t.Fatal("error, expecting:\n", exp, "\n got:\n", res)
+ }
+}
diff --git a/lib/mining/tree/binary/btnode.go b/lib/mining/tree/binary/btnode.go
new file mode 100644
index 00000000..57524265
--- /dev/null
+++ b/lib/mining/tree/binary/btnode.go
@@ -0,0 +1,77 @@
+// Copyright 2015 Mhd Sulhan <ms@kilabit.info>. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package binary
+
+import (
+ "fmt"
+ "reflect"
+)
+
+//
+// BTNode is a data type for node in binary tree.
+//
+type BTNode struct {
+ // Left branch of node.
+ Left *BTNode
+ // Right branch of node.
+ Right *BTNode
+ // Parent of node.
+ Parent *BTNode
+ // Value of node.
+ Value interface{}
+}
+
+//
+// NewBTNode create new node for binary tree.
+//
+func NewBTNode(v interface{}, l *BTNode, r *BTNode) (p *BTNode) {
+ p = &BTNode{
+ Left: l,
+ Right: r,
+ Parent: nil,
+ Value: v,
+ }
+ if l != nil {
+ l.Parent = p
+ }
+ if r != nil {
+ r.Parent = p
+ }
+
+ return p
+}
+
+//
+// SetLeft will set left branch of node to 'c'.
+//
+func (n *BTNode) SetLeft(c *BTNode) {
+ n.Left = c
+ c.Parent = n
+}
+
+//
+// SetRight will set right branch of node to 'c'.
+//
+func (n *BTNode) SetRight(c *BTNode) {
+ n.Right = c
+ c.Parent = n
+}
+
+//
+// String will convert the node to string.
+//
+func (n *BTNode) String() (s string) {
+ var p = n.Parent
+
+ // add tab until it reached nil
+ for p != nil {
+ s += "\t"
+ p = p.Parent
+ }
+
+ s += fmt.Sprintln(reflect.ValueOf(n.Value))
+
+ return s
+}