diff options
| author | Shulhan <ms@kilabit.info> | 2018-09-17 05:04:26 +0700 |
|---|---|---|
| committer | Shulhan <ms@kilabit.info> | 2018-09-18 01:50:21 +0700 |
| commit | 1cae4ca316afa5d177fdbf7a98a0ec7fffe76a3e (patch) | |
| tree | 5fa83fc0faa31e09cae82ac4d467cf8ba5f87fc2 /lib/mining | |
| parent | 446fef94cd712861221c0098dcdd9ae52aaed0eb (diff) | |
| download | pakakeh.go-1cae4ca316afa5d177fdbf7a98a0ec7fffe76a3e.tar.xz | |
Merge package "github.com/shuLhan/go-mining"
Diffstat (limited to 'lib/mining')
56 files changed, 7367 insertions, 0 deletions
diff --git a/lib/mining/.gitignore b/lib/mining/.gitignore new file mode 100644 index 00000000..4b45a114 --- /dev/null +++ b/lib/mining/.gitignore @@ -0,0 +1,26 @@ +cmd/cart/cart +cmd/crf/crf +cmd/lnsmote/lnsmote +cmd/rf/rf +cmd/smote/smote +resampling/lnsmote/lnsmote.outliers +resampling/lnsmote/phoneme_lnsmote.csv +resampling/lnsmote/synthetic.csv +resampling/smote/phoneme_smote.csv +resampling/smote/synthetic.csv +resampling/smote/testdata/phoneme.csv +testdata/phoneme/phoneme.csv +*.bak +*.bench +*.cprof +*.csv +*.dat +*.mprof +*.oob +*.perf +*.prof +*.rej +*.sh +*.stat +*.stats +*.test diff --git a/lib/mining/LICENSE b/lib/mining/LICENSE new file mode 100644 index 00000000..ec6ea686 --- /dev/null +++ b/lib/mining/LICENSE @@ -0,0 +1,38 @@ +Copyright (C) 2015, Mhd Sulhan (ms@kilabit.info). All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +* Neither the name of Kilabit nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY M.SHULHAN "AS IS" AND ANY EXPRESS OR IMPLIED +WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, +INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + --- --- --- --- --- --- --- + + TT TT II BB AAAA LLLLLL II KKKKKKKK + TT TT II BB AA AA LL LL II KK + TTTT II BB AA AA LL LL II KK + TT TT II BB AAAAAAAA LLLLLL II KK + TT TT II BB AA AA LL LL II KK + TT TT II BBBBBBBB AA AA LLLLLL II KK + +Website: http://kilabit.info +Contact: ms@kilabit.info diff --git a/lib/mining/README.md b/lib/mining/README.md new file mode 100644 index 00000000..821a6aa3 --- /dev/null +++ b/lib/mining/README.md @@ -0,0 +1,26 @@ +[](https://godoc.org/github.com/shuLhan/share/lib/mining) +[](https://goreportcard.com/report/github.com/shuLhan/share/lib/mining) + +# go-mining + +Go-mining is a small library for data mining. + +The library is written in [Go language](golang/go). + +## Features + +### Classifiers + +- CART +- Random Forest +- Cascaded Random Forest +- K-Nearest Neighbourhood + +### Resampling + +- SMOTE +- LN-SMOTE (Local Neigbourhood SMOTE) + +### Miscellaneous + +- Gini index diff --git a/lib/mining/classifier/cart/cart.go b/lib/mining/classifier/cart/cart.go new file mode 100644 index 00000000..449781d9 --- /dev/null +++ b/lib/mining/classifier/cart/cart.go @@ -0,0 +1,480 @@ +// Copyright 2015 Mhd Sulhan <ms@kilabit.info>. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +// Package cart implement the Classification and Regression Tree by Breiman, et al. +// CART is binary decision tree. +// +// Breiman, Leo, et al. Classification and regression trees. CRC press, +// 1984. +// +// The implementation is based on Data Mining book, +// +// Han, Jiawei, Micheline Kamber, and Jian Pei. Data mining: concepts and +// techniques: concepts and techniques. Elsevier, 2011. +// +package cart + +import ( + "fmt" + + "github.com/shuLhan/share/lib/debug" + "github.com/shuLhan/share/lib/mining/gain/gini" + "github.com/shuLhan/share/lib/mining/tree/binary" + "github.com/shuLhan/share/lib/numbers" + libstrings "github.com/shuLhan/share/lib/strings" + "github.com/shuLhan/share/lib/tabula" +) + +const ( + // SplitMethodGini if defined in Runtime, the dataset will be splitted + // using Gini gain for each possible value or partition. + // + // This option is used in Runtime.SplitMethod. + SplitMethodGini = "gini" +) + +const ( + // ColFlagParent denote that the column is parent/split node. + ColFlagParent = 1 + // ColFlagSkip denote that the column would be skipped. + ColFlagSkip = 2 +) + +// +// Runtime data for building CART. +// +type Runtime struct { + // SplitMethod define the criteria to used for splitting. + SplitMethod string `json:"SplitMethod"` + // NRandomFeature if less or equal to zero compute gain on all feature, + // otherwise select n random feature and compute gain only on selected + // features. + NRandomFeature int `json:"NRandomFeature"` + // OOBErrVal is the last out-of-bag error value in the tree. + OOBErrVal float64 + // Tree in classification. + Tree binary.Tree +} + +// +// New create new Runtime object. +// +func New(D tabula.ClasetInterface, splitMethod string, nRandomFeature int) ( + *Runtime, error, +) { + runtime := &Runtime{ + SplitMethod: splitMethod, + NRandomFeature: nRandomFeature, + Tree: binary.Tree{}, + } + + e := runtime.Build(D) + if e != nil { + return nil, e + } + + return runtime, nil +} + +// +// Build will create a tree using CART algorithm. +// +func (runtime *Runtime) Build(D tabula.ClasetInterface) (e error) { + // Re-check input configuration. + switch runtime.SplitMethod { + case SplitMethodGini: + // Do nothing. + default: + // Set default split method to Gini index. + runtime.SplitMethod = SplitMethodGini + } + + runtime.Tree.Root, e = runtime.splitTreeByGain(D) + + return +} + +// +// splitTreeByGain calculate the gain in all dataset, and split into two node: +// left and right. +// +// Return node with the split information. +// +func (runtime *Runtime) splitTreeByGain(D tabula.ClasetInterface) ( + node *binary.BTNode, + e error, +) { + node = &binary.BTNode{} + + D.RecountMajorMinor() + + // if dataset is empty return node labeled with majority classes in + // dataset. + nrow := D.GetNRow() + + if nrow <= 0 { + if debug.Value >= 2 { + fmt.Printf("[cart] empty dataset (%s) : %v\n", + D.MajorityClass(), D) + } + + node.Value = NodeValue{ + IsLeaf: true, + Class: D.MajorityClass(), + Size: 0, + } + return node, nil + } + + // if all dataset is in the same class, return node as leaf with class + // is set to that class. + single, name := D.IsInSingleClass() + if single { + if debug.Value >= 2 { + fmt.Printf("[cart] in single class (%s): %v\n", name, + D.GetColumns()) + } + + node.Value = NodeValue{ + IsLeaf: true, + Class: name, + Size: nrow, + } + return node, nil + } + + if debug.Value >= 2 { + fmt.Println("[cart] D:", D) + } + + // calculate the Gini gain for each attribute. + gains := runtime.computeGain(D) + + // get attribute with maximum Gini gain. + MaxGainIdx := gini.FindMaxGain(&gains) + MaxGain := gains[MaxGainIdx] + + // if maxgain value is 0, use majority class as node and terminate + // the process + if MaxGain.GetMaxGainValue() == 0 { + if debug.Value >= 2 { + fmt.Println("[cart] max gain 0 with target", + D.GetClassAsStrings(), + " and majority class is ", D.MajorityClass()) + } + + node.Value = NodeValue{ + IsLeaf: true, + Class: D.MajorityClass(), + Size: 0, + } + return node, nil + } + + // using the sorted index in MaxGain, sort all field in dataset + tabula.SortColumnsByIndex(D, MaxGain.SortedIndex) + + if debug.Value >= 2 { + fmt.Println("[cart] maxgain:", MaxGain) + } + + // Now that we have attribute with max gain in MaxGainIdx, and their + // gain dan partition value in Gains[MaxGainIdx] and + // GetMaxPartValue(), we split the dataset based on type of max-gain + // attribute. + // If its continuous, split the attribute using numeric value. + // If its discrete, split the attribute using subset (partition) of + // nominal values. + var splitV interface{} + + if MaxGain.IsContinu { + splitV = MaxGain.GetMaxPartGainValue() + } else { + attrPartV := MaxGain.GetMaxPartGainValue() + attrSubV := attrPartV.(libstrings.Row) + splitV = attrSubV[0] + } + + if debug.Value >= 2 { + fmt.Println("[cart] maxgainindex:", MaxGainIdx) + fmt.Println("[cart] split v:", splitV) + } + + node.Value = NodeValue{ + SplitAttrName: D.GetColumn(MaxGainIdx).GetName(), + IsLeaf: false, + IsContinu: MaxGain.IsContinu, + Size: nrow, + SplitAttrIdx: MaxGainIdx, + SplitV: splitV, + } + + dsL, dsR, e := tabula.SplitRowsByValue(D, MaxGainIdx, splitV) + + if e != nil { + return node, e + } + + splitL := dsL.(tabula.ClasetInterface) + splitR := dsR.(tabula.ClasetInterface) + + // Set the flag to parent in attribute referenced by + // MaxGainIdx, so it will not computed again in the next round. + cols := splitL.GetColumns() + for x := range *cols { + if x == MaxGainIdx { + (*cols)[x].Flag = ColFlagParent + } else { + (*cols)[x].Flag = 0 + } + } + + cols = splitR.GetColumns() + for x := range *cols { + if x == MaxGainIdx { + (*cols)[x].Flag = ColFlagParent + } else { + (*cols)[x].Flag = 0 + } + } + + nodeLeft, e := runtime.splitTreeByGain(splitL) + if e != nil { + return node, e + } + + nodeRight, e := runtime.splitTreeByGain(splitR) + if e != nil { + return node, e + } + + node.SetLeft(nodeLeft) + node.SetRight(nodeRight) + + return node, nil +} + +// SelectRandomFeature if NRandomFeature is greater than zero, select and +// compute gain in n random features instead of in all features +func (runtime *Runtime) SelectRandomFeature(D tabula.ClasetInterface) { + if runtime.NRandomFeature <= 0 { + // all features selected + return + } + + ncols := D.GetNColumn() + + // count all features minus class + nfeature := ncols - 1 + if runtime.NRandomFeature >= nfeature { + // Do nothing if number of random feature equal or greater than + // number of feature in dataset. + return + } + + // exclude class index and parent node index + excludeIdx := []int{D.GetClassIndex()} + cols := D.GetColumns() + for x, col := range *cols { + if (col.Flag & ColFlagParent) == ColFlagParent { + excludeIdx = append(excludeIdx, x) + } else { + (*cols)[x].Flag |= ColFlagSkip + } + } + + // Select random features excluding feature in `excludeIdx`. + var pickedIdx []int + for x := 0; x < runtime.NRandomFeature; x++ { + idx := numbers.IntPickRandPositive(ncols, false, pickedIdx, + excludeIdx) + pickedIdx = append(pickedIdx, idx) + + // Remove skip flag on selected column + col := D.GetColumn(idx) + col.Flag = col.Flag &^ ColFlagSkip + } + + if debug.Value >= 1 { + fmt.Println("[cart] selected random features:", pickedIdx) + fmt.Println("[cart] selected columns :", D.GetColumns()) + } +} + +// +// computeGain calculate the gini index for each value in each attribute. +// +func (runtime *Runtime) computeGain(D tabula.ClasetInterface) ( + gains []gini.Gini, +) { + switch runtime.SplitMethod { + case SplitMethodGini: + // create gains value for all attribute minus target class. + gains = make([]gini.Gini, D.GetNColumn()) + } + + runtime.SelectRandomFeature(D) + + classVS := D.GetClassValueSpace() + classIdx := D.GetClassIndex() + classType := D.GetClassType() + + for x, col := range *D.GetColumns() { + // skip class attribute. + if x == classIdx { + continue + } + + // skip column flagged with parent + if (col.Flag & ColFlagParent) == ColFlagParent { + gains[x].Skip = true + continue + } + + // ignore column flagged with skip + if (col.Flag & ColFlagSkip) == ColFlagSkip { + gains[x].Skip = true + continue + } + + // compute gain. + if col.GetType() == tabula.TReal { + attr := col.ToFloatSlice() + + if classType == tabula.TString { + target := D.GetClassAsStrings() + gains[x].ComputeContinu(&attr, &target, + &classVS) + } else { + targetReal := D.GetClassAsReals() + classVSReal := libstrings.ToFloat64(classVS) + + gains[x].ComputeContinuFloat(&attr, + &targetReal, &classVSReal) + } + } else { + attr := col.ToStringSlice() + attrV := col.ValueSpace + + if debug.Value >= 2 { + fmt.Println("[cart] attr :", attr) + fmt.Println("[cart] attrV:", attrV) + } + + target := D.GetClassAsStrings() + gains[x].ComputeDiscrete(&attr, &attrV, &target, + &classVS) + } + + if debug.Value >= 2 { + fmt.Println("[cart] gain :", gains[x]) + } + } + return +} + +// +// Classify return the prediction of one sample. +// +func (runtime *Runtime) Classify(data *tabula.Row) (class string) { + node := runtime.Tree.Root + nodev := node.Value.(NodeValue) + + for !nodev.IsLeaf { + if nodev.IsContinu { + splitV := nodev.SplitV.(float64) + attrV := (*data)[nodev.SplitAttrIdx].Float() + + if attrV < splitV { + node = node.Left + } else { + node = node.Right + } + } else { + splitV := nodev.SplitV.([]string) + attrV := (*data)[nodev.SplitAttrIdx].String() + + if libstrings.IsContain(splitV, attrV) { + node = node.Left + } else { + node = node.Right + } + } + nodev = node.Value.(NodeValue) + } + + return nodev.Class +} + +// +// ClassifySet set the class attribute based on tree classification. +// +func (runtime *Runtime) ClassifySet(data tabula.ClasetInterface) (e error) { + nrow := data.GetNRow() + targetAttr := data.GetClassColumn() + + for i := 0; i < nrow; i++ { + class := runtime.Classify(data.GetRow(i)) + + _ = (*targetAttr).Records[i].SetValue(class, tabula.TString) + } + + return +} + +// +// CountOOBError process out-of-bag data on tree and return error value. +// +func (runtime *Runtime) CountOOBError(oob tabula.Claset) ( + errval float64, + e error, +) { + // save the original target to be compared later. + origTarget := oob.GetClassAsStrings() + + if debug.Value >= 2 { + fmt.Println("[cart] OOB:", oob.Columns) + fmt.Println("[cart] TREE:", &runtime.Tree) + } + + // reset the target. + oobtarget := oob.GetClassColumn() + oobtarget.ClearValues() + + e = runtime.ClassifySet(&oob) + + if e != nil { + // set original target values back. + oobtarget.SetValues(origTarget) + return + } + + target := oobtarget.ToStringSlice() + + if debug.Value >= 2 { + fmt.Println("[cart] original target:", origTarget) + fmt.Println("[cart] classify target:", target) + } + + // count how many target value is miss-classified. + runtime.OOBErrVal, _, _ = libstrings.CountMissRate(origTarget, target) + + // set original target values back. + oobtarget.SetValues(origTarget) + + return runtime.OOBErrVal, nil +} + +// +// String yes, it will print it JSON like format. +// +func (runtime *Runtime) String() (s string) { + s = fmt.Sprintf("NRandomFeature: %d\n"+ + " SplitMethod : %s\n"+ + " Tree :\n%v", runtime.NRandomFeature, + runtime.SplitMethod, + runtime.Tree.String()) + return s +} diff --git a/lib/mining/classifier/cart/cart_test.go b/lib/mining/classifier/cart/cart_test.go new file mode 100644 index 00000000..14e89b12 --- /dev/null +++ b/lib/mining/classifier/cart/cart_test.go @@ -0,0 +1,62 @@ +// Copyright 2015 Mhd Sulhan <ms@kilabit.info>. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package cart + +import ( + "fmt" + "testing" + + "github.com/shuLhan/share/lib/dsv" + "github.com/shuLhan/share/lib/tabula" + "github.com/shuLhan/share/lib/test" +) + +const ( + NRows = 150 +) + +func TestCART(t *testing.T) { + fds := "../../testdata/iris/iris.dsv" + + ds := tabula.Claset{} + + _, e := dsv.SimpleRead(fds, &ds) + if nil != e { + t.Fatal(e) + } + + fmt.Println("[cart_test] class index:", ds.GetClassIndex()) + + // copy target to be compared later. + targetv := ds.GetClassAsStrings() + + test.Assert(t, "", NRows, ds.GetNRow(), true) + + // Build CART tree. + CART, e := New(&ds, SplitMethodGini, 0) + if e != nil { + t.Fatal(e) + } + + fmt.Println("[cart_test] CART Tree:\n", CART) + + // Create test set + testset := tabula.Claset{} + _, e = dsv.SimpleRead(fds, &testset) + + if nil != e { + t.Fatal(e) + } + + testset.GetClassColumn().ClearValues() + + // Classifiy test set + e = CART.ClassifySet(&testset) + if nil != e { + t.Fatal(e) + } + + test.Assert(t, "", targetv, testset.GetClassAsStrings(), true) +} diff --git a/lib/mining/classifier/cart/node.go b/lib/mining/classifier/cart/node.go new file mode 100644 index 00000000..b64dd13c --- /dev/null +++ b/lib/mining/classifier/cart/node.go @@ -0,0 +1,44 @@ +// Copyright 2015 Mhd Sulhan <ms@kilabit.info>. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package cart + +import ( + "fmt" + "reflect" +) + +// +// NodeValue of tree in CART. +// +type NodeValue struct { + // Class of leaf node. + Class string + // SplitAttrName define the name of attribute which cause the split. + SplitAttrName string + // IsLeaf define whether node is a leaf or not. + IsLeaf bool + // IsContinu define whether the node split is continuous or discrete. + IsContinu bool + // Size define number of sample that this node hold before splitting. + Size int + // SplitAttrIdx define the attribute which cause the split. + SplitAttrIdx int + // SplitV define the split value. + SplitV interface{} +} + +// +// String will return the value of node for printable. +// +func (nodev *NodeValue) String() (s string) { + if nodev.IsLeaf { + s = fmt.Sprintf("Class: %s", nodev.Class) + } else { + s = fmt.Sprintf("(SplitValue: %v)", + reflect.ValueOf(nodev.SplitV)) + } + + return s +} diff --git a/lib/mining/classifier/cm.go b/lib/mining/classifier/cm.go new file mode 100644 index 00000000..0dc2ee05 --- /dev/null +++ b/lib/mining/classifier/cm.go @@ -0,0 +1,442 @@ +// Copyright 2016 Mhd Sulhan <ms@kilabit.info>. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package classifier + +import ( + "fmt" + "strconv" + + "github.com/shuLhan/share/lib/tabula" +) + +// +// CM represent the matrix of classification. +// +type CM struct { + tabula.Dataset + // rowNames contain name in each row. + rowNames []string + // nSamples contain number of class. + nSamples int64 + // nTrue contain number of true positive and negative. + nTrue int64 + // nFalse contain number of false positive and negative. + nFalse int64 + + // tpIds contain index of true-positive samples. + tpIds []int + // fpIds contain index of false-positive samples. + fpIds []int + // tnIds contain index of true-negative samples. + tnIds []int + // fnIds contain index of false-negative samples. + fnIds []int +} + +// +// initByNumeric will initialize confusion matrix using numeric value space. +// +func (cm *CM) initByNumeric(vs []int64) { + var colTypes []int + var colNames []string + + for _, v := range vs { + vstr := strconv.FormatInt(v, 10) + colTypes = append(colTypes, tabula.TInteger) + colNames = append(colNames, vstr) + cm.rowNames = append(cm.rowNames, vstr) + } + + // class error column + colTypes = append(colTypes, tabula.TReal) + colNames = append(colNames, "class_error") + + cm.Dataset.Init(tabula.DatasetModeMatrix, colTypes, colNames) +} + +// +// ComputeStrings will calculate confusion matrix using targets and predictions +// class values. +// +func (cm *CM) ComputeStrings(valueSpace, targets, predictions []string) { + cm.init(valueSpace) + + for x, target := range valueSpace { + col := cm.GetColumn(x) + + for _, predict := range valueSpace { + cnt := cm.countTargetPrediction(target, predict, + targets, predictions) + + col.PushBack(tabula.NewRecordInt(cnt)) + } + + cm.PushColumnToRows(*col) + } + + cm.computeClassError() +} + +// +// ComputeNumeric will calculate confusion matrix using targets and predictions +// values. +// +func (cm *CM) ComputeNumeric(vs, actuals, predictions []int64) { + cm.initByNumeric(vs) + + for x, act := range vs { + col := cm.GetColumn(x) + + for _, pred := range vs { + cnt := cm.countNumeric(act, pred, actuals, predictions) + + rec := tabula.NewRecordInt(cnt) + col.PushBack(rec) + } + + cm.PushColumnToRows(*col) + } + + cm.computeClassError() +} + +// +// create will initialize confusion matrix using value space. +// +func (cm *CM) init(valueSpace []string) { + var colTypes []int + var colNames []string + + for _, v := range valueSpace { + colTypes = append(colTypes, tabula.TInteger) + colNames = append(colNames, v) + cm.rowNames = append(cm.rowNames, v) + } + + // class error column + colTypes = append(colTypes, tabula.TReal) + colNames = append(colNames, "class_error") + + cm.Dataset.Init(tabula.DatasetModeMatrix, colTypes, colNames) +} + +// +// countTargetPrediction will count and return number of true positive or false +// positive in predictions using targets values. +// +func (cm *CM) countTargetPrediction(target, predict string, + targets, predictions []string, +) ( + cnt int64, +) { + predictslen := len(predictions) + + for x, v := range targets { + // In case out of range, where predictions length less than + // targets length. + if x >= predictslen { + break + } + if v != target { + continue + } + if predict == predictions[x] { + cnt++ + } + } + return +} + +// +// countNumeric will count and return number of `pred` in predictions where +// actuals value is `act`. +// +func (cm *CM) countNumeric(act, pred int64, actuals, predictions []int64) ( + cnt int64, +) { + // Find minimum length to mitigate out-of-range loop. + minlen := len(actuals) + if len(predictions) < minlen { + minlen = len(predictions) + } + + for x := 0; x < minlen; x++ { + if actuals[x] != act { + continue + } + if predictions[x] != pred { + continue + } + cnt++ + } + return cnt +} + +// +// computeClassError will compute the classification error in matrix. +// +func (cm *CM) computeClassError() { + var tp, fp int64 + + cm.nSamples = 0 + cm.nFalse = 0 + + classcol := cm.GetNColumn() - 1 + col := cm.GetColumnClassError() + rows := cm.GetDataAsRows() + for x, row := range *rows { + for y, cell := range *row { + if y == classcol { + break + } + if x == y { + tp = cell.Integer() + } else { + fp += cell.Integer() + } + } + + nSamplePerRow := tp + fp + errv := float64(fp) / float64(nSamplePerRow) + col.PushBack(tabula.NewRecordReal(errv)) + + cm.nSamples += nSamplePerRow + cm.nTrue += tp + cm.nFalse += fp + } + + cm.PushColumnToRows(*col) +} + +// +// GroupIndexPredictions given index of samples, group the samples by their +// class of prediction. For example, +// +// sampleIds: [0, 1, 2, 3, 4, 5] +// actuals: [1, 1, 0, 0, 1, 0] +// predictions: [1, 0, 1, 0, 1, 1] +// +// This function will group the index by true-positive, false-positive, +// true-negative, and false-negative, which result in, +// +// true-positive indices: [0, 4] +// false-positive indices: [2, 5] +// true-negative indices: [3] +// false-negative indices: [1] +// +// This function assume that positive value as "1" and negative value as "0". +// +func (cm *CM) GroupIndexPredictions(sampleIds []int, + actuals, predictions []int64, +) { + // Reset indices. + cm.tpIds = nil + cm.fpIds = nil + cm.tnIds = nil + cm.fnIds = nil + + // Make sure we are not out-of-range when looping, always pick the + // minimum length between the three parameters. + min := len(sampleIds) + if len(actuals) < min { + min = len(actuals) + } + if len(predictions) < min { + min = len(predictions) + } + + for x := 0; x < min; x++ { + if actuals[x] == 1 { + if predictions[x] == 1 { + cm.tpIds = append(cm.tpIds, sampleIds[x]) + } else { + cm.fnIds = append(cm.fnIds, sampleIds[x]) + } + } else { + if predictions[x] == 1 { + cm.fpIds = append(cm.fpIds, sampleIds[x]) + } else { + cm.tnIds = append(cm.tnIds, sampleIds[x]) + } + } + } +} + +// +// GroupIndexPredictionsStrings is an alternative to GroupIndexPredictions +// which work with string class. +// +func (cm *CM) GroupIndexPredictionsStrings(sampleIds []int, + actuals, predictions []string, +) { + if len(sampleIds) <= 0 { + return + } + + // Reset indices. + cm.tpIds = nil + cm.fpIds = nil + cm.tnIds = nil + cm.fnIds = nil + + // Make sure we are not out-of-range when looping, always pick the + // minimum length between the three parameters. + min := len(sampleIds) + if len(actuals) < min { + min = len(actuals) + } + if len(predictions) < min { + min = len(predictions) + } + + for x := 0; x < min; x++ { + if actuals[x] == "1" { + if predictions[x] == "1" { + cm.tpIds = append(cm.tpIds, sampleIds[x]) + } else { + cm.fnIds = append(cm.fnIds, sampleIds[x]) + } + } else { + if predictions[x] == "1" { + cm.fpIds = append(cm.fpIds, sampleIds[x]) + } else { + cm.tnIds = append(cm.tnIds, sampleIds[x]) + } + } + } +} + +// +// GetColumnClassError return the last column which is the column that contain +// the error of classification. +// +func (cm *CM) GetColumnClassError() *tabula.Column { + return cm.GetColumn(cm.GetNColumn() - 1) +} + +// +// GetTrueRate return true-positive rate in term of +// +// true-positive / (true-positive + false-positive) +// +func (cm *CM) GetTrueRate() float64 { + return float64(cm.nTrue) / float64(cm.nTrue+cm.nFalse) +} + +// +// GetFalseRate return false-positive rate in term of, +// +// false-positive / (false-positive + true negative) +// +func (cm *CM) GetFalseRate() float64 { + return float64(cm.nFalse) / float64(cm.nTrue+cm.nFalse) +} + +// +// TP return number of true-positive in confusion matrix. +// +func (cm *CM) TP() int { + row := cm.GetRow(0) + if row == nil { + return 0 + } + + v, _ := row.GetIntAt(0) + return int(v) +} + +// +// FP return number of false-positive in confusion matrix. +// +func (cm *CM) FP() int { + row := cm.GetRow(0) + if row == nil { + return 0 + } + + v, _ := row.GetIntAt(1) + return int(v) +} + +// +// FN return number of false-negative. +// +func (cm *CM) FN() int { + row := cm.GetRow(1) + if row == nil { + return 0 + } + v, _ := row.GetIntAt(0) + return int(v) +} + +// +// TN return number of true-negative. +// +func (cm *CM) TN() int { + row := cm.GetRow(1) + if row == nil { + return 0 + } + v, _ := row.GetIntAt(1) + return int(v) +} + +// +// TPIndices return indices of all true-positive samples. +// +func (cm *CM) TPIndices() []int { + return cm.tpIds +} + +// +// FNIndices return indices of all false-negative samples. +// +func (cm *CM) FNIndices() []int { + return cm.fnIds +} + +// +// FPIndices return indices of all false-positive samples. +// +func (cm *CM) FPIndices() []int { + return cm.fpIds +} + +// +// TNIndices return indices of all true-negative samples. +// +func (cm *CM) TNIndices() []int { + return cm.tnIds +} + +// +// String will return the output of confusion matrix in table like format. +// +func (cm *CM) String() (s string) { + s += "Confusion Matrix:\n" + + // Row header: column names. + s += "\t" + for _, col := range cm.GetColumnsName() { + s += col + "\t" + } + s += "\n" + + rows := cm.GetDataAsRows() + for x, row := range *rows { + s += cm.rowNames[x] + "\t" + + for _, v := range *row { + s += v.String() + "\t" + } + s += "\n" + } + + s += fmt.Sprintf("TP-FP indices %d %d\n", len(cm.tpIds), len(cm.fpIds)) + s += fmt.Sprintf("FN-TN indices %d %d\n", len(cm.fnIds), len(cm.tnIds)) + + return +} diff --git a/lib/mining/classifier/cm_test.go b/lib/mining/classifier/cm_test.go new file mode 100644 index 00000000..6fd47d93 --- /dev/null +++ b/lib/mining/classifier/cm_test.go @@ -0,0 +1,69 @@ +// Copyright 2016 Mhd Sulhan <ms@kilabit.info>. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package classifier + +import ( + "fmt" + "testing" + + "github.com/shuLhan/share/lib/test" +) + +func TestComputeNumeric(t *testing.T) { + actuals := []int64{1, 1, 1, 0, 0, 0, 0} + predics := []int64{1, 1, 0, 0, 0, 0, 1} + vs := []int64{1, 0} + exp := []int{2, 1, 3, 1} + + cm := &CM{} + + cm.ComputeNumeric(vs, actuals, predics) + + test.Assert(t, "", exp[0], cm.TP(), true) + test.Assert(t, "", exp[1], cm.FN(), true) + test.Assert(t, "", exp[2], cm.TN(), true) + test.Assert(t, "", exp[3], cm.FP(), true) + + fmt.Println(cm) +} + +func TestComputeStrings(t *testing.T) { + actuals := []string{"1", "1", "1", "0", "0", "0", "0"} + predics := []string{"1", "1", "0", "0", "0", "0", "1"} + vs := []string{"1", "0"} + exp := []int{2, 1, 3, 1} + + cm := &CM{} + + cm.ComputeStrings(vs, actuals, predics) + + test.Assert(t, "", exp[0], cm.TP(), true) + test.Assert(t, "", exp[1], cm.FN(), true) + test.Assert(t, "", exp[2], cm.TN(), true) + test.Assert(t, "", exp[3], cm.FP(), true) + + fmt.Println(cm) +} + +func TestGroupIndexPredictions(t *testing.T) { + testIds := []int{0, 1, 2, 3, 4, 5, 6, 7, 8, 9} + actuals := []int64{1, 1, 1, 1, 0, 0, 0, 0, 0, 0} + predics := []int64{1, 1, 0, 1, 0, 0, 0, 0, 1, 0} + exp := [][]int{ + {0, 1, 3}, // tp + {2}, // fn + {8}, // fp + {4, 5, 6, 7, 9}, // tn + } + + cm := &CM{} + + cm.GroupIndexPredictions(testIds, actuals, predics) + + test.Assert(t, "", exp[0], cm.TPIndices(), true) + test.Assert(t, "", exp[1], cm.FNIndices(), true) + test.Assert(t, "", exp[2], cm.FPIndices(), true) + test.Assert(t, "", exp[3], cm.TNIndices(), true) +} diff --git a/lib/mining/classifier/crf/crf.go b/lib/mining/classifier/crf/crf.go new file mode 100644 index 00000000..3df7dd83 --- /dev/null +++ b/lib/mining/classifier/crf/crf.go @@ -0,0 +1,470 @@ +// Copyright 2016 Mhd Sulhan <ms@kilabit.info>. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +// Package crf implement the cascaded random forest algorithm, proposed by +// Baumann et.al in their paper: +// +// Baumann, Florian, et al. "Cascaded Random Forest for Fast Object +// Detection." Image Analysis. Springer Berlin Heidelberg, 2013. 131-142. +// +// +package crf + +import ( + "errors" + "fmt" + "math" + "sort" + + "github.com/shuLhan/share/lib/debug" + "github.com/shuLhan/share/lib/mining/classifier" + "github.com/shuLhan/share/lib/mining/classifier/rf" + "github.com/shuLhan/share/lib/numbers" + libstrings "github.com/shuLhan/share/lib/strings" + "github.com/shuLhan/share/lib/tabula" +) + +const ( + tag = "[crf]" + + // DefStage default number of stage + DefStage = 200 + // DefTPRate default threshold for true-positive rate. + DefTPRate = 0.9 + // DefTNRate default threshold for true-negative rate. + DefTNRate = 0.7 + + // DefNumTree default number of tree. + DefNumTree = 1 + // DefPercentBoot default percentage of sample that will be used for + // bootstraping a tree. + DefPercentBoot = 66 + // DefPerfFile default performance file output. + DefPerfFile = "crf.perf" + // DefStatFile default statistic file output. + DefStatFile = "crf.stat" +) + +var ( + // ErrNoInput will tell you when no input is given. + ErrNoInput = errors.New("rf: input samples is empty") +) + +// +// Runtime define the cascaded random forest runtime input and output. +// +type Runtime struct { + // Runtime embed common fields for classifier. + classifier.Runtime + + // NStage number of stage. + NStage int `json:"NStage"` + // TPRate threshold for true positive rate per stage. + TPRate float64 `json:"TPRate"` + // TNRate threshold for true negative rate per stage. + TNRate float64 `json:"TNRate"` + + // NTree number of tree in each stage. + NTree int `json:"NTree"` + // NRandomFeature number of features used to split the dataset. + NRandomFeature int `json:"NRandomFeature"` + // PercentBoot percentage of bootstrap. + PercentBoot int `json:"PercentBoot"` + + // forests contain forest for each stage. + forests []*rf.Runtime + // weights contain weight for each stage. + weights []float64 + // tnset contain sample of all true-negative in each iteration. + tnset *tabula.Claset +} + +// +// New create and return new input for cascaded random-forest. +// +func New(nstage, ntree, percentboot, nfeature int, + tprate, tnrate float64, + samples tabula.ClasetInterface, +) ( + crf *Runtime, +) { + crf = &Runtime{ + NStage: nstage, + NTree: ntree, + PercentBoot: percentboot, + NRandomFeature: nfeature, + TPRate: tprate, + TNRate: tnrate, + } + + return crf +} + +// +// AddForest will append new forest. +// +func (crf *Runtime) AddForest(forest *rf.Runtime) { + crf.forests = append(crf.forests, forest) +} + +// +// Initialize will check crf inputs and set it to default values if its +// invalid. +// +func (crf *Runtime) Initialize(samples tabula.ClasetInterface) error { + if crf.NStage <= 0 { + crf.NStage = DefStage + } + if crf.TPRate <= 0 || crf.TPRate >= 1 { + crf.TPRate = DefTPRate + } + if crf.TNRate <= 0 || crf.TNRate >= 1 { + crf.TNRate = DefTNRate + } + if crf.NTree <= 0 { + crf.NTree = DefNumTree + } + if crf.PercentBoot <= 0 { + crf.PercentBoot = DefPercentBoot + } + if crf.NRandomFeature <= 0 { + // Set default value to square-root of features. + ncol := samples.GetNColumn() - 1 + crf.NRandomFeature = int(math.Sqrt(float64(ncol))) + } + if crf.PerfFile == "" { + crf.PerfFile = DefPerfFile + } + if crf.StatFile == "" { + crf.StatFile = DefStatFile + } + crf.tnset = samples.Clone().(*tabula.Claset) + + return crf.Runtime.Initialize() +} + +// +// Build given a sample dataset, build the stage with randomforest. +// +func (crf *Runtime) Build(samples tabula.ClasetInterface) (e error) { + if samples == nil { + return ErrNoInput + } + + e = crf.Initialize(samples) + if e != nil { + return + } + + fmt.Println(tag, "Training samples:", samples) + fmt.Println(tag, "Sample (one row):", samples.GetRow(0)) + fmt.Println(tag, "Config:", crf) + + for x := 0; x < crf.NStage; x++ { + if debug.Value >= 1 { + fmt.Println(tag, "Stage #", x) + } + + forest, e := crf.createForest(samples) + if e != nil { + return e + } + + e = crf.finalizeStage(forest) + if e != nil { + return e + } + } + + return crf.Finalize() +} + +// +// createForest will create and return a forest and run the training `samples` +// on it. +// +// Algorithm, +// (1) Initialize forest. +// (2) For 0 to maximum number of tree in forest, +// (2.1) grow one tree until success. +// (2.2) If tree tp-rate and tn-rate greater than threshold, stop growing. +// (3) Calculate weight. +// (4) TODO: Move true-negative from samples. The collection of true-negative +// will be used again to test the model and after test and the sample with FP +// will be moved to training samples again. +// (5) Refill samples with false-positive. +// +func (crf *Runtime) createForest(samples tabula.ClasetInterface) ( + forest *rf.Runtime, e error, +) { + var cm *classifier.CM + var stat *classifier.Stat + + fmt.Println(tag, "Forest samples:", samples) + + // (1) + forest = &rf.Runtime{ + Runtime: classifier.Runtime{ + RunOOB: true, + }, + NTree: crf.NTree, + NRandomFeature: crf.NRandomFeature, + } + + e = forest.Initialize(samples) + if e != nil { + return nil, e + } + + // (2) + for t := 0; t < crf.NTree; t++ { + if debug.Value >= 2 { + fmt.Println(tag, "Tree #", t) + } + + // (2.1) + for { + cm, stat, e = forest.GrowTree(samples) + if e == nil { + break + } + } + + // (2.2) + if stat.TPRate > crf.TPRate && + stat.TNRate > crf.TNRate { + break + } + } + + e = forest.Finalize() + if e != nil { + return nil, e + } + + // (3) + crf.computeWeight(stat) + + if debug.Value >= 1 { + fmt.Println(tag, "Weight:", stat.FMeasure) + } + + // (4) + crf.deleteTrueNegative(samples, cm) + + // (5) + crf.runTPSet(samples) + + samples.RecountMajorMinor() + + return forest, nil +} + +// +// finalizeStage save forest and write the forest statistic to file. +// +func (crf *Runtime) finalizeStage(forest *rf.Runtime) (e error) { + stat := forest.StatTotal() + stat.ID = int64(len(crf.forests)) + + e = crf.WriteOOBStat(stat) + if e != nil { + return e + } + + crf.AddStat(stat) + crf.ComputeStatTotal(stat) + + if debug.Value >= 1 { + crf.PrintStatTotal(nil) + } + + // (7) + crf.AddForest(forest) + + return nil +} + +// +// computeWeight will compute the weight of stage based on F-measure of the +// last tree in forest. +// +func (crf *Runtime) computeWeight(stat *classifier.Stat) { + crf.weights = append(crf.weights, math.Exp(stat.FMeasure)) +} + +// +// deleteTrueNegative will delete all samples data where their row index is in +// true-negative values in confusion matrix and move it to TN-set. +// +// (1) Move true negative to tnset on the first iteration, on the next +// iteration it will be full deleted. +// (2) Delete TN from sample set one-by-one with offset, to make sure we +// are not deleting with wrong index. + +func (crf *Runtime) deleteTrueNegative(samples tabula.ClasetInterface, + cm *classifier.CM, +) { + var row *tabula.Row + + tnids := cm.TNIndices() + sort.Ints(tnids) + + // (1) + if len(crf.weights) <= 1 { + for _, i := range tnids { + crf.tnset.PushRow(samples.GetRow(i)) + } + } + + // (2) + c := 0 + for x, i := range tnids { + row = samples.DeleteRow(i - x) + if row != nil { + c++ + } + } + + if debug.Value >= 1 { + fmt.Println(tag, "# TN", len(tnids), "# deleted", c) + } +} + +// +// refillWithFP will copy the false-positive data in training set `tnset` +// and append it to `samples`. +// +func (crf *Runtime) refillWithFP(samples, tnset tabula.ClasetInterface, + cm *classifier.CM, +) { + // Get and sort FP. + fpids := cm.FPIndices() + sort.Ints(fpids) + + // Move FP samples from TN-set to training set samples. + for _, i := range fpids { + samples.PushRow(tnset.GetRow(i)) + } + + // Delete FP from training set. + var row *tabula.Row + c := 0 + for x, i := range fpids { + row = tnset.DeleteRow(i - x) + if row != nil { + c++ + } + } + + if debug.Value >= 1 { + fmt.Println(tag, "# FP", len(fpids), "# refilled", c) + } +} + +// +// runTPSet will run true-positive set into trained stage, to get the +// false-positive. The FP samples will be added to training set. +// +func (crf *Runtime) runTPSet(samples tabula.ClasetInterface) { + // Skip the first stage, because we just got tnset from them. + if len(crf.weights) <= 1 { + return + } + + tnIds := numbers.IntCreateSeq(0, crf.tnset.Len()-1) + _, cm, _ := crf.ClassifySetByWeight(crf.tnset, tnIds) + + crf.refillWithFP(samples, crf.tnset, cm) +} + +// +// ClassifySetByWeight will classify each instance in samples by weight +// with respect to its single performance. +// +// Algorithm, +// (1) For each instance in samples, +// (1.1) for each stage, +// (1.1.1) collect votes for instance in current stage. +// (1.1.2) Compute probabilities of each classes in votes. +// +// prob_class = count_of_class / total_votes +// +// (1.1.3) Compute total of probabilites times of stage weight. +// +// stage_prob = prob_class * stage_weight +// +// (1.2) Divide each class stage probabilites with +// +// stage_prob = stage_prob / +// (sum_of_all_weights * number_of_tree_in_forest) +// +// (1.3) Select class label with highest probabilites. +// (1.4) Save stage probabilities for positive class. +// (2) Compute confusion matrix. +// +func (crf *Runtime) ClassifySetByWeight(samples tabula.ClasetInterface, + sampleIds []int, +) ( + predicts []string, cm *classifier.CM, probs []float64, +) { + stat := classifier.Stat{} + stat.Start() + + vs := samples.GetClassValueSpace() + stageProbs := make([]float64, len(vs)) + stageSumProbs := make([]float64, len(vs)) + sumWeights := numbers.Floats64Sum(crf.weights) + + // (1) + rows := samples.GetDataAsRows() + for _, row := range *rows { + for y := range stageSumProbs { + stageSumProbs[y] = 0 + } + + // (1.1) + for y, forest := range crf.forests { + // (1.1.1) + votes := forest.Votes(row, -1) + + // (1.1.2) + probs := libstrings.FrequencyOfTokens(votes, vs, false) + + // (1.1.3) + for z := range probs { + stageSumProbs[z] += probs[z] + stageProbs[z] += probs[z] * crf.weights[y] + } + } + + // (1.2) + stageWeight := sumWeights * float64(crf.NTree) + + for x := range stageProbs { + stageProbs[x] = stageProbs[x] / stageWeight + } + + // (1.3) + _, maxi, ok := numbers.Floats64FindMax(stageProbs) + if ok { + predicts = append(predicts, vs[maxi]) + } + + probs = append(probs, stageSumProbs[0]/ + float64(len(crf.forests))) + } + + // (2) + actuals := samples.GetClassAsStrings() + cm = crf.ComputeCM(sampleIds, vs, actuals, predicts) + + crf.ComputeStatFromCM(&stat, cm) + stat.End() + + _ = stat.Write(crf.StatFile) + + return predicts, cm, probs +} diff --git a/lib/mining/classifier/crf/crf_test.go b/lib/mining/classifier/crf/crf_test.go new file mode 100644 index 00000000..4e530591 --- /dev/null +++ b/lib/mining/classifier/crf/crf_test.go @@ -0,0 +1,92 @@ +// Copyright 2015 Mhd Sulhan <ms@kilabit.info>. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package crf + +import ( + "fmt" + "os" + "testing" + + "github.com/shuLhan/share/lib/dsv" + "github.com/shuLhan/share/lib/mining/classifier" + "github.com/shuLhan/share/lib/tabula" +) + +var ( + SampleFile string + PerfFile string + StatFile string + NStage = 200 + NTree = 1 +) + +func runCRF(t *testing.T) { + // read trainingset. + samples := tabula.Claset{} + _, e := dsv.SimpleRead(SampleFile, &samples) + if e != nil { + t.Fatal(e) + } + + nbag := (samples.Len() * 63) / 100 + train, test, _, testIds := tabula.RandomPickRows(&samples, nbag, false) + + trainset := train.(tabula.ClasetInterface) + testset := test.(tabula.ClasetInterface) + + crfRuntime := Runtime{ + Runtime: classifier.Runtime{ + StatFile: StatFile, + PerfFile: PerfFile, + }, + NStage: NStage, + NTree: NTree, + } + + e = crfRuntime.Build(trainset) + if e != nil { + t.Fatal(e) + } + + testset.RecountMajorMinor() + fmt.Println("Testset:", testset) + + predicts, cm, probs := crfRuntime.ClassifySetByWeight(testset, testIds) + + fmt.Println("Confusion matrix:", cm) + + crfRuntime.Performance(testset, predicts, probs) + e = crfRuntime.WritePerformance() + if e != nil { + t.Fatal(e) + } +} + +func TestPhoneme200_1(t *testing.T) { + SampleFile = "../../testdata/phoneme/phoneme.dsv" + PerfFile = "phoneme_200_1.perf" + StatFile = "phoneme_200_1.stat" + + runCRF(t) +} + +func TestPhoneme200_10(t *testing.T) { + SampleFile = "../../testdata/phoneme/phoneme.dsv" + PerfFile = "phoneme_200_10.perf" + StatFile = "phoneme_200_10.stat" + NTree = 10 + + runCRF(t) +} + +func TestMain(m *testing.M) { + envTestCRF := os.Getenv("TEST_CRF") + + if len(envTestCRF) == 0 { + os.Exit(0) + } + + os.Exit(m.Run()) +} diff --git a/lib/mining/classifier/rf/rf.go b/lib/mining/classifier/rf/rf.go new file mode 100644 index 00000000..86595efd --- /dev/null +++ b/lib/mining/classifier/rf/rf.go @@ -0,0 +1,363 @@ +// Copyright 2016 Mhd Sulhan <ms@kilabit.info>. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +// Package rf implement ensemble of classifiers using random forest +// algorithm by Breiman and Cutler. +// +// Breiman, Leo. "Random forests." Machine learning 45.1 (2001): 5-32. +// +// The implementation is based on various sources and using author experience. +// +package rf + +import ( + "errors" + "fmt" + "math" + + "github.com/shuLhan/share/lib/debug" + "github.com/shuLhan/share/lib/mining/classifier" + "github.com/shuLhan/share/lib/mining/classifier/cart" + "github.com/shuLhan/share/lib/numbers" + libstrings "github.com/shuLhan/share/lib/strings" + "github.com/shuLhan/share/lib/tabula" +) + +const ( + tag = "[rf]" + + // DefNumTree default number of tree. + DefNumTree = 100 + + // DefPercentBoot default percentage of sample that will be used for + // bootstraping a tree. + DefPercentBoot = 66 + + // DefOOBStatsFile default statistic file output. + DefOOBStatsFile = "rf.oob.stat" + + // DefPerfFile default performance file output. + DefPerfFile = "rf.perf" + + // DefStatFile default statistic file. + DefStatFile = "rf.stat" +) + +var ( + // ErrNoInput will tell you when no input is given. + ErrNoInput = errors.New("rf: input samples is empty") +) + +// +// Runtime contains input and output configuration when generating random forest. +// +type Runtime struct { + // Runtime embed common fields for classifier. + classifier.Runtime + + // NTree number of tree in forest. + NTree int `json:"NTree"` + // NRandomFeature number of feature randomly selected for each tree. + NRandomFeature int `json:"NRandomFeature"` + // PercentBoot percentage of sample for bootstraping. + PercentBoot int `json:"PercentBoot"` + + // nSubsample number of samples used for bootstraping. + nSubsample int + // trees contain all tree in the forest. + trees []cart.Runtime + // bagIndices contain list of index of selected samples at bootstraping + // for book-keeping. + bagIndices [][]int +} + +// +// Trees return all tree in forest. +// +func (forest *Runtime) Trees() []cart.Runtime { + return forest.trees +} + +// +// AddCartTree add tree to forest +// +func (forest *Runtime) AddCartTree(tree cart.Runtime) { + forest.trees = append(forest.trees, tree) +} + +// +// AddBagIndex add bagging index for book keeping. +// +func (forest *Runtime) AddBagIndex(bagIndex []int) { + forest.bagIndices = append(forest.bagIndices, bagIndex) +} + +// +// Initialize will check forest inputs and set it to default values if invalid. +// +// It will also calculate number of random samples for each tree using, +// +// number-of-sample * percentage-of-bootstrap +// +// +func (forest *Runtime) Initialize(samples tabula.ClasetInterface) error { + if forest.NTree <= 0 { + forest.NTree = DefNumTree + } + if forest.PercentBoot <= 0 { + forest.PercentBoot = DefPercentBoot + } + if forest.NRandomFeature <= 0 { + // Set default value to square-root of features. + ncol := samples.GetNColumn() - 1 + forest.NRandomFeature = int(math.Sqrt(float64(ncol))) + } + if forest.OOBStatsFile == "" { + forest.OOBStatsFile = DefOOBStatsFile + } + if forest.PerfFile == "" { + forest.PerfFile = DefPerfFile + } + if forest.StatFile == "" { + forest.StatFile = DefStatFile + } + + forest.nSubsample = int(float32(samples.GetNRow()) * + (float32(forest.PercentBoot) / 100.0)) + + return forest.Runtime.Initialize() +} + +// +// Build the forest using samples dataset. +// +// Algorithm, +// +// (0) Recheck input value: number of tree, percentage bootstrap, etc; and +// Open statistic file output. +// (1) For 0 to NTree, +// (1.1) Create new tree, repeat until all trees has been build. +// (2) Compute and write total statistic. +// +func (forest *Runtime) Build(samples tabula.ClasetInterface) (e error) { + // check input samples + if samples == nil { + return ErrNoInput + } + + // (0) + e = forest.Initialize(samples) + if e != nil { + return + } + + fmt.Println(tag, "Training set :", samples) + fmt.Println(tag, "Sample (one row):", samples.GetRow(0)) + fmt.Println(tag, "Forest config :", forest) + + // (1) + for t := 0; t < forest.NTree; t++ { + if debug.Value >= 1 { + fmt.Println(tag, "tree #", t) + } + + // (1.1) + for { + _, _, e = forest.GrowTree(samples) + if e == nil { + break + } + + fmt.Println(tag, "error:", e) + } + } + + // (2) + return forest.Finalize() +} + +// +// GrowTree build a new tree in forest, return OOB error value or error if tree +// can not grow. +// +// Algorithm, +// +// (1) Select random samples with replacement, also with OOB. +// (2) Build tree using CART, without pruning. +// (3) Add tree to forest. +// (4) Save index of random samples for calculating error rate later. +// (5) Run OOB on forest. +// (6) Calculate OOB error rate and statistic values. +// +func (forest *Runtime) GrowTree(samples tabula.ClasetInterface) ( + cm *classifier.CM, stat *classifier.Stat, e error, +) { + stat = &classifier.Stat{} + stat.ID = int64(len(forest.trees)) + stat.Start() + + // (1) + bag, oob, bagIdx, oobIdx := tabula.RandomPickRows( + samples.(tabula.DatasetInterface), + forest.nSubsample, true) + + bagset := bag.(tabula.ClasetInterface) + + if debug.Value >= 2 { + bagset.RecountMajorMinor() + fmt.Println(tag, "Bagging:", bagset) + } + + // (2) + cart, e := cart.New(bagset, cart.SplitMethodGini, + forest.NRandomFeature) + if e != nil { + return nil, nil, e + } + + // (3) + forest.AddCartTree(*cart) + + // (4) + forest.AddBagIndex(bagIdx) + + // (5) + if forest.RunOOB { + oobset := oob.(tabula.ClasetInterface) + _, cm, _ = forest.ClassifySet(oobset, oobIdx) + + forest.AddOOBCM(cm) + } + + stat.End() + + if debug.Value >= 3 && forest.RunOOB { + fmt.Println(tag, "Elapsed time (s):", stat.ElapsedTime) + } + + forest.AddStat(stat) + + // (6) + if forest.RunOOB { + forest.ComputeStatFromCM(stat, cm) + + if debug.Value >= 2 { + fmt.Println(tag, "OOB stat:", stat) + } + } + + forest.ComputeStatTotal(stat) + e = forest.WriteOOBStat(stat) + + return cm, stat, e +} + +// +// ClassifySet given a samples predict their class by running each sample in +// forest, adn return their class prediction with confusion matrix. +// `samples` is the sample that will be predicted, `sampleIds` is the index of +// samples. +// If `sampleIds` is not nil, then sample index will be checked in each tree, +// if the sample is used for training, their vote is not counted. +// +// Algorithm, +// +// (0) Get value space (possible class values in dataset) +// (1) For each row in test-set, +// (1.1) collect votes in all trees, +// (1.2) select majority class vote, and +// (1.3) compute and save the actual class probabilities. +// (2) Compute confusion matrix from predictions. +// (3) Compute stat from confusion matrix. +// (4) Write the stat to file only if sampleIds is empty, which mean its run +// not from OOB set. +// +func (forest *Runtime) ClassifySet(samples tabula.ClasetInterface, + sampleIds []int, +) ( + predicts []string, cm *classifier.CM, probs []float64, +) { + stat := classifier.Stat{} + stat.Start() + + if len(sampleIds) <= 0 { + fmt.Println(tag, "Classify set:", samples) + fmt.Println(tag, "Classify set sample (one row):", + samples.GetRow(0)) + } + + // (0) + vs := samples.GetClassValueSpace() + actuals := samples.GetClassAsStrings() + sampleIdx := -1 + + // (1) + rows := samples.GetRows() + for x, row := range *rows { + // (1.1) + if len(sampleIds) > 0 { + sampleIdx = sampleIds[x] + } + votes := forest.Votes(row, sampleIdx) + + // (1.2) + classProbs := libstrings.FrequencyOfTokens(votes, vs, false) + + _, idx, ok := numbers.Floats64FindMax(classProbs) + + if ok { + predicts = append(predicts, vs[idx]) + } + + // (1.3) + probs = append(probs, classProbs[0]) + } + + // (2) + cm = forest.ComputeCM(sampleIds, vs, actuals, predicts) + + // (3) + forest.ComputeStatFromCM(&stat, cm) + stat.End() + + if len(sampleIds) <= 0 { + fmt.Println(tag, "CM:", cm) + fmt.Println(tag, "Classifying stat:", stat) + _ = stat.Write(forest.StatFile) + } + + return predicts, cm, probs +} + +// +// Votes will return votes, or classes, in each tree based on sample. +// If checkIdx is true then the `sampleIdx` will be checked in if it has been used +// when training the tree, if its exist then the sample will be skipped. +// +// (1) If row is used to build the tree then skip it, +// (2) classify row in tree, +// (3) save tree class value. +// +func (forest *Runtime) Votes(sample *tabula.Row, sampleIdx int) ( + votes []string, +) { + for x, tree := range forest.trees { + // (1) + if sampleIdx >= 0 { + exist := numbers.IntsIsExist(forest.bagIndices[x], + sampleIdx) + if exist { + continue + } + } + + // (2) + class := tree.Classify(sample) + + // (3) + votes = append(votes, class) + } + return votes +} diff --git a/lib/mining/classifier/rf/rf_bench_test.go b/lib/mining/classifier/rf/rf_bench_test.go new file mode 100644 index 00000000..eddd721c --- /dev/null +++ b/lib/mining/classifier/rf/rf_bench_test.go @@ -0,0 +1,22 @@ +// Copyright 2016 Mhd Sulhan <ms@kilabit.info>. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package rf + +import ( + "testing" +) + +func BenchmarkPhoneme(b *testing.B) { + SampleDsvFile = "../../testdata/phoneme/phoneme.dsv" + OOBStatsFile = "phoneme.oob" + PerfFile = "phoneme.perf" + + MinFeature = 3 + MaxFeature = 4 + + for x := 0; x < b.N; x++ { + runRandomForest() + } +} diff --git a/lib/mining/classifier/rf/rf_test.go b/lib/mining/classifier/rf/rf_test.go new file mode 100644 index 00000000..ba96125b --- /dev/null +++ b/lib/mining/classifier/rf/rf_test.go @@ -0,0 +1,190 @@ +// Copyright 2016 Mhd Sulhan <ms@kilabit.info>. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package rf + +import ( + "fmt" + "log" + "os" + "testing" + + "github.com/shuLhan/share/lib/dsv" + "github.com/shuLhan/share/lib/mining/classifier" + "github.com/shuLhan/share/lib/tabula" +) + +// Global options to run for each test. +var ( + // SampleDsvFile is the file that contain samples config. + SampleDsvFile string + // DoTest if its true then the dataset will splited into training and + // test set with random selection without replacement. + DoTest = false + // NTree number of tree to generate. + NTree = 100 + // NBootstrap percentage of sample used as subsample. + NBootstrap = 66 + // MinFeature number of feature to begin with. + MinFeature = 1 + // MaxFeature maximum number of feature to test + MaxFeature = -1 + // RunOOB if its true the the OOB samples will be used to test the + // model in each iteration. + RunOOB = true + // OOBStatsFile is the file where OOB statistic will be saved. + OOBStatsFile string + // PerfFile is the file where performance statistic will be saved. + PerfFile string + // StatFile is the file where classifying statistic will be saved. + StatFile string +) + +func getSamples() (train, test tabula.ClasetInterface) { + samples := tabula.Claset{} + _, e := dsv.SimpleRead(SampleDsvFile, &samples) + if nil != e { + log.Fatal(e) + } + + if !DoTest { + return &samples, nil + } + + ntrain := int(float32(samples.Len()) * (float32(NBootstrap) / 100.0)) + + bag, oob, _, _ := tabula.RandomPickRows(&samples, ntrain, false) + + train = bag.(tabula.ClasetInterface) + test = oob.(tabula.ClasetInterface) + + train.SetClassIndex(samples.GetClassIndex()) + test.SetClassIndex(samples.GetClassIndex()) + + return train, test +} + +func runRandomForest() { + oobStatsFile := OOBStatsFile + perfFile := PerfFile + statFile := StatFile + + trainset, testset := getSamples() + + if MaxFeature < 0 { + MaxFeature = trainset.GetNColumn() + } + + for nfeature := MinFeature; nfeature < MaxFeature; nfeature++ { + // Add prefix to OOB stats file. + oobStatsFile = fmt.Sprintf("N%d.%s", nfeature, OOBStatsFile) + + // Add prefix to performance file. + perfFile = fmt.Sprintf("N%d.%s", nfeature, PerfFile) + + // Add prefix to stat file. + statFile = fmt.Sprintf("N%d.%s", nfeature, StatFile) + + // Create and build random forest. + forest := Runtime{ + Runtime: classifier.Runtime{ + RunOOB: RunOOB, + OOBStatsFile: oobStatsFile, + PerfFile: perfFile, + StatFile: statFile, + }, + NTree: NTree, + NRandomFeature: nfeature, + PercentBoot: NBootstrap, + } + + e := forest.Build(trainset) + if e != nil { + log.Fatal(e) + } + + if DoTest { + predicts, _, probs := forest.ClassifySet(testset, nil) + + forest.Performance(testset, predicts, probs) + e = forest.WritePerformance() + if e != nil { + log.Fatal(e) + } + } + } +} + +func TestEnsemblingGlass(t *testing.T) { + SampleDsvFile = "../../testdata/forensic_glass/fgl.dsv" + RunOOB = false + OOBStatsFile = "glass.oob" + StatFile = "glass.stat" + PerfFile = "glass.perf" + DoTest = true + + runRandomForest() +} + +func TestEnsemblingIris(t *testing.T) { + SampleDsvFile = "../../testdata/iris/iris.dsv" + OOBStatsFile = "iris.oob" + + runRandomForest() +} + +func TestEnsemblingPhoneme(t *testing.T) { + SampleDsvFile = "../../testdata/phoneme/phoneme.dsv" + OOBStatsFile = "phoneme.oob.stat" + StatFile = "phoneme.stat" + PerfFile = "phoneme.perf" + + NTree = 200 + MinFeature = 3 + MaxFeature = 4 + RunOOB = false + DoTest = true + + runRandomForest() +} + +func TestEnsemblingSmotePhoneme(t *testing.T) { + SampleDsvFile = "../../resampling/smote/phoneme_smote.dsv" + OOBStatsFile = "phonemesmote.oob" + + MinFeature = 3 + MaxFeature = 4 + + runRandomForest() +} + +func TestEnsemblingLnsmotePhoneme(t *testing.T) { + SampleDsvFile = "../../resampling/lnsmote/phoneme_lnsmote.dsv" + OOBStatsFile = "phonemelnsmote.oob" + + MinFeature = 3 + MaxFeature = 4 + + runRandomForest() +} + +func TestWvc2010Lnsmote(t *testing.T) { + SampleDsvFile = "../../testdata/wvc2010lnsmote/wvc2010_features.lnsmote.dsv" + OOBStatsFile = "wvc2010lnsmote.oob" + + NTree = 1 + MinFeature = 5 + MaxFeature = 6 + + runRandomForest() +} + +func TestMain(m *testing.M) { + envTestRF := os.Getenv("TEST_RF") + if len(envTestRF) == 0 { + os.Exit(0) + } + + os.Exit(m.Run()) +} diff --git a/lib/mining/classifier/runtime.go b/lib/mining/classifier/runtime.go new file mode 100644 index 00000000..0f7a7755 --- /dev/null +++ b/lib/mining/classifier/runtime.go @@ -0,0 +1,443 @@ +// Copyright 2016 Mhd Sulhan <ms@kilabit.info>. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package classifier + +import ( + "fmt" + "math" + + "github.com/shuLhan/share/lib/debug" + "github.com/shuLhan/share/lib/dsv" + "github.com/shuLhan/share/lib/numbers" + libstrings "github.com/shuLhan/share/lib/strings" + "github.com/shuLhan/share/lib/tabula" +) + +const ( + tag = "[classifier.runtime]" +) + +// +// Runtime define a generic type which provide common fields that can be +// embedded by the real classifier (e.g. RandomForest). +// +type Runtime struct { + // RunOOB if its true the OOB will be computed, default is false. + RunOOB bool `json:"RunOOB"` + + // OOBStatsFile is the file where OOB statistic will be written. + OOBStatsFile string `json:"OOBStatsFile"` + + // PerfFile is the file where statistic of performance will be written. + PerfFile string `json:"PerfFile"` + + // StatFile is the file where statistic of classifying samples will be + // written. + StatFile string `json:"StatFile"` + + // oobCms contain confusion matrix value for each OOB in iteration. + oobCms []CM + + // oobStats contain statistic of classifier for each OOB in iteration. + oobStats Stats + + // oobStatTotal contain total OOB statistic values. + oobStatTotal Stat + + // oobWriter contain file writer for statistic. + oobWriter *dsv.Writer + + // perfs contain performance statistic per sample, after classifying + // sample on classifier. + perfs Stats +} + +// +// Initialize will start the runtime for processing by saving start time and +// opening stats file. +// +func (rt *Runtime) Initialize() error { + rt.oobStatTotal.Start() + + return rt.OpenOOBStatsFile() +} + +// +// Finalize finish the runtime, compute total statistic, write it to file, and +// close the file. +// +func (rt *Runtime) Finalize() (e error) { + st := &rt.oobStatTotal + + st.End() + st.ID = int64(len(rt.oobStats)) + + e = rt.WriteOOBStat(st) + if e != nil { + return e + } + + return rt.CloseOOBStatsFile() +} + +// +// OOBStats return all statistic objects. +// +func (rt *Runtime) OOBStats() *Stats { + return &rt.oobStats +} + +// +// StatTotal return total statistic. +// +func (rt *Runtime) StatTotal() *Stat { + return &rt.oobStatTotal +} + +// +// AddOOBCM will append new confusion matrix. +// +func (rt *Runtime) AddOOBCM(cm *CM) { + rt.oobCms = append(rt.oobCms, *cm) +} + +// +// AddStat will append new classifier statistic data. +// +func (rt *Runtime) AddStat(stat *Stat) { + rt.oobStats = append(rt.oobStats, stat) +} + +// +// ComputeCM will compute confusion matrix of sample using value space, actual +// and prediction values. +// +func (rt *Runtime) ComputeCM(sampleIds []int, + vs, actuals, predicts []string, +) ( + cm *CM, +) { + cm = &CM{} + + cm.ComputeStrings(vs, actuals, predicts) + cm.GroupIndexPredictionsStrings(sampleIds, actuals, predicts) + + if debug.Value >= 2 { + fmt.Println(tag, cm) + } + + return cm +} + +// +// ComputeStatFromCM will compute statistic using confusion matrix. +// +func (rt *Runtime) ComputeStatFromCM(stat *Stat, cm *CM) { + + stat.OobError = cm.GetFalseRate() + + stat.OobErrorMean = rt.oobStatTotal.OobError / + float64(len(rt.oobStats)+1) + + stat.TP = int64(cm.TP()) + stat.FP = int64(cm.FP()) + stat.TN = int64(cm.TN()) + stat.FN = int64(cm.FN()) + + t := float64(stat.TP + stat.FN) + if t == 0 { + stat.TPRate = 0 + } else { + stat.TPRate = float64(stat.TP) / t + } + + t = float64(stat.FP + stat.TN) + if t == 0 { + stat.FPRate = 0 + } else { + stat.FPRate = float64(stat.FP) / t + } + + t = float64(stat.FP + stat.TN) + if t == 0 { + stat.TNRate = 0 + } else { + stat.TNRate = float64(stat.TN) / t + } + + t = float64(stat.TP + stat.FP) + if t == 0 { + stat.Precision = 0 + } else { + stat.Precision = float64(stat.TP) / t + } + + t = (1 / stat.Precision) + (1 / stat.TPRate) + if t == 0 { + stat.FMeasure = 0 + } else { + stat.FMeasure = 2 / t + } + + t = float64(stat.TP + stat.TN + stat.FP + stat.FN) + if t == 0 { + stat.Accuracy = 0 + } else { + stat.Accuracy = float64(stat.TP+stat.TN) / t + } + + if debug.Value >= 1 { + rt.PrintOobStat(stat, cm) + rt.PrintStat(stat) + } +} + +// +// ComputeStatTotal compute total statistic. +// +func (rt *Runtime) ComputeStatTotal(stat *Stat) { + if stat == nil { + return + } + + nstat := len(rt.oobStats) + if nstat == 0 { + return + } + + t := &rt.oobStatTotal + + t.OobError += stat.OobError + t.OobErrorMean = t.OobError / float64(nstat) + t.TP += stat.TP + t.FP += stat.FP + t.TN += stat.TN + t.FN += stat.FN + + total := float64(t.TP + t.FN) + if total == 0 { + t.TPRate = 0 + } else { + t.TPRate = float64(t.TP) / total + } + + total = float64(t.FP + t.TN) + if total == 0 { + t.FPRate = 0 + } else { + t.FPRate = float64(t.FP) / total + } + + total = float64(t.FP + t.TN) + if total == 0 { + t.TNRate = 0 + } else { + t.TNRate = float64(t.TN) / total + } + + total = float64(t.TP + t.FP) + if total == 0 { + t.Precision = 0 + } else { + t.Precision = float64(t.TP) / total + } + + total = (1 / t.Precision) + (1 / t.TPRate) + if total == 0 { + t.FMeasure = 0 + } else { + t.FMeasure = 2 / total + } + + total = float64(t.TP + t.TN + t.FP + t.FN) + if total == 0 { + t.Accuracy = 0 + } else { + t.Accuracy = float64(t.TP+t.TN) / total + } +} + +// +// OpenOOBStatsFile will open statistic file for output. +// +func (rt *Runtime) OpenOOBStatsFile() error { + if rt.oobWriter != nil { + _ = rt.CloseOOBStatsFile() + } + rt.oobWriter = &dsv.Writer{} + return rt.oobWriter.OpenOutput(rt.OOBStatsFile) +} + +// +// WriteOOBStat will write statistic of process to file. +// +func (rt *Runtime) WriteOOBStat(stat *Stat) error { + if rt.oobWriter == nil { + return nil + } + if stat == nil { + return nil + } + return rt.oobWriter.WriteRawRow(stat.ToRow(), nil, nil) +} + +// +// CloseOOBStatsFile will close statistics file for writing. +// +func (rt *Runtime) CloseOOBStatsFile() (e error) { + if rt.oobWriter == nil { + return + } + + e = rt.oobWriter.Close() + rt.oobWriter = nil + + return +} + +// +// PrintOobStat will print the out-of-bag statistic to standard output. +// +func (rt *Runtime) PrintOobStat(stat *Stat, cm *CM) { + fmt.Printf("%s OOB error rate: %.4f,"+ + " total: %.4f, mean %.4f, true rate: %.4f\n", tag, + stat.OobError, rt.oobStatTotal.OobError, + stat.OobErrorMean, cm.GetTrueRate()) +} + +// +// PrintStat will print statistic value to standard output. +// +func (rt *Runtime) PrintStat(stat *Stat) { + if stat == nil { + statslen := len(rt.oobStats) + if statslen <= 0 { + return + } + stat = rt.oobStats[statslen-1] + } + + fmt.Printf("%s TPRate: %.4f, FPRate: %.4f,"+ + " TNRate: %.4f, precision: %.4f, f-measure: %.4f,"+ + " accuracy: %.4f\n", tag, stat.TPRate, stat.FPRate, + stat.TNRate, stat.Precision, stat.FMeasure, stat.Accuracy) +} + +// +// PrintStatTotal will print total statistic to standard output. +// +func (rt *Runtime) PrintStatTotal(st *Stat) { + if st == nil { + st = &rt.oobStatTotal + } + rt.PrintStat(st) +} + +// +// Performance given an actuals class label and their probabilities, compute +// the performance statistic of classifier. +// +// Algorithm, +// (1) Sort the probabilities in descending order. +// (2) Sort the actuals and predicts using sorted index from probs +// (3) Compute tpr, fpr, precision +// (4) Write performance to file. +// +func (rt *Runtime) Performance(samples tabula.ClasetInterface, + predicts []string, probs []float64, +) ( + perfs Stats, +) { + // (1) + actuals := samples.GetClassAsStrings() + sortedIds := numbers.IntCreateSeq(0, len(probs)-1) + numbers.Floats64InplaceMergesort(probs, sortedIds, 0, len(probs), + false) + + // (2) + libstrings.SortByIndex(&actuals, sortedIds) + libstrings.SortByIndex(&predicts, sortedIds) + + // (3) + rt.computePerfByProbs(samples, actuals, probs) + + return rt.perfs +} + +func trapezoidArea(fp, fpprev, tp, tpprev int64) float64 { + base := math.Abs(float64(fp - fpprev)) + heightAvg := float64(tp+tpprev) / float64(2.0) + return base * heightAvg +} + +// +// computePerfByProbs will compute classifier performance using probabilities +// or score `probs`. +// +// This currently only work for two class problem. +// +func (rt *Runtime) computePerfByProbs(samples tabula.ClasetInterface, + actuals []string, probs []float64, +) { + vs := samples.GetClassValueSpace() + nactuals := numbers.IntsTo64(samples.Counts()) + nclass := libstrings.CountTokens(actuals, vs, false) + + pprev := math.Inf(-1) + tp := int64(0) + fp := int64(0) + tpprev := int64(0) + fpprev := int64(0) + + auc := float64(0) + + for x, p := range probs { + if p != pprev { + stat := Stat{} + stat.SetTPRate(tp, nactuals[0]) + stat.SetFPRate(fp, nactuals[1]) + stat.SetPrecisionFromRate(nactuals[0], nactuals[1]) + + auc = auc + trapezoidArea(fp, fpprev, tp, tpprev) + stat.SetAUC(auc) + + rt.perfs = append(rt.perfs, &stat) + + pprev = p + tpprev = tp + fpprev = fp + } + + if actuals[x] == vs[0] { + tp++ + } else { + fp++ + } + } + + stat := Stat{} + stat.SetTPRate(tp, nactuals[0]) + stat.SetFPRate(fp, nactuals[1]) + stat.SetPrecisionFromRate(nactuals[0], nactuals[1]) + + auc = auc + trapezoidArea(fp, fpprev, tp, tpprev) + auc = auc / float64(nclass[0]*nclass[1]) + stat.SetAUC(auc) + + rt.perfs = append(rt.perfs, &stat) + + if len(rt.perfs) >= 2 { + // Replace the first stat with second stat, because of NaN + // value on the first precision. + rt.perfs[0] = rt.perfs[1] + } +} + +// +// WritePerformance will write performance data to file. +// +func (rt *Runtime) WritePerformance() error { + return rt.perfs.Write(rt.PerfFile) +} diff --git a/lib/mining/classifier/stat.go b/lib/mining/classifier/stat.go new file mode 100644 index 00000000..790aec10 --- /dev/null +++ b/lib/mining/classifier/stat.go @@ -0,0 +1,176 @@ +// Copyright 2015-2016 Mhd Sulhan <ms@kilabit.info>. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package classifier + +import ( + "time" + + "github.com/shuLhan/share/lib/dsv" + "github.com/shuLhan/share/lib/tabula" +) + +// +// Stat hold statistic value of classifier, including TP rate, FP rate, precision, +// and recall. +// +type Stat struct { + // ID unique id for this statistic (e.g. number of tree). + ID int64 + // StartTime contain the start time of classifier in unix timestamp. + StartTime int64 + // EndTime contain the end time of classifier in unix timestamp. + EndTime int64 + // ElapsedTime contain actual time, in seconds, between end and start + // time. + ElapsedTime int64 + // TP contain true-positive value. + TP int64 + // FP contain false-positive value. + FP int64 + // TN contain true-negative value. + TN int64 + // FN contain false-negative value. + FN int64 + // OobError contain out-of-bag error. + OobError float64 + // OobErrorMean contain mean of out-of-bag error. + OobErrorMean float64 + // TPRate contain true-positive rate (recall): tp/(tp+fn) + TPRate float64 + // FPRate contain false-positive rate: fp/(fp+tn) + FPRate float64 + // TNRate contain true-negative rate: tn/(tn+fp) + TNRate float64 + // Precision contain: tp/(tp+fp) + Precision float64 + // FMeasure contain value of F-measure or the harmonic mean of + // precision and recall. + FMeasure float64 + // Accuracy contain the degree of closeness of measurements of a + // quantity to that quantity's true value. + Accuracy float64 + // AUC contain the area under curve. + AUC float64 +} + +// SetAUC will set the AUC value. +func (stat *Stat) SetAUC(v float64) { + stat.AUC = v +} + +// +// SetTPRate will set TP and TPRate using number of positive `p`. +// +func (stat *Stat) SetTPRate(tp, p int64) { + stat.TP = tp + stat.TPRate = float64(tp) / float64(p) +} + +// +// SetFPRate will set FP and FPRate using number of negative `n`. +// +func (stat *Stat) SetFPRate(fp, n int64) { + stat.FP = fp + stat.FPRate = float64(fp) / float64(n) +} + +// +// SetPrecisionFromRate will set Precision value using tprate and fprate. +// `p` and `n` is the number of positive and negative class in samples. +// +func (stat *Stat) SetPrecisionFromRate(p, n int64) { + stat.Precision = (stat.TPRate * float64(p)) / + ((stat.TPRate * float64(p)) + (stat.FPRate * float64(n))) +} + +// +// Recall return value of recall. +// +func (stat *Stat) Recall() float64 { + return stat.TPRate +} + +// +// Sum will add statistic from other stat object to current stat, not including +// the start and end time. +// +func (stat *Stat) Sum(other *Stat) { + stat.OobError += other.OobError + stat.OobErrorMean += other.OobErrorMean + stat.TP += other.TP + stat.FP += other.FP + stat.TN += other.TN + stat.FN += other.FN + stat.TPRate += other.TPRate + stat.FPRate += other.FPRate + stat.TNRate += other.TNRate + stat.Precision += other.Precision + stat.FMeasure += other.FMeasure + stat.Accuracy += other.Accuracy +} + +// +// ToRow will convert the stat to tabula.row in the order of Stat field. +// +func (stat *Stat) ToRow() (row *tabula.Row) { + row = &tabula.Row{} + + row.PushBack(tabula.NewRecordInt(stat.ID)) + row.PushBack(tabula.NewRecordInt(stat.StartTime)) + row.PushBack(tabula.NewRecordInt(stat.EndTime)) + row.PushBack(tabula.NewRecordInt(stat.ElapsedTime)) + row.PushBack(tabula.NewRecordReal(stat.OobError)) + row.PushBack(tabula.NewRecordReal(stat.OobErrorMean)) + row.PushBack(tabula.NewRecordInt(stat.TP)) + row.PushBack(tabula.NewRecordInt(stat.FP)) + row.PushBack(tabula.NewRecordInt(stat.TN)) + row.PushBack(tabula.NewRecordInt(stat.FN)) + row.PushBack(tabula.NewRecordReal(stat.TPRate)) + row.PushBack(tabula.NewRecordReal(stat.FPRate)) + row.PushBack(tabula.NewRecordReal(stat.TNRate)) + row.PushBack(tabula.NewRecordReal(stat.Precision)) + row.PushBack(tabula.NewRecordReal(stat.FMeasure)) + row.PushBack(tabula.NewRecordReal(stat.Accuracy)) + row.PushBack(tabula.NewRecordReal(stat.AUC)) + + return +} + +// +// Start will start the timer. +// +func (stat *Stat) Start() { + stat.StartTime = time.Now().Unix() +} + +// +// End will stop the timer and compute the elapsed time. +// +func (stat *Stat) End() { + stat.EndTime = time.Now().Unix() + stat.ElapsedTime = stat.EndTime - stat.StartTime +} + +// +// Write will write the content of stat to `file`. +// +func (stat *Stat) Write(file string) (e error) { + if file == "" { + return + } + + writer := &dsv.Writer{} + e = writer.OpenOutput(file) + if e != nil { + return e + } + + e = writer.WriteRawRow(stat.ToRow(), nil, nil) + if e != nil { + return e + } + + return writer.Close() +} diff --git a/lib/mining/classifier/stats.go b/lib/mining/classifier/stats.go new file mode 100644 index 00000000..b29d3510 --- /dev/null +++ b/lib/mining/classifier/stats.go @@ -0,0 +1,143 @@ +// Copyright 2016 Mhd Sulhan <ms@kilabit.info>. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package classifier + +import ( + "github.com/shuLhan/share/lib/dsv" +) + +// +// Stats define list of statistic values. +// +type Stats []*Stat + +// +// Add will add other stat object to the slice. +// +func (stats *Stats) Add(stat *Stat) { + *stats = append(*stats, stat) +} + +// +// StartTimes return all start times in unix timestamp. +// +func (stats *Stats) StartTimes() (times []int64) { + for _, stat := range *stats { + times = append(times, stat.StartTime) + } + return +} + +// +// EndTimes return all end times in unix timestamp. +// +func (stats *Stats) EndTimes() (times []int64) { + for _, stat := range *stats { + times = append(times, stat.EndTime) + } + return +} + +// +// OobErrorMeans return all out-of-bag error mean values. +// +func (stats *Stats) OobErrorMeans() (oobmeans []float64) { + oobmeans = make([]float64, len(*stats)) + for x, stat := range *stats { + oobmeans[x] = stat.OobErrorMean + } + return +} + +// +// TPRates return all true-positive rate values. +// +func (stats *Stats) TPRates() (tprates []float64) { + for _, stat := range *stats { + tprates = append(tprates, stat.TPRate) + } + return +} + +// +// FPRates return all false-positive rate values. +// +func (stats *Stats) FPRates() (fprates []float64) { + for _, stat := range *stats { + fprates = append(fprates, stat.FPRate) + } + return +} + +// +// TNRates will return all true-negative rate values. +// +func (stats *Stats) TNRates() (tnrates []float64) { + for _, stat := range *stats { + tnrates = append(tnrates, stat.TNRate) + } + return +} + +// +// Precisions return all precision values. +// +func (stats *Stats) Precisions() (precs []float64) { + for _, stat := range *stats { + precs = append(precs, stat.Precision) + } + return +} + +// +// Recalls return all recall values. +// +func (stats *Stats) Recalls() (recalls []float64) { + return stats.TPRates() +} + +// +// FMeasures return all F-measure values. +// +func (stats *Stats) FMeasures() (fmeasures []float64) { + for _, stat := range *stats { + fmeasures = append(fmeasures, stat.FMeasure) + } + return +} + +// +// Accuracies return all accuracy values. +// +func (stats *Stats) Accuracies() (accuracies []float64) { + for _, stat := range *stats { + accuracies = append(accuracies, stat.Accuracy) + } + return +} + +// +// Write will write all statistic data to `file`. +// +func (stats *Stats) Write(file string) (e error) { + if file == "" { + return + } + + writer := &dsv.Writer{} + e = writer.OpenOutput(file) + if e != nil { + return e + } + + for _, st := range *stats { + e = writer.WriteRawRow(st.ToRow(), nil, nil) + if e != nil { + return e + } + } + + return writer.Close() +} diff --git a/lib/mining/classifier/stats_interface.go b/lib/mining/classifier/stats_interface.go new file mode 100644 index 00000000..a9b60b9a --- /dev/null +++ b/lib/mining/classifier/stats_interface.go @@ -0,0 +1,68 @@ +// Copyright 2016 Mhd Sulhan <ms@kilabit.info>. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package classifier + +// +// ComputeFMeasures given array of precisions and recalls, compute F-measure +// of each instance and return it. +// +func ComputeFMeasures(precisions, recalls []float64) (fmeasures []float64) { + // Get the minimum length of precision and recall. + // This is to make sure that we are not looping out of range. + minlen := len(precisions) + recallslen := len(recalls) + if recallslen < minlen { + minlen = recallslen + } + + for x := 0; x < minlen; x++ { + f := 2 / ((1 / precisions[x]) + (1 / recalls[x])) + fmeasures = append(fmeasures, f) + } + return +} + +// +// ComputeAccuracies will compute and return accuracy from array of +// true-positive, false-positive, true-negative, and false-negative; using +// formula, +// +// (tp + tn) / (tp + tn + tn + fn) +// +func ComputeAccuracies(tp, fp, tn, fn []int64) (accuracies []float64) { + // Get minimum length of input, just to make sure we are not looping + // out of range. + minlen := len(tp) + if len(fp) < len(tn) { + minlen = len(fp) + } + if len(fn) < minlen { + minlen = len(fn) + } + + for x := 0; x < minlen; x++ { + acc := float64(tp[x]+tn[x]) / + float64(tp[x]+fp[x]+tn[x]+fn[x]) + accuracies = append(accuracies, acc) + } + return +} + +// +// ComputeElapsedTimes will compute and return elapsed time between `start` +// and `end` timestamps. +// +func ComputeElapsedTimes(start, end []int64) (elaps []int64) { + // Get minimum length. + minlen := len(start) + if len(end) < minlen { + minlen = len(end) + } + + for x := 0; x < minlen; x++ { + elaps = append(elaps, end[x]-start[x]) + } + return +} diff --git a/lib/mining/cmd/cart/iris.dsv b/lib/mining/cmd/cart/iris.dsv new file mode 100644 index 00000000..82a52d04 --- /dev/null +++ b/lib/mining/cmd/cart/iris.dsv @@ -0,0 +1,35 @@ +{ + "Input" :"../../testdata/iris/iris.dat" +, "Rejected" :"iris.rej" +, "MaxRows" :-1 +, "ClassIndex" :4 +, "DatasetMode" :"matrix" +, "SplitMethod" :"gini" +, "InputMetadata" : + [{ + "Name" :"sepal-length" + , "Separator" :"," + , "Type" :"real" + },{ + "Name" :"sepal-width" + , "Separator" :"," + , "Type" :"real" + },{ + "Name" :"petal-length" + , "Separator" :"," + , "Type" :"real" + },{ + "Name" :"petal-width" + , "Separator" :"," + , "Type" :"real" + },{ + "Name" :"class" + , "Type" :"string" + , "ValueSpace" : + [ + "Iris-setosa" + , "Iris-versicolor" + , "Iris-virginica" + ] + }] +} diff --git a/lib/mining/cmd/cart/main.go b/lib/mining/cmd/cart/main.go new file mode 100644 index 00000000..08510203 --- /dev/null +++ b/lib/mining/cmd/cart/main.go @@ -0,0 +1,107 @@ +// Copyright 2016 Mhd Sulhan <ms@kilabit.info>. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package main + +import ( + "encoding/json" + "flag" + "fmt" + "io/ioutil" + "os" + "time" + + "github.com/shuLhan/share/lib/debug" + "github.com/shuLhan/share/lib/dsv" + "github.com/shuLhan/share/lib/mining/classifier/cart" + "github.com/shuLhan/share/lib/tabula" +) + +var ( + nRandomFeature = 0 +) + +var usage = func() { + cmd := os.Args[0] + fmt.Fprintf(os.Stderr, "Usage of %s: [-n number] [config.dsv]\n", cmd) + flag.PrintDefaults() +} + +func init() { + flagUsage := []string{ + "Number of random feature (default 0)", + } + + flag.IntVar(&nRandomFeature, "n", 0, flagUsage[0]) +} + +func trace(s string) (string, time.Time) { + fmt.Println("[START]", s) + return s, time.Now() +} + +func un(s string, startTime time.Time) { + endTime := time.Now() + fmt.Println("[END]", s, "with elapsed time", + endTime.Sub(startTime)) +} + +func createCart(fcfg string) (*cart.Runtime, error) { + cartrt := &cart.Runtime{} + + config, e := ioutil.ReadFile(fcfg) + if e != nil { + return nil, e + } + + e = json.Unmarshal(config, cartrt) + if e != nil { + return nil, e + } + + if nRandomFeature > 0 { + cartrt.NRandomFeature = nRandomFeature + } + + return cartrt, nil +} + +func main() { + defer un(trace("cart")) + + flag.Parse() + + if len(flag.Args()) <= 0 { + usage() + os.Exit(1) + } + + fcfg := flag.Arg(0) + + // Parsing config file and check command parameter values. + cartrt, e := createCart(fcfg) + if e != nil { + panic(e) + } + + // Get dataset + dataset := tabula.Claset{} + _, e = dsv.SimpleRead(fcfg, &dataset) + if e != nil { + panic(e) + } + + if debug.Value >= 1 { + fmt.Printf("[cart] Class index: %v\n", dataset.GetClassIndex()) + } + + e = cartrt.Build(&dataset) + if e != nil { + panic(e) + } + + if debug.Value >= 1 { + fmt.Println("[cart] CART tree:\n", cartrt) + } +} diff --git a/lib/mining/cmd/crf/main.go b/lib/mining/cmd/crf/main.go new file mode 100644 index 00000000..2bdd698a --- /dev/null +++ b/lib/mining/cmd/crf/main.go @@ -0,0 +1,205 @@ +// Copyright 2016 Mhd Sulhan <ms@kilabit.info>. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package main + +import ( + "encoding/json" + "flag" + "fmt" + "io/ioutil" + "os" + "time" + + "github.com/shuLhan/share/lib/dsv" + "github.com/shuLhan/share/lib/mining/classifier/crf" + "github.com/shuLhan/share/lib/tabula" +) + +const ( + tag = "[crf]" +) + +var ( + // nStage number of stage. + nStage = 0 + // nTree number of tree. + nTree = 0 + // nRandomFeature number of feature to compute. + nRandomFeature = 0 + // percentBoot percentage of sample for bootstraping. + percentBoot = 0 + // oobStatsFile where statistic will be written. + oobStatsFile = "" + // perfFile where performance of classifier will be written. + perfFile = "" + // trainCfg point to the configuration file for training or creating + // a model + trainCfg = "" + // testCfg point to the configuration file for testing + testCfg = "" + + // crforest the main object. + crforest crf.Runtime +) + +var usage = func() { + flag.PrintDefaults() +} + +func init() { + flagUsage := []string{ + "Number of stage (default 200)", + "Number of tree in each stage (default 1)", + "Number of feature to compute (default 0, all features)", + "Percentage of bootstrap (default 64%)", + "OOB statistic file, where OOB data will be written", + "Performance file, where statistic of classifying data set will be written", + "Training configuration", + "Test configuration", + } + + flag.IntVar(&nStage, "nstage", -1, flagUsage[0]) + flag.IntVar(&nTree, "ntree", -1, flagUsage[1]) + flag.IntVar(&nRandomFeature, "nrandomfeature", -1, flagUsage[2]) + flag.IntVar(&percentBoot, "percentboot", -1, flagUsage[3]) + + flag.StringVar(&oobStatsFile, "oobstatsfile", "", flagUsage[4]) + flag.StringVar(&perfFile, "perffile", "", flagUsage[5]) + + flag.StringVar(&trainCfg, "train", "", flagUsage[6]) + flag.StringVar(&testCfg, "test", "", flagUsage[7]) +} + +func trace() (start time.Time) { + start = time.Now() + fmt.Println(tag, "start", start) + return +} + +func un(startTime time.Time) { + endTime := time.Now() + fmt.Println(tag, "elapsed time", endTime.Sub(startTime)) +} + +// +// createCRF will create cascaded random forest for training, with the +// following steps, +// (1) load training configuration. +// (2) Overwrite configuration parameter if its set from command line. +// +func createCRF() error { + // (1) + config, e := ioutil.ReadFile(trainCfg) + if e != nil { + return e + } + + crforest = crf.Runtime{} + + e = json.Unmarshal(config, &crforest) + if e != nil { + return e + } + + // (2) + if nStage > 0 { + crforest.NStage = nStage + } + if nTree > 0 { + crforest.NTree = nTree + } + if nRandomFeature > 0 { + crforest.NRandomFeature = nRandomFeature + } + if percentBoot > 0 { + crforest.PercentBoot = percentBoot + } + if oobStatsFile != "" { + crforest.OOBStatsFile = oobStatsFile + } + if perfFile != "" { + crforest.PerfFile = perfFile + } + + crforest.RunOOB = true + + return nil +} + +func train() { + e := createCRF() + if e != nil { + panic(e) + } + + trainset := tabula.Claset{} + + _, e = dsv.SimpleRead(trainCfg, &trainset) + if e != nil { + panic(e) + } + + e = crforest.Build(&trainset) + if e != nil { + panic(e) + } +} + +func test() { + testset := tabula.Claset{} + _, e := dsv.SimpleRead(testCfg, &testset) + if e != nil { + panic(e) + } + + fmt.Println(tag, "Test set:", &testset) + fmt.Println(tag, "Sample test set:", testset.GetRow(0)) + + predicts, cm, probs := crforest.ClassifySetByWeight(&testset, nil) + + fmt.Println("[crf] Test set CM:", cm) + + crforest.Performance(&testset, predicts, probs) + + e = crforest.WritePerformance() + if e != nil { + panic(e) + } +} + +// +// (0) Parse and check command line parameters. +// (1) If trainCfg parameter is set, +// (1.1) train the model, +// (1.2) TODO: load saved model. +// (2) If testCfg parameter is set, +// (2.1) Test the model using data from testCfg. +// +func main() { + defer un(trace()) + + // (0) + flag.Parse() + + fmt.Println(tag, "Training config:", trainCfg) + fmt.Println(tag, "Test config:", testCfg) + + // (1) + if trainCfg != "" { + // (1.1) + train() + } else { + // (1.2) + if len(flag.Args()) <= 0 { + usage() + os.Exit(1) + } + } + + // (2) + if testCfg != "" { + test() + } +} diff --git a/lib/mining/cmd/crf/phoneme.dsv b/lib/mining/cmd/crf/phoneme.dsv new file mode 100644 index 00000000..1219989f --- /dev/null +++ b/lib/mining/cmd/crf/phoneme.dsv @@ -0,0 +1,45 @@ +{ + "Input" :"../../testdata/phoneme/phoneme.dat" +, "Rejected" :"phoneme.rej" +, "MaxRows" :-1 +, "DatasetMode" :"matrix" +, "ClassIndex" :5 + +, "NTree" :50 +, "NRandomFeature" :2 +, "RunOOB" :true +, "OOBStatsFile" :"phoneme.oob.stat" +, "PerfFile" :"phoneme.perf" +, "StatFile" :"phoneme.stat" + +, "InputMetadata" : + [{ + "Name" :"f1" + , "Type" :"real" + , "Separator" :" " + },{ + "Name" :"f2" + , "Type" :"real" + , "Separator" :" " + },{ + "Name" :"f3" + , "Type" :"real" + , "Separator" :" " + },{ + "Name" :"f4" + , "Type" :"real" + , "Separator" :" " + },{ + "Name" :"f5" + , "Type" :"real" + , "Separator" :" " + },{ + "Name" :"class" + , "Type" :"real" + , "ValueSpace" : + [ + "1" + , "0" + ] + }] +} diff --git a/lib/mining/cmd/lnsmote/main.go b/lib/mining/cmd/lnsmote/main.go new file mode 100644 index 00000000..692e9cf4 --- /dev/null +++ b/lib/mining/cmd/lnsmote/main.go @@ -0,0 +1,188 @@ +// Copyright 2016 Mhd Sulhan <ms@kilabit.info>. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package main + +import ( + "encoding/json" + "flag" + "fmt" + "io/ioutil" + "os" + "time" + + "github.com/shuLhan/share/lib/debug" + "github.com/shuLhan/share/lib/dsv" + "github.com/shuLhan/share/lib/mining/resampling/lnsmote" + "github.com/shuLhan/share/lib/tabula" +) + +var ( + // percentOver contain percentage of over sampling. + percentOver = 100 + // knn contain number of nearest neighbours considered when + // oversampling. + knn = 5 + // synFile flag for synthetic file output. + synFile = "" + // merge flag, if its true the original and synthetic will be merged + // into `synFile`. + merge = false +) + +var usage = func() { + cmd := os.Args[0] + fmt.Fprintf(os.Stderr, "Usage of %s:\n"+ + "[-percentover number] "+ + "[-knn number] "+ + "[-syntheticfile string] "+ + "[-merge bool] "+ + "[config.dsv]\n", cmd) + flag.PrintDefaults() +} + +func init() { + flagUsage := []string{ + "Percentage of oversampling (default 100)", + "Number of nearest neighbours (default 5)", + "File where synthetic samples will be written (default '')", + "If true then original and synthetic will be merged when" + + " written to file (default false)", + } + + flag.IntVar(&percentOver, "percentover", -1, flagUsage[0]) + flag.IntVar(&knn, "knn", -1, flagUsage[1]) + flag.StringVar(&synFile, "syntheticfile", "", flagUsage[2]) + flag.BoolVar(&merge, "merge", false, flagUsage[3]) +} + +func trace(s string) (string, time.Time) { + fmt.Println("[START]", s) + return s, time.Now() +} + +func un(s string, startTime time.Time) { + endTime := time.Now() + fmt.Println("[END]", s, "with elapsed time", + endTime.Sub(startTime)) +} + +// +// createLnsmote will create and initialize SMOTE object from config file and +// from command parameter. +// +func createLnsmote(fcfg string) (lnsmoteRun *lnsmote.Runtime, e error) { + lnsmoteRun = &lnsmote.Runtime{} + + config, e := ioutil.ReadFile(fcfg) + if e != nil { + return nil, e + } + + e = json.Unmarshal(config, lnsmoteRun) + if e != nil { + return nil, e + } + + // Use option value from command parameter. + if percentOver > 0 { + lnsmoteRun.PercentOver = percentOver + } + if knn > 0 { + lnsmoteRun.K = knn + } + + if debug.Value >= 1 { + fmt.Println("[lnsmote]", lnsmoteRun) + } + + return +} + +// +// runLnsmote will select minority class from dataset and run oversampling. +// +func runLnsmote(lnsmoteRun *lnsmote.Runtime, dataset *tabula.Claset) (e error) { + e = lnsmoteRun.Resampling(dataset) + if e != nil { + return + } + + if debug.Value >= 1 { + fmt.Println("[lnsmote] # synthetics:", + lnsmoteRun.GetSynthetics().Len()) + } + + return +} + +// runMerge will append original dataset to synthetic file. +func runMerge(lnsmoteRun *lnsmote.Runtime, dataset *tabula.Claset) (e error) { + writer, e := dsv.NewWriter("") + if e != nil { + return + } + + e = writer.ReopenOutput(lnsmoteRun.SyntheticFile) + if e != nil { + return + } + + sep := dsv.DefSeparator + n, e := writer.WriteRawDataset(dataset, &sep) + if e != nil { + return + } + + if debug.Value >= 1 { + fmt.Println("[lnsmote] # appended:", n) + } + + return writer.Close() +} + +func main() { + defer un(trace("lnsmote")) + + flag.Parse() + + if len(flag.Args()) <= 0 { + usage() + os.Exit(1) + } + + fcfg := flag.Arg(0) + + // Parsing config file and parameter. + lnsmoteRun, e := createLnsmote(fcfg) + if e != nil { + panic(e) + } + + // Get dataset. + dataset := tabula.Claset{} + _, e = dsv.SimpleRead(fcfg, &dataset) + if e != nil { + panic(e) + } + + fmt.Println("[lnsmote] Dataset:", &dataset) + + row := dataset.GetRow(0) + fmt.Println("[lnsmote] sample:", row) + + e = runLnsmote(lnsmoteRun, &dataset) + if e != nil { + panic(e) + } + + if !merge { + return + } + + e = runMerge(lnsmoteRun, &dataset) + if e != nil { + panic(e) + } +} diff --git a/lib/mining/cmd/lnsmote/phoneme.dsv b/lib/mining/cmd/lnsmote/phoneme.dsv new file mode 100644 index 00000000..6e179c50 --- /dev/null +++ b/lib/mining/cmd/lnsmote/phoneme.dsv @@ -0,0 +1,45 @@ +{ + "Input" : "../../testdata/phoneme/phoneme.dat" +, "Rejected" : "phoneme.rej" +, "MaxRows" : -1 +, "ClassIndex" : 5 +, "DatasetMode" : "matrix" + +, "K" : 5 + +, "PercentOver" : 100 +, "SyntheticFile" : "phoneme.lnsmote.dat" + +, "ClassMinor" : "1" + +, "InputMetadata" : + [{ + "Name" :"f1" + , "Type" :"real" + , "Separator" :" " + },{ + "Name" :"f2" + , "Type" :"real" + , "Separator" :" " + },{ + "Name" :"f3" + , "Type" :"real" + , "Separator" :" " + },{ + "Name" :"f4" + , "Type" :"real" + , "Separator" :" " + },{ + "Name" :"f5" + , "Type" :"real" + , "Separator" :" " + },{ + "Name" :"class" + , "Type" :"string" + , "ValueSpace" : + [ + "0" + , "1" + ] + }] +} diff --git a/lib/mining/cmd/rf/iris.dsv b/lib/mining/cmd/rf/iris.dsv new file mode 100644 index 00000000..e86af6d1 --- /dev/null +++ b/lib/mining/cmd/rf/iris.dsv @@ -0,0 +1,41 @@ +{ + "Input" :"../../testdata/iris/iris.dat" +, "Rejected" :"iris.rej" +, "MaxRows" :-1 +, "ClassIndex" :4 +, "DatasetMode" :"matrix" +, "SplitMethod" :"gini" +, "NTree" :50 +, "NRandomFeature" :2 +, "RunOOB" :true +, "OOBStatsFile" :"iris.oob.stat" +, "PerfFile" :"iris.perf" +, "StatFile" :"iris.stat" +, "InputMetadata" : + [{ + "Name" :"sepal-length" + , "Separator" :"," + , "Type" :"real" + },{ + "Name" :"sepal-width" + , "Separator" :"," + , "Type" :"real" + },{ + "Name" :"petal-length" + , "Separator" :"," + , "Type" :"real" + },{ + "Name" :"petal-width" + , "Separator" :"," + , "Type" :"real" + },{ + "Name" :"class" + , "Type" :"string" + , "ValueSpace" : + [ + "Iris-setosa" + , "Iris-versicolor" + , "Iris-virginica" + ] + }] +} diff --git a/lib/mining/cmd/rf/main.go b/lib/mining/cmd/rf/main.go new file mode 100644 index 00000000..2c029c37 --- /dev/null +++ b/lib/mining/cmd/rf/main.go @@ -0,0 +1,188 @@ +// Copyright 2016 Mhd Sulhan <ms@kilabit.info>. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package main + +import ( + "encoding/json" + "flag" + "fmt" + "io/ioutil" + "os" + "time" + + "github.com/shuLhan/share/lib/dsv" + "github.com/shuLhan/share/lib/mining/classifier/rf" + "github.com/shuLhan/share/lib/tabula" +) + +const ( + tag = "[rf]" +) + +var ( + // nTree number of tree. + nTree = 0 + // nRandomFeature number of feature to compute. + nRandomFeature = 0 + // percentBoot percentage of sample for bootstraping. + percentBoot = 0 + // oobStatsFile where statistic will be written. + oobStatsFile = "" + // perfFile where performance of classifier will be written. + perfFile = "" + // trainCfg point to the configuration file for training or creating + // a model + trainCfg = "" + // testCfg point to the configuration file for testing + testCfg = "" + + // forest the main object. + forest rf.Runtime +) + +var usage = func() { + flag.PrintDefaults() +} + +func init() { + flagUsage := []string{ + "Number of tree in forest (default 100)", + "Number of feature to compute (default 0)", + "Percentage of bootstrap (default 64%)", + "OOB statistic file, where OOB data will be written", + "Performance file, where statistic of classifying data set will be written", + "Training configuration", + "Test configuration", + } + + flag.IntVar(&nTree, "ntree", -1, flagUsage[0]) + flag.IntVar(&nRandomFeature, "nrandomfeature", -1, flagUsage[1]) + flag.IntVar(&percentBoot, "percentboot", -1, flagUsage[2]) + + flag.StringVar(&oobStatsFile, "oobstatsfile", "", flagUsage[3]) + flag.StringVar(&perfFile, "perffile", "", flagUsage[4]) + + flag.StringVar(&trainCfg, "train", "", flagUsage[5]) + flag.StringVar(&testCfg, "test", "", flagUsage[6]) +} + +func trace() (start time.Time) { + start = time.Now() + fmt.Println(tag, "start", start) + return +} + +func un(startTime time.Time) { + endTime := time.Now() + fmt.Println(tag, "elapsed time", endTime.Sub(startTime)) +} + +// +// createRandomForest will create random forest for training, with the +// following steps, +// (1) load training configuration. +// (2) Overwrite configuration parameter if its set from command line. +// +func createRandomForest() error { + // (1) + config, e := ioutil.ReadFile(trainCfg) + if e != nil { + return e + } + + forest = rf.Runtime{} + + e = json.Unmarshal(config, &forest) + if e != nil { + return e + } + + // (2) + if nTree > 0 { + forest.NTree = nTree + } + if nRandomFeature > 0 { + forest.NRandomFeature = nRandomFeature + } + if percentBoot > 0 { + forest.PercentBoot = percentBoot + } + if oobStatsFile != "" { + forest.OOBStatsFile = oobStatsFile + } + if perfFile != "" { + forest.PerfFile = perfFile + } + + return nil +} + +func train() { + e := createRandomForest() + if e != nil { + panic(e) + } + + trainset := tabula.Claset{} + + _, e = dsv.SimpleRead(trainCfg, &trainset) + if e != nil { + panic(e) + } + + e = forest.Build(&trainset) + if e != nil { + panic(e) + } +} + +func test() { + testset := tabula.Claset{} + _, e := dsv.SimpleRead(testCfg, &testset) + if e != nil { + panic(e) + } + + predicts, _, probs := forest.ClassifySet(&testset, nil) + + forest.Performance(&testset, predicts, probs) + + e = forest.WritePerformance() + if e != nil { + panic(e) + } +} + +// +// (0) Parse and check command line parameters. +// (1) If trainCfg parameter is set, +// (1.1) train the model, +// (1.2) TODO: load saved model. +// (2) If testCfg parameter is set, +// (2.1) Test the model using data from testCfg. +// +func main() { + defer un(trace()) + + // (0) + flag.Parse() + + // (1) + if trainCfg != "" { + // (1.1) + train() + } else { + // (1.2) + if len(flag.Args()) <= 0 { + usage() + os.Exit(1) + } + } + + // (2) + if testCfg != "" { + test() + } +} diff --git a/lib/mining/cmd/rf/phoneme.dsv b/lib/mining/cmd/rf/phoneme.dsv new file mode 100644 index 00000000..1219989f --- /dev/null +++ b/lib/mining/cmd/rf/phoneme.dsv @@ -0,0 +1,45 @@ +{ + "Input" :"../../testdata/phoneme/phoneme.dat" +, "Rejected" :"phoneme.rej" +, "MaxRows" :-1 +, "DatasetMode" :"matrix" +, "ClassIndex" :5 + +, "NTree" :50 +, "NRandomFeature" :2 +, "RunOOB" :true +, "OOBStatsFile" :"phoneme.oob.stat" +, "PerfFile" :"phoneme.perf" +, "StatFile" :"phoneme.stat" + +, "InputMetadata" : + [{ + "Name" :"f1" + , "Type" :"real" + , "Separator" :" " + },{ + "Name" :"f2" + , "Type" :"real" + , "Separator" :" " + },{ + "Name" :"f3" + , "Type" :"real" + , "Separator" :" " + },{ + "Name" :"f4" + , "Type" :"real" + , "Separator" :" " + },{ + "Name" :"f5" + , "Type" :"real" + , "Separator" :" " + },{ + "Name" :"class" + , "Type" :"real" + , "ValueSpace" : + [ + "1" + , "0" + ] + }] +} diff --git a/lib/mining/cmd/smote/main.go b/lib/mining/cmd/smote/main.go new file mode 100644 index 00000000..48a53f51 --- /dev/null +++ b/lib/mining/cmd/smote/main.go @@ -0,0 +1,193 @@ +// Copyright 2016 Mhd Sulhan <ms@kilabit.info>. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package main + +import ( + "encoding/json" + "flag" + "fmt" + "io/ioutil" + "os" + "time" + + "github.com/shuLhan/share/lib/debug" + "github.com/shuLhan/share/lib/dsv" + "github.com/shuLhan/share/lib/mining/resampling/smote" + "github.com/shuLhan/share/lib/tabula" +) + +var ( + // percentOver contain percentage of over sampling. + percentOver = 100 + // knn contain number of nearest neighbours considered when + // oversampling. + knn = 5 + // synFile flag for synthetic file output. + synFile = "" + // merge flag, if its true the original and synthetic will be merged + // into `synFile`. + merge = false +) + +var usage = func() { + cmd := os.Args[0] + fmt.Fprintf(os.Stderr, "Usage of %s:\n"+ + "[-percentover number] "+ + "[-knn number] "+ + "[-syntheticfile string] "+ + "[-merge bool] "+ + "[config.dsv]\n", cmd) + flag.PrintDefaults() +} + +func init() { + flagUsage := []string{ + "Percentage of oversampling (default 100)", + "Number of nearest neighbours (default 5)", + "File where synthetic samples will be written (default '')", + "If true then original and synthetic will be merged when" + + " written to file (default false)", + } + + flag.IntVar(&percentOver, "percentover", -1, flagUsage[0]) + flag.IntVar(&knn, "knn", -1, flagUsage[1]) + flag.StringVar(&synFile, "syntheticfile", "", flagUsage[2]) + flag.BoolVar(&merge, "merge", false, flagUsage[3]) +} + +func trace(s string) (string, time.Time) { + fmt.Println("[START]", s) + return s, time.Now() +} + +func un(s string, startTime time.Time) { + endTime := time.Now() + fmt.Println("[END]", s, "with elapsed time", + endTime.Sub(startTime)) +} + +// +// createSmote will create and initialize SMOTE object from config file and +// from command parameter. +// +func createSmote(fcfg string) (smoteRun *smote.Runtime, e error) { + smoteRun = &smote.Runtime{} + + config, e := ioutil.ReadFile(fcfg) + if e != nil { + return nil, e + } + + e = json.Unmarshal(config, smoteRun) + if e != nil { + return nil, e + } + + // Use option value from command parameter. + if percentOver > 0 { + smoteRun.PercentOver = percentOver + } + if knn > 0 { + smoteRun.K = knn + } + + if debug.Value >= 1 { + fmt.Println("[smote]", smoteRun) + } + + return +} + +// +// runSmote will select minority class from dataset and run oversampling. +// +func runSmote(smote *smote.Runtime, dataset *tabula.Claset) (e error) { + minorset := dataset.GetMinorityRows() + + if debug.Value >= 1 { + fmt.Println("[smote] # minority samples:", minorset.Len()) + } + + e = smote.Resampling(*minorset) + if e != nil { + return + } + + if debug.Value >= 1 { + fmt.Println("[smote] # synthetics:", smote.Synthetics.Len()) + } + + return +} + +// runMerge will append original dataset to synthetic file. +func runMerge(smote *smote.Runtime, dataset *tabula.Claset) (e error) { + writer, e := dsv.NewWriter("") + if e != nil { + return + } + + e = writer.ReopenOutput(smote.SyntheticFile) + if e != nil { + return + } + + sep := dsv.DefSeparator + n, e := writer.WriteRawDataset(dataset, &sep) + if e != nil { + return + } + + if debug.Value >= 1 { + fmt.Println("[smote] # appended:", n) + } + + return writer.Close() +} + +func main() { + defer un(trace("smote")) + + flag.Parse() + + if len(flag.Args()) <= 0 { + usage() + os.Exit(1) + } + + fcfg := flag.Arg(0) + + // Parsing config file and parameter. + smote, e := createSmote(fcfg) + if e != nil { + panic(e) + } + + // Get dataset. + dataset := tabula.Claset{} + _, e = dsv.SimpleRead(fcfg, &dataset) + if e != nil { + panic(e) + } + + fmt.Println("[smote] Dataset:", &dataset) + + row := dataset.GetRow(0) + fmt.Println("[smote] sample:", row) + + e = runSmote(smote, &dataset) + if e != nil { + panic(e) + } + + if !merge { + return + } + + e = runMerge(smote, &dataset) + if e != nil { + panic(e) + } +} diff --git a/lib/mining/cmd/smote/phoneme.dsv b/lib/mining/cmd/smote/phoneme.dsv new file mode 100644 index 00000000..8cba121c --- /dev/null +++ b/lib/mining/cmd/smote/phoneme.dsv @@ -0,0 +1,40 @@ +{ + "Input" : "../../testdata/phoneme/phoneme.dat" +, "Rejected" : "phoneme.rej" +, "MaxRows" : -1 +, "ClassIndex" : 5 +, "DatasetMode" : "matrix" +, "PercentOver" : 100 +, "K" : 5 +, "SyntheticFile" : "phoneme_smote.dat" +, "InputMetadata" : + [{ + "Name" :"f1" + , "Type" :"real" + , "Separator" :" " + },{ + "Name" :"f2" + , "Type" :"real" + , "Separator" :" " + },{ + "Name" :"f3" + , "Type" :"real" + , "Separator" :" " + },{ + "Name" :"f4" + , "Type" :"real" + , "Separator" :" " + },{ + "Name" :"f5" + , "Type" :"real" + , "Separator" :" " + },{ + "Name" :"class" + , "Type" :"string" + , "ValueSpace" : + [ + "0" + , "1" + ] + }] +} diff --git a/lib/mining/gain/gini/gini.go b/lib/mining/gain/gini/gini.go new file mode 100644 index 00000000..2fcc2f9d --- /dev/null +++ b/lib/mining/gain/gini/gini.go @@ -0,0 +1,461 @@ +// Copyright 2015 Mhd Sulhan <ms@kilabit.info>. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +// Package gini contain function to calculating Gini gain. +// +// Gini gain, which is an impurity-based criterion that measures the divergences +// between the probability distribution of the target attributes' values. +// +package gini + +import ( + "fmt" + + "github.com/shuLhan/share/lib/debug" + "github.com/shuLhan/share/lib/numbers" + libstrings "github.com/shuLhan/share/lib/strings" +) + +// +// Gini contain slice of sorted index, slice of partition values, slice of Gini +// index, Gini value for all samples. +// +type Gini struct { + // Skip if its true, the gain value would not be searched on this + // instance. + Skip bool + // IsContinue define whether the Gini index came from continuous + // attribute or not. + IsContinu bool + // Value of Gini index for all value in attribute. + Value float64 + // MaxPartGain contain the index of partition which have the maximum + // gain. + MaxPartGain int + // MaxGainValue contain maximum gain of index. + MaxGainValue float64 + // MinIndexPart contain the index of partition which have the minimum + // Gini index. + MinIndexPart int + // MinIndexGini contain minimum Gini index value. + MinIndexValue float64 + // SortedIndex of attribute, sorted by values of attribute. This will + // be used to reference the unsorted target attribute. + SortedIndex []int + // ContinuPart contain list of partition value for continuous attribute. + ContinuPart []float64 + // DiscretePart contain the possible combination of discrete values. + DiscretePart libstrings.Table + // Index contain list of Gini Index for each partition. + Index []float64 + // Gain contain information gain for each partition. + Gain []float64 +} + +// +// ComputeDiscrete Given an attribute A with discreate value 'discval', and the +// target attribute T which contain N classes in C, compute the information gain +// of A. +// +// The result is saved as gain value in MaxGainValue for each partition. +// +func (gini *Gini) ComputeDiscrete(A *[]string, discval *[]string, T *[]string, + C *[]string) { + gini.IsContinu = false + + // create partition for possible combination of discrete values. + gini.createDiscretePartition((*discval)) + + if debug.Value >= 2 { + fmt.Println("[gini] part :", gini.DiscretePart) + } + + gini.Index = make([]float64, len(gini.DiscretePart)) + gini.Gain = make([]float64, len(gini.DiscretePart)) + gini.MinIndexValue = 1.0 + + // compute gini index for all samples + gini.Value = gini.compute(T, C) + + gini.computeDiscreteGain(A, T, C) +} + +// +// computeDiscreteGain will compute Gini index and Gain for each partition. +// +func (gini *Gini) computeDiscreteGain(A *[]string, T *[]string, C *[]string) { + // number of samples + nsample := float64(len(*A)) + + if debug.Value >= 3 { + fmt.Println("[gini] sample:", T) + fmt.Printf("[gini] Gini(a=%s) = %f\n", (*A), gini.Value) + } + + // compute gini index for each discrete values + for i, subPart := range gini.DiscretePart { + // check if sub partition has at least an element + if len(subPart) <= 0 { + continue + } + + sumGI := 0.0 + for _, part := range subPart { + ndisc := 0.0 + var subT []string + + for _, el := range part { + for t, a := range *A { + if a != el { + continue + } + + // count how many sample with this discrete value + ndisc++ + // split the target by discrete value + subT = append(subT, (*T)[t]) + } + } + + // compute gini index for subtarget + giniIndex := gini.compute(&subT, C) + + // compute probabilites of discrete value through all samples + p := ndisc / nsample + + probIndex := p * giniIndex + + // sum all probabilities times gini index. + sumGI += probIndex + + if debug.Value >= 3 { + fmt.Printf("[gini] subsample: %v\n", subT) + fmt.Printf("[gini] Gini(a=%s) = %f/%f * %f = %f\n", + part, ndisc, nsample, + giniIndex, probIndex) + } + } + + gini.Index[i] = sumGI + gini.Gain[i] = gini.Value - sumGI + + if debug.Value >= 3 { + fmt.Printf("[gini] sample: %v\n", subPart) + fmt.Printf("[gini] Gain(a=%s) = %f - %f = %f\n", + subPart, gini.Value, sumGI, + gini.Gain[i]) + } + + if gini.MinIndexValue > gini.Index[i] && gini.Index[i] != 0 { + gini.MinIndexValue = gini.Index[i] + gini.MinIndexPart = i + } + + if gini.MaxGainValue < gini.Gain[i] { + gini.MaxGainValue = gini.Gain[i] + gini.MaxPartGain = i + } + } +} + +// +// createDiscretePartition will create possible combination for discrete value +// in DiscretePart. +// +func (gini *Gini) createDiscretePartition(discval []string) { + // no discrete values ? + if len(discval) <= 0 { + return + } + + // use set partition function to group the discrete values into two + // subset. + gini.DiscretePart = libstrings.Partition(discval, 2) +} + +// +// ComputeContinu Given an attribute A and the target attribute T which contain +// N classes in C, compute the information gain of A. +// +// The result of Gini partitions value, Gini Index, and Gini Gain is saved in +// ContinuPart, Index, and Gain. +// +func (gini *Gini) ComputeContinu(A *[]float64, T *[]string, C *[]string) { + gini.IsContinu = true + + // make a copy of attribute and target. + A2 := make([]float64, len(*A)) + copy(A2, *A) + + T2 := make([]string, len(*T)) + copy(T2, *T) + + gini.SortedIndex = numbers.Floats64IndirectSort(A2, true) + + if debug.Value >= 1 { + fmt.Println("[gini] attr sorted :", A2) + } + + // sort the target attribute using sorted index. + libstrings.SortByIndex(&T2, gini.SortedIndex) + + // create partition + gini.createContinuPartition(&A2) + + // create holder for gini index and gini gain + gini.Index = make([]float64, len(gini.ContinuPart)) + gini.Gain = make([]float64, len(gini.ContinuPart)) + gini.MinIndexValue = 1.0 + + // compute gini index for all samples + gini.Value = gini.compute(&T2, C) + + gini.computeContinuGain(&A2, &T2, C) +} + +// +// createContinuPartition for dividing class and computing Gini index. +// +// This is assuming that the data `A` has been sorted in ascending order. +// +func (gini *Gini) createContinuPartition(A *[]float64) { + l := len(*A) + gini.ContinuPart = make([]float64, 0) + + // loop from first index until last index - 1 + for i := 0; i < l-1; i++ { + sum := (*A)[i] + (*A)[i+1] + med := sum / 2.0 + + // If median is zero, its mean both left and right value is + // zero. We are not allowing this, because it will result the + // minimum Gini Index or maximum Gain value. + if med == 0 { + continue + } + + // Reject if median is contained in attribute's value. + // We use equality because if both A[i] and A[i+1] value is + // equal, the median is equal to both of them. + exist := false + for j := 0; j <= i; j++ { + if (*A)[j] == med { + exist = true + break + } + } + if !exist { + gini.ContinuPart = append(gini.ContinuPart, med) + } + } +} + +// +// compute value for attribute T. +// +// Return Gini value in the form of, +// +// 1 - sum (probability of each classes in T) +// +func (gini *Gini) compute(T *[]string, C *[]string) float64 { + n := float64(len(*T)) + if n == 0 { + return 0 + } + + classCount := libstrings.CountTokens(*T, *C, true) + + var sump2 float64 + + for x, v := range classCount { + p := float64(v) / n + sump2 += (p * p) + + if debug.Value >= 3 { + fmt.Printf("[gini] compute (%s): (%d/%f)^2 = %f\n", + (*C)[x], v, n, p*p) + } + + } + + return 1 - sump2 +} + +// +// computeContinuGain for each partition. +// +// The Gini gain formula we used here is, +// +// Gain(part,S) = Gini(S) - ((count(left)/S * Gini(left)) +// + (count(right)/S * Gini(right))) +// +// where, +// - left is sub-sample from S that is less than part value. +// - right is sub-sample from S that is greater than part value. +// +func (gini *Gini) computeContinuGain(A *[]float64, T *[]string, C *[]string) { + var gleft, gright float64 + var tleft, tright []string + + nsample := len(*A) + + if debug.Value >= 2 { + fmt.Println("[gini] sorted data:", A) + fmt.Println("[gini] Gini.Value:", gini.Value) + } + + for p, contVal := range gini.ContinuPart { + + // find the split of samples between partition based on + // partition value + partidx := nsample + for x, attrVal := range *A { + if attrVal > contVal { + partidx = x + break + } + } + + nleft := partidx + nright := nsample - partidx + pleft := float64(nleft) / float64(nsample) + pright := float64(nright) / float64(nsample) + + if partidx > 0 { + tleft = (*T)[0:partidx] + tright = (*T)[partidx:] + + gleft = gini.compute(&tleft, C) + gright = gini.compute(&tright, C) + } else { + tleft = nil + tright = (*T)[0:] + + gleft = 0 + gright = gini.compute(&tright, C) + } + + // count class in partition + gini.Index[p] = ((pleft * gleft) + (pright * gright)) + gini.Gain[p] = gini.Value - gini.Index[p] + + if debug.Value >= 3 { + fmt.Println("[gini] tleft:", tleft) + fmt.Println("[gini] tright:", tright) + + fmt.Printf("[gini] GiniGain(%v) = %f - (%f * %f) + (%f * %f) = %f\n", + contVal, gini.Value, pleft, gleft, + pright, gright, gini.Gain[p]) + } + + if gini.MinIndexValue > gini.Index[p] && gini.Index[p] != 0 { + gini.MinIndexValue = gini.Index[p] + gini.MinIndexPart = p + } + + if gini.MaxGainValue < gini.Gain[p] { + gini.MaxGainValue = gini.Gain[p] + gini.MaxPartGain = p + } + } +} + +// +// GetMaxPartGainValue return the partition that have the maximum Gini gain. +// +func (gini *Gini) GetMaxPartGainValue() interface{} { + if gini.IsContinu { + return gini.ContinuPart[gini.MaxPartGain] + } + + return gini.DiscretePart[gini.MaxPartGain] +} + +// +// GetMaxGainValue return the value of partition which contain the maximum Gini +// gain. +// +func (gini *Gini) GetMaxGainValue() float64 { + return gini.MaxGainValue +} + +// +// GetMinIndexPartValue return the partition that have the minimum Gini index. +// +func (gini *Gini) GetMinIndexPartValue() interface{} { + if gini.IsContinu { + return gini.ContinuPart[gini.MinIndexPart] + } + + return gini.DiscretePart[gini.MinIndexPart] +} + +// +// GetMinIndexValue return the minimum Gini index value. +// +func (gini *Gini) GetMinIndexValue() float64 { + return gini.MinIndexValue +} + +// +// FindMaxGain find the attribute and value that have the maximum gain. +// The returned value is index of attribute. +// +func FindMaxGain(gains *[]Gini) (MaxGainIdx int) { + var gainValue = 0.0 + var maxGainValue = 0.0 + + for i := range *gains { + if (*gains)[i].Skip { + continue + } + gainValue = (*gains)[i].GetMaxGainValue() + if gainValue > maxGainValue { + maxGainValue = gainValue + MaxGainIdx = i + } + } + + return +} + +// +// FindMinGiniIndex return the index of attribute that have the minimum Gini index. +// +func FindMinGiniIndex(ginis *[]Gini) (MinIndexIdx int) { + var indexV = 0.0 + var minIndexV = 1.0 + + for i := range *ginis { + indexV = (*ginis)[i].GetMinIndexValue() + if indexV > minIndexV { + minIndexV = indexV + MinIndexIdx = i + } + } + + return +} + +// +// String yes, it will print it JSON like format. +// +func (gini Gini) String() (s string) { + s = fmt.Sprint("{\n", + " Skip :", gini.Skip, "\n", + " IsContinu :", gini.IsContinu, "\n", + " Index :", gini.Index, "\n", + " Value :", gini.Value, "\n", + " Gain :", gini.Gain, "\n", + " MaxPartGain :", gini.MaxPartGain, "\n", + " MaxGainValue :", gini.MaxGainValue, "\n", + " MinIndexPart :", gini.MinIndexPart, "\n", + " MinIndexValue :", gini.MinIndexValue, "\n", + " SortedIndex :", gini.SortedIndex, "\n", + " ContinuPart :", gini.ContinuPart, "\n", + " DiscretePart :", gini.DiscretePart, "\n", + "}") + return +} diff --git a/lib/mining/gain/gini/gini_test.go b/lib/mining/gain/gini/gini_test.go new file mode 100644 index 00000000..682d5159 --- /dev/null +++ b/lib/mining/gain/gini/gini_test.go @@ -0,0 +1,59 @@ +// Copyright 2015 Mhd Sulhan <ms@kilabit.info>. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package gini + +import ( + "fmt" + "testing" +) + +var data = [][]float64{ + {1.0, 6.0, 5.0, 4.0, 7.0, 3.0, 8.0, 7.0, 5.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, +} +var targetValues = []string{"P", "P", "N", "P", "N", "N", "N", "P", "N"} +var classes = []string{"P", "N"} + +func TestComputeContinu(t *testing.T) { + target := make([]string, len(targetValues)) + + copy(target, targetValues) + + fmt.Println("target:", target) + + fmt.Println("data:", data[0]) + GINI := Gini{} + GINI.ComputeContinu(&data[0], &target, &classes) + fmt.Println(">>> gini:", GINI) + + fmt.Println("data:", data[1]) + GINI = Gini{} + GINI.ComputeContinu(&data[1], &target, &classes) + fmt.Println(">>> gini:", GINI) +} + +var discreteSamples = [][]string{ + {"T", "T", "T", "F", "F", "F", "F", "T", "F"}, + {"T", "T", "F", "F", "T", "T", "F", "F", "T"}, + {"T", "T", "F", "T", "F", "F", "F", "T", "F"}, +} +var discreteValues = []string{"T", "F"} + +func TestComputeDiscrete(t *testing.T) { + gini := Gini{} + target := make([]string, len(targetValues)) + + for _, sample := range discreteSamples { + copy(target, targetValues) + + fmt.Println("target:", target) + fmt.Println("data:", sample) + + gini.ComputeDiscrete(&sample, &discreteValues, &target, + &classes) + + fmt.Println(gini) + } +} diff --git a/lib/mining/gain/gini/ginifloat.go b/lib/mining/gain/gini/ginifloat.go new file mode 100644 index 00000000..b890b39d --- /dev/null +++ b/lib/mining/gain/gini/ginifloat.go @@ -0,0 +1,168 @@ +// Copyright 2016 Mhd Sulhan <ms@kilabit.info>. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package gini + +import ( + "fmt" + + "github.com/shuLhan/share/lib/debug" + "github.com/shuLhan/share/lib/numbers" +) + +// +// ComputeContinuFloat Given an attribute A and the target attribute T which contain +// N classes in C, compute the information gain of A. +// +// The result of Gini partitions value, Gini Index, and Gini Gain is saved in +// ContinuPart, Index, and Gain. +// +// Algorithm, +// (0) Sort the attribute. +// (1) Sort the target attribute using sorted index. +// (2) Create continu partition. +// (3) Create temporary space for gini index and gini gain. +// (4) Compute gini index for all target. +// +func (gini *Gini) ComputeContinuFloat(A, T, C *[]float64) { + gini.IsContinu = true + + gini.SortedIndex = numbers.Floats64IndirectSort(*A, true) + + if debug.Value >= 1 { + fmt.Println("[gini] attr sorted :", A) + } + + // (1) + numbers.Floats64SortByIndex(T, gini.SortedIndex) + + // (2) + gini.createContinuPartition(A) + + // (3) + gini.Index = make([]float64, len(gini.ContinuPart)) + gini.Gain = make([]float64, len(gini.ContinuPart)) + gini.MinIndexValue = 1.0 + + // (4) + gini.Value = gini.computeFloat(T, C) + + gini.computeContinuGainFloat(A, T, C) +} + +// +// computeFloat will compute Gini value for attribute T. +// +// Gini value is computed using formula, +// +// 1 - sum (probability of each classes in T) +// +func (gini *Gini) computeFloat(T, C *[]float64) float64 { + n := float64(len(*T)) + if n == 0 { + return 0 + } + + classCount := numbers.Floats64Counts(*T, *C) + + var sump2 float64 + + for x, v := range classCount { + p := float64(v) / n + sump2 += (p * p) + + if debug.Value >= 3 { + fmt.Printf("[gini] compute (%f): (%d/%f)^2 = %f\n", + (*C)[x], v, n, p*p) + } + + } + + return 1 - sump2 +} + +// +// computeContinuGainFloat will compute gain for each partition. +// +// The Gini gain formula we used here is, +// +// Gain(part,S) = Gini(S) - ((count(left)/S * Gini(left)) +// + (count(right)/S * Gini(right))) +// +// where, +// - left is sub-sample from S that is less than part value. +// - right is sub-sample from S that is greater than part value. +// +// Algorithm, +// (0) For each partition value, +// (0.1) Find the split of samples between partition based on partition value. +// (0.2) Count class in partition. +// +func (gini *Gini) computeContinuGainFloat(A, T, C *[]float64) { + var gainLeft, gainRight float64 + var tleft, tright []float64 + + nsample := len(*A) + + if debug.Value >= 2 { + fmt.Println("[gini] sorted data:", A) + fmt.Println("[gini] Gini.Value:", gini.Value) + } + + // (0) + for p, contVal := range gini.ContinuPart { + + // (0.1) + partidx := nsample + for x, attrVal := range *A { + if attrVal > contVal { + partidx = x + break + } + } + + nleft := float64(partidx) + nright := float64(nsample - partidx) + probLeft := nleft / float64(nsample) + probRight := nright / float64(nsample) + + if partidx > 0 { + tleft = (*T)[0:partidx] + tright = (*T)[partidx:] + + gainLeft = gini.computeFloat(&tleft, C) + gainRight = gini.computeFloat(&tright, C) + } else { + tleft = nil + tright = (*T)[0:] + + gainLeft = 0 + gainRight = gini.computeFloat(&tright, C) + } + + // (0.2) + gini.Index[p] = ((probLeft * gainLeft) + + (probRight * gainRight)) + gini.Gain[p] = gini.Value - gini.Index[p] + + if debug.Value >= 3 { + fmt.Println("[gini] tleft:", tleft) + fmt.Println("[gini] tright:", tright) + + fmt.Printf("[gini] GiniGain(%v) = %f - (%f * %f) + (%f * %f) = %f\n", + contVal, gini.Value, probLeft, gainLeft, + probRight, gainRight, gini.Gain[p]) + } + + if gini.MinIndexValue > gini.Index[p] && gini.Index[p] != 0 { + gini.MinIndexValue = gini.Index[p] + gini.MinIndexPart = p + } + + if gini.MaxGainValue < gini.Gain[p] { + gini.MaxGainValue = gini.Gain[p] + gini.MaxPartGain = p + } + } +} diff --git a/lib/mining/knn/knn.go b/lib/mining/knn/knn.go new file mode 100644 index 00000000..0fa38f9c --- /dev/null +++ b/lib/mining/knn/knn.go @@ -0,0 +1,109 @@ +// Copyright 2015-2016 Mhd Sulhan <ms@kilabit.info>. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +// Package knn implement the K Nearest Neighbor using Euclidian to compute the +// distance between samples. +// +package knn + +import ( + "fmt" + "math" + "sort" + + "github.com/shuLhan/share/lib/debug" + "github.com/shuLhan/share/lib/tabula" +) + +const ( + // TEuclidianDistance used in Runtime.DistanceMethod. + TEuclidianDistance = 0 +) + +// +// Runtime parameters for KNN processing. +// +type Runtime struct { + // DistanceMethod define how the distance between sample will be + // measured. + DistanceMethod int + // ClassIndex define index of class in dataset. + ClassIndex int `json:"ClassIndex"` + // K define number of nearest neighbors that will be searched. + K int `json:"K"` + + // AllNeighbors contain all neighbours + AllNeighbors Neighbors +} + +// +// ComputeEuclidianDistance compute the distance of instance with each sample in +// dataset `samples` and return it. +// +func (in *Runtime) ComputeEuclidianDistance(samples *tabula.Rows, + instance *tabula.Row, +) { + for x := range *samples { + row := (*samples)[x] + + // compute euclidian distance + d := 0.0 + for y, rec := range *row { + if y == in.ClassIndex { + // skip class attribute + continue + } + + ir := (*instance)[y] + diff := 0.0 + + diff = ir.Float() - rec.Float() + + d += math.Abs(diff) + } + + // only add sample distance which is not zero (its probably + // we calculating with the instance itself) + if d != 0 { + in.AllNeighbors.Add(row, math.Sqrt(d)) + } + } + + sort.Sort(&in.AllNeighbors) +} + +// +// FindNeighbors Given sample set and an instance, return the nearest neighbors as +// a slice of neighbors. +// +func (in *Runtime) FindNeighbors(samples *tabula.Rows, instance *tabula.Row) ( + kneighbors Neighbors, +) { + // Reset current input neighbours + in.AllNeighbors = Neighbors{} + + switch in.DistanceMethod { + case TEuclidianDistance: + in.ComputeEuclidianDistance(samples, instance) + } + + // Make sure number of neighbors is greater than request. + minK := in.AllNeighbors.Len() + if minK > in.K { + minK = in.K + } + + if debug.Value >= 2 { + fmt.Println("[knn] all neighbors:", in.AllNeighbors.Len()) + } + + kneighbors = in.AllNeighbors.SelectRange(0, minK) + + if debug.Value >= 2 { + fmt.Println("[knn] k neighbors:", kneighbors.Len()) + } + + return +} diff --git a/lib/mining/knn/knn_test.go b/lib/mining/knn/knn_test.go new file mode 100644 index 00000000..86ef75c0 --- /dev/null +++ b/lib/mining/knn/knn_test.go @@ -0,0 +1,62 @@ +// Copyright 2015 Mhd Sulhan <ms@kilabit.info>. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package knn + +import ( + "fmt" + "testing" + + "github.com/shuLhan/share/lib/dsv" + "github.com/shuLhan/share/lib/tabula" + "github.com/shuLhan/share/lib/test" +) + +func TestComputeEuclidianDistance(t *testing.T) { + var exp = []string{ + `[0.302891 0.608544 0.47413 1.42718 -0.811085 1]`, + `[0.243474 0.505146 0.472892 1.34802 -0.844252 1]` + + `[0.202343 0.485983 0.527533 1.47307 -0.809672 1]` + + `[0.215496 0.523418 0.51719 1.43548 -0.933981 1]` + + `[0.214331 0.546086 0.414773 1.38542 -0.702336 1]` + + `[0.301676 0.554505 0.594757 1.21258 -0.873084 1]`, + } + var expDistances = "[0.5257185558832786" + + " 0.5690474496911485" + + " 0.5888777462258191" + + " 0.6007362149895741" + + " 0.672666336306493]" + + // Reading data + dataset := tabula.Dataset{} + _, e := dsv.SimpleRead("../testdata/phoneme/phoneme.dsv", &dataset) + if nil != e { + return + } + + // Processing + knnIn := Runtime{ + DistanceMethod: TEuclidianDistance, + ClassIndex: 5, + K: 5, + } + + classes := dataset.GetRows().GroupByValue(knnIn.ClassIndex) + + _, minoritySet := classes.GetMinority() + + kneighbors := knnIn.FindNeighbors(&minoritySet, minoritySet[0]) + + var got string + rows := kneighbors.Rows() + for _, row := range *rows { + got += fmt.Sprint(*row) + } + + test.Assert(t, "", exp[1], got, true) + + distances := kneighbors.Distances() + got = fmt.Sprint(*distances) + test.Assert(t, "", expDistances, got, true) +} diff --git a/lib/mining/knn/neighbor.go b/lib/mining/knn/neighbor.go new file mode 100644 index 00000000..e472d09f --- /dev/null +++ b/lib/mining/knn/neighbor.go @@ -0,0 +1,150 @@ +// Copyright 2015 Mhd Sulhan <ms@kilabit.info>. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package knn + +import ( + "github.com/shuLhan/share/lib/tabula" +) + +// +// Neighbors is a mapping between sample and their distance. +// This type implement the sort interface. +// +type Neighbors struct { + // rows contain pointer to rows. + rows []*tabula.Row + // Distance value + distances []float64 +} + +// +// Rows return all rows. +// +func (neighbors *Neighbors) Rows() *[]*tabula.Row { + return &neighbors.rows +} + +// +// Row return pointer to row at index `idx`. +// +func (neighbors *Neighbors) Row(idx int) *tabula.Row { + return neighbors.rows[idx] +} + +// +// Distances return slice of distance of each neighbours. +// +func (neighbors *Neighbors) Distances() *[]float64 { + return &neighbors.distances +} + +// +// Distance return distance value at index `idx`. +// +func (neighbors *Neighbors) Distance(idx int) float64 { + return neighbors.distances[idx] +} + +// +// Len return the number of neighbors. +// This is for sort interface. +// +func (neighbors *Neighbors) Len() int { + return len(neighbors.distances) +} + +// +// Less return true if i < j. +// This is for sort interface. +// +func (neighbors *Neighbors) Less(i, j int) bool { + if neighbors.distances[i] < neighbors.distances[j] { + return true + } + return false +} + +// +// Swap content of object in index i with index j. +// This is for sort interface. +// +func (neighbors *Neighbors) Swap(i, j int) { + row := neighbors.rows[i] + distance := neighbors.distances[i] + + neighbors.rows[i] = neighbors.rows[j] + neighbors.distances[i] = neighbors.distances[j] + + neighbors.rows[j] = row + neighbors.distances[j] = distance +} + +// +// Add new neighbor. +// +func (neighbors *Neighbors) Add(row *tabula.Row, distance float64) { + neighbors.rows = append(neighbors.rows, row) + neighbors.distances = append(neighbors.distances, distance) +} + +// +// SelectRange select all neighbors from index `start` to `end`. +// Return an empty set if start or end is out of range. +// +func (neighbors *Neighbors) SelectRange(start, end int) (newn Neighbors) { + if start < 0 { + return + } + + if end > neighbors.Len() { + return + } + + for x := start; x < end; x++ { + row := neighbors.rows[x] + newn.Add(row, neighbors.distances[x]) + } + return +} + +// +// SelectWhere return all neighbors where row value at index `idx` is equal +// to string `val`. +// +func (neighbors *Neighbors) SelectWhere(idx int, val string) (newn Neighbors) { + for x, row := range neighbors.rows { + colval := (*row)[idx].String() + + if colval == val { + newn.Add(row, neighbors.Distance(x)) + } + } + return +} + +// +// Contain return true if `row` is in neighbors and their index, otherwise +// return false and -1. +// +func (neighbors *Neighbors) Contain(row *tabula.Row) (bool, int) { + for x, xrow := range neighbors.rows { + if xrow.IsEqual(row) { + return true, x + } + } + return false, -1 +} + +// +// Replace neighbor at index `idx` with new row and distance value. +// +func (neighbors *Neighbors) Replace(idx int, row *tabula.Row, distance float64) { + if idx > len(neighbors.rows) { + return + } + + neighbors.rows[idx] = row + neighbors.distances[idx] = distance +} diff --git a/lib/mining/knn/neighbor_test.go b/lib/mining/knn/neighbor_test.go new file mode 100644 index 00000000..047d9a32 --- /dev/null +++ b/lib/mining/knn/neighbor_test.go @@ -0,0 +1,84 @@ +// Copyright 2016 Mhd Sulhan <ms@kilabit.info>. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package knn + +import ( + "math/rand" + "sort" + "testing" + "time" + + "github.com/shuLhan/share/lib/tabula" + "github.com/shuLhan/share/lib/test" +) + +var dataFloat64 = [][]float64{ + {0.243474, 0.505146, 0.472892, 1.34802, -0.844252, 1}, + {0.202343, 0.485983, 0.527533, 1.47307, -0.809672, 1}, + {0.215496, 0.523418, 0.517190, 1.43548, -0.933981, 1}, + {0.214331, 0.546086, 0.414773, 1.38542, -0.702336, 1}, + {0.301676, 0.554505, 0.594757, 1.21258, -0.873084, 1}, +} + +var distances = []int{4, 3, 2, 1, 0} + +func createNeigbours() (neighbors Neighbors) { + for x, d := range dataFloat64 { + row := tabula.Row{} + + for _, v := range d { + rec := tabula.NewRecordReal(v) + row.PushBack(rec) + } + + neighbors.Add(&row, float64(distances[x])) + } + return +} + +func createNeigboursByIdx(indices []int) (neighbors Neighbors) { + for x, idx := range indices { + row := tabula.Row{} + + for _, v := range dataFloat64[idx] { + rec := tabula.NewRecordReal(v) + row.PushBack(rec) + } + + neighbors.Add(&row, float64(distances[x])) + } + return +} + +func TestContain(t *testing.T) { + rand.Seed(time.Now().UnixNano()) + + neighbors := createNeigbours() + + // pick random sample from neighbors + pickIdx := rand.Intn(neighbors.Len()) + randSample := neighbors.Row(pickIdx).Clone() + + isin, idx := neighbors.Contain(randSample) + + test.Assert(t, "", true, isin, true) + test.Assert(t, "", pickIdx, idx, true) + + // change one of record value to check for false. + (*randSample)[0].SetFloat(0) + + isin, _ = neighbors.Contain(randSample) + + test.Assert(t, "", false, isin, true) +} + +func TestSort(t *testing.T) { + neighbors := createNeigbours() + exp := createNeigboursByIdx(distances) + + sort.Sort(&neighbors) + + test.Assert(t, "", exp.Rows(), neighbors.Rows(), true) +} diff --git a/lib/mining/math/math.go b/lib/mining/math/math.go new file mode 100644 index 00000000..944890de --- /dev/null +++ b/lib/mining/math/math.go @@ -0,0 +1,68 @@ +// Copyright 2015 Mhd Sulhan <ms@kilabit.info>. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +// Package math provide generic functions working with math. +// +package math + +import ( + "math" +) + +// +// Factorial compute the factorial of n. +// +func Factorial(n int) (f int) { + if n >= 0 { + f = 1 + } else { + f = -1 + n = n * -1 + } + + for ; n > 0; n-- { + f = f * n + } + + return f +} + +// +// BinomialCoefficient or combination, compute number of picking k from +// n possibilities. +// +// Result is n! / ((n - k)! * k!) +// +func BinomialCoefficient(n int, k int) int { + if k > n { + return 0 + } + return Factorial(n) / (Factorial(n-k) * Factorial(k)) +} + +// +// StirlingS2 The number of ways of partitioning a set of n elements into +// k nonempty sets (i.e., k set blocks), also called a Stirling set number. +// +// For example, the set {1,2,3} can be partitioned into three subsets in one way: +// {{1},{2},{3}}; into two subsets in three ways: {{1,2},{3}}, {{1,3},{2}}, +// and {{1},{2,3}}; and into one subset in one way: {{1,2,3}}. +// +// Ref: http://mathworld.wolfram.com/StirlingNumberoftheSecondKind.html +// +func StirlingS2(n int, k int) int { + if k == 1 || n == k { + return 1 + } + var sum int + + for i := 0; i <= k; i++ { + sum += int(math.Pow(-1, float64(i))) * + BinomialCoefficient(k, i) * + int(math.Pow(float64(k-i), float64(n))) + } + + return sum / Factorial(k) +} diff --git a/lib/mining/math/math_test.go b/lib/mining/math/math_test.go new file mode 100644 index 00000000..ae7bf563 --- /dev/null +++ b/lib/mining/math/math_test.go @@ -0,0 +1,48 @@ +// Copyright 2015 Mhd Sulhan <ms@kilabit.info>. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package math + +import ( + "testing" +) + +func TestFactorial(t *testing.T) { + in := []int{-3, -2, -1, 0, 1, 2, 3} + exp := []int{-6, -2, -1, 1, 1, 2, 6} + + for i := range in { + res := Factorial(in[i]) + + if res != exp[i] { + t.Fatal("Expecting ", exp[i], ", got ", res) + } + } +} + +func TestBinomialCoefficient(t *testing.T) { + in := [][]int{{1, 2}, {1, 1}, {3, 2}, {5, 3}} + exp := []int{0, 1, 3, 10} + + for i := range in { + res := BinomialCoefficient(in[i][0], in[i][1]) + + if res != exp[i] { + t.Fatal("Expecting ", exp[i], ", got ", res) + } + } +} + +func TestStirlingS2(t *testing.T) { + in := [][]int{{3, 1}, {3, 2}, {3, 3}, {5, 3}} + exp := []int{1, 3, 1, 25} + + for i := range in { + res := StirlingS2(in[i][0], in[i][1]) + + if res != exp[i] { + t.Fatal("Expecting ", exp[i], ", got ", res) + } + } +} diff --git a/lib/mining/resampling/interface.go b/lib/mining/resampling/interface.go new file mode 100644 index 00000000..58c333ce --- /dev/null +++ b/lib/mining/resampling/interface.go @@ -0,0 +1,52 @@ +// Copyright 2016 Mhd Sulhan <ms@kilabit.info>. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +// Package resampling provide common interface, constants, and methods for +// resampling modules. +// +package resampling + +import ( + "github.com/shuLhan/share/lib/dsv" + "github.com/shuLhan/share/lib/tabula" +) + +const ( + // DefaultK nearest neighbors. + DefaultK = 5 + // DefaultPercentOver sampling. + DefaultPercentOver = 100 +) + +// +// Interface define common methods used by resampling module. +// +type Interface interface { + GetSynthetics() tabula.DatasetInterface +} + +// +// WriteSynthetics will write synthetic samples in resampling module `ri` into +// `file`. +// +func WriteSynthetics(ri Interface, file string) (e error) { + writer, e := dsv.NewWriter("") + if nil != e { + return + } + + e = writer.OpenOutput(file) + if e != nil { + return + } + + sep := dsv.DefSeparator + _, e = writer.WriteRawDataset(ri.GetSynthetics(), &sep) + if e != nil { + return + } + + return writer.Close() +} diff --git a/lib/mining/resampling/lnsmote/lnsmote.go b/lib/mining/resampling/lnsmote/lnsmote.go new file mode 100644 index 00000000..ce6f26e9 --- /dev/null +++ b/lib/mining/resampling/lnsmote/lnsmote.go @@ -0,0 +1,287 @@ +// Copyright 2016 Mhd Sulhan <ms@kilabit.info>. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +// Package lnsmote implement the Local-Neighborhood algorithm from the paper, +// +// Maciejewski, Tomasz, and Jerzy Stefanowski. "Local neighbourhood +// extension of SMOTE for mining imbalanced data." Computational +// Intelligence and Data Mining (CIDM), 2011 IEEE Symposium on. IEEE, +// 2011. +// +package lnsmote + +import ( + "fmt" + "math/rand" + + "github.com/shuLhan/share/lib/debug" + "github.com/shuLhan/share/lib/dsv" + "github.com/shuLhan/share/lib/mining/knn" + "github.com/shuLhan/share/lib/mining/resampling/smote" + "github.com/shuLhan/share/lib/tabula" +) + +// +// Runtime parameters for input and output. +// +type Runtime struct { + // Runtime of SMOTE, since this module extend the SMOTE method. + smote.Runtime + + // ClassMinor the minority sample in dataset that we want to + // oversampling. + ClassMinor string `json:"ClassMinor"` + + // minorset contain minor class in samples. + minorset tabula.DatasetInterface + + // datasetRows contain all rows in dataset. + datasetRows *tabula.Rows + + // outliersRows contain all sample that is detected as outliers. + outliers tabula.Rows + + // OutliersFile if its not empty then outliers will be saved in file + // specified by this option. + OutliersFile string `json:"OutliersFile"` +} + +// +// New create and return new LnSmote object. +// +func New(percentOver, k, classIndex int, classMinor, outliers string) ( + lnsmoteRun *Runtime, +) { + lnsmoteRun = &Runtime{ + Runtime: smote.Runtime{ + Runtime: knn.Runtime{ + DistanceMethod: knn.TEuclidianDistance, + ClassIndex: classIndex, + K: k, + }, + PercentOver: percentOver, + }, + ClassMinor: classMinor, + OutliersFile: outliers, + } + + return +} + +// +// Init will initialize LNSmote runtime by checking input values and set it to +// default if not set or invalid. +// +func (in *Runtime) Init(dataset tabula.ClasetInterface) { + in.Runtime.Init() + + in.NSynthetic = in.PercentOver / 100.0 + in.datasetRows = dataset.GetDataAsRows() + + in.minorset = tabula.SelectRowsWhere(dataset, in.ClassIndex, + in.ClassMinor) + + in.outliers = make(tabula.Rows, 0) + + if debug.Value >= 1 { + fmt.Println("[lnsmote] n:", in.NSynthetic) + fmt.Println("[lnsmote] n minority:", in.minorset.Len()) + } +} + +// +// Resampling will run resampling process on dataset and return the synthetic +// samples. +// +func (in *Runtime) Resampling(dataset tabula.ClasetInterface) ( + e error, +) { + in.Init(dataset) + + minorRows := in.minorset.GetDataAsRows() + + for x := range *minorRows { + p := (*minorRows)[x] + + neighbors := in.FindNeighbors(in.datasetRows, p) + + if debug.Value >= 3 { + fmt.Println("[lnsmote] neighbors:", neighbors.Rows()) + } + + for y := 0; y < in.NSynthetic; y++ { + syn := in.createSynthetic(p, neighbors) + + if syn != nil { + in.Synthetics.PushRow(syn) + } + } + + if debug.Value >= 1 { + fmt.Printf("[lnsmote] %-4d n synthetics: %v\n", x, + in.Synthetics.Len()) + } + } + + if in.SyntheticFile != "" { + e = in.Write(in.SyntheticFile) + } + if in.OutliersFile != "" && in.outliers.Len() > 0 { + e = in.writeOutliers() + } + + return +} + +// +// createSynthetic will create synthetics row from original row `p` and their +// `neighbors`. +// +func (in *Runtime) createSynthetic(p *tabula.Row, neighbors knn.Neighbors) ( + synthetic *tabula.Row, +) { + // choose one of the K nearest neighbors + randIdx := rand.Intn(neighbors.Len()) + n := neighbors.Row(randIdx) + + // Check if synthetic sample can be created from p and n. + canit, slp, sln := in.canCreate(p, n) + if !canit { + if debug.Value >= 2 { + fmt.Println("[lnsmote] can not create synthetic") + } + + if slp.Len() <= 0 { + in.outliers.PushBack(p) + } + + // we can not create from p and synthetic. + return nil + } + + synthetic = p.Clone() + + for x, srec := range *synthetic { + // Skip class attribute. + if x == in.ClassIndex { + continue + } + + delta := in.randomGap(p, n, slp.Len(), sln.Len()) + pv := (*p)[x].Float() + diff := (*n)[x].Float() - pv + srec.SetFloat(pv + delta*diff) + } + + return +} + +// +// canCreate return true if synthetic can be created between two sample `p` and +// `n`. Otherwise it will return false. +// +func (in *Runtime) canCreate(p, n *tabula.Row) (bool, knn.Neighbors, + knn.Neighbors, +) { + slp := in.safeLevel(p) + sln := in.safeLevel2(p, n) + + if debug.Value >= 2 { + fmt.Println("[lnsmote] slp : ", slp.Len()) + fmt.Println("[lnsmote] sln : ", sln.Len()) + } + + return slp.Len() != 0 || sln.Len() != 0, slp, sln +} + +// +// safeLevel return the minority neighbors in sample `p`. +// +func (in *Runtime) safeLevel(p *tabula.Row) knn.Neighbors { + neighbors := in.FindNeighbors(in.datasetRows, p) + minorNeighbors := neighbors.SelectWhere(in.ClassIndex, in.ClassMinor) + + return minorNeighbors +} + +// +// safeLevel2 return the minority neighbors between sample `p` and `n`. +// +func (in *Runtime) safeLevel2(p, n *tabula.Row) knn.Neighbors { + neighbors := in.FindNeighbors(in.datasetRows, n) + + // check if n is in minority class. + nIsMinor := (*n)[in.ClassIndex].IsEqualToString(in.ClassMinor) + + // check if p is in neighbors. + pInNeighbors, pidx := neighbors.Contain(p) + + // if p in neighbors, replace it with neighbours in K+1 + if nIsMinor && pInNeighbors { + if debug.Value >= 1 { + fmt.Println("[lnsmote] Replacing ", pidx) + } + if debug.Value >= 2 { + fmt.Println("[lnsmote] Replacing ", pidx, " in ", neighbors) + } + + row := in.AllNeighbors.Row(in.K + 1) + dist := in.AllNeighbors.Distance(in.K + 1) + neighbors.Replace(pidx, row, dist) + + if debug.Value >= 2 { + fmt.Println("[lnsmote] Replacement ", neighbors) + } + } + + minorNeighbors := neighbors.SelectWhere(in.ClassIndex, in.ClassMinor) + + return minorNeighbors +} + +// +// randomGap return the neighbors gap between sample `p` and `n` using safe +// level (number of minority neighbors) of p in `lenslp` and `n` in `lensln`. +// +func (in *Runtime) randomGap(p, n *tabula.Row, lenslp, lensln int) ( + delta float64, +) { + if lensln == 0 && lenslp > 0 { + return + } + + slratio := float64(lenslp) / float64(lensln) + if slratio == 1 { + delta = rand.Float64() + } else if slratio > 1 { + delta = rand.Float64() * (1 / slratio) + } else { + delta = 1 - rand.Float64()*slratio + } + + return delta +} + +// writeOutliers will save the `outliers` to file specified by +// `OutliersFile`. +func (in *Runtime) writeOutliers() (e error) { + writer, e := dsv.NewWriter("") + if nil != e { + return + } + + e = writer.OpenOutput(in.OutliersFile) + if e != nil { + return + } + + sep := dsv.DefSeparator + _, e = writer.WriteRawRows(&in.outliers, &sep) + if e != nil { + return + } + + return writer.Close() +} diff --git a/lib/mining/resampling/lnsmote/lnsmote_test.go b/lib/mining/resampling/lnsmote/lnsmote_test.go new file mode 100644 index 00000000..1c8e45e4 --- /dev/null +++ b/lib/mining/resampling/lnsmote/lnsmote_test.go @@ -0,0 +1,74 @@ +// Copyright 2016 Mhd Sulhan <ms@kilabit.info>. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package lnsmote + +import ( + "fmt" + "os" + "testing" + + "github.com/shuLhan/share/lib/dsv" + "github.com/shuLhan/share/lib/tabula" +) + +const ( + fcfg = "../../testdata/phoneme/phoneme.dsv" +) + +func TestLNSmote(t *testing.T) { + // Read sample dataset. + dataset := tabula.Claset{} + _, e := dsv.SimpleRead(fcfg, &dataset) + if nil != e { + t.Fatal(e) + } + + fmt.Println("[lnsmote_test] Total samples:", dataset.GetNRow()) + + // Write original samples. + writer, e := dsv.NewWriter("") + + if nil != e { + t.Fatal(e) + } + + e = writer.OpenOutput("phoneme_lnsmote.csv") + if e != nil { + t.Fatal(e) + } + + sep := dsv.DefSeparator + _, e = writer.WriteRawRows(dataset.GetRows(), &sep) + if e != nil { + t.Fatal(e) + } + + // Initialize LN-SMOTE. + lnsmoteRun := New(100, 5, 5, "1", "lnsmote.outliers") + + e = lnsmoteRun.Resampling(&dataset) + + fmt.Println("[lnsmote_test] # synthetic:", lnsmoteRun.Synthetics.Len()) + + sep = dsv.DefSeparator + _, e = writer.WriteRawRows(lnsmoteRun.Synthetics.GetRows(), &sep) + if e != nil { + t.Fatal(e) + } + + e = writer.Close() + if e != nil { + t.Fatal(e) + } +} + +func TestMain(m *testing.M) { + envTestLnsmote := os.Getenv("TEST_LNSMOTE") + if len(envTestLnsmote) == 0 { + os.Exit(0) + } + + os.Exit(m.Run()) +} diff --git a/lib/mining/resampling/lnsmote/phoneme_lnsmote.dsv b/lib/mining/resampling/lnsmote/phoneme_lnsmote.dsv new file mode 100644 index 00000000..cd1fef8b --- /dev/null +++ b/lib/mining/resampling/lnsmote/phoneme_lnsmote.dsv @@ -0,0 +1,38 @@ +{ + "Input" :"phoneme_lnsmote.csv" +, "Rejected" :"phoneme.rej" +, "MaxRows" :-1 +, "ClassMetadataIndex" :5 +, "ClassIndex" :5 +, "DatasetMode" :"matrix" +, "InputMetadata" : + [{ + "Name" :"f1" + , "Type" :"real" + , "Separator" :"," + },{ + "Name" :"f2" + , "Type" :"real" + , "Separator" :"," + },{ + "Name" :"f3" + , "Type" :"real" + , "Separator" :"," + },{ + "Name" :"f4" + , "Type" :"real" + , "Separator" :"," + },{ + "Name" :"f5" + , "Type" :"real" + , "Separator" :"," + },{ + "Name" :"class" + , "Type" :"string" + , "ValueSpace" : + [ + "0" + , "1" + ] + }] +} diff --git a/lib/mining/resampling/smote/phoneme_smote.dsv b/lib/mining/resampling/smote/phoneme_smote.dsv new file mode 100644 index 00000000..63180379 --- /dev/null +++ b/lib/mining/resampling/smote/phoneme_smote.dsv @@ -0,0 +1,38 @@ +{ + "Input" :"phoneme_smote.csv" +, "Rejected" :"phoneme.rej" +, "MaxRows" :-1 +, "ClassMetadataIndex" :5 +, "ClassIndex" :5 +, "DatasetMode" :"matrix" +, "InputMetadata" : + [{ + "Name" :"f1" + , "Type" :"real" + , "Separator" :"," + },{ + "Name" :"f2" + , "Type" :"real" + , "Separator" :"," + },{ + "Name" :"f3" + , "Type" :"real" + , "Separator" :"," + },{ + "Name" :"f4" + , "Type" :"real" + , "Separator" :"," + },{ + "Name" :"f5" + , "Type" :"real" + , "Separator" :"," + },{ + "Name" :"class" + , "Type" :"string" + , "ValueSpace" : + [ + "0" + , "1" + ] + }] +} diff --git a/lib/mining/resampling/smote/smote.go b/lib/mining/resampling/smote/smote.go new file mode 100644 index 00000000..32f21b7e --- /dev/null +++ b/lib/mining/resampling/smote/smote.go @@ -0,0 +1,183 @@ +// Copyright 2015 Mhd Sulhan <ms@kilabit.info>. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +// Package smote resamples a dataset by applying the Synthetic Minority +// Oversampling TEchnique (SMOTE). The original dataset must fit entirely in +// memory. The amount of SMOTE and number of nearest neighbors may be specified. +// For more information, see +// +// Nitesh V. Chawla et. al. (2002). Synthetic Minority Over-sampling +// Technique. Journal of Artificial Intelligence Research. 16:321-357. +// +package smote + +import ( + "fmt" + "math/rand" + "time" + + "github.com/shuLhan/share/lib/mining/knn" + "github.com/shuLhan/share/lib/mining/resampling" + "github.com/shuLhan/share/lib/tabula" +) + +// +// Runtime for input and output. +// +type Runtime struct { + // Runtime the K-Nearest-Neighbourhood parameters. + knn.Runtime + // PercentOver input for oversampling percentage. + PercentOver int `json:"PercentOver"` + // SyntheticFile is a filename where synthetic samples will be written. + SyntheticFile string `json:"SyntheticFile"` + // NSynthetic input for number of new synthetic per sample. + NSynthetic int + // Synthetics contain output of resampling as synthetic samples. + Synthetics tabula.Dataset +} + +// +// New create and return new smote runtime. +// +func New(percentOver, k, classIndex int) (smoteRun *Runtime) { + smoteRun = &Runtime{ + Runtime: knn.Runtime{ + DistanceMethod: knn.TEuclidianDistance, + ClassIndex: classIndex, + K: k, + }, + PercentOver: percentOver, + } + return +} + +// +// Init will recheck input and set to default value if its not valid. +// +func (smote *Runtime) Init() { + rand.Seed(time.Now().UnixNano()) + + if smote.K <= 0 { + smote.K = resampling.DefaultK + } + if smote.PercentOver <= 0 { + smote.PercentOver = resampling.DefaultPercentOver + } +} + +// +// GetSynthetics return synthetic samples. +// +func (smote *Runtime) GetSynthetics() tabula.DatasetInterface { + return &smote.Synthetics +} + +// +// populate will generate new synthetic sample using nearest neighbors. +// +func (smote *Runtime) populate(instance *tabula.Row, neighbors knn.Neighbors) { + lenAttr := len(*instance) + + for x := 0; x < smote.NSynthetic; x++ { + // choose one of the K nearest neighbors + n := rand.Intn(neighbors.Len()) + sample := neighbors.Row(n) + + newSynt := make(tabula.Row, lenAttr) + + // Compute new synthetic attributes. + for attr, sr := range *sample { + if attr == smote.ClassIndex { + continue + } + + ir := (*instance)[attr] + + iv := ir.Float() + sv := sr.Float() + + dif := sv - iv + gap := rand.Float64() + newAttr := iv + (gap * dif) + + record := &tabula.Record{} + record.SetFloat(newAttr) + newSynt[attr] = record + } + + newSynt[smote.ClassIndex] = (*instance)[smote.ClassIndex] + + smote.Synthetics.PushRow(&newSynt) + } +} + +// +// Resampling will run resampling algorithm using values that has been defined +// in `Runtime` and return list of synthetic samples. +// +// The `dataset` must be samples of minority class not the whole dataset. +// +// Algorithms, +// +// (0) If oversampling percentage less than 100, then +// (0.1) replace the input dataset by selecting n random sample from dataset +// without replacement, where n is +// +// (percentage-oversampling / 100) * number-of-sample +// +// (1) For each `sample` in dataset, +// (1.1) find k-nearest-neighbors of `sample`, +// (1.2) generate synthetic sample in neighbors. +// (2) Write synthetic samples to file, only if `SyntheticFile` is not empty. +// +func (smote *Runtime) Resampling(dataset tabula.Rows) (e error) { + smote.Init() + + if smote.PercentOver < 100 { + // (0.1) + smote.NSynthetic = (smote.PercentOver / 100.0) * len(dataset) + dataset, _, _, _ = dataset.RandomPick(smote.NSynthetic, false) + } else { + smote.NSynthetic = smote.PercentOver / 100.0 + } + + // (1) + for x := range dataset { + sample := dataset[x] + + // (1.1) + neighbors := smote.FindNeighbors(&dataset, sample) + + // (1.2) + smote.populate(sample, neighbors) + } + + // (2) + if smote.SyntheticFile != "" { + e = resampling.WriteSynthetics(smote, smote.SyntheticFile) + } + + return +} + +// +// Write will write synthetic samples to file defined in `file`. +// +func (smote *Runtime) Write(file string) error { + return resampling.WriteSynthetics(smote, file) +} + +func (smote *Runtime) String() (s string) { + s = fmt.Sprintf("'smote' : {\n"+ + " 'ClassIndex' :%d\n"+ + " , 'K' :%d\n"+ + " , 'PercentOver' :%d\n"+ + " , 'DistanceMethod' :%d\n"+ + "}", smote.ClassIndex, smote.K, smote.PercentOver, + smote.DistanceMethod) + + return +} diff --git a/lib/mining/resampling/smote/smote_test.go b/lib/mining/resampling/smote/smote_test.go new file mode 100644 index 00000000..c3a633cb --- /dev/null +++ b/lib/mining/resampling/smote/smote_test.go @@ -0,0 +1,49 @@ +// Copyright 2015 Mhd Sulhan <ms@kilabit.info>. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package smote + +import ( + "fmt" + "testing" + + "github.com/shuLhan/share/lib/dsv" + "github.com/shuLhan/share/lib/tabula" +) + +var ( + fcfg = "../../testdata/phoneme/phoneme.dsv" + PercentOver = 100 + K = 5 +) + +func TestSmote(t *testing.T) { + smot := New(PercentOver, K, 5) + + // Read samples. + dataset := tabula.Claset{} + + _, e := dsv.SimpleRead(fcfg, &dataset) + if nil != e { + t.Fatal(e) + } + + fmt.Println("[smote_test] Total samples:", dataset.Len()) + + minorset := dataset.GetMinorityRows() + + fmt.Println("[smote_test] # minority samples:", minorset.Len()) + + e = smot.Resampling(*minorset) + if e != nil { + t.Fatal(e) + } + + fmt.Println("[smote_test] # synthetic:", smot.GetSynthetics().Len()) + + e = smot.Write("phoneme_smote.csv") + if e != nil { + t.Fatal(e) + } +} diff --git a/lib/mining/testdata/forensic_glass/fgl.data b/lib/mining/testdata/forensic_glass/fgl.data new file mode 100644 index 00000000..8dc6fc0c --- /dev/null +++ b/lib/mining/testdata/forensic_glass/fgl.data @@ -0,0 +1,214 @@ +1 3.01 13.64 4.49 1.10 71.78 0.06 8.75 0.00 0.00 WinF +2 -0.39 13.89 3.60 1.36 72.73 0.48 7.83 0.00 0.00 WinF +3 -1.82 13.53 3.55 1.54 72.99 0.39 7.78 0.00 0.00 WinF +4 -0.34 13.21 3.69 1.29 72.61 0.57 8.22 0.00 0.00 WinF +5 -0.58 13.27 3.62 1.24 73.08 0.55 8.07 0.00 0.00 WinF +6 -2.04 12.79 3.61 1.62 72.97 0.64 8.07 0.00 0.26 WinF +7 -0.57 13.30 3.60 1.14 73.09 0.58 8.17 0.00 0.00 WinF +8 -0.44 13.15 3.61 1.05 73.24 0.57 8.24 0.00 0.00 WinF +9 1.18 14.04 3.58 1.37 72.08 0.56 8.30 0.00 0.00 WinF +10 -0.45 13.00 3.60 1.36 72.99 0.57 8.40 0.00 0.11 WinF +11 -2.29 12.72 3.46 1.56 73.20 0.67 8.09 0.00 0.24 WinF +12 -0.37 12.80 3.66 1.27 73.01 0.60 8.56 0.00 0.00 WinF +13 -2.11 12.88 3.43 1.40 73.28 0.69 8.05 0.00 0.24 WinF +14 -0.52 12.86 3.56 1.27 73.21 0.54 8.38 0.00 0.17 WinF +15 -0.37 12.61 3.59 1.31 73.29 0.58 8.50 0.00 0.00 WinF +16 -0.39 12.81 3.54 1.23 73.24 0.58 8.39 0.00 0.00 WinF +17 -0.16 12.68 3.67 1.16 73.11 0.61 8.70 0.00 0.00 WinF +18 3.96 14.36 3.85 0.89 71.36 0.15 9.15 0.00 0.00 WinF +19 1.11 13.90 3.73 1.18 72.12 0.06 8.89 0.00 0.00 WinF +20 -0.65 13.02 3.54 1.69 72.73 0.54 8.44 0.00 0.07 WinF +21 -0.50 12.82 3.55 1.49 72.75 0.54 8.52 0.00 0.19 WinF +22 1.66 14.77 3.75 0.29 72.02 0.03 9.00 0.00 0.00 WinF +23 -0.64 12.78 3.62 1.29 72.79 0.59 8.70 0.00 0.00 WinF +24 -0.49 12.81 3.57 1.35 73.02 0.62 8.59 0.00 0.00 WinF +25 -0.80 13.38 3.50 1.15 72.85 0.50 8.43 0.00 0.00 WinF +26 -0.36 12.98 3.54 1.21 73.00 0.65 8.53 0.00 0.00 WinF +27 -0.07 13.21 3.48 1.41 72.64 0.59 8.43 0.00 0.00 WinF +28 -0.79 12.87 3.48 1.33 73.04 0.56 8.43 0.00 0.00 WinF +29 -0.32 12.56 3.52 1.43 73.15 0.57 8.54 0.00 0.00 WinF +30 -0.16 13.08 3.49 1.28 72.86 0.60 8.49 0.00 0.00 WinF +31 -0.32 12.65 3.56 1.30 73.08 0.61 8.69 0.00 0.14 WinF +32 -0.53 12.84 3.50 1.14 73.27 0.56 8.55 0.00 0.00 WinF +33 -0.25 12.85 3.48 1.23 72.97 0.61 8.56 0.09 0.22 WinF +34 -0.47 12.57 3.47 1.38 73.39 0.60 8.55 0.00 0.06 WinF +35 -0.17 12.69 3.54 1.34 72.95 0.57 8.75 0.00 0.00 WinF +36 -2.33 13.29 3.45 1.21 72.74 0.56 8.57 0.00 0.00 WinF +37 1.09 13.89 3.53 1.32 71.81 0.51 8.78 0.11 0.00 WinF +38 -0.03 12.74 3.48 1.35 72.96 0.64 8.68 0.00 0.00 WinF +39 4.13 14.21 3.82 0.47 71.77 0.11 9.57 0.00 0.00 WinF +40 4.13 14.21 3.82 0.47 71.77 0.11 9.57 0.00 0.00 WinF +41 -0.07 12.79 3.50 1.12 73.03 0.64 8.77 0.00 0.00 WinF +42 -0.45 12.71 3.42 1.20 73.20 0.59 8.64 0.00 0.00 WinF +43 -0.21 13.21 3.39 1.33 72.76 0.59 8.59 0.00 0.00 WinF +44 4.10 13.73 3.84 0.72 71.76 0.17 9.74 0.00 0.00 WinF +45 -0.14 12.73 3.43 1.19 72.95 0.62 8.76 0.00 0.30 WinF +46 1.00 13.49 3.48 1.35 71.95 0.55 9.00 0.00 0.00 WinF +47 0.69 13.19 3.37 1.18 72.72 0.57 8.83 0.00 0.16 WinF +48 8.67 13.99 3.70 0.71 71.57 0.02 9.82 0.00 0.10 WinF +49 4.23 13.21 3.77 0.79 71.99 0.13 10.02 0.00 0.00 WinF +50 0.98 13.58 3.35 1.23 72.08 0.59 8.91 0.00 0.00 WinF +51 5.20 13.72 3.72 0.51 71.75 0.09 10.06 0.00 0.16 WinF +52 1.26 13.20 3.33 1.28 72.36 0.60 9.14 0.00 0.11 WinF +53 0.08 13.43 2.87 1.19 72.84 0.55 9.03 0.00 0.00 WinF +54 0.37 13.14 2.84 1.28 72.85 0.55 9.07 0.00 0.00 WinF +55 -0.22 13.21 2.81 1.29 72.98 0.51 9.02 0.00 0.09 WinF +56 -0.31 12.45 2.71 1.29 73.70 0.56 9.06 0.00 0.24 WinF +57 -5.85 12.99 3.47 1.12 72.98 0.62 8.35 0.00 0.31 WinF +58 0.24 12.87 3.48 1.29 72.95 0.60 8.43 0.00 0.00 WinF +59 -0.46 13.48 3.74 1.17 72.99 0.59 8.03 0.00 0.00 WinF +60 -0.46 13.39 3.66 1.19 72.79 0.57 8.27 0.00 0.11 WinF +61 1.05 13.60 3.62 1.11 72.64 0.14 8.76 0.00 0.00 WinF +62 1.77 13.81 3.58 1.32 71.72 0.12 8.67 0.69 0.00 WinF +63 3.72 13.51 3.86 0.88 71.79 0.23 9.54 0.00 0.11 WinF +64 4.27 14.17 3.81 0.78 71.35 0.00 9.69 0.00 0.00 WinF +65 3.72 13.48 3.74 0.90 72.01 0.18 9.61 0.00 0.07 WinF +66 2.99 13.69 3.59 1.12 71.96 0.09 9.40 0.00 0.00 WinF +67 3.52 13.05 3.65 0.87 72.22 0.19 9.85 0.00 0.17 WinF +68 3.52 13.05 3.65 0.87 72.32 0.19 9.85 0.00 0.17 WinF +69 3.52 13.12 3.58 0.90 72.20 0.23 9.82 0.00 0.16 WinF +70 5.00 13.31 3.58 0.82 71.99 0.12 10.17 0.00 0.03 WinF +71 -2.26 14.86 3.67 1.74 71.87 0.16 7.36 0.00 0.12 WinNF +72 0.48 13.64 3.87 1.27 71.96 0.54 8.32 0.00 0.32 WinNF +73 -2.07 13.09 3.59 1.52 73.10 0.67 7.83 0.00 0.00 WinNF +74 -1.69 13.34 3.57 1.57 72.87 0.61 7.89 0.00 0.00 WinNF +75 -2.04 13.02 3.56 1.54 73.11 0.72 7.90 0.00 0.00 WinNF +76 -2.10 13.02 3.58 1.51 73.12 0.69 7.96 0.00 0.00 WinNF +77 -1.55 13.44 3.61 1.54 72.39 0.66 8.03 0.00 0.00 WinNF +78 -1.73 13.00 3.58 1.54 72.83 0.61 8.04 0.00 0.00 WinNF +79 -1.87 13.92 3.52 1.25 72.88 0.37 7.94 0.00 0.14 WinNF +80 -2.10 12.82 3.52 1.90 72.86 0.69 7.97 0.00 0.00 WinNF +81 -2.08 12.86 3.52 2.12 72.66 0.69 7.97 0.00 0.00 WinNF +82 -2.07 13.25 3.45 1.43 73.17 0.61 7.86 0.00 0.00 WinNF +83 -1.54 13.41 3.55 1.25 72.81 0.68 8.10 0.00 0.00 WinNF +84 -2.06 13.09 3.52 1.55 72.87 0.68 8.05 0.00 0.09 WinNF +85 -3.91 14.25 3.09 2.08 72.28 1.10 7.08 0.00 0.00 WinNF +86 -1.75 13.36 3.58 1.49 72.72 0.45 8.21 0.00 0.00 WinNF +87 -2.31 13.24 3.49 1.47 73.25 0.38 8.03 0.00 0.00 WinNF +88 -1.55 13.40 3.49 1.52 72.65 0.67 8.08 0.00 0.10 WinNF +89 -1.82 13.01 3.50 1.48 72.89 0.60 8.12 0.00 0.00 WinNF +90 -1.60 12.55 3.48 1.87 73.23 0.63 8.08 0.00 0.09 WinNF +91 0.41 12.93 3.74 1.11 72.28 0.64 8.96 0.00 0.22 WinNF +92 -1.95 12.90 3.44 1.45 73.06 0.44 8.27 0.00 0.00 WinNF +93 -2.12 13.12 3.41 1.58 73.26 0.07 8.39 0.00 0.19 WinNF +94 -2.10 13.24 3.34 1.47 73.10 0.39 8.22 0.00 0.00 WinNF +95 -1.71 12.71 3.33 1.49 73.28 0.67 8.24 0.00 0.00 WinNF +96 0.60 13.36 3.43 1.43 72.26 0.51 8.60 0.00 0.00 WinNF +97 0.41 13.02 3.62 1.06 72.34 0.64 9.13 0.00 0.15 WinNF +98 -0.57 12.20 3.25 1.16 73.55 0.62 8.90 0.00 0.24 WinNF +99 -1.11 12.67 2.88 1.71 73.21 0.73 8.54 0.00 0.00 WinNF +100 0.11 12.96 2.96 1.43 72.92 0.60 8.79 0.14 0.00 WinNF +101 -1.45 12.75 2.85 1.44 73.27 0.57 8.79 0.11 0.22 WinNF +102 -0.70 12.35 2.72 1.63 72.87 0.70 9.23 0.00 0.00 WinNF +103 0.20 12.62 2.76 0.83 73.81 0.35 9.42 0.00 0.20 WinNF +104 9.25 13.80 3.15 0.66 70.57 0.08 11.64 0.00 0.00 WinNF +105 6.10 13.83 2.90 1.17 71.15 0.08 10.79 0.00 0.00 WinNF +106 6.75 11.45 0.00 1.88 72.19 0.81 13.24 0.00 0.34 WinNF +107 13.25 10.73 0.00 2.10 69.81 0.58 13.30 3.15 0.28 WinNF +108 15.93 12.30 0.00 1.00 70.16 0.12 16.19 0.00 0.24 WinNF +109 4.22 14.43 0.00 1.00 72.67 0.10 11.52 0.00 0.08 WinNF +110 0.18 13.72 0.00 0.56 74.45 0.00 10.99 0.00 0.00 WinNF +111 8.64 11.23 0.00 0.77 73.21 0.00 14.68 0.00 0.00 WinNF +112 9.39 11.02 0.00 0.75 73.08 0.00 14.96 0.00 0.00 WinNF +113 9.77 12.64 0.00 0.67 72.02 0.06 14.40 0.00 0.00 WinNF +114 0.92 13.46 3.83 1.26 72.55 0.57 8.21 0.00 0.14 WinNF +115 0.47 13.10 3.97 1.19 72.44 0.60 8.43 0.00 0.00 WinNF +116 0.46 13.41 3.89 1.33 72.38 0.51 8.28 0.00 0.00 WinNF +117 0.29 13.24 3.90 1.41 72.33 0.55 8.31 0.00 0.10 WinNF +118 -0.92 13.72 3.68 1.81 72.06 0.64 7.88 0.00 0.00 WinNF +119 -1.27 13.30 3.64 1.53 72.53 0.65 8.03 0.00 0.29 WinNF +120 -1.48 13.56 3.57 1.47 72.45 0.64 7.96 0.00 0.00 WinNF +121 0.44 13.25 3.76 1.32 72.40 0.58 8.42 0.00 0.00 WinNF +122 -1.37 12.93 3.54 1.62 72.96 0.64 8.03 0.00 0.21 WinNF +123 -1.13 13.23 3.54 1.48 72.84 0.56 8.10 0.00 0.00 WinNF +124 -0.93 13.48 3.48 1.71 72.52 0.62 7.99 0.00 0.00 WinNF +125 3.77 13.20 3.68 1.15 72.75 0.54 8.52 0.00 0.00 WinNF +126 0.72 12.93 3.66 1.56 72.51 0.58 8.55 0.00 0.12 WinNF +127 -1.33 12.94 3.61 1.26 72.75 0.56 8.60 0.00 0.00 WinNF +128 2.81 13.78 2.28 1.43 71.99 0.49 9.85 0.00 0.17 WinNF +129 2.68 13.55 2.09 1.67 72.18 0.53 9.57 0.27 0.17 WinNF +130 2.20 13.98 1.35 1.63 71.76 0.39 10.56 0.00 0.18 WinNF +131 3.77 13.75 1.01 1.36 72.19 0.33 11.14 0.00 0.00 WinNF +132 8.14 13.70 0.00 1.36 71.24 0.19 13.44 0.00 0.10 WinNF +133 0.13 13.43 3.98 1.18 72.49 0.58 8.15 0.00 0.00 WinNF +134 0.00 13.71 3.93 1.54 71.81 0.54 8.21 0.00 0.15 WinNF +135 0.11 13.33 3.85 1.25 72.78 0.52 8.12 0.00 0.00 WinNF +136 -0.11 13.19 3.90 1.30 72.33 0.55 8.44 0.00 0.28 WinNF +137 0.06 13.00 3.80 1.08 73.07 0.56 8.38 0.00 0.12 WinNF +138 -0.89 12.89 3.62 1.57 72.96 0.61 8.11 0.00 0.00 WinNF +139 -1.26 12.79 3.52 1.54 73.36 0.66 7.90 0.00 0.00 WinNF +140 -1.26 12.87 3.56 1.64 73.14 0.65 7.99 0.00 0.00 WinNF +141 -1.10 13.33 3.54 1.61 72.54 0.68 8.11 0.00 0.00 WinNF +142 0.51 13.20 3.63 1.07 72.83 0.57 8.41 0.09 0.17 WinNF +143 -1.38 12.85 3.51 1.44 73.01 0.68 8.23 0.06 0.25 WinNF +144 -0.91 13.00 3.47 1.79 72.72 0.66 8.18 0.00 0.00 WinNF +145 -1.40 12.99 3.18 1.23 72.97 0.58 8.81 0.00 0.24 WinNF +146 0.39 12.85 3.67 1.24 72.57 0.62 8.68 0.00 0.35 WinNF +147 -0.31 13.65 3.66 1.11 72.77 0.11 8.60 0.00 0.00 Veh +148 -1.90 13.33 3.53 1.34 72.67 0.56 8.33 0.00 0.00 Veh +149 -1.30 13.24 3.57 1.38 72.70 0.56 8.44 0.00 0.10 Veh +150 -1.57 12.16 3.52 1.35 72.89 0.57 8.53 0.00 0.00 Veh +151 -1.35 13.14 3.45 1.76 72.48 0.60 8.38 0.00 0.17 Veh +152 3.27 14.32 3.90 0.83 71.50 0.00 9.49 0.00 0.00 Veh +153 -0.21 13.64 3.65 0.65 73.00 0.06 8.93 0.00 0.00 Veh +154 -1.90 13.42 3.40 1.22 72.69 0.59 8.32 0.00 0.00 Veh +155 -1.06 12.86 3.58 1.31 72.61 0.61 8.79 0.00 0.00 Veh +156 -1.54 13.04 3.40 1.26 73.01 0.52 8.58 0.00 0.00 Veh +157 -1.45 13.41 3.39 1.28 72.64 0.52 8.65 0.00 0.00 Veh +158 3.21 14.03 3.76 0.58 71.79 0.11 9.65 0.00 0.00 Veh +159 -0.24 13.53 3.41 1.52 72.04 0.58 8.79 0.00 0.00 Veh +160 -0.04 13.50 3.36 1.63 71.94 0.57 8.81 0.00 0.09 Veh +161 0.32 13.33 3.34 1.54 72.14 0.56 8.99 0.00 0.00 Veh +162 1.34 13.64 3.54 0.75 72.65 0.16 8.89 0.15 0.24 Veh +163 4.11 14.19 3.78 0.91 71.36 0.23 9.14 0.00 0.37 Veh +164 -2.86 14.01 2.68 3.50 69.89 1.68 5.87 2.20 0.00 Con +165 1.15 12.73 1.85 1.86 72.69 0.60 10.09 0.00 0.00 Con +166 3.71 11.56 1.88 1.56 72.86 0.47 11.41 0.00 0.00 Con +167 3.51 11.03 1.71 1.56 73.44 0.58 11.62 0.00 0.00 Con +168 1.69 12.64 0.00 1.65 73.75 0.38 11.53 0.00 0.00 Con +169 -1.34 12.86 0.00 1.83 73.88 0.97 10.17 0.00 0.00 Con +170 1.94 13.27 0.00 1.76 73.03 0.47 11.32 0.00 0.00 Con +171 5.69 13.44 0.00 1.58 72.22 0.32 12.24 0.00 0.00 Con +172 -4.84 13.02 0.00 3.04 70.48 6.21 6.96 0.00 0.00 Con +173 -4.79 13.00 0.00 3.02 70.70 6.21 6.93 0.00 0.00 Con +174 2.43 13.38 0.00 1.40 72.25 0.33 12.50 0.00 0.00 Con +175 2.58 12.85 1.61 2.17 72.18 0.76 9.70 0.24 0.51 Con +176 3.19 12.97 0.33 1.51 73.39 0.13 11.27 0.00 0.28 Con +177 1.05 14.00 2.39 1.56 72.37 0.00 9.57 0.00 0.00 Tabl +178 1.37 13.79 2.41 1.19 72.76 0.00 9.77 0.00 0.00 Tabl +179 0.29 14.46 2.24 1.62 72.38 0.00 9.26 0.00 0.00 Tabl +180 0.52 14.09 2.19 1.66 72.67 0.00 9.32 0.00 0.00 Tabl +181 -5.01 14.40 1.74 1.54 74.55 0.00 7.59 0.00 0.00 Tabl +182 0.88 14.99 0.78 1.74 72.50 0.00 9.95 0.00 0.00 Tabl +183 1.16 14.15 0.00 2.09 72.74 0.00 10.88 0.00 0.00 Tabl +184 1.69 14.56 0.00 0.56 73.48 0.00 11.22 0.00 0.00 Tabl +185 -6.85 17.38 0.00 0.34 75.41 0.00 6.65 0.00 0.00 Tabl +186 -6.69 13.69 3.20 1.81 72.81 1.76 5.43 1.19 0.00 Head +187 0.38 14.32 3.26 2.22 71.25 1.46 5.79 1.63 0.00 Head +188 5.15 13.44 3.34 1.23 72.38 0.60 8.83 0.00 0.00 Head +189 4.47 14.86 2.20 2.06 70.26 0.76 9.76 0.00 0.00 Head +190 5.65 15.79 1.83 1.31 70.43 0.31 8.61 1.68 0.00 Head +191 -1.87 13.88 1.78 1.79 73.10 0.00 8.67 0.76 0.00 Head +192 -1.98 14.85 0.00 2.38 73.28 0.00 8.76 0.64 0.09 Head +193 -1.77 14.20 0.00 2.79 73.46 0.04 9.04 0.40 0.09 Head +194 -0.81 14.75 0.00 2.00 73.02 0.00 8.53 1.59 0.08 Head +195 -1.17 14.56 0.00 1.98 73.29 0.00 8.52 1.57 0.07 Head +196 -2.55 14.14 0.00 2.68 73.39 0.08 9.07 0.61 0.05 Head +197 -2.44 13.87 0.00 2.54 73.23 0.14 9.41 0.81 0.01 Head +198 -0.73 14.70 0.00 2.34 73.28 0.00 8.95 0.66 0.00 Head +199 -2.69 14.38 0.00 2.66 73.10 0.04 9.08 0.64 0.00 Head +200 -1.91 15.01 0.00 2.51 73.05 0.05 8.83 0.53 0.00 Head +201 -2.92 15.15 0.00 2.25 73.50 0.00 8.34 0.63 0.00 Head +202 -1.47 11.95 0.00 1.19 75.18 2.70 8.93 0.00 0.00 Head +203 -2.86 14.85 0.00 2.42 73.72 0.00 8.39 0.56 0.00 Head +204 -1.42 14.80 0.00 1.99 73.11 0.00 8.28 1.71 0.00 Head +205 -1.83 14.95 0.00 2.27 73.30 0.00 8.71 0.67 0.00 Head +206 -0.68 14.95 0.00 1.80 72.99 0.00 8.61 1.55 0.00 Head +207 -1.55 14.94 0.00 1.87 73.11 0.00 8.67 1.38 0.00 Head +208 0.31 14.39 0.00 1.82 72.86 1.41 6.47 2.88 0.00 Head +209 -1.60 14.37 0.00 2.74 72.85 0.00 9.45 0.54 0.00 Head +210 -1.77 14.14 0.00 2.88 72.61 0.08 9.18 1.06 0.00 Head +211 -1.15 14.92 0.00 1.99 73.06 0.00 8.40 1.59 0.00 Head +212 2.65 14.36 0.00 2.02 73.42 0.00 8.44 1.64 0.00 Head +213 -1.49 14.38 0.00 1.94 73.61 0.00 8.48 1.57 0.00 Head +214 -0.89 14.23 0.00 2.08 73.36 0.00 8.62 1.67 0.00 Head diff --git a/lib/mining/testdata/forensic_glass/fgl.dsv b/lib/mining/testdata/forensic_glass/fgl.dsv new file mode 100644 index 00000000..fd33e0d5 --- /dev/null +++ b/lib/mining/testdata/forensic_glass/fgl.dsv @@ -0,0 +1,97 @@ +{ + "Input" :"fgl.data" +, "Rejected" :"glass.rej" +, "MaxRows" :-1 +, "ClassMetadataIndex" :10 +, "ClassIndex" :9 +, "DatasetMode" :"matrix" +, "InputMetadata" : + [{ + "Name" :"ID" + , "Separator" :" " + , "Type" :"integer" + , "Skip" :true + },{ + "Name" :"RI" + , "Separator" :" " + , "Type" :"real" + },{ + "Name" :"NA20" + , "Separator" :" " + , "Type" :"real" + },{ + "Name" :"MGO" + , "Separator" :" " + , "Type" :"real" + },{ + "Name" :"AL203" + , "Separator" :" " + , "Type" :"real" + },{ + "Name" :"SI02" + , "Separator" :" " + , "Type" :"real" + },{ + "Name" :"K20" + , "Separator" :" " + , "Type" :"real" + },{ + "Name" :"CAO" + , "Separator" :" " + , "Type" :"real" + },{ + "Name" :"BAO" + , "Separator" :" " + , "Type" :"real" + },{ + "Name" :"FE203" + , "Separator" :" " + , "Type" :"real" + },{ + "Name" :"TYPE" + , "Type" :"string" + , "ValueSpace" : + [ + "WinF" + , "WinNF" + , "Veh" + , "Con" + , "Tabl" + , "Head" + ] + }] + +, "Output" :"glass.out" +, "OutputMetadata" : + [{ + "Name" :"RI" + , "Separator" :"," + },{ + "Name" :"NA20" + , "Separator" :"," + },{ + "Name" :"MGO" + , "Separator" :"," + },{ + "Name" :"AL203" + , "Separator" :"," + },{ + "Name" :"SI02" + , "Separator" :"," + },{ + "Name" :"K20" + , "Separator" :"," + },{ + "Name" :"CAO" + , "Separator" :"," + },{ + "Name" :"BAO" + , "Separator" :"," + },{ + "Name" :"FE203" + , "Separator" :"," + },{ + "Name" :"TYPE" + , "Type" :"integer" + }] +} diff --git a/lib/mining/testdata/forensic_glass/glass.data b/lib/mining/testdata/forensic_glass/glass.data new file mode 100644 index 00000000..ab21361c --- /dev/null +++ b/lib/mining/testdata/forensic_glass/glass.data @@ -0,0 +1,214 @@ +1,1.52101,13.64,4.49,1.10,71.78,0.06,8.75,0.00,0.00,1 +2,1.51761,13.89,3.60,1.36,72.73,0.48,7.83,0.00,0.00,1 +3,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.00,0.00,1 +4,1.51766,13.21,3.69,1.29,72.61,0.57,8.22,0.00,0.00,1 +5,1.51742,13.27,3.62,1.24,73.08,0.55,8.07,0.00,0.00,1 +6,1.51596,12.79,3.61,1.62,72.97,0.64,8.07,0.00,0.26,1 +7,1.51743,13.30,3.60,1.14,73.09,0.58,8.17,0.00,0.00,1 +8,1.51756,13.15,3.61,1.05,73.24,0.57,8.24,0.00,0.00,1 +9,1.51918,14.04,3.58,1.37,72.08,0.56,8.30,0.00,0.00,1 +10,1.51755,13.00,3.60,1.36,72.99,0.57,8.40,0.00,0.11,1 +11,1.51571,12.72,3.46,1.56,73.20,0.67,8.09,0.00,0.24,1 +12,1.51763,12.80,3.66,1.27,73.01,0.60,8.56,0.00,0.00,1 +13,1.51589,12.88,3.43,1.40,73.28,0.69,8.05,0.00,0.24,1 +14,1.51748,12.86,3.56,1.27,73.21,0.54,8.38,0.00,0.17,1 +15,1.51763,12.61,3.59,1.31,73.29,0.58,8.50,0.00,0.00,1 +16,1.51761,12.81,3.54,1.23,73.24,0.58,8.39,0.00,0.00,1 +17,1.51784,12.68,3.67,1.16,73.11,0.61,8.70,0.00,0.00,1 +18,1.52196,14.36,3.85,0.89,71.36,0.15,9.15,0.00,0.00,1 +19,1.51911,13.90,3.73,1.18,72.12,0.06,8.89,0.00,0.00,1 +20,1.51735,13.02,3.54,1.69,72.73,0.54,8.44,0.00,0.07,1 +21,1.51750,12.82,3.55,1.49,72.75,0.54,8.52,0.00,0.19,1 +22,1.51966,14.77,3.75,0.29,72.02,0.03,9.00,0.00,0.00,1 +23,1.51736,12.78,3.62,1.29,72.79,0.59,8.70,0.00,0.00,1 +24,1.51751,12.81,3.57,1.35,73.02,0.62,8.59,0.00,0.00,1 +25,1.51720,13.38,3.50,1.15,72.85,0.50,8.43,0.00,0.00,1 +26,1.51764,12.98,3.54,1.21,73.00,0.65,8.53,0.00,0.00,1 +27,1.51793,13.21,3.48,1.41,72.64,0.59,8.43,0.00,0.00,1 +28,1.51721,12.87,3.48,1.33,73.04,0.56,8.43,0.00,0.00,1 +29,1.51768,12.56,3.52,1.43,73.15,0.57,8.54,0.00,0.00,1 +30,1.51784,13.08,3.49,1.28,72.86,0.60,8.49,0.00,0.00,1 +31,1.51768,12.65,3.56,1.30,73.08,0.61,8.69,0.00,0.14,1 +32,1.51747,12.84,3.50,1.14,73.27,0.56,8.55,0.00,0.00,1 +33,1.51775,12.85,3.48,1.23,72.97,0.61,8.56,0.09,0.22,1 +34,1.51753,12.57,3.47,1.38,73.39,0.60,8.55,0.00,0.06,1 +35,1.51783,12.69,3.54,1.34,72.95,0.57,8.75,0.00,0.00,1 +36,1.51567,13.29,3.45,1.21,72.74,0.56,8.57,0.00,0.00,1 +37,1.51909,13.89,3.53,1.32,71.81,0.51,8.78,0.11,0.00,1 +38,1.51797,12.74,3.48,1.35,72.96,0.64,8.68,0.00,0.00,1 +39,1.52213,14.21,3.82,0.47,71.77,0.11,9.57,0.00,0.00,1 +40,1.52213,14.21,3.82,0.47,71.77,0.11,9.57,0.00,0.00,1 +41,1.51793,12.79,3.50,1.12,73.03,0.64,8.77,0.00,0.00,1 +42,1.51755,12.71,3.42,1.20,73.20,0.59,8.64,0.00,0.00,1 +43,1.51779,13.21,3.39,1.33,72.76,0.59,8.59,0.00,0.00,1 +44,1.52210,13.73,3.84,0.72,71.76,0.17,9.74,0.00,0.00,1 +45,1.51786,12.73,3.43,1.19,72.95,0.62,8.76,0.00,0.30,1 +46,1.51900,13.49,3.48,1.35,71.95,0.55,9.00,0.00,0.00,1 +47,1.51869,13.19,3.37,1.18,72.72,0.57,8.83,0.00,0.16,1 +48,1.52667,13.99,3.70,0.71,71.57,0.02,9.82,0.00,0.10,1 +49,1.52223,13.21,3.77,0.79,71.99,0.13,10.02,0.00,0.00,1 +50,1.51898,13.58,3.35,1.23,72.08,0.59,8.91,0.00,0.00,1 +51,1.52320,13.72,3.72,0.51,71.75,0.09,10.06,0.00,0.16,1 +52,1.51926,13.20,3.33,1.28,72.36,0.60,9.14,0.00,0.11,1 +53,1.51808,13.43,2.87,1.19,72.84,0.55,9.03,0.00,0.00,1 +54,1.51837,13.14,2.84,1.28,72.85,0.55,9.07,0.00,0.00,1 +55,1.51778,13.21,2.81,1.29,72.98,0.51,9.02,0.00,0.09,1 +56,1.51769,12.45,2.71,1.29,73.70,0.56,9.06,0.00,0.24,1 +57,1.51215,12.99,3.47,1.12,72.98,0.62,8.35,0.00,0.31,1 +58,1.51824,12.87,3.48,1.29,72.95,0.60,8.43,0.00,0.00,1 +59,1.51754,13.48,3.74,1.17,72.99,0.59,8.03,0.00,0.00,1 +60,1.51754,13.39,3.66,1.19,72.79,0.57,8.27,0.00,0.11,1 +61,1.51905,13.60,3.62,1.11,72.64,0.14,8.76,0.00,0.00,1 +62,1.51977,13.81,3.58,1.32,71.72,0.12,8.67,0.69,0.00,1 +63,1.52172,13.51,3.86,0.88,71.79,0.23,9.54,0.00,0.11,1 +64,1.52227,14.17,3.81,0.78,71.35,0.00,9.69,0.00,0.00,1 +65,1.52172,13.48,3.74,0.90,72.01,0.18,9.61,0.00,0.07,1 +66,1.52099,13.69,3.59,1.12,71.96,0.09,9.40,0.00,0.00,1 +67,1.52152,13.05,3.65,0.87,72.22,0.19,9.85,0.00,0.17,1 +68,1.52152,13.05,3.65,0.87,72.32,0.19,9.85,0.00,0.17,1 +69,1.52152,13.12,3.58,0.90,72.20,0.23,9.82,0.00,0.16,1 +70,1.52300,13.31,3.58,0.82,71.99,0.12,10.17,0.00,0.03,1 +71,1.51574,14.86,3.67,1.74,71.87,0.16,7.36,0.00,0.12,2 +72,1.51848,13.64,3.87,1.27,71.96,0.54,8.32,0.00,0.32,2 +73,1.51593,13.09,3.59,1.52,73.10,0.67,7.83,0.00,0.00,2 +74,1.51631,13.34,3.57,1.57,72.87,0.61,7.89,0.00,0.00,2 +75,1.51596,13.02,3.56,1.54,73.11,0.72,7.90,0.00,0.00,2 +76,1.51590,13.02,3.58,1.51,73.12,0.69,7.96,0.00,0.00,2 +77,1.51645,13.44,3.61,1.54,72.39,0.66,8.03,0.00,0.00,2 +78,1.51627,13.00,3.58,1.54,72.83,0.61,8.04,0.00,0.00,2 +79,1.51613,13.92,3.52,1.25,72.88,0.37,7.94,0.00,0.14,2 +80,1.51590,12.82,3.52,1.90,72.86,0.69,7.97,0.00,0.00,2 +81,1.51592,12.86,3.52,2.12,72.66,0.69,7.97,0.00,0.00,2 +82,1.51593,13.25,3.45,1.43,73.17,0.61,7.86,0.00,0.00,2 +83,1.51646,13.41,3.55,1.25,72.81,0.68,8.10,0.00,0.00,2 +84,1.51594,13.09,3.52,1.55,72.87,0.68,8.05,0.00,0.09,2 +85,1.51409,14.25,3.09,2.08,72.28,1.10,7.08,0.00,0.00,2 +86,1.51625,13.36,3.58,1.49,72.72,0.45,8.21,0.00,0.00,2 +87,1.51569,13.24,3.49,1.47,73.25,0.38,8.03,0.00,0.00,2 +88,1.51645,13.40,3.49,1.52,72.65,0.67,8.08,0.00,0.10,2 +89,1.51618,13.01,3.50,1.48,72.89,0.60,8.12,0.00,0.00,2 +90,1.51640,12.55,3.48,1.87,73.23,0.63,8.08,0.00,0.09,2 +91,1.51841,12.93,3.74,1.11,72.28,0.64,8.96,0.00,0.22,2 +92,1.51605,12.90,3.44,1.45,73.06,0.44,8.27,0.00,0.00,2 +93,1.51588,13.12,3.41,1.58,73.26,0.07,8.39,0.00,0.19,2 +94,1.51590,13.24,3.34,1.47,73.10,0.39,8.22,0.00,0.00,2 +95,1.51629,12.71,3.33,1.49,73.28,0.67,8.24,0.00,0.00,2 +96,1.51860,13.36,3.43,1.43,72.26,0.51,8.60,0.00,0.00,2 +97,1.51841,13.02,3.62,1.06,72.34,0.64,9.13,0.00,0.15,2 +98,1.51743,12.20,3.25,1.16,73.55,0.62,8.90,0.00,0.24,2 +99,1.51689,12.67,2.88,1.71,73.21,0.73,8.54,0.00,0.00,2 +100,1.51811,12.96,2.96,1.43,72.92,0.60,8.79,0.14,0.00,2 +101,1.51655,12.75,2.85,1.44,73.27,0.57,8.79,0.11,0.22,2 +102,1.51730,12.35,2.72,1.63,72.87,0.70,9.23,0.00,0.00,2 +103,1.51820,12.62,2.76,0.83,73.81,0.35,9.42,0.00,0.20,2 +104,1.52725,13.80,3.15,0.66,70.57,0.08,11.64,0.00,0.00,2 +105,1.52410,13.83,2.90,1.17,71.15,0.08,10.79,0.00,0.00,2 +106,1.52475,11.45,0.00,1.88,72.19,0.81,13.24,0.00,0.34,2 +107,1.53125,10.73,0.00,2.10,69.81,0.58,13.30,3.15,0.28,2 +108,1.53393,12.30,0.00,1.00,70.16,0.12,16.19,0.00,0.24,2 +109,1.52222,14.43,0.00,1.00,72.67,0.10,11.52,0.00,0.08,2 +110,1.51818,13.72,0.00,0.56,74.45,0.00,10.99,0.00,0.00,2 +111,1.52664,11.23,0.00,0.77,73.21,0.00,14.68,0.00,0.00,2 +112,1.52739,11.02,0.00,0.75,73.08,0.00,14.96,0.00,0.00,2 +113,1.52777,12.64,0.00,0.67,72.02,0.06,14.40,0.00,0.00,2 +114,1.51892,13.46,3.83,1.26,72.55,0.57,8.21,0.00,0.14,2 +115,1.51847,13.10,3.97,1.19,72.44,0.60,8.43,0.00,0.00,2 +116,1.51846,13.41,3.89,1.33,72.38,0.51,8.28,0.00,0.00,2 +117,1.51829,13.24,3.90,1.41,72.33,0.55,8.31,0.00,0.10,2 +118,1.51708,13.72,3.68,1.81,72.06,0.64,7.88,0.00,0.00,2 +119,1.51673,13.30,3.64,1.53,72.53,0.65,8.03,0.00,0.29,2 +120,1.51652,13.56,3.57,1.47,72.45,0.64,7.96,0.00,0.00,2 +121,1.51844,13.25,3.76,1.32,72.40,0.58,8.42,0.00,0.00,2 +122,1.51663,12.93,3.54,1.62,72.96,0.64,8.03,0.00,0.21,2 +123,1.51687,13.23,3.54,1.48,72.84,0.56,8.10,0.00,0.00,2 +124,1.51707,13.48,3.48,1.71,72.52,0.62,7.99,0.00,0.00,2 +125,1.52177,13.20,3.68,1.15,72.75,0.54,8.52,0.00,0.00,2 +126,1.51872,12.93,3.66,1.56,72.51,0.58,8.55,0.00,0.12,2 +127,1.51667,12.94,3.61,1.26,72.75,0.56,8.60,0.00,0.00,2 +128,1.52081,13.78,2.28,1.43,71.99,0.49,9.85,0.00,0.17,2 +129,1.52068,13.55,2.09,1.67,72.18,0.53,9.57,0.27,0.17,2 +130,1.52020,13.98,1.35,1.63,71.76,0.39,10.56,0.00,0.18,2 +131,1.52177,13.75,1.01,1.36,72.19,0.33,11.14,0.00,0.00,2 +132,1.52614,13.70,0.00,1.36,71.24,0.19,13.44,0.00,0.10,2 +133,1.51813,13.43,3.98,1.18,72.49,0.58,8.15,0.00,0.00,2 +134,1.51800,13.71,3.93,1.54,71.81,0.54,8.21,0.00,0.15,2 +135,1.51811,13.33,3.85,1.25,72.78,0.52,8.12,0.00,0.00,2 +136,1.51789,13.19,3.90,1.30,72.33,0.55,8.44,0.00,0.28,2 +137,1.51806,13.00,3.80,1.08,73.07,0.56,8.38,0.00,0.12,2 +138,1.51711,12.89,3.62,1.57,72.96,0.61,8.11,0.00,0.00,2 +139,1.51674,12.79,3.52,1.54,73.36,0.66,7.90,0.00,0.00,2 +140,1.51674,12.87,3.56,1.64,73.14,0.65,7.99,0.00,0.00,2 +141,1.51690,13.33,3.54,1.61,72.54,0.68,8.11,0.00,0.00,2 +142,1.51851,13.20,3.63,1.07,72.83,0.57,8.41,0.09,0.17,2 +143,1.51662,12.85,3.51,1.44,73.01,0.68,8.23,0.06,0.25,2 +144,1.51709,13.00,3.47,1.79,72.72,0.66,8.18,0.00,0.00,2 +145,1.51660,12.99,3.18,1.23,72.97,0.58,8.81,0.00,0.24,2 +146,1.51839,12.85,3.67,1.24,72.57,0.62,8.68,0.00,0.35,2 +147,1.51769,13.65,3.66,1.11,72.77,0.11,8.60,0.00,0.00,3 +148,1.51610,13.33,3.53,1.34,72.67,0.56,8.33,0.00,0.00,3 +149,1.51670,13.24,3.57,1.38,72.70,0.56,8.44,0.00,0.10,3 +150,1.51643,12.16,3.52,1.35,72.89,0.57,8.53,0.00,0.00,3 +151,1.51665,13.14,3.45,1.76,72.48,0.60,8.38,0.00,0.17,3 +152,1.52127,14.32,3.90,0.83,71.50,0.00,9.49,0.00,0.00,3 +153,1.51779,13.64,3.65,0.65,73.00,0.06,8.93,0.00,0.00,3 +154,1.51610,13.42,3.40,1.22,72.69,0.59,8.32,0.00,0.00,3 +155,1.51694,12.86,3.58,1.31,72.61,0.61,8.79,0.00,0.00,3 +156,1.51646,13.04,3.40,1.26,73.01,0.52,8.58,0.00,0.00,3 +157,1.51655,13.41,3.39,1.28,72.64,0.52,8.65,0.00,0.00,3 +158,1.52121,14.03,3.76,0.58,71.79,0.11,9.65,0.00,0.00,3 +159,1.51776,13.53,3.41,1.52,72.04,0.58,8.79,0.00,0.00,3 +160,1.51796,13.50,3.36,1.63,71.94,0.57,8.81,0.00,0.09,3 +161,1.51832,13.33,3.34,1.54,72.14,0.56,8.99,0.00,0.00,3 +162,1.51934,13.64,3.54,0.75,72.65,0.16,8.89,0.15,0.24,3 +163,1.52211,14.19,3.78,0.91,71.36,0.23,9.14,0.00,0.37,3 +164,1.51514,14.01,2.68,3.50,69.89,1.68,5.87,2.20,0.00,5 +165,1.51915,12.73,1.85,1.86,72.69,0.60,10.09,0.00,0.00,5 +166,1.52171,11.56,1.88,1.56,72.86,0.47,11.41,0.00,0.00,5 +167,1.52151,11.03,1.71,1.56,73.44,0.58,11.62,0.00,0.00,5 +168,1.51969,12.64,0.00,1.65,73.75,0.38,11.53,0.00,0.00,5 +169,1.51666,12.86,0.00,1.83,73.88,0.97,10.17,0.00,0.00,5 +170,1.51994,13.27,0.00,1.76,73.03,0.47,11.32,0.00,0.00,5 +171,1.52369,13.44,0.00,1.58,72.22,0.32,12.24,0.00,0.00,5 +172,1.51316,13.02,0.00,3.04,70.48,6.21,6.96,0.00,0.00,5 +173,1.51321,13.00,0.00,3.02,70.70,6.21,6.93,0.00,0.00,5 +174,1.52043,13.38,0.00,1.40,72.25,0.33,12.50,0.00,0.00,5 +175,1.52058,12.85,1.61,2.17,72.18,0.76,9.70,0.24,0.51,5 +176,1.52119,12.97,0.33,1.51,73.39,0.13,11.27,0.00,0.28,5 +177,1.51905,14.00,2.39,1.56,72.37,0.00,9.57,0.00,0.00,6 +178,1.51937,13.79,2.41,1.19,72.76,0.00,9.77,0.00,0.00,6 +179,1.51829,14.46,2.24,1.62,72.38,0.00,9.26,0.00,0.00,6 +180,1.51852,14.09,2.19,1.66,72.67,0.00,9.32,0.00,0.00,6 +181,1.51299,14.40,1.74,1.54,74.55,0.00,7.59,0.00,0.00,6 +182,1.51888,14.99,0.78,1.74,72.50,0.00,9.95,0.00,0.00,6 +183,1.51916,14.15,0.00,2.09,72.74,0.00,10.88,0.00,0.00,6 +184,1.51969,14.56,0.00,0.56,73.48,0.00,11.22,0.00,0.00,6 +185,1.51115,17.38,0.00,0.34,75.41,0.00,6.65,0.00,0.00,6 +186,1.51131,13.69,3.20,1.81,72.81,1.76,5.43,1.19,0.00,7 +187,1.51838,14.32,3.26,2.22,71.25,1.46,5.79,1.63,0.00,7 +188,1.52315,13.44,3.34,1.23,72.38,0.60,8.83,0.00,0.00,7 +189,1.52247,14.86,2.20,2.06,70.26,0.76,9.76,0.00,0.00,7 +190,1.52365,15.79,1.83,1.31,70.43,0.31,8.61,1.68,0.00,7 +191,1.51613,13.88,1.78,1.79,73.10,0.00,8.67,0.76,0.00,7 +192,1.51602,14.85,0.00,2.38,73.28,0.00,8.76,0.64,0.09,7 +193,1.51623,14.20,0.00,2.79,73.46,0.04,9.04,0.40,0.09,7 +194,1.51719,14.75,0.00,2.00,73.02,0.00,8.53,1.59,0.08,7 +195,1.51683,14.56,0.00,1.98,73.29,0.00,8.52,1.57,0.07,7 +196,1.51545,14.14,0.00,2.68,73.39,0.08,9.07,0.61,0.05,7 +197,1.51556,13.87,0.00,2.54,73.23,0.14,9.41,0.81,0.01,7 +198,1.51727,14.70,0.00,2.34,73.28,0.00,8.95,0.66,0.00,7 +199,1.51531,14.38,0.00,2.66,73.10,0.04,9.08,0.64,0.00,7 +200,1.51609,15.01,0.00,2.51,73.05,0.05,8.83,0.53,0.00,7 +201,1.51508,15.15,0.00,2.25,73.50,0.00,8.34,0.63,0.00,7 +202,1.51653,11.95,0.00,1.19,75.18,2.70,8.93,0.00,0.00,7 +203,1.51514,14.85,0.00,2.42,73.72,0.00,8.39,0.56,0.00,7 +204,1.51658,14.80,0.00,1.99,73.11,0.00,8.28,1.71,0.00,7 +205,1.51617,14.95,0.00,2.27,73.30,0.00,8.71,0.67,0.00,7 +206,1.51732,14.95,0.00,1.80,72.99,0.00,8.61,1.55,0.00,7 +207,1.51645,14.94,0.00,1.87,73.11,0.00,8.67,1.38,0.00,7 +208,1.51831,14.39,0.00,1.82,72.86,1.41,6.47,2.88,0.00,7 +209,1.51640,14.37,0.00,2.74,72.85,0.00,9.45,0.54,0.00,7 +210,1.51623,14.14,0.00,2.88,72.61,0.08,9.18,1.06,0.00,7 +211,1.51685,14.92,0.00,1.99,73.06,0.00,8.40,1.59,0.00,7 +212,1.52065,14.36,0.00,2.02,73.42,0.00,8.44,1.64,0.00,7 +213,1.51651,14.38,0.00,1.94,73.61,0.00,8.48,1.57,0.00,7 +214,1.51711,14.23,0.00,2.08,73.36,0.00,8.62,1.67,0.00,7 diff --git a/lib/mining/testdata/forensic_glass/glass.dsv b/lib/mining/testdata/forensic_glass/glass.dsv new file mode 100644 index 00000000..10e9ba98 --- /dev/null +++ b/lib/mining/testdata/forensic_glass/glass.dsv @@ -0,0 +1,98 @@ +{ + "Input" :"glass.data" +, "Rejected" :"glass.rej" +, "MaxRows" :-1 +, "ClassMetadataIndex" :10 +, "ClassIndex" :9 +, "DatasetMode" :"matrix" +, "InputMetadata" : + [{ + "Name" :"ID" + , "Separator" :"," + , "Type" :"integer" + , "Skip" :true + },{ + "Name" :"RI" + , "Separator" :"," + , "Type" :"real" + },{ + "Name" :"NA20" + , "Separator" :"," + , "Type" :"real" + },{ + "Name" :"MGO" + , "Separator" :"," + , "Type" :"real" + },{ + "Name" :"AL203" + , "Separator" :"," + , "Type" :"real" + },{ + "Name" :"SI02" + , "Separator" :"," + , "Type" :"real" + },{ + "Name" :"K20" + , "Separator" :"," + , "Type" :"real" + },{ + "Name" :"CAO" + , "Separator" :"," + , "Type" :"real" + },{ + "Name" :"BAO" + , "Separator" :"," + , "Type" :"real" + },{ + "Name" :"FE203" + , "Separator" :"," + , "Type" :"real" + },{ + "Name" :"TYPE" + , "Type" :"real" + , "ValueSpace" : + [ + "1" + , "2" + , "3" + , "4" + , "5" + , "6" + , "7" + ] + }] + +, "Output" :"glass.out" +, "OutputMetadata" : + [{ + "Name" :"RI" + , "Separator" :"," + },{ + "Name" :"NA20" + , "Separator" :"," + },{ + "Name" :"MGO" + , "Separator" :"," + },{ + "Name" :"AL203" + , "Separator" :"," + },{ + "Name" :"SI02" + , "Separator" :"," + },{ + "Name" :"K20" + , "Separator" :"," + },{ + "Name" :"CAO" + , "Separator" :"," + },{ + "Name" :"BAO" + , "Separator" :"," + },{ + "Name" :"FE203" + , "Separator" :"," + },{ + "Name" :"TYPE" + , "Type" :"integer" + }] +} diff --git a/lib/mining/testdata/forensic_glass/glass.names b/lib/mining/testdata/forensic_glass/glass.names new file mode 100644 index 00000000..caff715c --- /dev/null +++ b/lib/mining/testdata/forensic_glass/glass.names @@ -0,0 +1,90 @@ +1. Title: Glass Identification Database + +2. Sources: + (a) Creator: B. German + -- Central Research Establishment + Home Office Forensic Science Service + Aldermaston, Reading, Berkshire RG7 4PN + (b) Donor: Vina Spiehler, Ph.D., DABFT + Diagnostic Products Corporation + (213) 776-0180 (ext 3014) + (c) Date: September, 1987 + +3. Past Usage: + -- Rule Induction in Forensic Science + -- Ian W. Evett and Ernest J. Spiehler + -- Central Research Establishment + Home Office Forensic Science Service + Aldermaston, Reading, Berkshire RG7 4PN + -- Unknown technical note number (sorry, not listed here) + -- General Results: nearest neighbor held its own with respect to the + rule-based system + +4. Relevant Information:n + Vina conducted a comparison test of her rule-based system, BEAGLE, the + nearest-neighbor algorithm, and discriminant analysis. BEAGLE is + a product available through VRS Consulting, Inc.; 4676 Admiralty Way, + Suite 206; Marina Del Ray, CA 90292 (213) 827-7890 and FAX: -3189. + In determining whether the glass was a type of "float" glass or not, + the following results were obtained (# incorrect answers): + + Type of Sample Beagle NN DA + Windows that were float processed (87) 10 12 21 + Windows that were not: (76) 19 16 22 + + The study of classification of types of glass was motivated by + criminological investigation. At the scene of the crime, the glass left + can be used as evidence...if it is correctly identified! + +5. Number of Instances: 214 + +6. Number of Attributes: 10 (including an Id#) plus the class attribute + -- all attributes are continuously valued + +7. Attribute Information: + 1. Id number: 1 to 214 + 2. RI: refractive index + 3. Na: Sodium (unit measurement: weight percent in corresponding oxide, as + are attributes 4-10) + 4. Mg: Magnesium + 5. Al: Aluminum + 6. Si: Silicon + 7. K: Potassium + 8. Ca: Calcium + 9. Ba: Barium + 10. Fe: Iron + 11. Type of glass: (class attribute) + -- 1 building_windows_float_processed + -- 2 building_windows_non_float_processed + -- 3 vehicle_windows_float_processed + -- 4 vehicle_windows_non_float_processed (none in this database) + -- 5 containers + -- 6 tableware + -- 7 headlamps + +8. Missing Attribute Values: None + +Summary Statistics: +Attribute: Min Max Mean SD Correlation with class + 2. RI: 1.5112 1.5339 1.5184 0.0030 -0.1642 + 3. Na: 10.73 17.38 13.4079 0.8166 0.5030 + 4. Mg: 0 4.49 2.6845 1.4424 -0.7447 + 5. Al: 0.29 3.5 1.4449 0.4993 0.5988 + 6. Si: 69.81 75.41 72.6509 0.7745 0.1515 + 7. K: 0 6.21 0.4971 0.6522 -0.0100 + 8. Ca: 5.43 16.19 8.9570 1.4232 0.0007 + 9. Ba: 0 3.15 0.1750 0.4972 0.5751 +10. Fe: 0 0.51 0.0570 0.0974 -0.1879 + +9. Class Distribution: (out of 214 total instances) + -- 163 Window glass (building windows and vehicle windows) + -- 87 float processed + -- 70 building windows + -- 17 vehicle windows + -- 76 non-float processed + -- 76 building windows + -- 0 vehicle windows + -- 51 Non-window glass + -- 13 containers + -- 9 tableware + -- 29 headlamps diff --git a/lib/mining/testdata/forensic_glass/glass.tag b/lib/mining/testdata/forensic_glass/glass.tag new file mode 100644 index 00000000..8bf70515 --- /dev/null +++ b/lib/mining/testdata/forensic_glass/glass.tag @@ -0,0 +1,27 @@ +An original file donated by Vina Speihler + +ID, N -- numeric identifier of the instance +RI, N -- refractive index +NA2O, N -- Sodium oxide +MGO, N -- magnesium oxide +AL2O3, N -- aluminum oxide +SIO2, N -- silcon oxide +K2O, N -- potassium oxide +CAO, N -- calcium oxide +BAO, N -- barium oxide +FE2O3, N -- iron oxide +TYPE, N -- An unknown, but must correspond to the types in the paper +CAMG, N -- Unsure + +Types include: + 1. WF (Float Window) + 2. WNF (Non-float Window) + 3. C (Container) + 4. T (Tableware) + 5. H (Headlamp) 214 2568 14127 glass.dat + 19 92 518 glass.tag + 62 742 4775 glassx.dat + 51 610 3928 nonwindo.dat + 6 14 120 phones + 163 1955 12552 window.dat + 515 5981 36020 total diff --git a/lib/mining/testdata/iris/iris.dsv b/lib/mining/testdata/iris/iris.dsv new file mode 100644 index 00000000..9839b12b --- /dev/null +++ b/lib/mining/testdata/iris/iris.dsv @@ -0,0 +1,35 @@ +{ + "Input" :"iris.dat" +, "Rejected" :"iris.rej" +, "MaxRows" :-1 +, "ClassMetadataIndex" :4 +, "ClassIndex" :4 +, "DatasetMode" :"matrix" +, "InputMetadata" : + [{ + "Name" :"sepal-length" + , "Separator" :"," + , "Type" :"real" + },{ + "Name" :"sepal-width" + , "Separator" :"," + , "Type" :"real" + },{ + "Name" :"petal-length" + , "Separator" :"," + , "Type" :"real" + },{ + "Name" :"petal-width" + , "Separator" :"," + , "Type" :"real" + },{ + "Name" :"class" + , "Type" :"string" + , "ValueSpace" : + [ + "Iris-setosa" + , "Iris-versicolor" + , "Iris-virginica" + ] + }] +} diff --git a/lib/mining/testdata/phoneme/phoneme.dsv b/lib/mining/testdata/phoneme/phoneme.dsv new file mode 100644 index 00000000..595576b5 --- /dev/null +++ b/lib/mining/testdata/phoneme/phoneme.dsv @@ -0,0 +1,59 @@ +{ + "Input" :"phoneme.dat" +, "Rejected" :"phoneme.rej" +, "MaxRows" :-1 +, "ClassMetadataIndex" :5 +, "ClassIndex" :5 +, "DatasetMode" :"matrix" +, "OutliersFile" :"phoneme.outliers" +, "InputMetadata" : + [{ + "Name" :"f1" + , "Type" :"real" + , "Separator" :" " + },{ + "Name" :"f2" + , "Type" :"real" + , "Separator" :" " + },{ + "Name" :"f3" + , "Type" :"real" + , "Separator" :" " + },{ + "Name" :"f4" + , "Type" :"real" + , "Separator" :" " + },{ + "Name" :"f5" + , "Type" :"real" + , "Separator" :" " + },{ + "Name" :"class" + , "Type" :"real" + , "ValueSpace" : + [ + "1" + , "0" + ] + }] +, "Output" :"phoneme.csv" +, "OutputMetadata": + [{ + "Name" :"f1" + , "Separator" :"," + },{ + "Name" :"f2" + , "Separator" :"," + },{ + "Name" :"f3" + , "Separator" :"," + },{ + "Name" :"f4" + , "Separator" :"," + },{ + "Name" :"f5" + , "Separator" :"," + },{ + "Name" :"class" + }] +} diff --git a/lib/mining/testdata/wvc2010lnsmote/wvc2010_features.lnsmote.dsv b/lib/mining/testdata/wvc2010lnsmote/wvc2010_features.lnsmote.dsv new file mode 100644 index 00000000..3c88ca38 --- /dev/null +++ b/lib/mining/testdata/wvc2010lnsmote/wvc2010_features.lnsmote.dsv @@ -0,0 +1,129 @@ +{ + "Input" : "wvc2010_features.lnsmote.dat" +, "Rejected" : "wvc2010_features.lnsmote.rej" +, "DatasetMode" : "matrix" +, "MaxRows" : -1 +, "ClassIndex" : 27 + +, "NTree" : 500 +, "PercentBoot" : 66 +, "StatsFile" : "randomforest.lnsmote.stats.dat" + +, "InputMetadata" : [{ + "Name" : "anonim" + , "Type" : "real" + , "Separator" : "," + },{ + "Name" : "comment_length" + , "Type" : "real" + , "Separator" : "," + },{ + "Name" : "size_increment" + , "Type" : "real" + , "Separator" : "," + },{ + "Name" : "size_ratio" + , "Type" : "real" + , "Separator" : "," + },{ + "Name" : "upper_lower_ratio" + , "Type" : "real" + , "Separator" : "," + },{ + "Name" : "upper_to_all_ratio" + , "Type" : "real" + , "Separator" : "," + },{ + "Name" : "digit_ratio" + , "Type" : "real" + , "Separator" : "," + },{ + "Name" : "non_alnum_ratio" + , "Type" : "real" + , "Separator" : "," + },{ + "Name" : "char_diversity" + , "Type" : "real" + , "Separator" : "," + },{ + "Name" : "char_distribution_insert" + , "Type" : "real" + , "Separator" : "," + },{ + "Name" : "compress_rate" + , "Type" : "real" + , "Separator" : "," + },{ + "Name" : "good_token" + , "Type" : "real" + , "Separator" : "," + },{ + "Name" : "term_frequency" + , "Type" : "real" + , "Separator" : "," + },{ + "Name" : "longest_word" + , "Type" : "real" + , "Separator" : "," + },{ + "Name" : "longest_char_sequence" + , "Type" : "real" + , "Separator" : "," + },{ + "Name" : "words_vulgar_frequency" + , "Type" : "real" + , "Separator" : "," + },{ + "Name" : "words_vulgar_impact" + , "Type" : "real" + , "Separator" : "," + },{ + "Name" : "words_pronoun_frequency" + , "Type" : "real" + , "Separator" : "," + },{ + "Name" : "words_pronoun_impact" + , "Type" : "real" + , "Separator" : "," + },{ + "Name" : "words_bias_frequency" + , "Type" : "real" + , "Separator" : "," + },{ + "Name" : "words_bias_impact" + , "Type" : "real" + , "Separator" : "," + },{ + "Name" : "words_sex_frequency" + , "Type" : "real" + , "Separator" : "," + },{ + "Name" : "words_sex_impact" + , "Type" : "real" + , "Separator" : "," + },{ + "Name" : "words_bad_frequency" + , "Type" : "real" + , "Separator" : "," + },{ + "Name" : "words_bad_impact" + , "Type" : "real" + , "Separator" : "," + },{ + "Name" : "words_all_frequency" + , "Type" : "real" + , "Separator" : "," + },{ + "Name" : "words_all_impact" + , "Type" : "real" + , "Separator" : "," + },{ + "Name" : "class" + , "Type" : "real" + , "ValueSpace" : + [ + "1" + , "0" + ] + }] +} diff --git a/lib/mining/tree/binary/binary.go b/lib/mining/tree/binary/binary.go new file mode 100644 index 00000000..57c459b0 --- /dev/null +++ b/lib/mining/tree/binary/binary.go @@ -0,0 +1,64 @@ +// Copyright 2015 Mhd Sulhan <ms@kilabit.info>. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Package binary contain implementation of binary tree. +package binary + +import ( + "fmt" +) + +// +// Tree is a abstract data type for tree with only two branch: left and right. +// +type Tree struct { + // Root is pointer to root of tree. + Root *BTNode +} + +// +// NewTree will create new binary tree with empty root. +// +func NewTree() *Tree { + return &Tree{ + Root: nil, + } +} + +// +// String will print all the branch and leaf in tree. +// +func (btree *Tree) String() (s string) { + var parent, node *BTNode + + parent = btree.Root + + s = fmt.Sprint(parent) + + node = parent.Right + + for parent != nil { + // Print right node down to the leaf. + for node.Right != nil { + s += fmt.Sprint(node) + + parent = node + node = node.Right + } + s += fmt.Sprint(node) + + // crawling to the stop one at a time ... + for parent != nil && node.Parent == parent { + if parent.Right == node { + node = parent.Left + break + } else if parent.Left == node { + node = parent + parent = parent.Parent + } + } + } + + return s +} diff --git a/lib/mining/tree/binary/binary_test.go b/lib/mining/tree/binary/binary_test.go new file mode 100644 index 00000000..4f61deb0 --- /dev/null +++ b/lib/mining/tree/binary/binary_test.go @@ -0,0 +1,47 @@ +// Copyright 2015 Mhd Sulhan <ms@kilabit.info>. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package binary + +import ( + "fmt" + "testing" +) + +func TestTree(t *testing.T) { + exp := `1 + 12 + 24 + 34 + 33 + 23 + 11 + 22 + 32 + 31 + 21 +` + + btree := NewTree() + + root := NewBTNode(1, + NewBTNode(11, + NewBTNode(21, nil, nil), + NewBTNode(22, + NewBTNode(31, nil, nil), + NewBTNode(32, nil, nil))), + NewBTNode(12, + NewBTNode(23, nil, nil), + NewBTNode(24, + NewBTNode(33, nil, nil), + NewBTNode(34, nil, nil)))) + + btree.Root = root + + res := fmt.Sprint(btree) + + if res != exp { + t.Fatal("error, expecting:\n", exp, "\n got:\n", res) + } +} diff --git a/lib/mining/tree/binary/btnode.go b/lib/mining/tree/binary/btnode.go new file mode 100644 index 00000000..57524265 --- /dev/null +++ b/lib/mining/tree/binary/btnode.go @@ -0,0 +1,77 @@ +// Copyright 2015 Mhd Sulhan <ms@kilabit.info>. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package binary + +import ( + "fmt" + "reflect" +) + +// +// BTNode is a data type for node in binary tree. +// +type BTNode struct { + // Left branch of node. + Left *BTNode + // Right branch of node. + Right *BTNode + // Parent of node. + Parent *BTNode + // Value of node. + Value interface{} +} + +// +// NewBTNode create new node for binary tree. +// +func NewBTNode(v interface{}, l *BTNode, r *BTNode) (p *BTNode) { + p = &BTNode{ + Left: l, + Right: r, + Parent: nil, + Value: v, + } + if l != nil { + l.Parent = p + } + if r != nil { + r.Parent = p + } + + return p +} + +// +// SetLeft will set left branch of node to 'c'. +// +func (n *BTNode) SetLeft(c *BTNode) { + n.Left = c + c.Parent = n +} + +// +// SetRight will set right branch of node to 'c'. +// +func (n *BTNode) SetRight(c *BTNode) { + n.Right = c + c.Parent = n +} + +// +// String will convert the node to string. +// +func (n *BTNode) String() (s string) { + var p = n.Parent + + // add tab until it reached nil + for p != nil { + s += "\t" + p = p.Parent + } + + s += fmt.Sprintln(reflect.ValueOf(n.Value)) + + return s +} |
