aboutsummaryrefslogtreecommitdiff
path: root/lib/mining
diff options
context:
space:
mode:
authorShulhan <ms@kilabit.info>2023-05-20 13:42:39 +0700
committerShulhan <ms@kilabit.info>2023-05-20 13:44:07 +0700
commit3eae1d3df5eeef14f9e8389895bb6b835ac2cf78 (patch)
treef35ca6c111bed2ecf85ada965accf22ce83d3e8c /lib/mining
parentf43b8ead50575c6a279bef403af0204df98323c9 (diff)
downloadpakakeh.go-3eae1d3df5eeef14f9e8389895bb6b835ac2cf78.tar.xz
all: remove any usage of debug.Value in all packages
Using global debug value for all packages turns out is not a good idea.
Diffstat (limited to 'lib/mining')
-rw-r--r--lib/mining/classifier/cart/cart.go54
-rw-r--r--lib/mining/classifier/crf/crf.go25
-rw-r--r--lib/mining/classifier/rf/rf.go18
-rw-r--r--lib/mining/classifier/runtime.go10
-rw-r--r--lib/mining/gain/gini/gini.go49
-rw-r--r--lib/mining/gain/gini/ginifloat.go28
-rw-r--r--lib/mining/knn/knn.go10
-rw-r--r--lib/mining/resampling/lnsmote/lnsmote.go36
8 files changed, 2 insertions, 228 deletions
diff --git a/lib/mining/classifier/cart/cart.go b/lib/mining/classifier/cart/cart.go
index 4ee79198..99eb5b5d 100644
--- a/lib/mining/classifier/cart/cart.go
+++ b/lib/mining/classifier/cart/cart.go
@@ -17,7 +17,6 @@ package cart
import (
"fmt"
- "github.com/shuLhan/share/lib/debug"
"github.com/shuLhan/share/lib/mining/gain/gini"
"github.com/shuLhan/share/lib/mining/tree/binary"
"github.com/shuLhan/share/lib/numbers"
@@ -105,11 +104,6 @@ func (runtime *Runtime) splitTreeByGain(claset tabula.ClasetInterface) (
nrow := claset.GetNRow()
if nrow <= 0 {
- if debug.Value >= 2 {
- fmt.Printf("[cart] empty dataset (%s) : %v\n",
- claset.MajorityClass(), claset)
- }
-
node.Value = NodeValue{
IsLeaf: true,
Class: claset.MajorityClass(),
@@ -122,11 +116,6 @@ func (runtime *Runtime) splitTreeByGain(claset tabula.ClasetInterface) (
// is set to that class.
single, name := claset.IsInSingleClass()
if single {
- if debug.Value >= 2 {
- fmt.Printf("[cart] in single class (%s): %v\n", name,
- claset.GetColumns())
- }
-
node.Value = NodeValue{
IsLeaf: true,
Class: name,
@@ -135,10 +124,6 @@ func (runtime *Runtime) splitTreeByGain(claset tabula.ClasetInterface) (
return node, nil
}
- if debug.Value >= 2 {
- fmt.Println("[cart] claset:", claset)
- }
-
// calculate the Gini gain for each attribute.
gains := runtime.computeGain(claset)
@@ -149,12 +134,6 @@ func (runtime *Runtime) splitTreeByGain(claset tabula.ClasetInterface) (
// if maxgain value is 0, use majority class as node and terminate
// the process
if MaxGain.GetMaxGainValue() == 0 {
- if debug.Value >= 2 {
- fmt.Println("[cart] max gain 0 with target",
- claset.GetClassAsStrings(),
- " and majority class is ", claset.MajorityClass())
- }
-
node.Value = NodeValue{
IsLeaf: true,
Class: claset.MajorityClass(),
@@ -166,10 +145,6 @@ func (runtime *Runtime) splitTreeByGain(claset tabula.ClasetInterface) (
// using the sorted index in MaxGain, sort all field in dataset
tabula.SortColumnsByIndex(claset, MaxGain.SortedIndex)
- if debug.Value >= 2 {
- fmt.Println("[cart] maxgain:", MaxGain)
- }
-
// Now that we have attribute with max gain in MaxGainIdx, and their
// gain dan partition value in Gains[MaxGainIdx] and
// GetMaxPartValue(), we split the dataset based on type of max-gain
@@ -187,11 +162,6 @@ func (runtime *Runtime) splitTreeByGain(claset tabula.ClasetInterface) (
splitV = attrSubV[0]
}
- if debug.Value >= 2 {
- fmt.Println("[cart] maxgainindex:", MaxGainIdx)
- fmt.Println("[cart] split v:", splitV)
- }
-
node.Value = NodeValue{
SplitAttrName: claset.GetColumn(MaxGainIdx).GetName(),
IsLeaf: false,
@@ -286,11 +256,6 @@ func (runtime *Runtime) SelectRandomFeature(claset tabula.ClasetInterface) {
col := claset.GetColumn(idx)
col.Flag &^= ColFlagSkip
}
-
- if debug.Value >= 1 {
- fmt.Println("[cart] selected random features:", pickedIdx)
- fmt.Println("[cart] selected columns :", claset.GetColumns())
- }
}
// computeGain calculate the gini index for each value in each attribute.
@@ -345,19 +310,10 @@ func (runtime *Runtime) computeGain(claset tabula.ClasetInterface) (
attr := col.ToStringSlice()
attrV := col.ValueSpace
- if debug.Value >= 2 {
- fmt.Println("[cart] attr :", attr)
- fmt.Println("[cart] attrV:", attrV)
- }
-
target := claset.GetClassAsStrings()
gains[x].ComputeDiscrete(&attr, &attrV, &target,
&classVS)
}
-
- if debug.Value >= 2 {
- fmt.Println("[cart] gain :", gains[x])
- }
}
return gains
}
@@ -415,11 +371,6 @@ func (runtime *Runtime) CountOOBError(oob tabula.Claset) (
// save the original target to be compared later.
origTarget := oob.GetClassAsStrings()
- if debug.Value >= 2 {
- fmt.Println("[cart] OOB:", oob.Columns)
- fmt.Println("[cart] TREE:", &runtime.Tree)
- }
-
// reset the target.
oobtarget := oob.GetClassColumn()
oobtarget.ClearValues()
@@ -434,11 +385,6 @@ func (runtime *Runtime) CountOOBError(oob tabula.Claset) (
target := oobtarget.ToStringSlice()
- if debug.Value >= 2 {
- fmt.Println("[cart] original target:", origTarget)
- fmt.Println("[cart] classify target:", target)
- }
-
// count how many target value is miss-classified.
runtime.OOBErrVal, _, _ = libstrings.CountMissRate(origTarget, target)
diff --git a/lib/mining/classifier/crf/crf.go b/lib/mining/classifier/crf/crf.go
index 3c6f25c8..1a40e1c0 100644
--- a/lib/mining/classifier/crf/crf.go
+++ b/lib/mining/classifier/crf/crf.go
@@ -15,7 +15,6 @@ import (
"math"
"sort"
- "github.com/shuLhan/share/lib/debug"
"github.com/shuLhan/share/lib/floats64"
"github.com/shuLhan/share/lib/mining/classifier"
"github.com/shuLhan/share/lib/mining/classifier/rf"
@@ -151,10 +150,6 @@ func (crf *Runtime) Build(samples tabula.ClasetInterface) (e error) {
fmt.Println(tag, "Config:", crf)
for x := 0; x < crf.NStage; x++ {
- if debug.Value >= 1 {
- fmt.Println(tag, "Stage #", x)
- }
-
forest, e := crf.createForest(samples)
if e != nil {
return e
@@ -206,10 +201,6 @@ func (crf *Runtime) createForest(samples tabula.ClasetInterface) (
// (2)
for t := 0; t < crf.NTree; t++ {
- if debug.Value >= 2 {
- fmt.Println(tag, "Tree #", t)
- }
-
// (2.1)
for {
cm, stat, e = forest.GrowTree(samples)
@@ -233,10 +224,6 @@ func (crf *Runtime) createForest(samples tabula.ClasetInterface) (
// (3)
crf.computeWeight(stat)
- if debug.Value >= 1 {
- fmt.Println(tag, "Weight:", stat.FMeasure)
- }
-
// (4)
crf.deleteTrueNegative(samples, cm)
@@ -261,10 +248,6 @@ func (crf *Runtime) finalizeStage(forest *rf.Runtime) (e error) {
crf.AddStat(stat)
crf.ComputeStatTotal(stat)
- if debug.Value >= 1 {
- crf.PrintStatTotal(nil)
- }
-
// (7)
crf.AddForest(forest)
@@ -309,10 +292,6 @@ func (crf *Runtime) deleteTrueNegative(samples tabula.ClasetInterface,
c++
}
}
-
- if debug.Value >= 1 {
- fmt.Println(tag, "# TN", len(tnids), "# deleted", c)
- }
}
// refillWithFP will copy the false-positive data in training set `tnset`
@@ -338,10 +317,6 @@ func (crf *Runtime) refillWithFP(samples, tnset tabula.ClasetInterface,
c++
}
}
-
- if debug.Value >= 1 {
- fmt.Println(tag, "# FP", len(fpids), "# refilled", c)
- }
}
// runTPSet will run true-positive set into trained stage, to get the
diff --git a/lib/mining/classifier/rf/rf.go b/lib/mining/classifier/rf/rf.go
index 38612b97..3eb08aec 100644
--- a/lib/mining/classifier/rf/rf.go
+++ b/lib/mining/classifier/rf/rf.go
@@ -15,7 +15,6 @@ import (
"fmt"
"math"
- "github.com/shuLhan/share/lib/debug"
"github.com/shuLhan/share/lib/floats64"
"github.com/shuLhan/share/lib/ints"
"github.com/shuLhan/share/lib/mining/classifier"
@@ -147,10 +146,6 @@ func (forest *Runtime) Build(samples tabula.ClasetInterface) (e error) {
// (1)
for t := 0; t < forest.NTree; t++ {
- if debug.Value >= 1 {
- fmt.Println(tag, "tree #", t)
- }
-
// (1.1)
for {
_, _, e = forest.GrowTree(samples)
@@ -191,11 +186,6 @@ func (forest *Runtime) GrowTree(samples tabula.ClasetInterface) (
bagset := bag.(tabula.ClasetInterface)
- if debug.Value >= 2 {
- bagset.RecountMajorMinor()
- fmt.Println(tag, "Bagging:", bagset)
- }
-
// (2)
cart, e := cart.New(bagset, cart.SplitMethodGini,
forest.NRandomFeature)
@@ -219,19 +209,11 @@ func (forest *Runtime) GrowTree(samples tabula.ClasetInterface) (
stat.End()
- if debug.Value >= 3 && forest.RunOOB {
- fmt.Println(tag, "Elapsed time (s):", stat.ElapsedTime)
- }
-
forest.AddStat(stat)
// (6)
if forest.RunOOB {
forest.ComputeStatFromCM(stat, cm)
-
- if debug.Value >= 2 {
- fmt.Println(tag, "OOB stat:", stat)
- }
}
forest.ComputeStatTotal(stat)
diff --git a/lib/mining/classifier/runtime.go b/lib/mining/classifier/runtime.go
index 963c54b6..2022e8c4 100644
--- a/lib/mining/classifier/runtime.go
+++ b/lib/mining/classifier/runtime.go
@@ -8,7 +8,6 @@ import (
"fmt"
"math"
- "github.com/shuLhan/share/lib/debug"
"github.com/shuLhan/share/lib/dsv"
"github.com/shuLhan/share/lib/floats64"
"github.com/shuLhan/share/lib/ints"
@@ -110,10 +109,6 @@ func (rt *Runtime) ComputeCM(sampleIds []int,
cm.ComputeStrings(vs, actuals, predicts)
cm.GroupIndexPredictionsStrings(sampleIds, actuals, predicts)
- if debug.Value >= 2 {
- fmt.Println(tag, cm)
- }
-
return cm
}
@@ -170,11 +165,6 @@ func (rt *Runtime) ComputeStatFromCM(stat *Stat, cm *CM) {
} else {
stat.Accuracy = float64(stat.TP+stat.TN) / t
}
-
- if debug.Value >= 1 {
- rt.PrintOobStat(stat, cm)
- rt.PrintStat(stat)
- }
}
// ComputeStatTotal compute total statistic.
diff --git a/lib/mining/gain/gini/gini.go b/lib/mining/gain/gini/gini.go
index 881195bc..dd95602a 100644
--- a/lib/mining/gain/gini/gini.go
+++ b/lib/mining/gain/gini/gini.go
@@ -11,7 +11,6 @@ package gini
import (
"fmt"
- "github.com/shuLhan/share/lib/debug"
"github.com/shuLhan/share/lib/floats64"
libstrings "github.com/shuLhan/share/lib/strings"
)
@@ -61,10 +60,6 @@ func (gini *Gini) ComputeDiscrete(src, discval, target, classes *[]string) {
// create partition for possible combination of discrete values.
gini.createDiscretePartition((*discval))
- if debug.Value >= 2 {
- fmt.Println("[gini] part :", gini.DiscretePart)
- }
-
gini.Index = make([]float64, len(gini.DiscretePart))
gini.Gain = make([]float64, len(gini.DiscretePart))
gini.MinIndexValue = 1.0
@@ -80,11 +75,6 @@ func (gini *Gini) computeDiscreteGain(src, target, classes *[]string) {
// number of samples
nsample := float64(len(*src))
- if debug.Value >= 3 {
- fmt.Println("[gini] sample:", target)
- fmt.Printf("[gini] Gini(a=%s) = %f\n", (*src), gini.Value)
- }
-
// compute gini index for each discrete values
for i, subPart := range gini.DiscretePart {
// check if sub partition has at least an element
@@ -121,25 +111,11 @@ func (gini *Gini) computeDiscreteGain(src, target, classes *[]string) {
// sum all probabilities times gini index.
sumGI += probIndex
-
- if debug.Value >= 3 {
- fmt.Printf("[gini] subsample: %v\n", subT)
- fmt.Printf("[gini] Gini(a=%s) = %f/%f * %f = %f\n",
- part, ndisc, nsample,
- giniIndex, probIndex)
- }
}
gini.Index[i] = sumGI
gini.Gain[i] = gini.Value - sumGI
- if debug.Value >= 3 {
- fmt.Printf("[gini] sample: %v\n", subPart)
- fmt.Printf("[gini] Gain(a=%s) = %f - %f = %f\n",
- subPart, gini.Value, sumGI,
- gini.Gain[i])
- }
-
if gini.MinIndexValue > gini.Index[i] && gini.Index[i] != 0 {
gini.MinIndexValue = gini.Index[i]
gini.MinIndexPart = i
@@ -182,10 +158,6 @@ func (gini *Gini) ComputeContinu(src *[]float64, target, classes *[]string) {
gini.SortedIndex = floats64.IndirectSort(A2, true)
- if debug.Value >= 1 {
- fmt.Println("[gini] attr sorted :", A2)
- }
-
// sort the target attribute using sorted index.
libstrings.SortByIndex(&T2, gini.SortedIndex)
@@ -253,14 +225,9 @@ func (gini *Gini) compute(target, classes *[]string) float64 {
var sump2 float64
- for x, v := range classCount {
+ for _, v := range classCount {
p := float64(v) / n
sump2 += (p * p)
-
- if debug.Value >= 3 {
- fmt.Printf("[gini] compute (%s): (%d/%f)^2 = %f\n",
- (*classes)[x], v, n, p*p)
- }
}
return 1 - sump2
@@ -282,11 +249,6 @@ func (gini *Gini) computeContinuGain(src *[]float64, target, classes *[]string)
nsample := len(*src)
- if debug.Value >= 2 {
- fmt.Println("[gini] sorted data:", src)
- fmt.Println("[gini] Gini.Value:", gini.Value)
- }
-
for p, contVal := range gini.ContinuPart {
// find the split of samples between partition based on
// partition value
@@ -321,15 +283,6 @@ func (gini *Gini) computeContinuGain(src *[]float64, target, classes *[]string)
gini.Index[p] = ((pleft * gleft) + (pright * gright))
gini.Gain[p] = gini.Value - gini.Index[p]
- if debug.Value >= 3 {
- fmt.Println("[gini] tleft:", tleft)
- fmt.Println("[gini] tright:", tright)
-
- fmt.Printf("[gini] GiniGain(%v) = %f - (%f * %f) + (%f * %f) = %f\n",
- contVal, gini.Value, pleft, gleft,
- pright, gright, gini.Gain[p])
- }
-
if gini.MinIndexValue > gini.Index[p] && gini.Index[p] != 0 {
gini.MinIndexValue = gini.Index[p]
gini.MinIndexPart = p
diff --git a/lib/mining/gain/gini/ginifloat.go b/lib/mining/gain/gini/ginifloat.go
index 010d26ba..420f47c2 100644
--- a/lib/mining/gain/gini/ginifloat.go
+++ b/lib/mining/gain/gini/ginifloat.go
@@ -5,9 +5,6 @@
package gini
import (
- "fmt"
-
- "github.com/shuLhan/share/lib/debug"
"github.com/shuLhan/share/lib/floats64"
)
@@ -28,10 +25,6 @@ func (gini *Gini) ComputeContinuFloat(src, target, classes *[]float64) {
gini.SortedIndex = floats64.IndirectSort(*src, true)
- if debug.Value >= 1 {
- fmt.Println("[gini] attr sorted :", src)
- }
-
// (1)
floats64.SortByIndex(target, gini.SortedIndex)
@@ -64,14 +57,9 @@ func (gini *Gini) computeFloat(target, classes *[]float64) float64 {
var sump2 float64
- for x, v := range classCount {
+ for _, v := range classCount {
p := float64(v) / n
sump2 += (p * p)
-
- if debug.Value >= 3 {
- fmt.Printf("[gini] compute (%f): (%d/%f)^2 = %f\n",
- (*classes)[x], v, n, p*p)
- }
}
return 1 - sump2
@@ -98,11 +86,6 @@ func (gini *Gini) computeContinuGainFloat(src, target, classes *[]float64) {
nsample := len(*src)
- if debug.Value >= 2 {
- fmt.Println("[gini] sorted data:", src)
- fmt.Println("[gini] Gini.Value:", gini.Value)
- }
-
// (0)
for p, contVal := range gini.ContinuPart {
// (0.1)
@@ -138,15 +121,6 @@ func (gini *Gini) computeContinuGainFloat(src, target, classes *[]float64) {
(probRight * gainRight))
gini.Gain[p] = gini.Value - gini.Index[p]
- if debug.Value >= 3 {
- fmt.Println("[gini] tleft:", tleft)
- fmt.Println("[gini] tright:", tright)
-
- fmt.Printf("[gini] GiniGain(%v) = %f - (%f * %f) + (%f * %f) = %f\n",
- contVal, gini.Value, probLeft, gainLeft,
- probRight, gainRight, gini.Gain[p])
- }
-
if gini.MinIndexValue > gini.Index[p] && gini.Index[p] != 0 {
gini.MinIndexValue = gini.Index[p]
gini.MinIndexPart = p
diff --git a/lib/mining/knn/knn.go b/lib/mining/knn/knn.go
index fa303df6..6b8b9a30 100644
--- a/lib/mining/knn/knn.go
+++ b/lib/mining/knn/knn.go
@@ -7,11 +7,9 @@
package knn
import (
- "fmt"
"math"
"sort"
- "github.com/shuLhan/share/lib/debug"
"github.com/shuLhan/share/lib/tabula"
)
@@ -86,15 +84,7 @@ func (in *Runtime) FindNeighbors(samples *tabula.Rows, instance *tabula.Row) (
minK = in.K
}
- if debug.Value >= 2 {
- fmt.Println("[knn] all neighbors:", in.AllNeighbors.Len())
- }
-
kneighbors = in.AllNeighbors.SelectRange(0, minK)
- if debug.Value >= 2 {
- fmt.Println("[knn] k neighbors:", kneighbors.Len())
- }
-
return
}
diff --git a/lib/mining/resampling/lnsmote/lnsmote.go b/lib/mining/resampling/lnsmote/lnsmote.go
index 4e148014..66df6525 100644
--- a/lib/mining/resampling/lnsmote/lnsmote.go
+++ b/lib/mining/resampling/lnsmote/lnsmote.go
@@ -11,10 +11,8 @@
package lnsmote
import (
- "fmt"
"math/rand"
- "github.com/shuLhan/share/lib/debug"
"github.com/shuLhan/share/lib/dsv"
"github.com/shuLhan/share/lib/mining/knn"
"github.com/shuLhan/share/lib/mining/resampling/smote"
@@ -76,11 +74,6 @@ func (in *Runtime) Init(dataset tabula.DatasetInterface) {
in.ClassMinor)
in.outliers = make(tabula.Rows, 0)
-
- if debug.Value >= 1 {
- fmt.Println("[lnsmote] n:", in.NSynthetic)
- fmt.Println("[lnsmote] n minority:", in.minorset.Len())
- }
}
// Resampling will run resampling process on dataset and return the synthetic
@@ -97,10 +90,6 @@ func (in *Runtime) Resampling(dataset tabula.DatasetInterface) (
neighbors := in.FindNeighbors(in.datasetRows, p)
- if debug.Value >= 3 {
- fmt.Println("[lnsmote] neighbors:", neighbors.Rows())
- }
-
for y := 0; y < in.NSynthetic; y++ {
syn := in.createSynthetic(p, neighbors)
@@ -108,11 +97,6 @@ func (in *Runtime) Resampling(dataset tabula.DatasetInterface) (
in.Synthetics.PushRow(syn)
}
}
-
- if debug.Value >= 1 {
- fmt.Printf("[lnsmote] %-4d n synthetics: %v\n", x,
- in.Synthetics.Len())
- }
}
if in.SyntheticFile != "" {
@@ -137,10 +121,6 @@ func (in *Runtime) createSynthetic(p *tabula.Row, neighbors knn.Neighbors) (
// Check if synthetic sample can be created from p and n.
canit, slp, sln := in.canCreate(p, n)
if !canit {
- if debug.Value >= 2 {
- fmt.Println("[lnsmote] can not create synthetic")
- }
-
if slp.Len() <= 0 {
in.outliers.PushBack(p)
}
@@ -174,11 +154,6 @@ func (in *Runtime) canCreate(p, n *tabula.Row) (bool, knn.Neighbors,
slp := in.safeLevel(p)
sln := in.safeLevel2(p, n)
- if debug.Value >= 2 {
- fmt.Println("[lnsmote] slp : ", slp.Len())
- fmt.Println("[lnsmote] sln : ", sln.Len())
- }
-
return slp.Len() != 0 || sln.Len() != 0, slp, sln
}
@@ -202,20 +177,9 @@ func (in *Runtime) safeLevel2(p, n *tabula.Row) knn.Neighbors {
// if p in neighbors, replace it with neighbours in K+1
if nIsMinor && pInNeighbors {
- if debug.Value >= 1 {
- fmt.Println("[lnsmote] Replacing ", pidx)
- }
- if debug.Value >= 2 {
- fmt.Println("[lnsmote] Replacing ", pidx, " in ", neighbors)
- }
-
row := in.AllNeighbors.Row(in.K + 1)
dist := in.AllNeighbors.Distance(in.K + 1)
neighbors.Replace(pidx, row, dist)
-
- if debug.Value >= 2 {
- fmt.Println("[lnsmote] Replacement ", neighbors)
- }
}
minorNeighbors := neighbors.SelectWhere(in.ClassIndex, in.ClassMinor)