diff options
| author | Shulhan <ms@kilabit.info> | 2023-05-20 13:42:39 +0700 |
|---|---|---|
| committer | Shulhan <ms@kilabit.info> | 2023-05-20 13:44:07 +0700 |
| commit | 3eae1d3df5eeef14f9e8389895bb6b835ac2cf78 (patch) | |
| tree | f35ca6c111bed2ecf85ada965accf22ce83d3e8c /lib/mining | |
| parent | f43b8ead50575c6a279bef403af0204df98323c9 (diff) | |
| download | pakakeh.go-3eae1d3df5eeef14f9e8389895bb6b835ac2cf78.tar.xz | |
all: remove any usage of debug.Value in all packages
Using global debug value for all packages turns out is not a good
idea.
Diffstat (limited to 'lib/mining')
| -rw-r--r-- | lib/mining/classifier/cart/cart.go | 54 | ||||
| -rw-r--r-- | lib/mining/classifier/crf/crf.go | 25 | ||||
| -rw-r--r-- | lib/mining/classifier/rf/rf.go | 18 | ||||
| -rw-r--r-- | lib/mining/classifier/runtime.go | 10 | ||||
| -rw-r--r-- | lib/mining/gain/gini/gini.go | 49 | ||||
| -rw-r--r-- | lib/mining/gain/gini/ginifloat.go | 28 | ||||
| -rw-r--r-- | lib/mining/knn/knn.go | 10 | ||||
| -rw-r--r-- | lib/mining/resampling/lnsmote/lnsmote.go | 36 |
8 files changed, 2 insertions, 228 deletions
diff --git a/lib/mining/classifier/cart/cart.go b/lib/mining/classifier/cart/cart.go index 4ee79198..99eb5b5d 100644 --- a/lib/mining/classifier/cart/cart.go +++ b/lib/mining/classifier/cart/cart.go @@ -17,7 +17,6 @@ package cart import ( "fmt" - "github.com/shuLhan/share/lib/debug" "github.com/shuLhan/share/lib/mining/gain/gini" "github.com/shuLhan/share/lib/mining/tree/binary" "github.com/shuLhan/share/lib/numbers" @@ -105,11 +104,6 @@ func (runtime *Runtime) splitTreeByGain(claset tabula.ClasetInterface) ( nrow := claset.GetNRow() if nrow <= 0 { - if debug.Value >= 2 { - fmt.Printf("[cart] empty dataset (%s) : %v\n", - claset.MajorityClass(), claset) - } - node.Value = NodeValue{ IsLeaf: true, Class: claset.MajorityClass(), @@ -122,11 +116,6 @@ func (runtime *Runtime) splitTreeByGain(claset tabula.ClasetInterface) ( // is set to that class. single, name := claset.IsInSingleClass() if single { - if debug.Value >= 2 { - fmt.Printf("[cart] in single class (%s): %v\n", name, - claset.GetColumns()) - } - node.Value = NodeValue{ IsLeaf: true, Class: name, @@ -135,10 +124,6 @@ func (runtime *Runtime) splitTreeByGain(claset tabula.ClasetInterface) ( return node, nil } - if debug.Value >= 2 { - fmt.Println("[cart] claset:", claset) - } - // calculate the Gini gain for each attribute. gains := runtime.computeGain(claset) @@ -149,12 +134,6 @@ func (runtime *Runtime) splitTreeByGain(claset tabula.ClasetInterface) ( // if maxgain value is 0, use majority class as node and terminate // the process if MaxGain.GetMaxGainValue() == 0 { - if debug.Value >= 2 { - fmt.Println("[cart] max gain 0 with target", - claset.GetClassAsStrings(), - " and majority class is ", claset.MajorityClass()) - } - node.Value = NodeValue{ IsLeaf: true, Class: claset.MajorityClass(), @@ -166,10 +145,6 @@ func (runtime *Runtime) splitTreeByGain(claset tabula.ClasetInterface) ( // using the sorted index in MaxGain, sort all field in dataset tabula.SortColumnsByIndex(claset, MaxGain.SortedIndex) - if debug.Value >= 2 { - fmt.Println("[cart] maxgain:", MaxGain) - } - // Now that we have attribute with max gain in MaxGainIdx, and their // gain dan partition value in Gains[MaxGainIdx] and // GetMaxPartValue(), we split the dataset based on type of max-gain @@ -187,11 +162,6 @@ func (runtime *Runtime) splitTreeByGain(claset tabula.ClasetInterface) ( splitV = attrSubV[0] } - if debug.Value >= 2 { - fmt.Println("[cart] maxgainindex:", MaxGainIdx) - fmt.Println("[cart] split v:", splitV) - } - node.Value = NodeValue{ SplitAttrName: claset.GetColumn(MaxGainIdx).GetName(), IsLeaf: false, @@ -286,11 +256,6 @@ func (runtime *Runtime) SelectRandomFeature(claset tabula.ClasetInterface) { col := claset.GetColumn(idx) col.Flag &^= ColFlagSkip } - - if debug.Value >= 1 { - fmt.Println("[cart] selected random features:", pickedIdx) - fmt.Println("[cart] selected columns :", claset.GetColumns()) - } } // computeGain calculate the gini index for each value in each attribute. @@ -345,19 +310,10 @@ func (runtime *Runtime) computeGain(claset tabula.ClasetInterface) ( attr := col.ToStringSlice() attrV := col.ValueSpace - if debug.Value >= 2 { - fmt.Println("[cart] attr :", attr) - fmt.Println("[cart] attrV:", attrV) - } - target := claset.GetClassAsStrings() gains[x].ComputeDiscrete(&attr, &attrV, &target, &classVS) } - - if debug.Value >= 2 { - fmt.Println("[cart] gain :", gains[x]) - } } return gains } @@ -415,11 +371,6 @@ func (runtime *Runtime) CountOOBError(oob tabula.Claset) ( // save the original target to be compared later. origTarget := oob.GetClassAsStrings() - if debug.Value >= 2 { - fmt.Println("[cart] OOB:", oob.Columns) - fmt.Println("[cart] TREE:", &runtime.Tree) - } - // reset the target. oobtarget := oob.GetClassColumn() oobtarget.ClearValues() @@ -434,11 +385,6 @@ func (runtime *Runtime) CountOOBError(oob tabula.Claset) ( target := oobtarget.ToStringSlice() - if debug.Value >= 2 { - fmt.Println("[cart] original target:", origTarget) - fmt.Println("[cart] classify target:", target) - } - // count how many target value is miss-classified. runtime.OOBErrVal, _, _ = libstrings.CountMissRate(origTarget, target) diff --git a/lib/mining/classifier/crf/crf.go b/lib/mining/classifier/crf/crf.go index 3c6f25c8..1a40e1c0 100644 --- a/lib/mining/classifier/crf/crf.go +++ b/lib/mining/classifier/crf/crf.go @@ -15,7 +15,6 @@ import ( "math" "sort" - "github.com/shuLhan/share/lib/debug" "github.com/shuLhan/share/lib/floats64" "github.com/shuLhan/share/lib/mining/classifier" "github.com/shuLhan/share/lib/mining/classifier/rf" @@ -151,10 +150,6 @@ func (crf *Runtime) Build(samples tabula.ClasetInterface) (e error) { fmt.Println(tag, "Config:", crf) for x := 0; x < crf.NStage; x++ { - if debug.Value >= 1 { - fmt.Println(tag, "Stage #", x) - } - forest, e := crf.createForest(samples) if e != nil { return e @@ -206,10 +201,6 @@ func (crf *Runtime) createForest(samples tabula.ClasetInterface) ( // (2) for t := 0; t < crf.NTree; t++ { - if debug.Value >= 2 { - fmt.Println(tag, "Tree #", t) - } - // (2.1) for { cm, stat, e = forest.GrowTree(samples) @@ -233,10 +224,6 @@ func (crf *Runtime) createForest(samples tabula.ClasetInterface) ( // (3) crf.computeWeight(stat) - if debug.Value >= 1 { - fmt.Println(tag, "Weight:", stat.FMeasure) - } - // (4) crf.deleteTrueNegative(samples, cm) @@ -261,10 +248,6 @@ func (crf *Runtime) finalizeStage(forest *rf.Runtime) (e error) { crf.AddStat(stat) crf.ComputeStatTotal(stat) - if debug.Value >= 1 { - crf.PrintStatTotal(nil) - } - // (7) crf.AddForest(forest) @@ -309,10 +292,6 @@ func (crf *Runtime) deleteTrueNegative(samples tabula.ClasetInterface, c++ } } - - if debug.Value >= 1 { - fmt.Println(tag, "# TN", len(tnids), "# deleted", c) - } } // refillWithFP will copy the false-positive data in training set `tnset` @@ -338,10 +317,6 @@ func (crf *Runtime) refillWithFP(samples, tnset tabula.ClasetInterface, c++ } } - - if debug.Value >= 1 { - fmt.Println(tag, "# FP", len(fpids), "# refilled", c) - } } // runTPSet will run true-positive set into trained stage, to get the diff --git a/lib/mining/classifier/rf/rf.go b/lib/mining/classifier/rf/rf.go index 38612b97..3eb08aec 100644 --- a/lib/mining/classifier/rf/rf.go +++ b/lib/mining/classifier/rf/rf.go @@ -15,7 +15,6 @@ import ( "fmt" "math" - "github.com/shuLhan/share/lib/debug" "github.com/shuLhan/share/lib/floats64" "github.com/shuLhan/share/lib/ints" "github.com/shuLhan/share/lib/mining/classifier" @@ -147,10 +146,6 @@ func (forest *Runtime) Build(samples tabula.ClasetInterface) (e error) { // (1) for t := 0; t < forest.NTree; t++ { - if debug.Value >= 1 { - fmt.Println(tag, "tree #", t) - } - // (1.1) for { _, _, e = forest.GrowTree(samples) @@ -191,11 +186,6 @@ func (forest *Runtime) GrowTree(samples tabula.ClasetInterface) ( bagset := bag.(tabula.ClasetInterface) - if debug.Value >= 2 { - bagset.RecountMajorMinor() - fmt.Println(tag, "Bagging:", bagset) - } - // (2) cart, e := cart.New(bagset, cart.SplitMethodGini, forest.NRandomFeature) @@ -219,19 +209,11 @@ func (forest *Runtime) GrowTree(samples tabula.ClasetInterface) ( stat.End() - if debug.Value >= 3 && forest.RunOOB { - fmt.Println(tag, "Elapsed time (s):", stat.ElapsedTime) - } - forest.AddStat(stat) // (6) if forest.RunOOB { forest.ComputeStatFromCM(stat, cm) - - if debug.Value >= 2 { - fmt.Println(tag, "OOB stat:", stat) - } } forest.ComputeStatTotal(stat) diff --git a/lib/mining/classifier/runtime.go b/lib/mining/classifier/runtime.go index 963c54b6..2022e8c4 100644 --- a/lib/mining/classifier/runtime.go +++ b/lib/mining/classifier/runtime.go @@ -8,7 +8,6 @@ import ( "fmt" "math" - "github.com/shuLhan/share/lib/debug" "github.com/shuLhan/share/lib/dsv" "github.com/shuLhan/share/lib/floats64" "github.com/shuLhan/share/lib/ints" @@ -110,10 +109,6 @@ func (rt *Runtime) ComputeCM(sampleIds []int, cm.ComputeStrings(vs, actuals, predicts) cm.GroupIndexPredictionsStrings(sampleIds, actuals, predicts) - if debug.Value >= 2 { - fmt.Println(tag, cm) - } - return cm } @@ -170,11 +165,6 @@ func (rt *Runtime) ComputeStatFromCM(stat *Stat, cm *CM) { } else { stat.Accuracy = float64(stat.TP+stat.TN) / t } - - if debug.Value >= 1 { - rt.PrintOobStat(stat, cm) - rt.PrintStat(stat) - } } // ComputeStatTotal compute total statistic. diff --git a/lib/mining/gain/gini/gini.go b/lib/mining/gain/gini/gini.go index 881195bc..dd95602a 100644 --- a/lib/mining/gain/gini/gini.go +++ b/lib/mining/gain/gini/gini.go @@ -11,7 +11,6 @@ package gini import ( "fmt" - "github.com/shuLhan/share/lib/debug" "github.com/shuLhan/share/lib/floats64" libstrings "github.com/shuLhan/share/lib/strings" ) @@ -61,10 +60,6 @@ func (gini *Gini) ComputeDiscrete(src, discval, target, classes *[]string) { // create partition for possible combination of discrete values. gini.createDiscretePartition((*discval)) - if debug.Value >= 2 { - fmt.Println("[gini] part :", gini.DiscretePart) - } - gini.Index = make([]float64, len(gini.DiscretePart)) gini.Gain = make([]float64, len(gini.DiscretePart)) gini.MinIndexValue = 1.0 @@ -80,11 +75,6 @@ func (gini *Gini) computeDiscreteGain(src, target, classes *[]string) { // number of samples nsample := float64(len(*src)) - if debug.Value >= 3 { - fmt.Println("[gini] sample:", target) - fmt.Printf("[gini] Gini(a=%s) = %f\n", (*src), gini.Value) - } - // compute gini index for each discrete values for i, subPart := range gini.DiscretePart { // check if sub partition has at least an element @@ -121,25 +111,11 @@ func (gini *Gini) computeDiscreteGain(src, target, classes *[]string) { // sum all probabilities times gini index. sumGI += probIndex - - if debug.Value >= 3 { - fmt.Printf("[gini] subsample: %v\n", subT) - fmt.Printf("[gini] Gini(a=%s) = %f/%f * %f = %f\n", - part, ndisc, nsample, - giniIndex, probIndex) - } } gini.Index[i] = sumGI gini.Gain[i] = gini.Value - sumGI - if debug.Value >= 3 { - fmt.Printf("[gini] sample: %v\n", subPart) - fmt.Printf("[gini] Gain(a=%s) = %f - %f = %f\n", - subPart, gini.Value, sumGI, - gini.Gain[i]) - } - if gini.MinIndexValue > gini.Index[i] && gini.Index[i] != 0 { gini.MinIndexValue = gini.Index[i] gini.MinIndexPart = i @@ -182,10 +158,6 @@ func (gini *Gini) ComputeContinu(src *[]float64, target, classes *[]string) { gini.SortedIndex = floats64.IndirectSort(A2, true) - if debug.Value >= 1 { - fmt.Println("[gini] attr sorted :", A2) - } - // sort the target attribute using sorted index. libstrings.SortByIndex(&T2, gini.SortedIndex) @@ -253,14 +225,9 @@ func (gini *Gini) compute(target, classes *[]string) float64 { var sump2 float64 - for x, v := range classCount { + for _, v := range classCount { p := float64(v) / n sump2 += (p * p) - - if debug.Value >= 3 { - fmt.Printf("[gini] compute (%s): (%d/%f)^2 = %f\n", - (*classes)[x], v, n, p*p) - } } return 1 - sump2 @@ -282,11 +249,6 @@ func (gini *Gini) computeContinuGain(src *[]float64, target, classes *[]string) nsample := len(*src) - if debug.Value >= 2 { - fmt.Println("[gini] sorted data:", src) - fmt.Println("[gini] Gini.Value:", gini.Value) - } - for p, contVal := range gini.ContinuPart { // find the split of samples between partition based on // partition value @@ -321,15 +283,6 @@ func (gini *Gini) computeContinuGain(src *[]float64, target, classes *[]string) gini.Index[p] = ((pleft * gleft) + (pright * gright)) gini.Gain[p] = gini.Value - gini.Index[p] - if debug.Value >= 3 { - fmt.Println("[gini] tleft:", tleft) - fmt.Println("[gini] tright:", tright) - - fmt.Printf("[gini] GiniGain(%v) = %f - (%f * %f) + (%f * %f) = %f\n", - contVal, gini.Value, pleft, gleft, - pright, gright, gini.Gain[p]) - } - if gini.MinIndexValue > gini.Index[p] && gini.Index[p] != 0 { gini.MinIndexValue = gini.Index[p] gini.MinIndexPart = p diff --git a/lib/mining/gain/gini/ginifloat.go b/lib/mining/gain/gini/ginifloat.go index 010d26ba..420f47c2 100644 --- a/lib/mining/gain/gini/ginifloat.go +++ b/lib/mining/gain/gini/ginifloat.go @@ -5,9 +5,6 @@ package gini import ( - "fmt" - - "github.com/shuLhan/share/lib/debug" "github.com/shuLhan/share/lib/floats64" ) @@ -28,10 +25,6 @@ func (gini *Gini) ComputeContinuFloat(src, target, classes *[]float64) { gini.SortedIndex = floats64.IndirectSort(*src, true) - if debug.Value >= 1 { - fmt.Println("[gini] attr sorted :", src) - } - // (1) floats64.SortByIndex(target, gini.SortedIndex) @@ -64,14 +57,9 @@ func (gini *Gini) computeFloat(target, classes *[]float64) float64 { var sump2 float64 - for x, v := range classCount { + for _, v := range classCount { p := float64(v) / n sump2 += (p * p) - - if debug.Value >= 3 { - fmt.Printf("[gini] compute (%f): (%d/%f)^2 = %f\n", - (*classes)[x], v, n, p*p) - } } return 1 - sump2 @@ -98,11 +86,6 @@ func (gini *Gini) computeContinuGainFloat(src, target, classes *[]float64) { nsample := len(*src) - if debug.Value >= 2 { - fmt.Println("[gini] sorted data:", src) - fmt.Println("[gini] Gini.Value:", gini.Value) - } - // (0) for p, contVal := range gini.ContinuPart { // (0.1) @@ -138,15 +121,6 @@ func (gini *Gini) computeContinuGainFloat(src, target, classes *[]float64) { (probRight * gainRight)) gini.Gain[p] = gini.Value - gini.Index[p] - if debug.Value >= 3 { - fmt.Println("[gini] tleft:", tleft) - fmt.Println("[gini] tright:", tright) - - fmt.Printf("[gini] GiniGain(%v) = %f - (%f * %f) + (%f * %f) = %f\n", - contVal, gini.Value, probLeft, gainLeft, - probRight, gainRight, gini.Gain[p]) - } - if gini.MinIndexValue > gini.Index[p] && gini.Index[p] != 0 { gini.MinIndexValue = gini.Index[p] gini.MinIndexPart = p diff --git a/lib/mining/knn/knn.go b/lib/mining/knn/knn.go index fa303df6..6b8b9a30 100644 --- a/lib/mining/knn/knn.go +++ b/lib/mining/knn/knn.go @@ -7,11 +7,9 @@ package knn import ( - "fmt" "math" "sort" - "github.com/shuLhan/share/lib/debug" "github.com/shuLhan/share/lib/tabula" ) @@ -86,15 +84,7 @@ func (in *Runtime) FindNeighbors(samples *tabula.Rows, instance *tabula.Row) ( minK = in.K } - if debug.Value >= 2 { - fmt.Println("[knn] all neighbors:", in.AllNeighbors.Len()) - } - kneighbors = in.AllNeighbors.SelectRange(0, minK) - if debug.Value >= 2 { - fmt.Println("[knn] k neighbors:", kneighbors.Len()) - } - return } diff --git a/lib/mining/resampling/lnsmote/lnsmote.go b/lib/mining/resampling/lnsmote/lnsmote.go index 4e148014..66df6525 100644 --- a/lib/mining/resampling/lnsmote/lnsmote.go +++ b/lib/mining/resampling/lnsmote/lnsmote.go @@ -11,10 +11,8 @@ package lnsmote import ( - "fmt" "math/rand" - "github.com/shuLhan/share/lib/debug" "github.com/shuLhan/share/lib/dsv" "github.com/shuLhan/share/lib/mining/knn" "github.com/shuLhan/share/lib/mining/resampling/smote" @@ -76,11 +74,6 @@ func (in *Runtime) Init(dataset tabula.DatasetInterface) { in.ClassMinor) in.outliers = make(tabula.Rows, 0) - - if debug.Value >= 1 { - fmt.Println("[lnsmote] n:", in.NSynthetic) - fmt.Println("[lnsmote] n minority:", in.minorset.Len()) - } } // Resampling will run resampling process on dataset and return the synthetic @@ -97,10 +90,6 @@ func (in *Runtime) Resampling(dataset tabula.DatasetInterface) ( neighbors := in.FindNeighbors(in.datasetRows, p) - if debug.Value >= 3 { - fmt.Println("[lnsmote] neighbors:", neighbors.Rows()) - } - for y := 0; y < in.NSynthetic; y++ { syn := in.createSynthetic(p, neighbors) @@ -108,11 +97,6 @@ func (in *Runtime) Resampling(dataset tabula.DatasetInterface) ( in.Synthetics.PushRow(syn) } } - - if debug.Value >= 1 { - fmt.Printf("[lnsmote] %-4d n synthetics: %v\n", x, - in.Synthetics.Len()) - } } if in.SyntheticFile != "" { @@ -137,10 +121,6 @@ func (in *Runtime) createSynthetic(p *tabula.Row, neighbors knn.Neighbors) ( // Check if synthetic sample can be created from p and n. canit, slp, sln := in.canCreate(p, n) if !canit { - if debug.Value >= 2 { - fmt.Println("[lnsmote] can not create synthetic") - } - if slp.Len() <= 0 { in.outliers.PushBack(p) } @@ -174,11 +154,6 @@ func (in *Runtime) canCreate(p, n *tabula.Row) (bool, knn.Neighbors, slp := in.safeLevel(p) sln := in.safeLevel2(p, n) - if debug.Value >= 2 { - fmt.Println("[lnsmote] slp : ", slp.Len()) - fmt.Println("[lnsmote] sln : ", sln.Len()) - } - return slp.Len() != 0 || sln.Len() != 0, slp, sln } @@ -202,20 +177,9 @@ func (in *Runtime) safeLevel2(p, n *tabula.Row) knn.Neighbors { // if p in neighbors, replace it with neighbours in K+1 if nIsMinor && pInNeighbors { - if debug.Value >= 1 { - fmt.Println("[lnsmote] Replacing ", pidx) - } - if debug.Value >= 2 { - fmt.Println("[lnsmote] Replacing ", pidx, " in ", neighbors) - } - row := in.AllNeighbors.Row(in.K + 1) dist := in.AllNeighbors.Distance(in.K + 1) neighbors.Replace(pidx, row, dist) - - if debug.Value >= 2 { - fmt.Println("[lnsmote] Replacement ", neighbors) - } } minorNeighbors := neighbors.SelectWhere(in.ClassIndex, in.ClassMinor) |
