Merge package "github.com/shuLhan/tekstus", part 3/3

author: Shulhan <ms@kilabit.info> 2018-09-17 00:26:46 +0700
committer: Shulhan <ms@kilabit.info> 2018-09-17 22:51:23 +0700
commit: 66caeb368336e9f149d6f40772e7f0fdd070cf78 (patch)
tree: d0295d2fb895ce9e046400fae90cd7d69081c16a /lib/strings/strings.go
parent: f911fdc362d2a98a9f4deb93c18231ae77df12a1 (diff)
download: pakakeh.go-66caeb368336e9f149d6f40772e7f0fdd070cf78.tar.xz
1 files changed, 251 insertions, 0 deletions
diff --git a/lib/strings/strings.go b/lib/strings/strings.go
index 3b4db829..d050ce52 100644
--- a/lib/strings/strings.go
+++ b/lib/strings/strings.go
@@ -7,3 +7,254 @@
 // strings.
 //
 package strings
+
+import (
+	"strings"
+
+	"github.com/shuLhan/share/lib/numbers"
+)
+
+//
+// CountMissRate given two slice of string, count number of string that is
+// not equal with each other, and return the miss rate as
+//
+//	number of not equal / number of data
+//
+// and count of missing, and length of input `src`.
+//
+func CountMissRate(src []string, target []string) (
+	missrate float64,
+	nmiss, length int,
+) {
+	length = len(src)
+	targetlen := len(target)
+	if length == 0 && targetlen == 0 {
+		return
+	}
+	if targetlen < length {
+		length = targetlen
+	}
+
+	for x := 0; x < length; x++ {
+		if src[x] != target[x] {
+			nmiss++
+		}
+	}
+
+	return float64(nmiss) / float64(length), nmiss, length
+}
+
+//
+// CountToken will return number of token occurence in words.
+//
+func CountToken(words []string, token string, sensitive bool) int {
+	if !sensitive {
+		token = strings.ToLower(token)
+	}
+
+	var cnt int
+	for _, v := range words {
+		if !sensitive {
+			v = strings.ToLower(v)
+		}
+
+		if v == token {
+			cnt++
+		}
+	}
+	return cnt
+}
+
+//
+// CountTokens count number of occurrence of each `tokens` values in words.
+// Return number of each tokens based on their index.
+//
+func CountTokens(words []string, tokens []string, sensitive bool) []int {
+	tokenslen := len(tokens)
+	if tokenslen <= 0 {
+		return nil
+	}
+
+	counters := make([]int, tokenslen)
+
+	for x := 0; x < len(tokens); x++ {
+		counters[x] = CountToken(words, tokens[x], sensitive)
+	}
+
+	return counters
+}
+
+//
+// FrequencyOfToken return frequency of token in words using
+//
+//	count-of-token / total-words
+//
+func FrequencyOfToken(words []string, token string, sensitive bool) float64 {
+	wordslen := float64(len(words))
+	if wordslen <= 0 {
+		return 0
+	}
+
+	cnt := CountToken(words, token, sensitive)
+
+	return float64(cnt) / wordslen
+}
+
+//
+// FrequencyOfTokens will compute each frequency of token in words.
+//
+func FrequencyOfTokens(words, tokens []string, sensitive bool) (probs []float64) {
+	if len(words) == 0 || len(tokens) == 0 {
+		return
+	}
+
+	probs = make([]float64, len(tokens))
+
+	for x := 0; x < len(tokens); x++ {
+		probs[x] = FrequencyOfToken(words, tokens[x], sensitive)
+	}
+
+	return probs
+}
+
+//
+// IsContain return true if elemen `el` is in slice of string `ss`,
+// otherwise return false.
+//
+func IsContain(ss []string, el string) bool {
+	for x := 0; x < len(ss); x++ {
+		if ss[x] == el {
+			return true
+		}
+	}
+	return false
+}
+
+//
+// IsEqual compare elements of two slice of string without regard to
+// their order.
+//
+// Return true if each both slice have the same elements, false otherwise.
+//
+func IsEqual(a, b []string) bool {
+	alen := len(a)
+
+	if alen != len(b) {
+		return false
+	}
+
+	check := make([]bool, alen)
+
+	for x, ls := range a {
+		for _, rs := range b {
+			if ls == rs {
+				check[x] = true
+			}
+		}
+	}
+
+	for _, v := range check {
+		if !v {
+			return false
+		}
+	}
+	return true
+}
+
+//
+// Longest find the longest word in words and return their value and index.
+//
+// If words is empty return nil string with negative (-1) index.
+//
+func Longest(words []string) (string, int) {
+	if len(words) <= 0 {
+		return "", -1
+	}
+
+	var (
+		outlen, idx int
+		out         string
+	)
+	for x := 0; x < len(words); x++ {
+		vlen := len(words[x])
+		if vlen > outlen {
+			outlen = vlen
+			out = words[x]
+			idx = x
+		}
+	}
+	return out, idx
+}
+
+//
+// MostFrequentTokens return the token that has highest frequency in words.
+//
+// For example, given input
+//
+//	words:  [A A B A B C C]
+//	tokens: [A B]
+//
+// it will return A as the majority tokens in words.
+// If tokens has equal frequency, then the first token in order will returned.
+//
+func MostFrequentTokens(words []string, tokens []string, sensitive bool) string {
+	if len(words) == 0 || len(tokens) == 0 {
+		return ""
+	}
+
+	tokensCount := CountTokens(words, tokens, sensitive)
+	_, maxIdx, _ := numbers.IntsFindMax(tokensCount)
+
+	return tokens[maxIdx]
+}
+
+//
+// SortByIndex will sort the slice of string in place using list of index.
+//
+func SortByIndex(ss *[]string, sortedIds []int) {
+	newd := make([]string, len(*ss))
+
+	for x := 0; x < len(sortedIds); x++ {
+		newd[x] = (*ss)[sortedIds[x]]
+	}
+
+	(*ss) = newd
+}
+
+//
+// Swap two indices value of string.
+// If x or y is less than zero, it will return unchanged slice.
+// If x or y is greater than length of slice, it will return unchanged slice.
+//
+func Swap(ss []string, x, y int) {
+	if x == y {
+		return
+	}
+	if x < 0 || y < 0 {
+		return
+	}
+	if x > len(ss) || y > len(ss) {
+		return
+	}
+
+	tmp := ss[x]
+	ss[x] = ss[y]
+	ss[y] = tmp
+}
+
+//
+// TotalFrequencyOfTokens return total frequency of list of token in words.
+//
+func TotalFrequencyOfTokens(words, tokens []string, sensitive bool) float64 {
+	if len(words) <= 0 || len(tokens) <= 0 {
+		return 0
+	}
+
+	var sumfreq float64
+
+	for x := 0; x < len(tokens); x++ {
+		sumfreq += FrequencyOfToken(words, tokens[x], sensitive)
+	}
+
+	return sumfreq
+}
author	Shulhan <ms@kilabit.info>	2018-09-17 00:26:46 +0700
committer	Shulhan <ms@kilabit.info>	2018-09-17 22:51:23 +0700
commit	66caeb368336e9f149d6f40772e7f0fdd070cf78 (patch)
tree	d0295d2fb895ce9e046400fae90cd7d69081c16a /lib/strings/strings.go
parent	f911fdc362d2a98a9f4deb93c18231ae77df12a1 (diff)
download	pakakeh.go-66caeb368336e9f149d6f40772e7f0fdd070cf78.tar.xz