diff options
| author | Shulhan <ms@kilabit.info> | 2018-09-17 00:26:46 +0700 |
|---|---|---|
| committer | Shulhan <ms@kilabit.info> | 2018-09-17 22:51:23 +0700 |
| commit | 66caeb368336e9f149d6f40772e7f0fdd070cf78 (patch) | |
| tree | d0295d2fb895ce9e046400fae90cd7d69081c16a /lib/strings/strings.go | |
| parent | f911fdc362d2a98a9f4deb93c18231ae77df12a1 (diff) | |
| download | pakakeh.go-66caeb368336e9f149d6f40772e7f0fdd070cf78.tar.xz | |
Merge package "github.com/shuLhan/tekstus", part 3/3
Diffstat (limited to 'lib/strings/strings.go')
| -rw-r--r-- | lib/strings/strings.go | 251 |
1 files changed, 251 insertions, 0 deletions
diff --git a/lib/strings/strings.go b/lib/strings/strings.go index 3b4db829..d050ce52 100644 --- a/lib/strings/strings.go +++ b/lib/strings/strings.go @@ -7,3 +7,254 @@ // strings. // package strings + +import ( + "strings" + + "github.com/shuLhan/share/lib/numbers" +) + +// +// CountMissRate given two slice of string, count number of string that is +// not equal with each other, and return the miss rate as +// +// number of not equal / number of data +// +// and count of missing, and length of input `src`. +// +func CountMissRate(src []string, target []string) ( + missrate float64, + nmiss, length int, +) { + length = len(src) + targetlen := len(target) + if length == 0 && targetlen == 0 { + return + } + if targetlen < length { + length = targetlen + } + + for x := 0; x < length; x++ { + if src[x] != target[x] { + nmiss++ + } + } + + return float64(nmiss) / float64(length), nmiss, length +} + +// +// CountToken will return number of token occurence in words. +// +func CountToken(words []string, token string, sensitive bool) int { + if !sensitive { + token = strings.ToLower(token) + } + + var cnt int + for _, v := range words { + if !sensitive { + v = strings.ToLower(v) + } + + if v == token { + cnt++ + } + } + return cnt +} + +// +// CountTokens count number of occurrence of each `tokens` values in words. +// Return number of each tokens based on their index. +// +func CountTokens(words []string, tokens []string, sensitive bool) []int { + tokenslen := len(tokens) + if tokenslen <= 0 { + return nil + } + + counters := make([]int, tokenslen) + + for x := 0; x < len(tokens); x++ { + counters[x] = CountToken(words, tokens[x], sensitive) + } + + return counters +} + +// +// FrequencyOfToken return frequency of token in words using +// +// count-of-token / total-words +// +func FrequencyOfToken(words []string, token string, sensitive bool) float64 { + wordslen := float64(len(words)) + if wordslen <= 0 { + return 0 + } + + cnt := CountToken(words, token, sensitive) + + return float64(cnt) / wordslen +} + +// +// FrequencyOfTokens will compute each frequency of token in words. +// +func FrequencyOfTokens(words, tokens []string, sensitive bool) (probs []float64) { + if len(words) == 0 || len(tokens) == 0 { + return + } + + probs = make([]float64, len(tokens)) + + for x := 0; x < len(tokens); x++ { + probs[x] = FrequencyOfToken(words, tokens[x], sensitive) + } + + return probs +} + +// +// IsContain return true if elemen `el` is in slice of string `ss`, +// otherwise return false. +// +func IsContain(ss []string, el string) bool { + for x := 0; x < len(ss); x++ { + if ss[x] == el { + return true + } + } + return false +} + +// +// IsEqual compare elements of two slice of string without regard to +// their order. +// +// Return true if each both slice have the same elements, false otherwise. +// +func IsEqual(a, b []string) bool { + alen := len(a) + + if alen != len(b) { + return false + } + + check := make([]bool, alen) + + for x, ls := range a { + for _, rs := range b { + if ls == rs { + check[x] = true + } + } + } + + for _, v := range check { + if !v { + return false + } + } + return true +} + +// +// Longest find the longest word in words and return their value and index. +// +// If words is empty return nil string with negative (-1) index. +// +func Longest(words []string) (string, int) { + if len(words) <= 0 { + return "", -1 + } + + var ( + outlen, idx int + out string + ) + for x := 0; x < len(words); x++ { + vlen := len(words[x]) + if vlen > outlen { + outlen = vlen + out = words[x] + idx = x + } + } + return out, idx +} + +// +// MostFrequentTokens return the token that has highest frequency in words. +// +// For example, given input +// +// words: [A A B A B C C] +// tokens: [A B] +// +// it will return A as the majority tokens in words. +// If tokens has equal frequency, then the first token in order will returned. +// +func MostFrequentTokens(words []string, tokens []string, sensitive bool) string { + if len(words) == 0 || len(tokens) == 0 { + return "" + } + + tokensCount := CountTokens(words, tokens, sensitive) + _, maxIdx, _ := numbers.IntsFindMax(tokensCount) + + return tokens[maxIdx] +} + +// +// SortByIndex will sort the slice of string in place using list of index. +// +func SortByIndex(ss *[]string, sortedIds []int) { + newd := make([]string, len(*ss)) + + for x := 0; x < len(sortedIds); x++ { + newd[x] = (*ss)[sortedIds[x]] + } + + (*ss) = newd +} + +// +// Swap two indices value of string. +// If x or y is less than zero, it will return unchanged slice. +// If x or y is greater than length of slice, it will return unchanged slice. +// +func Swap(ss []string, x, y int) { + if x == y { + return + } + if x < 0 || y < 0 { + return + } + if x > len(ss) || y > len(ss) { + return + } + + tmp := ss[x] + ss[x] = ss[y] + ss[y] = tmp +} + +// +// TotalFrequencyOfTokens return total frequency of list of token in words. +// +func TotalFrequencyOfTokens(words, tokens []string, sensitive bool) float64 { + if len(words) <= 0 || len(tokens) <= 0 { + return 0 + } + + var sumfreq float64 + + for x := 0; x < len(tokens); x++ { + sumfreq += FrequencyOfToken(words, tokens[x], sensitive) + } + + return sumfreq +} |
