From 424583f727bf5d7da0552780ba369834a73c36d3 Mon Sep 17 00:00:00 2001 From: Shulhan Date: Sun, 31 Jan 2021 02:55:28 +0700 Subject: kamusku: the Go library for Kamus Besar Bahasa Indonesia (KBBI) This module contains HTTP client and command line interface for official KBBI web. --- LICENSE | 39 +++ Makefile | 16 + README.md | 91 ++++++ cmd/kamusku/main.go | 116 ++++++++ go.mod | 8 + go.sum | 23 ++ kamusku.go | 9 + kbbi_client.go | 417 ++++++++++++++++++++++++++ kbbi_client_test.go | 29 ++ lookup_response.go | 10 + testdata/entri.html | 408 +++++++++++++++++++++++++ testdata/entri_analisa.html | 342 +++++++++++++++++++++ testdata/kbbi_dasar.html | 707 ++++++++++++++++++++++++++++++++++++++++++++ word.go | 98 ++++++ word_definition.go | 94 ++++++ word_test.go | 66 +++++ words.go | 20 ++ 17 files changed, 2493 insertions(+) create mode 100644 LICENSE create mode 100644 Makefile create mode 100644 README.md create mode 100644 cmd/kamusku/main.go create mode 100644 go.mod create mode 100644 go.sum create mode 100644 kamusku.go create mode 100644 kbbi_client.go create mode 100644 kbbi_client_test.go create mode 100644 lookup_response.go create mode 100644 testdata/entri.html create mode 100644 testdata/entri_analisa.html create mode 100644 testdata/kbbi_dasar.html create mode 100644 word.go create mode 100644 word_definition.go create mode 100644 word_test.go create mode 100644 words.go diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..7e49067 --- /dev/null +++ b/LICENSE @@ -0,0 +1,39 @@ +Copyright 2020, M. Shulhan (ms@kilabit.info). +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +3. Neither the name of copyright holder nor the names of its contributors may + be used to endorse or promote products derived from this software without + specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + --- --- --- --- --- --- --- + + TT TT II BB AAAA LLLLLL II KKKKKKKK + TT TT II BB AA AA LL LL II KK + TTTT II BB AA AA LL LL II KK + TT TT II BB AAAAAAAA LLLLLL II KK + TT TT II BB AA AA LL LL II KK + TT TT II BBBBBBBB AA AA LLLLLL II KK + +Website: http://kilabit.info +Contact: ms@kilabit.info diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..d7ccb28 --- /dev/null +++ b/Makefile @@ -0,0 +1,16 @@ +## Copyright 2020, Shulhan . All rights reserved. +## Use of this source code is governed by a BSD-style +## license that can be found in the LICENSE file. + +.PHONY: all install + +all: test check install + +test: + go test -v ./... + +check: + golangci-lint run ./... + +install: + go install ./cmd/kamusku diff --git a/README.md b/README.md new file mode 100644 index 0000000..c5c41c3 --- /dev/null +++ b/README.md @@ -0,0 +1,91 @@ +# kamusku + +kamusku adalah Go module yang berisi pustaka dan program untuk mencari +definisi kata Bahasa Indonesia dari situs resmi KBBI. + + +## Program kamusku + +Program kamusku yaitu antar-muka untuk mencari definisi dari kata lewat baris +perintah. + +Program ini sangat sederhana, caranya yaitu dengan memberikan kata yang dicari +setelah nama program, misalnya, + +``` +$ kamusku kamus,bahasa +``` + +Maka akan mencetak definisi dari kata "kamus" dan "bahasa" ke layar, + +``` +=== bahasa + Definisi #1: sistem lambang bunyi yang arbitrer, yang digunakan oleh + anggota suatu masyarakat untuk bekerja sama, berinteraksi, dan + mengidentifikasikan diri + Kelas #1: Nomina: kata benda + Kelas #2: Linguistik: - + + Definisi #2: percakapan (perkataan) yang baik; tingkah laku yang baik; sopan santun + Kelas #1: Nomina: kata benda + Contoh #1: baik budi --nya + + ... + +=== kamus + Definisi #1: karya rujukan atau acuan dalam bentuk cetak maupun digital yang + memuat kata dan ungkapan, dapat disusun menurut abjad atau tema, berisi + keterangan tentang makna, pemakaian, atau terjemahan + Kelas #1: Nomina: kata benda + + Definisi #2: buku yang memuat kumpulan istilah atau nama yang disusun + menurut abjad beserta penjelasan tentang makna dan pemakaiannya + Kelas #1: Nomina: kata benda + + ... +``` + + +## Bot Telegram + +Bot untuk aplikasi Telegram: https://t.me/KamuskuBot + +Untuk saat ini, KamuskuBot hanya punya satu perintah yaitu "/definisi". Cara +menggunakan perintah ini hampir sama dengan program kamusku yaitu dengan +memberikan kata yang dicari, contohnya, + +``` +/definisi kamus,bahasa +``` + +## LISENSI + +``` +Copyright 2020, M. Shulhan (ms@kilabit.info). +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +3. Neither the name of copyright holder nor the names of its contributors may + be used to endorse or promote products derived from this software without + specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +``` diff --git a/cmd/kamusku/main.go b/cmd/kamusku/main.go new file mode 100644 index 0000000..b48a791 --- /dev/null +++ b/cmd/kamusku/main.go @@ -0,0 +1,116 @@ +// Copyright 2020, Shulhan . All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +// Program kamusku is the command-line interface to Kamus Besar Bahasa +// Indonesia (KBBI). +// +package main + +import ( + "flag" + "fmt" + "log" + "sort" + + "git.sr.ht/~shulhan/kamusku" +) + +const ( + cmdNameSurel = "surel" + cmdNameSandi = "sandi" + cmdNameDaftarKataDasar = "daftar-kata-dasar" +) + +func main() { + var ( + isListRootWords bool + email string + pass string + ) + + log.SetFlags(0) + log.SetPrefix("kamusku: ") + + flag.StringVar(&email, cmdNameSurel, "", "Nama pengguna") + flag.StringVar(&pass, cmdNameSandi, "", "Sandi pengguna") + flag.BoolVar(&isListRootWords, cmdNameDaftarKataDasar, false, + "Ambil dan cetak semua kata dasar") + + flag.Parse() + + cl, err := kamusku.NewKbbiClient() + if err != nil { + log.Fatal(err) + } + + if len(email) > 0 && len(pass) > 0 { + err = cl.Login(email, pass) + if err != nil { + log.Fatal(err) + } + } + + if isListRootWords { + if !cl.IsAuthenticated() { + log.Fatalf("opsi %s membutuhkan opsi %s dan %s", + cmdNameDaftarKataDasar, cmdNameSurel, + cmdNameSandi) + } + listRootWords(cl) + return + } + + resDefinition, err := cl.Lookup(flag.Args()) + if err != nil { + log.Fatal(err) + } + + for word, wordDef := range resDefinition { + err = wordDef.Err() + if err != nil { + fmt.Printf("!!! %s: %s\n", word, err) + continue + } + + fmt.Println("===", word) + if len(wordDef.Message) != 0 { + fmt.Println(" " + wordDef.Message) + continue + } + if len(wordDef.Root) > 0 { + fmt.Printf(" Kata dasar: %s\n", wordDef.Root) + } + for x, def := range wordDef.Definition { + fmt.Printf(" Definisi #%d: %s\n", x+1, def.Value) + + for y, nomina := range def.Classes { + fmt.Printf(" Kelas #%d: %s\n", y+1, nomina) + } + for z, contoh := range def.Examples { + fmt.Printf(" Contoh #%d: %s\n", z+1, contoh) + } + fmt.Println() + } + } +} + +func listRootWords(cl *kamusku.KbbiClient) { + words, err := cl.ListRootWords() + if err != nil { + log.Println(err) + } + + list := make([]string, 0, len(words)) + + for k := range words { + list = append(list, k) + } + + sort.Strings(list) + + for _, word := range list { + fmt.Println(word) + } +} diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..b3a733a --- /dev/null +++ b/go.mod @@ -0,0 +1,8 @@ +module git.sr.ht/~shulhan/kamusku + +go 1.15 + +require ( + github.com/shuLhan/share v0.22.0 + golang.org/x/net v0.0.0-20210119194325-5f4716e94777 +) diff --git a/go.sum b/go.sum new file mode 100644 index 0000000..aec0a9d --- /dev/null +++ b/go.sum @@ -0,0 +1,23 @@ +git.sr.ht/~shulhan/asciidoctor-go v0.0.0-20201205130914-be765f32b57b/go.mod h1:ejaxKeBMNL5EpP2zjRP4B8zuOr+MM4ZyGwE3y7807WI= +git.sr.ht/~shulhan/ciigo v0.3.0/go.mod h1:Y5FvSiJg88qshoR1ktj4fLzM5sk1pZcV0kJGU8GAuTo= +github.com/shuLhan/share v0.20.2-0.20201122173411-e8b3bf5ee6e9/go.mod h1:oBv+CGHG6u4Sa71+nJJJji8mCgPAadywjsB3I3k/b0o= +github.com/shuLhan/share v0.20.2-0.20201205202022-66069b9e49fe/go.mod h1:oBv+CGHG6u4Sa71+nJJJji8mCgPAadywjsB3I3k/b0o= +github.com/shuLhan/share v0.22.0 h1:oTV1M0X3TqyhwSoT0BxVBmnUZLbhkvRwmhyV0KkTOR4= +github.com/shuLhan/share v0.22.0/go.mod h1:u9caerexlcxmPVDttj7PnkxCBDY6yBRTZ+gGR+1tO98= +golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= +golang.org/x/crypto v0.0.0-20201016220609-9e8e0b390897/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= +golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= +golang.org/x/net v0.0.0-20201031054903-ff519b6c9102/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= +golang.org/x/net v0.0.0-20201202161906-c7110b5ffcbb/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= +golang.org/x/net v0.0.0-20210119194325-5f4716e94777 h1:003p0dJM77cxMSyCPFphvZf/Y5/NXf5fzg6ufd1/Oew= +golang.org/x/net v0.0.0-20210119194325-5f4716e94777/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20201107080550-4d91cf3a1aaf/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= +golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= diff --git a/kamusku.go b/kamusku.go new file mode 100644 index 0000000..8780723 --- /dev/null +++ b/kamusku.go @@ -0,0 +1,9 @@ +// Copyright 2020, Shulhan . All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +// Package kamusku is the Go library to access the Bahasa Indonesia dictionary +// from https://kbbi.kemdikbud.go.id. +// +package kamusku diff --git a/kbbi_client.go b/kbbi_client.go new file mode 100644 index 0000000..23767b4 --- /dev/null +++ b/kbbi_client.go @@ -0,0 +1,417 @@ +// Copyright 2020, Shulhan . All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package kamusku + +import ( + "bytes" + "encoding/gob" + "errors" + "fmt" + "io/ioutil" + "log" + "net/http" + "net/http/cookiejar" + "net/url" + "os" + "path/filepath" + "strconv" + "strings" + "time" + + "github.com/shuLhan/share/lib/debug" + libhttp "github.com/shuLhan/share/lib/http" + "github.com/shuLhan/share/lib/net/html" + "golang.org/x/net/publicsuffix" +) + +const ( + kbbiUrlBase = "https://kbbi.kemdikbud.go.id" + kbbiUrlLogin = kbbiUrlBase + "/Account/Login" + kbbiPathEntri = "/entri/" + + attrNameClass = "class" + attrNameHref = "href" + attrNameTitle = "title" + attrNameValue = "value" + + attrValueRootWord = "rootword" + + paramNameMasukan = "masukan" + paramNameMasukanLengkap = "masukanLengkap" + paramNameIngatSaya = "IngatSaya" + paramNameKataSandi = "KataSandi" + paramNamePage = "page" + paramNamePosel = "Posel" + paramNameRequestVerificationToken = "__RequestVerificationToken" //nolint: gosec + + paramValueDasar = "dasar" + paramValueFalse = "false" + + tagNameAnchor = "a" + tagNameFont = "font" + tagNameHeader2 = "h2" + tagNameInput = "input" + tagNameItalic = "i" + tagNameOrderedList = "ol" + tagNameSpan = "span" + tagNameUnorderedList = "ul" + + cookieFile = "cookie" + configDir = "kamusku" + defTimeout = 20 * time.Second + maxPageNumber = 501 +) + +// +// KbbiClient client for official KBBI web using HTTP. +// +type KbbiClient struct { + baseDir string + cookieURL *url.URL + cookies []*http.Cookie + httpc *http.Client +} + +// +// NewKbbiClient create and initialize new client that connect directly to +// KBBI official website. +// +func NewKbbiClient() (cl *KbbiClient, err error) { + cookieURL, err := url.Parse(kbbiUrlBase) + if err != nil { + return nil, fmt.Errorf("New: %w", err) + } + + jarOpt := &cookiejar.Options{ + PublicSuffixList: publicsuffix.List, + } + + jar, err := cookiejar.New(jarOpt) + if err != nil { + return nil, fmt.Errorf("New: %w", err) + } + + cl = &KbbiClient{ + cookieURL: cookieURL, + httpc: &http.Client{ + Jar: jar, + Timeout: defTimeout, + }, + } + + err = cl.loadCookies() + if err != nil { + return nil, fmt.Errorf("New: %w", err) + } + + if cl.cookies != nil { + jar.SetCookies(cookieURL, cl.cookies) + } + + return cl, nil +} + +// +// Lookup lookup definition of one or more words. +// +func (cl *KbbiClient) Lookup(ins []string) (res LookupResponse, err error) { + res = make(LookupResponse, len(ins)) + + for _, in := range ins { + _, ok := res[in] + if ok { + continue + } + + kata := &Word{} + res[in] = kata + + entriURL := kbbiUrlBase + kbbiPathEntri + in + httpRes, err := cl.httpc.Get(entriURL) + if err != nil { + kata.err = err + continue + } + + defer httpRes.Body.Close() + + body, err := ioutil.ReadAll(httpRes.Body) + if err != nil { + kata.err = err + continue + } + + if debug.Value >= 3 { + fmt.Printf(">>> HTML body for %s:\n%s", entriURL, body) + } + + err = kata.parseHTMLEntri(in, body) + if err != nil { + kata.err = err + } + + if len(kata.Definition) == 0 && len(kata.Message) == 0 { + kata.Message = "Entri tidak ditemukan" + } + } + + return res, nil +} + +// +// ListRootWords list all of the root words in dictionary. +// +func (cl *KbbiClient) ListRootWords() (rootWords Words, err error) { + params := url.Values{ + paramNameMasukan: []string{paramValueDasar}, + paramNameMasukanLengkap: []string{paramValueDasar}, + } + + urlPage := kbbiUrlBase + "/Cari/Jenis?" + + rootWords = make(Words) + + for pageNumber := 1; pageNumber <= maxPageNumber; pageNumber++ { + params.Set(paramNamePage, strconv.Itoa(pageNumber)) + + req, err := http.NewRequest(http.MethodGet, urlPage+params.Encode(), nil) + if err != nil { + return rootWords, err + } + + res, err := cl.httpc.Do(req) + if err != nil { + return rootWords, fmt.Errorf("ListRootWords: page %d: %w", + pageNumber, err) + } + + defer res.Body.Close() + + body, err := ioutil.ReadAll(res.Body) + if err != nil { + return rootWords, fmt.Errorf("ListRootWords: page %d: %w", + pageNumber, err) + } + + got, err := cl.parseHTMLRootWords(body) + if err != nil { + return rootWords, fmt.Errorf("ListRootWords: page %d: %w", + pageNumber, err) + } + if len(got) == 0 { + break + } + + rootWords.merge(got) + + log.Printf("ListRootWords: halaman %d, jumlah kata %d, total kata %d", + pageNumber, len(got), len(rootWords)) + } + + return rootWords, nil +} + +// +// IsAuthenticated will return true if the client already login; otherwise it +// will return false. +// +func (cl *KbbiClient) IsAuthenticated() bool { + return len(cl.cookies) > 0 +} + +// +// Login authenticate the client using user email and password. +// +func (cl *KbbiClient) Login(email, pass string) (err error) { + tokenLogin, err := cl.preLogin() + if err != nil { + return fmt.Errorf("Login: %w", err) + } + + params := url.Values{ + paramNameRequestVerificationToken: []string{tokenLogin}, + paramNamePosel: []string{email}, + paramNameKataSandi: []string{pass}, + paramNameIngatSaya: []string{paramValueFalse}, + } + + reqBody := strings.NewReader(params.Encode()) + + req, err := http.NewRequest(http.MethodPost, kbbiUrlLogin, reqBody) + if err != nil { + return fmt.Errorf("Login: %w", err) + } + + req.Header.Set(libhttp.HeaderContentType, libhttp.ContentTypeForm) + + res, err := cl.httpc.Do(req) + if err != nil { + return fmt.Errorf("Login: %w", err) + } + + defer res.Body.Close() + + resBody, err := ioutil.ReadAll(res.Body) + if err != nil { + return fmt.Errorf("Login: %w", err) + } + + if res.StatusCode >= http.StatusBadRequest { + return fmt.Errorf("login: %d %s", res.StatusCode, resBody) + } + + cl.cookies = cl.httpc.Jar.Cookies(cl.cookieURL) + cl.setCookies() + cl.saveCookies() + + return nil +} + +// +// setCookies for HTTP request that need an authentication. +// +func (cl *KbbiClient) setCookies() { + cl.httpc.Jar.SetCookies(cl.cookieURL, cl.cookies) +} + +func (cl *KbbiClient) parseHTMLRootWords(htmlBody []byte) ( + rootWords Words, err error, +) { + iter, err := html.Parse(bytes.NewReader(htmlBody)) + if err != nil { + return nil, err + } + + rootWords = make(Words) + + for node := iter.Next(); node != nil; node = iter.Next() { + if !node.IsElement() { + continue + } + if node.Data != tagNameAnchor { + continue + } + hrefValue := node.GetAttrValue(attrNameHref) + if !strings.HasPrefix(hrefValue, kbbiPathEntri) { + continue + } + k := strings.TrimSpace(node.FirstChild.Data) + rootWords[k] = struct{}{} + } + + return rootWords, nil +} + +// +// parseHTMLLogin get the token at the form login. +// +func (cl *KbbiClient) parseHTMLLogin(htmlBody []byte) ( + token string, err error, +) { + iter, err := html.Parse(bytes.NewReader(htmlBody)) + if err != nil { + return "", err + } + + for node := iter.Next(); node != nil; node = iter.Next() { + if !node.IsElement() { + continue + } + if node.Data != tagNameInput { + continue + } + + token := node.GetAttrValue(attrNameValue) + if len(token) > 0 { + return token, nil + } + } + + return "", fmt.Errorf("token login not found") +} + +// +// preLogin initialize the client to get the first cookie. +// +func (cl *KbbiClient) preLogin() (token string, err error) { + req, err := http.NewRequest(http.MethodGet, kbbiUrlLogin, nil) + if err != nil { + return "", err + } + + res, err := cl.httpc.Do(req) + if err != nil { + return "", err + } + + defer res.Body.Close() + + body, err := ioutil.ReadAll(res.Body) + if err != nil { + return "", err + } + + token, err = cl.parseHTMLLogin(body) + if err != nil { + return "", err + } + + return token, nil +} + +// +// loadCookies load the KBBI cookies from file. +// +func (cl *KbbiClient) loadCookies() (err error) { + cl.baseDir, err = os.UserConfigDir() + if err != nil { + return fmt.Errorf("loadCookies: %w", err) + } + + f := filepath.Join(cl.baseDir, configDir, cookieFile) + + _, err = os.Stat(f) + if errors.Is(err, os.ErrNotExist) { + return nil + } + + body, err := ioutil.ReadFile(f) + if err != nil { + return fmt.Errorf("loadCookies: %w", err) + } + + dec := gob.NewDecoder(bytes.NewReader(body)) + + err = dec.Decode(&cl.cookies) + if err != nil { + return fmt.Errorf("loadCookies: %w", err) + } + + return nil +} + +// +// saveCookies store the client cookies to the file for future use. +// +func (cl *KbbiClient) saveCookies() { + err := os.MkdirAll(filepath.Join(cl.baseDir, configDir), 0700) + if err != nil { + log.Println("saveCookies:", err) + } + + f := filepath.Join(cl.baseDir, configDir, cookieFile) + + var buf bytes.Buffer + enc := gob.NewEncoder(&buf) + err = enc.Encode(cl.cookies) + if err != nil { + log.Println("saveCookies: ", err) + } + + err = ioutil.WriteFile(f, buf.Bytes(), 0600) + if err != nil { + log.Println("saveCookies: ", err) + } +} diff --git a/kbbi_client_test.go b/kbbi_client_test.go new file mode 100644 index 0000000..d83ab5c --- /dev/null +++ b/kbbi_client_test.go @@ -0,0 +1,29 @@ +// Copyright 2020, Shulhan . All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package kamusku + +import ( + "io/ioutil" + "testing" +) + +func TestClient_parseHTMLKataDasar(t *testing.T) { + htmlBody, err := ioutil.ReadFile("testdata/kbbi_dasar.html") + if err != nil { + t.Fatal(err) + } + + cl, err := NewKbbiClient() + if err != nil { + t.Fatal(err) + } + + got, err := cl.parseHTMLRootWords(htmlBody) + if err != nil { + t.Fatal(err) + } + + t.Logf("Root words: %v", got) +} diff --git a/lookup_response.go b/lookup_response.go new file mode 100644 index 0000000..aac9175 --- /dev/null +++ b/lookup_response.go @@ -0,0 +1,10 @@ +// Copyright 2020, Shulhan . All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package kamusku + +// +// LookupResponse contains mapping of word and its definition. +// +type LookupResponse map[string]*Word diff --git a/testdata/entri.html b/testdata/entri.html new file mode 100644 index 0000000..1899950 --- /dev/null +++ b/testdata/entri.html @@ -0,0 +1,408 @@ + + + + + + + + Hasil Pencarian - KBBI Daring + + + + + + +
+ + +
+ +
+

+ + Halo Shulhan! + Sudahkah Anda mengecek + halaman manajemen akun Anda? Anda + dapat melihat cara membukanya + di sini. Jika + Anda pernah mengajukan + usulan-usulan, + mungkin usulan-usulan tersebut telah diproses oleh redaksi + kami. +

+
+
+ +
+
+
+
+
+ + + + +
+
+
+

+ +
+ +
+

+ in.for.ma.si + +

+

+ ⇢ Tesaurus +

+
    +
  1. + n + penerangan +
  2. +
  3. + n + pemberitahuan; kabar atau berita tentang sesuatu +
  4. +
  5. + n + Ling + keseluruhan makna yang menunjang amanat yang terlihat + dalam bagian-bagian amanat itu +
  6. +
  7. + Usulkan makna baru +
  8. +
+

Kata Turunan

+ +

Gabungan Kata

+ +

+

+ Usulkan entri baru +

+
+ +
+ + + + + + + + + + + diff --git a/testdata/entri_analisa.html b/testdata/entri_analisa.html new file mode 100644 index 0000000..9ba807d --- /dev/null +++ b/testdata/entri_analisa.html @@ -0,0 +1,342 @@ + + + + + + + + Hasil Pencarian - KBBI Daring + + + + + + +
+ + +
+ +
+

+ + Halo Shulhan! + Sudahkah Anda mengecek + halaman manajemen akun Anda? Anda + dapat melihat cara membukanya + di sini. Jika + Anda pernah mengajukan + usulan-usulan, + mungkin usulan-usulan tersebut telah diproses oleh redaksi + kami. +

+
+
+ +
+
+
+
+
+ + + + +
+
+
+

+ +
+ +
+

+ ana.li.sa + +

+

+ ⇢ Tesaurus +

+ +

+

+ Usulkan entri baru +

+
+ +
+ + + + + + + + + + + diff --git a/testdata/kbbi_dasar.html b/testdata/kbbi_dasar.html new file mode 100644 index 0000000..4bd5170 --- /dev/null +++ b/testdata/kbbi_dasar.html @@ -0,0 +1,707 @@ + + + + + + + + Jenis - KBBI Daring + + + + + + + +
+ + + +
+

Daftar Entri Jenis Dasar

+
+ +
+ +
+ + Halaman 1 / 501 + +
+ +
+ Hasil Pencarian: 1 - 100 dari 50001 +
+
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+
+ A + B + C + D + E + F + G + H + I + J + K + L + M + N + O + P + Q + R + S + T + U + V + W + X + Y + Z + Semua +
+ +
+ + + + +
+ + +
+ + + + + + + + + + + + + diff --git a/word.go b/word.go new file mode 100644 index 0000000..8bc0da0 --- /dev/null +++ b/word.go @@ -0,0 +1,98 @@ +// Copyright 2020, Shulhan . All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package kamusku + +import ( + "bytes" + + "github.com/shuLhan/share/lib/net/html" +) + +// +// Word store the single root word and its definitions. +// +type Word struct { + Root string `json:"dasar,omitempty"` // The root word + Definition []*WordDefinition `json:"definisi"` // The word definition. + + // Message will contains the information when the word is not found or + // the word is informal (kata tidak baku). + Message string `json:"pesan,omitempty"` + + err error +} + +// +// Err return an error from retrieving definition. +// +func (word *Word) Err() error { + return word.err +} + +// +// parseHTMLEntri parse HTML body from "/entri/" page to find the +// definition of the word. +// +func (word *Word) parseHTMLEntri(in string, htmlBody []byte) (err error) { + iter, err := html.Parse(bytes.NewReader(htmlBody)) + if err != nil { + return err + } + + for node := iter.Next(); node != nil; node = iter.Next() { + if !node.IsElement() { + continue + } + + switch node.Data { + case tagNameHeader2: + word.parseRootWord(node) + + case tagNameOrderedList, tagNameUnorderedList: + li := node.GetFirstChild() + for li != nil { + defKata, err := parseWordDefinition(in, li) + if err != nil { + word.Message = err.Error() + err = nil + break + } + if defKata == nil { + break + } + word.Definition = append(word.Definition, defKata) + li = li.GetNextSibling() + } + next := node.GetNextSibling() + iter.SetNext(next) + } + } + + return nil +} + +// +// parseRootWord given an HMTL element "h2" find a possible root word and +// return true; otherwise it will return false. +// +func (word *Word) parseRootWord(h2 *html.Node) bool { + el := h2.GetFirstChild() + if el.Data != tagNameSpan { + return false + } + v := el.GetAttrValue(attrNameClass) + if v != attrValueRootWord { + return false + } + + el = el.GetFirstChild() + if el.Data != tagNameAnchor { + return false + } + el = el.GetFirstChild() + word.Root = el.Data + + return true +} diff --git a/word_definition.go b/word_definition.go new file mode 100644 index 0000000..86b05bd --- /dev/null +++ b/word_definition.go @@ -0,0 +1,94 @@ +// Copyright 2020, Shulhan . All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package kamusku + +import ( + "fmt" + "strings" + + "github.com/shuLhan/share/lib/net/html" + libstrings "github.com/shuLhan/share/lib/strings" +) + +// +// WordDefinition contains the meaning of word in dictionary, and optional +// attribute for word classifications and examples. +// +type WordDefinition struct { + Value string `json:"isi"` + Classes []string `json:"kelas,omitempty"` + Examples []string `json:"contoh,omitempty"` +} + +func parseWordDefinition(in string, li *html.Node) (defKata *WordDefinition, err error) { + elFont := li.GetFirstChild() + if elFont == nil || elFont.Data != tagNameFont { + return nil, nil + } + elItalic := elFont.GetFirstChild() + if elItalic == nil || elItalic.Data != tagNameItalic { + return nil, nil + } + + defKata = &WordDefinition{} + + elSpan := elItalic.GetFirstChild() + for elSpan != nil && elSpan.Data == tagNameSpan { + kelas := elSpan.GetAttrValue(attrNameTitle) + if len(kelas) > 0 { + defKata.Classes = append(defKata.Classes, kelas) + } + elSpan = elSpan.GetNextSibling() + } + + el := elFont.GetNextSibling() + if el == nil { + return defKata, nil + } + + defKata.Value = strings.TrimSpace(libstrings.SingleSpace(el.Data)) + + if defKata.Value == "→" { + defKata.Value = "" + el = el.GetNextSibling() + if el == nil || el.Data != tagNameAnchor { + return nil, nil + } + el = el.GetFirstChild() + return nil, fmt.Errorf(`%q adalah bentuk tidak baku dari %q`, + in, el.Data) + } + + if defKata.Value[len(defKata.Value)-1] != ':' { + return defKata, nil + } + + defKata.Value = defKata.Value[:len(defKata.Value)-1] + + // Parse the example of kata in the next sibling. + el = el.GetNextSibling() + for el != nil { + if el.Data != tagNameFont { + break + } + + elItalic = el.GetFirstChild() + if elItalic.Data != tagNameItalic { + break + } + + elText := elItalic.GetFirstChild() + if elText != nil { + contoh := strings.TrimSpace(elText.Data) + if len(contoh) > 0 && contoh != ";" { + defKata.Examples = append(defKata.Examples, elText.Data) + } + } + + el = el.GetNextSibling() + } + + return defKata, nil +} diff --git a/word_test.go b/word_test.go new file mode 100644 index 0000000..5dcb468 --- /dev/null +++ b/word_test.go @@ -0,0 +1,66 @@ +// Copyright 2020, Shulhan . All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package kamusku + +import ( + "io/ioutil" + "testing" + + "github.com/shuLhan/share/lib/test" +) + +func TestWord_parseHTMLEntri(t *testing.T) { + cases := []struct { + infile string + cari string + exp *Word + }{{ + infile: "testdata/entri.html", + cari: "informasi", + exp: &Word{ + Definition: []*WordDefinition{{ + Value: "penerangan", + Classes: []string{"Nomina: kata benda"}, + }, { + Value: "pemberitahuan; kabar atau berita tentang sesuatu", + Classes: []string{"Nomina: kata benda"}, + }, { + Value: "keseluruhan makna yang menunjang amanat yang " + + "terlihat dalam bagian-bagian " + + "amanat itu", + Classes: []string{ + "Nomina: kata benda", + "Linguistik: -", + }, + }}, + }, + }, { + infile: "testdata/entri_analisa.html", + cari: "analisa", + exp: &Word{ + Message: `"analisa" adalah bentuk tidak baku dari "analisis"`, + }, + }} + + for _, c := range cases { + htmlBody, err := ioutil.ReadFile(c.infile) + if err != nil { + t.Fatal(err) + } + + got := new(Word) + + err = got.parseHTMLEntri(c.cari, htmlBody) + if err != nil { + t.Fatal(err) + } + + for x, def := range c.exp.Definition { + test.Assert(t, "Definition", def, got.Definition[x], true) + } + + test.Assert(t, c.infile, c.exp, got, true) + } +} diff --git a/words.go b/words.go new file mode 100644 index 0000000..6cdba0c --- /dev/null +++ b/words.go @@ -0,0 +1,20 @@ +// Copyright 2020, Shulhan . All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package kamusku + +// +// Words contains list of words. +// +type Words map[string]struct{} + +// +// merge other map into current map. +// +func (words Words) merge(in Words) Words { + for k := range in { + words[k] = struct{}{} + } + return words +} -- cgit v1.3