From 3d9f0baa2c96b101ec8047a5969c7ece14e16991 Mon Sep 17 00:00:00 2001 From: Shulhan Date: Sun, 12 Apr 2020 00:16:02 +0700 Subject: all: tangani kata tidak baku MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Sebelumnya bila kata adalah kata tidak baku, definisi kata akan berisi karaketer "→" saja. Perubahan ini memeriksa bila definisi kata adalah "→" maka akan dianggap tidak baku dan kata yang baku berada satu elemen sesudahnya. --- cmd/kbbi/main.go | 4 + definisi_kata.go | 24 +++- direct_client.go | 2 +- kamus_cache.go | 10 ++ kata.go | 10 +- kata_test.go | 9 +- telegram_bot.go | 5 + testdata/entri.html | 58 +++++--- testdata/entri_analisa.html | 342 ++++++++++++++++++++++++++++++++++++++++++++ 9 files changed, 431 insertions(+), 33 deletions(-) create mode 100644 testdata/entri_analisa.html diff --git a/cmd/kbbi/main.go b/cmd/kbbi/main.go index ac9298c..adba621 100644 --- a/cmd/kbbi/main.go +++ b/cmd/kbbi/main.go @@ -85,6 +85,10 @@ func main() { } fmt.Println("===", k) + if len(kata.Pesan) != 0 { + fmt.Println(" " + kata.Pesan) + continue + } if len(kata.Dasar) > 0 { fmt.Printf(" Kata dasar: %s\n", kata.Dasar) } diff --git a/definisi_kata.go b/definisi_kata.go index 0060a97..8d812f8 100644 --- a/definisi_kata.go +++ b/definisi_kata.go @@ -5,6 +5,7 @@ package kbbi import ( + "fmt" "strings" "github.com/shuLhan/share/lib/net/html" @@ -21,14 +22,14 @@ type DefinisiKata struct { Contoh []string `json:"contoh,omitempty"` } -func parseDefinisiKata(li *html.Node) (defKata *DefinisiKata) { +func parseDefinisiKata(in string, li *html.Node) (defKata *DefinisiKata, err error) { elFont := li.GetFirstChild() if elFont == nil || elFont.Data != tagNameFont { - return nil + return nil, nil } elItalic := elFont.GetFirstChild() if elItalic == nil || elItalic.Data != tagNameItalic { - return nil + return nil, nil } defKata = &DefinisiKata{} @@ -44,13 +45,24 @@ func parseDefinisiKata(li *html.Node) (defKata *DefinisiKata) { el := elFont.GetNextSibling() if el == nil { - return defKata + return defKata, nil } defKata.Isi = strings.TrimSpace(libstrings.SingleSpace(el.Data)) + if defKata.Isi == "→" { + defKata.Isi = "" + el = el.GetNextSibling() + if el == nil || el.Data != tagNameAnchor { + return nil, nil + } + el = el.GetFirstChild() + return nil, fmt.Errorf(`%q adalah bentuk tidak baku dari %q`, + in, el.Data) + } + if defKata.Isi[len(defKata.Isi)-1] != ':' { - return defKata + return defKata, nil } defKata.Isi = defKata.Isi[:len(defKata.Isi)-1] @@ -78,5 +90,5 @@ func parseDefinisiKata(li *html.Node) (defKata *DefinisiKata) { el = el.GetNextSibling() } - return defKata + return defKata, nil } diff --git a/direct_client.go b/direct_client.go index b4bcf98..963a118 100644 --- a/direct_client.go +++ b/direct_client.go @@ -102,7 +102,7 @@ func (cl *directClient) CariDefinisi(ins []string) ( fmt.Printf(">>> HTML body for %s:\n%s", entriURL, body) } - err = kata.parseHTMLEntri(body) + err = kata.parseHTMLEntri(in, body) if err != nil { kata.err = err } diff --git a/kamus_cache.go b/kamus_cache.go index ffcf4db..124fdc1 100644 --- a/kamus_cache.go +++ b/kamus_cache.go @@ -95,6 +95,16 @@ func (kamus *kamusCache) load() (err error) { return err } + // Clean up. Remove all word that contain "→" as definition. + for k, kata := range kamus.cache { + for _, def := range kata.Definisi { + if def.Isi == "→" { + delete(kamus.cache, k) + break + } + } + } + kamus.lastSize = len(kamus.cache) return nil diff --git a/kata.go b/kata.go index dfadf61..85ecb82 100644 --- a/kata.go +++ b/kata.go @@ -23,6 +23,7 @@ func (kata *Kata) Err() error { type Kata struct { Dasar string `json:"dasar"` Definisi []*DefinisiKata `json:"definisi"` + Pesan string `json:"pesan"` err error } @@ -30,7 +31,7 @@ type Kata struct { // parseHTMLEntri parse HTML body from "/entri/" page to find the // definition of the word. // -func (kata *Kata) parseHTMLEntri(htmlBody []byte) (err error) { +func (kata *Kata) parseHTMLEntri(in string, htmlBody []byte) (err error) { iter, err := html.Parse(bytes.NewReader(htmlBody)) if err != nil { return err @@ -48,7 +49,12 @@ func (kata *Kata) parseHTMLEntri(htmlBody []byte) (err error) { case tagNameOrderedList, tagNameUnorderedList: li := node.GetFirstChild() for li != nil { - defKata := parseDefinisiKata(li) + defKata, err := parseDefinisiKata(in, li) + if err != nil { + kata.Pesan = err.Error() + err = nil + break + } if defKata == nil { break } diff --git a/kata_test.go b/kata_test.go index 83e9177..7f9c4a5 100644 --- a/kata_test.go +++ b/kata_test.go @@ -34,10 +34,17 @@ func TestKata_parseHTMLEntri(t *testing.T) { }, }}, }, + }, { + infile: "testdata/entri_analisa.html", + exp: &Kata{ + Definisi: []*DefinisiKata{{ + Isi: `Bentuk tidak baku dari "analisis"`, + }}, + }, }} for _, c := range cases { - htmlBody, err := ioutil.ReadFile("testdata/entri.html") + htmlBody, err := ioutil.ReadFile(c.infile) if err != nil { t.Fatal(err) } diff --git a/telegram_bot.go b/telegram_bot.go index cf023ab..1304972 100644 --- a/telegram_bot.go +++ b/telegram_bot.go @@ -128,6 +128,11 @@ func formatText(definisiKata DefinisiResponse) string { for k, kata := range definisiKata { fmt.Fprintf(buf, "%s\n", k) + if len(kata.Pesan) > 0 { + fmt.Fprintln(buf, " "+kata.Pesan) + fmt.Fprintln(buf, "") + continue + } if len(kata.Dasar) > 0 { fmt.Fprintf(buf, " Kata dasar: %s\n\n", kata.Dasar) diff --git a/testdata/entri.html b/testdata/entri.html index 9c53f07..1899950 100644 --- a/testdata/entri.html +++ b/testdata/entri.html @@ -19,10 +19,10 @@ - +