diff options
| author | Shulhan <ms@kilabit.info> | 2026-01-22 01:39:41 +0700 |
|---|---|---|
| committer | Shulhan <ms@kilabit.info> | 2026-01-22 01:39:41 +0700 |
| commit | 79eaccc81b85eb92dab9cf18d52662f367903652 (patch) | |
| tree | af58482138c2ba9211029174ab579e951cd7fff6 /brokenlinks/worker.go | |
| parent | 26fc8bd3203dae6b4705ada227439c90129bbe36 (diff) | |
| download | jarink-79eaccc81b85eb92dab9cf18d52662f367903652.tar.xz | |
all: refactoring, use single struct to represent Link
Previously, have [jarink.Link], [brokenlinks.Broken], and
[brokenlinks.linkQueue] to store the metadata for a link.
These changes unified them into struct [jarink.Link].
Diffstat (limited to 'brokenlinks/worker.go')
| -rw-r--r-- | brokenlinks/worker.go | 138 |
1 files changed, 68 insertions, 70 deletions
diff --git a/brokenlinks/worker.go b/brokenlinks/worker.go index c0a33dd..07bda88 100644 --- a/brokenlinks/worker.go +++ b/brokenlinks/worker.go @@ -29,7 +29,7 @@ type worker struct { seenLink map[string]int // queue contains list of link to be scanned. - queue []linkQueue + queue []jarink.Link // result contains the final result after all of the pages has been // scanned. @@ -124,15 +124,14 @@ func (wrk *worker) run() (result *Result, err error) { // scanAll scan all pages start from [Options.Url]. func (wrk *worker) scanAll() (result *Result, err error) { // Scan the first URL to make sure that the server is reachable. - var linkq = linkQueue{ - parentUrl: nil, - url: wrk.opts.scanUrl.String(), + var linkq = jarink.Link{ + Url: wrk.opts.scanUrl.String(), } var resultq = wrk.scan(linkq) - linkq = resultq[linkq.url] - if linkq.errScan != nil { - return nil, linkq.errScan + linkq = resultq[linkq.Url] + if linkq.ErrScan != nil { + return nil, linkq.ErrScan } wrk.processResult(resultq) @@ -176,17 +175,16 @@ func (wrk *worker) scanPastResult() (result *Result, err error) { // - Skip external link that has been checked before. // - Skip link that has been seen. // - Otherwise push it to queue. -func (wrk *worker) processResult(resultq map[string]linkQueue) { - var linkq linkQueue +func (wrk *worker) processResult(resultq map[string]jarink.Link) { var seen bool - for _, linkq = range resultq { - if linkq.status != 0 { + for _, linkq := range resultq { + if linkq.StatusCode != 0 { wrk.seen(linkq) continue } - if linkq.isExternal { - var scannedLink = wrk.cache.Get(linkq.url) + if linkq.IsExternal { + var scannedLink = wrk.cache.Get(linkq.Url) if scannedLink != nil { // The external link has been scanned // previously. @@ -194,9 +192,9 @@ func (wrk *worker) processResult(resultq map[string]linkQueue) { } } - linkq.status, seen = wrk.seenLink[linkq.url] + linkq.StatusCode, seen = wrk.seenLink[linkq.Url] if seen { - if linkq.status >= http.StatusBadRequest { + if linkq.StatusCode >= http.StatusBadRequest { // Different pages may have the same broken // link. wrk.markAsBroken(linkq) @@ -207,61 +205,61 @@ func (wrk *worker) processResult(resultq map[string]linkQueue) { } } -func (wrk *worker) seen(linkq linkQueue) { - wrk.seenLink[linkq.url] = linkq.status +func (wrk *worker) seen(linkq jarink.Link) { + wrk.seenLink[linkq.Url] = linkq.StatusCode - if linkq.isExternal { - if linkq.status != StatusBadLink { - wrk.cache.Set(linkq.url, linkq.status, linkq.size) + if linkq.IsExternal { + if linkq.StatusCode != StatusBadLink { + wrk.cache.Set(linkq) } } - if linkq.status >= http.StatusBadRequest { + if linkq.StatusCode >= http.StatusBadRequest { wrk.markAsBroken(linkq) } } -func (wrk *worker) markAsBroken(linkq linkQueue) { - if slices.Contains(wrk.opts.ignoreStatus, linkq.status) { +func (wrk *worker) markAsBroken(linkq jarink.Link) { + if slices.Contains(wrk.opts.ignoreStatus, linkq.StatusCode) { return } - var parentUrl = linkq.parentUrl.String() + var parentUrl = linkq.ParentUrl.String() var listBroken = wrk.result.BrokenLinks[parentUrl] - var brokenLink = Broken{ - Link: linkq.url, - Code: linkq.status, + if linkq.ErrScan != nil { + linkq.Error = linkq.ErrScan.Error() } - if linkq.errScan != nil { - brokenLink.Error = linkq.errScan.Error() - } - listBroken = append(listBroken, brokenLink) + listBroken = append(listBroken, linkq) wrk.result.BrokenLinks[parentUrl] = listBroken } // scan the link to HTML page or image. -func (wrk *worker) scan(linkq linkQueue) (resultq map[string]linkQueue) { - resultq = make(map[string]linkQueue) +func (wrk *worker) scan(linkq jarink.Link) (resultq map[string]jarink.Link) { + resultq = make(map[string]jarink.Link) var ( httpResp *http.Response err error ) httpResp, err = wrk.fetch(linkq) if err != nil { - linkq.status = StatusBadLink - linkq.errScan = err - resultq[linkq.url] = linkq + linkq.StatusCode = StatusBadLink + linkq.ErrScan = err + resultq[linkq.Url] = linkq return resultq } defer httpResp.Body.Close() - linkq.status = httpResp.StatusCode - linkq.size = httpResp.ContentLength - resultq[linkq.url] = linkq + linkq.StatusCode = httpResp.StatusCode + resultq[linkq.Url] = linkq if httpResp.StatusCode >= http.StatusBadRequest { return resultq } - if linkq.kind == atom.Img || linkq.isExternal { + if linkq.Kind == int(atom.Img) { + return resultq + } + linkq.Size = httpResp.ContentLength + if linkq.IsExternal { + resultq[linkq.Url] = linkq return resultq } @@ -276,11 +274,11 @@ func (wrk *worker) scan(linkq linkQueue) (resultq map[string]linkQueue) { var parentUrl *url.URL - parentUrl, err = url.Parse(linkq.url) + parentUrl, err = url.Parse(linkq.Url) if err != nil { - linkq.status = StatusBadLink - linkq.errScan = err - resultq[linkq.url] = linkq + linkq.StatusCode = StatusBadLink + linkq.ErrScan = err + resultq[linkq.Url] = linkq return resultq } @@ -292,13 +290,13 @@ func (wrk *worker) scan(linkq linkQueue) (resultq map[string]linkQueue) { if node.DataAtom != atom.A && node.DataAtom != atom.Img { continue } - var nodeLink *linkQueue + var nodeLink *jarink.Link if node.DataAtom == atom.A { for _, attr := range node.Attr { if attr.Key != `href` { continue } - nodeLink = wrk.processLink(parentUrl, attr.Val, atom.A) + nodeLink = wrk.processLink(parentUrl, attr.Val, int(atom.A)) break } } else { // atom.Img @@ -306,7 +304,7 @@ func (wrk *worker) scan(linkq linkQueue) (resultq map[string]linkQueue) { if attr.Key != `src` { continue } - nodeLink = wrk.processLink(parentUrl, attr.Val, atom.Img) + nodeLink = wrk.processLink(parentUrl, attr.Val, int(atom.Img)) break } } @@ -314,30 +312,30 @@ func (wrk *worker) scan(linkq linkQueue) (resultq map[string]linkQueue) { // Link is invalid. continue } - _, seen := resultq[nodeLink.url] + _, seen := resultq[nodeLink.Url] if seen { // The same link already exist previously. continue } - resultq[nodeLink.url] = *nodeLink + resultq[nodeLink.Url] = *nodeLink } return resultq } -func (wrk *worker) fetch(linkq linkQueue) (httpResp *http.Response, err error) { +func (wrk *worker) fetch(linkq jarink.Link) (httpResp *http.Response, err error) { const maxRetry = 5 var retry int for retry < 5 { - if linkq.kind == atom.Img { + if linkq.Kind == int(atom.Img) { if wrk.opts.IsVerbose { - wrk.log.Printf("fetch: HEAD %s", linkq.url) + wrk.log.Printf("fetch: HEAD %s", linkq.Url) } - httpResp, err = wrk.httpc.Head(linkq.url) + httpResp, err = wrk.httpc.Head(linkq.Url) } else { if wrk.opts.IsVerbose { - wrk.log.Printf("fetch: GET %s", linkq.url) + wrk.log.Printf("fetch: GET %s", linkq.Url) } - httpResp, err = wrk.httpc.Get(linkq.url) + httpResp, err = wrk.httpc.Get(linkq.Url) } if err == nil { return httpResp, nil @@ -348,7 +346,7 @@ func (wrk *worker) fetch(linkq linkQueue) (httpResp *http.Response, err error) { } if errDNS.Timeout() { retry++ - wrk.log.Printf(`fetch %s: %s (%d/%d)`, linkq.url, err, retry, maxRetry) + wrk.log.Printf(`fetch %s: %s (%d/%d)`, linkq.Url, err, retry, maxRetry) continue } break @@ -356,33 +354,33 @@ func (wrk *worker) fetch(linkq linkQueue) (httpResp *http.Response, err error) { return nil, err } -// processLink given a parentURL and link value `val` -// check if link `val` is valid and return it as linkQueue. -func (wrk *worker) processLink(parentUrl *url.URL, val string, kind atom.Atom) ( - linkq *linkQueue, +// processLink given a parentURL, check if link `val` is valid, and return it +// as [jarink.Link]. +func (wrk *worker) processLink(parentUrl *url.URL, val string, kind int) ( + linkq *jarink.Link, ) { if len(val) == 0 { return nil } - linkq = &linkQueue{ - parentUrl: parentUrl, - kind: kind, + linkq = &jarink.Link{ + ParentUrl: parentUrl, + Kind: kind, } var newUrl *url.URL var err error newUrl, err = url.Parse(val) if err != nil { - linkq.errScan = err - linkq.url = val - linkq.status = StatusBadLink + linkq.ErrScan = err + linkq.Url = val + linkq.StatusCode = StatusBadLink return linkq } newUrl.Fragment = "" newUrl.RawFragment = "" - if kind == atom.A && val[0] == '#' { + if kind == int(atom.A) && val[0] == '#' { // Ignore link to ID, like `href="#element_id"`. return nil } @@ -395,9 +393,9 @@ func (wrk *worker) processLink(parentUrl *url.URL, val string, kind atom.Atom) ( newUrl = parentUrl.JoinPath(`/`, newUrl.Path) } } - linkq.url = strings.TrimSuffix(newUrl.String(), `/`) - if !strings.HasPrefix(linkq.url, wrk.opts.scanUrl.String()) { - linkq.isExternal = true + linkq.Url = strings.TrimSuffix(newUrl.String(), `/`) + if !strings.HasPrefix(linkq.Url, wrk.opts.scanUrl.String()) { + linkq.IsExternal = true } return linkq } |
