diff options
| author | Shulhan <ms@kilabit.info> | 2025-06-21 15:20:01 +0700 |
|---|---|---|
| committer | Shulhan <ms@kilabit.info> | 2025-06-27 12:19:23 +0700 |
| commit | 1ca561ed0ecfa59b70a10191ac8e58cde90d126e (patch) | |
| tree | 80f0c65f7e9321ad92dfc1a53a444226cee4be3d /brokenlinks/worker.go | |
| parent | 8bc8fce1bd80b5a25c452ac5a24b1a1e3f5a4feb (diff) | |
| download | jarink-1ca561ed0ecfa59b70a10191ac8e58cde90d126e.tar.xz | |
brokenlinks: implement caching for external URLs
Any succesful fetch on external URLs, will be recorded into jarink
cache file, located in user's home cache directory.
For example, in Linux it would be `$HOME/.cache/jarink/cache.json`.
This help improve the future rescanning on the same or different target
URL, minimizing network requests.
Diffstat (limited to 'brokenlinks/worker.go')
| -rw-r--r-- | brokenlinks/worker.go | 72 |
1 files changed, 61 insertions, 11 deletions
diff --git a/brokenlinks/worker.go b/brokenlinks/worker.go index 3eacf01..8d7918f 100644 --- a/brokenlinks/worker.go +++ b/brokenlinks/worker.go @@ -19,6 +19,8 @@ import ( "golang.org/x/net/html" "golang.org/x/net/html/atom" + + "git.sr.ht/~shulhan/jarink" ) type worker struct { @@ -41,6 +43,9 @@ type worker struct { // links or image. baseUrl *url.URL + // cache of scanned links. + cache *jarink.Cache + log *log.Logger httpc *http.Client @@ -79,6 +84,11 @@ func newWorker(opts Options) (wrk *worker, err error) { }, } + wrk.cache, err = jarink.LoadCache() + if err != nil { + return nil, err + } + wrk.baseUrl = &url.URL{ Scheme: wrk.opts.scanUrl.Scheme, Host: wrk.opts.scanUrl.Host, @@ -135,9 +145,14 @@ func (wrk *worker) scanAll() (result *Result, err error) { wrk.seenLink[linkq.url] = linkq.status continue } - if linkq.status >= http.StatusBadRequest { - wrk.markBroken(linkq) - continue + + if linkq.isExternal { + var scannedLink = wrk.cache.Get(linkq.url) + if scannedLink != nil { + linkq.status = scannedLink.ResponseCode + wrk.seen(linkq) + continue + } } wrk.seenLink[linkq.url] = http.StatusProcessing @@ -206,17 +221,27 @@ func (wrk *worker) processResult( newList []linkQueue, ) { for _, linkq := range resultq { - if linkq.status >= http.StatusBadRequest { - wrk.markBroken(linkq) - continue - } + // Process the scanned page first. + if linkq.status != 0 { - // linkq is the result of scan with - // non error status. - wrk.seenLink[linkq.url] = linkq.status + wrk.seen(linkq) + if linkq.isExternal && linkq.status != StatusBadLink { + wrk.cache.Set(linkq.url, linkq.status, linkq.size) + } continue } + // Now process the links inside the page. + + if linkq.isExternal { + var scannedLink = wrk.cache.Get(linkq.url) + if scannedLink != nil { + linkq.status = scannedLink.ResponseCode + wrk.seen(linkq) + continue + } + } + seenStatus, seen := wrk.seenLink[linkq.url] if !seen { wrk.seenLink[linkq.url] = http.StatusProcessing @@ -257,6 +282,14 @@ func (wrk *worker) processResult( return newList } +func (wrk *worker) seen(linkq linkQueue) { + if linkq.status >= http.StatusBadRequest { + wrk.markBroken(linkq) + return + } + wrk.seenLink[linkq.url] = linkq.status +} + func (wrk *worker) markBroken(linkq linkQueue) { var parentUrl = linkq.parentUrl.String() var listBroken = wrk.result.BrokenLinks[parentUrl] @@ -299,6 +332,7 @@ func (wrk *worker) scan(linkq linkQueue) { defer httpResp.Body.Close() linkq.status = httpResp.StatusCode + linkq.size = httpResp.ContentLength resultq[linkq.url] = linkq if slices.Contains(wrk.opts.ignoreStatus, httpResp.StatusCode) { @@ -361,7 +395,7 @@ func (wrk *worker) scan(linkq linkQueue) { } _, seen := resultq[nodeLink.url] if !seen { - nodeLink.checkExternal(wrk) + wrk.checkExternal(nodeLink) resultq[nodeLink.url] = *nodeLink } } @@ -459,3 +493,19 @@ func (wrk *worker) pushResult(resultq map[string]linkQueue) { } } } + +// checkExternal set the [linkQueue.isExternal] field to true if +// +// (1) [linkQueue.url] does not start with [Options.Url] +// (2) linkQueue is not from scanPastResult, indicated by non-nil +// [worker.pastResult]. +func (wrk *worker) checkExternal(linkq *linkQueue) { + if !strings.HasPrefix(linkq.url, wrk.opts.scanUrl.String()) { + linkq.isExternal = true + return + } + if wrk.pastResult != nil { + linkq.isExternal = true + return + } +} |
