diff options
Diffstat (limited to 'brokenlinks/worker.go')
| -rw-r--r-- | brokenlinks/worker.go | 72 |
1 files changed, 61 insertions, 11 deletions
diff --git a/brokenlinks/worker.go b/brokenlinks/worker.go index 3eacf01..8d7918f 100644 --- a/brokenlinks/worker.go +++ b/brokenlinks/worker.go @@ -19,6 +19,8 @@ import ( "golang.org/x/net/html" "golang.org/x/net/html/atom" + + "git.sr.ht/~shulhan/jarink" ) type worker struct { @@ -41,6 +43,9 @@ type worker struct { // links or image. baseUrl *url.URL + // cache of scanned links. + cache *jarink.Cache + log *log.Logger httpc *http.Client @@ -79,6 +84,11 @@ func newWorker(opts Options) (wrk *worker, err error) { }, } + wrk.cache, err = jarink.LoadCache() + if err != nil { + return nil, err + } + wrk.baseUrl = &url.URL{ Scheme: wrk.opts.scanUrl.Scheme, Host: wrk.opts.scanUrl.Host, @@ -135,9 +145,14 @@ func (wrk *worker) scanAll() (result *Result, err error) { wrk.seenLink[linkq.url] = linkq.status continue } - if linkq.status >= http.StatusBadRequest { - wrk.markBroken(linkq) - continue + + if linkq.isExternal { + var scannedLink = wrk.cache.Get(linkq.url) + if scannedLink != nil { + linkq.status = scannedLink.ResponseCode + wrk.seen(linkq) + continue + } } wrk.seenLink[linkq.url] = http.StatusProcessing @@ -206,17 +221,27 @@ func (wrk *worker) processResult( newList []linkQueue, ) { for _, linkq := range resultq { - if linkq.status >= http.StatusBadRequest { - wrk.markBroken(linkq) - continue - } + // Process the scanned page first. + if linkq.status != 0 { - // linkq is the result of scan with - // non error status. - wrk.seenLink[linkq.url] = linkq.status + wrk.seen(linkq) + if linkq.isExternal && linkq.status != StatusBadLink { + wrk.cache.Set(linkq.url, linkq.status, linkq.size) + } continue } + // Now process the links inside the page. + + if linkq.isExternal { + var scannedLink = wrk.cache.Get(linkq.url) + if scannedLink != nil { + linkq.status = scannedLink.ResponseCode + wrk.seen(linkq) + continue + } + } + seenStatus, seen := wrk.seenLink[linkq.url] if !seen { wrk.seenLink[linkq.url] = http.StatusProcessing @@ -257,6 +282,14 @@ func (wrk *worker) processResult( return newList } +func (wrk *worker) seen(linkq linkQueue) { + if linkq.status >= http.StatusBadRequest { + wrk.markBroken(linkq) + return + } + wrk.seenLink[linkq.url] = linkq.status +} + func (wrk *worker) markBroken(linkq linkQueue) { var parentUrl = linkq.parentUrl.String() var listBroken = wrk.result.BrokenLinks[parentUrl] @@ -299,6 +332,7 @@ func (wrk *worker) scan(linkq linkQueue) { defer httpResp.Body.Close() linkq.status = httpResp.StatusCode + linkq.size = httpResp.ContentLength resultq[linkq.url] = linkq if slices.Contains(wrk.opts.ignoreStatus, httpResp.StatusCode) { @@ -361,7 +395,7 @@ func (wrk *worker) scan(linkq linkQueue) { } _, seen := resultq[nodeLink.url] if !seen { - nodeLink.checkExternal(wrk) + wrk.checkExternal(nodeLink) resultq[nodeLink.url] = *nodeLink } } @@ -459,3 +493,19 @@ func (wrk *worker) pushResult(resultq map[string]linkQueue) { } } } + +// checkExternal set the [linkQueue.isExternal] field to true if +// +// (1) [linkQueue.url] does not start with [Options.Url] +// (2) linkQueue is not from scanPastResult, indicated by non-nil +// [worker.pastResult]. +func (wrk *worker) checkExternal(linkq *linkQueue) { + if !strings.HasPrefix(linkq.url, wrk.opts.scanUrl.String()) { + linkq.isExternal = true + return + } + if wrk.pastResult != nil { + linkq.isExternal = true + return + } +} |
