summaryrefslogtreecommitdiff
path: root/brokenlinks/worker.go
diff options
context:
space:
mode:
Diffstat (limited to 'brokenlinks/worker.go')
-rw-r--r--brokenlinks/worker.go72
1 files changed, 61 insertions, 11 deletions
diff --git a/brokenlinks/worker.go b/brokenlinks/worker.go
index 3eacf01..8d7918f 100644
--- a/brokenlinks/worker.go
+++ b/brokenlinks/worker.go
@@ -19,6 +19,8 @@ import (
"golang.org/x/net/html"
"golang.org/x/net/html/atom"
+
+ "git.sr.ht/~shulhan/jarink"
)
type worker struct {
@@ -41,6 +43,9 @@ type worker struct {
// links or image.
baseUrl *url.URL
+ // cache of scanned links.
+ cache *jarink.Cache
+
log *log.Logger
httpc *http.Client
@@ -79,6 +84,11 @@ func newWorker(opts Options) (wrk *worker, err error) {
},
}
+ wrk.cache, err = jarink.LoadCache()
+ if err != nil {
+ return nil, err
+ }
+
wrk.baseUrl = &url.URL{
Scheme: wrk.opts.scanUrl.Scheme,
Host: wrk.opts.scanUrl.Host,
@@ -135,9 +145,14 @@ func (wrk *worker) scanAll() (result *Result, err error) {
wrk.seenLink[linkq.url] = linkq.status
continue
}
- if linkq.status >= http.StatusBadRequest {
- wrk.markBroken(linkq)
- continue
+
+ if linkq.isExternal {
+ var scannedLink = wrk.cache.Get(linkq.url)
+ if scannedLink != nil {
+ linkq.status = scannedLink.ResponseCode
+ wrk.seen(linkq)
+ continue
+ }
}
wrk.seenLink[linkq.url] = http.StatusProcessing
@@ -206,17 +221,27 @@ func (wrk *worker) processResult(
newList []linkQueue,
) {
for _, linkq := range resultq {
- if linkq.status >= http.StatusBadRequest {
- wrk.markBroken(linkq)
- continue
- }
+ // Process the scanned page first.
+
if linkq.status != 0 {
- // linkq is the result of scan with
- // non error status.
- wrk.seenLink[linkq.url] = linkq.status
+ wrk.seen(linkq)
+ if linkq.isExternal && linkq.status != StatusBadLink {
+ wrk.cache.Set(linkq.url, linkq.status, linkq.size)
+ }
continue
}
+ // Now process the links inside the page.
+
+ if linkq.isExternal {
+ var scannedLink = wrk.cache.Get(linkq.url)
+ if scannedLink != nil {
+ linkq.status = scannedLink.ResponseCode
+ wrk.seen(linkq)
+ continue
+ }
+ }
+
seenStatus, seen := wrk.seenLink[linkq.url]
if !seen {
wrk.seenLink[linkq.url] = http.StatusProcessing
@@ -257,6 +282,14 @@ func (wrk *worker) processResult(
return newList
}
+func (wrk *worker) seen(linkq linkQueue) {
+ if linkq.status >= http.StatusBadRequest {
+ wrk.markBroken(linkq)
+ return
+ }
+ wrk.seenLink[linkq.url] = linkq.status
+}
+
func (wrk *worker) markBroken(linkq linkQueue) {
var parentUrl = linkq.parentUrl.String()
var listBroken = wrk.result.BrokenLinks[parentUrl]
@@ -299,6 +332,7 @@ func (wrk *worker) scan(linkq linkQueue) {
defer httpResp.Body.Close()
linkq.status = httpResp.StatusCode
+ linkq.size = httpResp.ContentLength
resultq[linkq.url] = linkq
if slices.Contains(wrk.opts.ignoreStatus, httpResp.StatusCode) {
@@ -361,7 +395,7 @@ func (wrk *worker) scan(linkq linkQueue) {
}
_, seen := resultq[nodeLink.url]
if !seen {
- nodeLink.checkExternal(wrk)
+ wrk.checkExternal(nodeLink)
resultq[nodeLink.url] = *nodeLink
}
}
@@ -459,3 +493,19 @@ func (wrk *worker) pushResult(resultq map[string]linkQueue) {
}
}
}
+
+// checkExternal set the [linkQueue.isExternal] field to true if
+//
+// (1) [linkQueue.url] does not start with [Options.Url]
+// (2) linkQueue is not from scanPastResult, indicated by non-nil
+// [worker.pastResult].
+func (wrk *worker) checkExternal(linkq *linkQueue) {
+ if !strings.HasPrefix(linkq.url, wrk.opts.scanUrl.String()) {
+ linkq.isExternal = true
+ return
+ }
+ if wrk.pastResult != nil {
+ linkq.isExternal = true
+ return
+ }
+}