aboutsummaryrefslogtreecommitdiff
path: root/brokenlinks/worker.go
diff options
context:
space:
mode:
authorShulhan <ms@kilabit.info>2025-06-21 15:20:01 +0700
committerShulhan <ms@kilabit.info>2025-06-27 12:19:23 +0700
commit1ca561ed0ecfa59b70a10191ac8e58cde90d126e (patch)
tree80f0c65f7e9321ad92dfc1a53a444226cee4be3d /brokenlinks/worker.go
parent8bc8fce1bd80b5a25c452ac5a24b1a1e3f5a4feb (diff)
downloadjarink-1ca561ed0ecfa59b70a10191ac8e58cde90d126e.tar.xz
brokenlinks: implement caching for external URLs
Any succesful fetch on external URLs, will be recorded into jarink cache file, located in user's home cache directory. For example, in Linux it would be `$HOME/.cache/jarink/cache.json`. This help improve the future rescanning on the same or different target URL, minimizing network requests.
Diffstat (limited to 'brokenlinks/worker.go')
-rw-r--r--brokenlinks/worker.go72
1 files changed, 61 insertions, 11 deletions
diff --git a/brokenlinks/worker.go b/brokenlinks/worker.go
index 3eacf01..8d7918f 100644
--- a/brokenlinks/worker.go
+++ b/brokenlinks/worker.go
@@ -19,6 +19,8 @@ import (
"golang.org/x/net/html"
"golang.org/x/net/html/atom"
+
+ "git.sr.ht/~shulhan/jarink"
)
type worker struct {
@@ -41,6 +43,9 @@ type worker struct {
// links or image.
baseUrl *url.URL
+ // cache of scanned links.
+ cache *jarink.Cache
+
log *log.Logger
httpc *http.Client
@@ -79,6 +84,11 @@ func newWorker(opts Options) (wrk *worker, err error) {
},
}
+ wrk.cache, err = jarink.LoadCache()
+ if err != nil {
+ return nil, err
+ }
+
wrk.baseUrl = &url.URL{
Scheme: wrk.opts.scanUrl.Scheme,
Host: wrk.opts.scanUrl.Host,
@@ -135,9 +145,14 @@ func (wrk *worker) scanAll() (result *Result, err error) {
wrk.seenLink[linkq.url] = linkq.status
continue
}
- if linkq.status >= http.StatusBadRequest {
- wrk.markBroken(linkq)
- continue
+
+ if linkq.isExternal {
+ var scannedLink = wrk.cache.Get(linkq.url)
+ if scannedLink != nil {
+ linkq.status = scannedLink.ResponseCode
+ wrk.seen(linkq)
+ continue
+ }
}
wrk.seenLink[linkq.url] = http.StatusProcessing
@@ -206,17 +221,27 @@ func (wrk *worker) processResult(
newList []linkQueue,
) {
for _, linkq := range resultq {
- if linkq.status >= http.StatusBadRequest {
- wrk.markBroken(linkq)
- continue
- }
+ // Process the scanned page first.
+
if linkq.status != 0 {
- // linkq is the result of scan with
- // non error status.
- wrk.seenLink[linkq.url] = linkq.status
+ wrk.seen(linkq)
+ if linkq.isExternal && linkq.status != StatusBadLink {
+ wrk.cache.Set(linkq.url, linkq.status, linkq.size)
+ }
continue
}
+ // Now process the links inside the page.
+
+ if linkq.isExternal {
+ var scannedLink = wrk.cache.Get(linkq.url)
+ if scannedLink != nil {
+ linkq.status = scannedLink.ResponseCode
+ wrk.seen(linkq)
+ continue
+ }
+ }
+
seenStatus, seen := wrk.seenLink[linkq.url]
if !seen {
wrk.seenLink[linkq.url] = http.StatusProcessing
@@ -257,6 +282,14 @@ func (wrk *worker) processResult(
return newList
}
+func (wrk *worker) seen(linkq linkQueue) {
+ if linkq.status >= http.StatusBadRequest {
+ wrk.markBroken(linkq)
+ return
+ }
+ wrk.seenLink[linkq.url] = linkq.status
+}
+
func (wrk *worker) markBroken(linkq linkQueue) {
var parentUrl = linkq.parentUrl.String()
var listBroken = wrk.result.BrokenLinks[parentUrl]
@@ -299,6 +332,7 @@ func (wrk *worker) scan(linkq linkQueue) {
defer httpResp.Body.Close()
linkq.status = httpResp.StatusCode
+ linkq.size = httpResp.ContentLength
resultq[linkq.url] = linkq
if slices.Contains(wrk.opts.ignoreStatus, httpResp.StatusCode) {
@@ -361,7 +395,7 @@ func (wrk *worker) scan(linkq linkQueue) {
}
_, seen := resultq[nodeLink.url]
if !seen {
- nodeLink.checkExternal(wrk)
+ wrk.checkExternal(nodeLink)
resultq[nodeLink.url] = *nodeLink
}
}
@@ -459,3 +493,19 @@ func (wrk *worker) pushResult(resultq map[string]linkQueue) {
}
}
}
+
+// checkExternal set the [linkQueue.isExternal] field to true if
+//
+// (1) [linkQueue.url] does not start with [Options.Url]
+// (2) linkQueue is not from scanPastResult, indicated by non-nil
+// [worker.pastResult].
+func (wrk *worker) checkExternal(linkq *linkQueue) {
+ if !strings.HasPrefix(linkq.url, wrk.opts.scanUrl.String()) {
+ linkq.isExternal = true
+ return
+ }
+ if wrk.pastResult != nil {
+ linkq.isExternal = true
+ return
+ }
+}