aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorShulhan <ms@kilabit.info>2026-02-11 04:38:20 +0700
committerShulhan <ms@kilabit.info>2026-02-11 04:38:20 +0700
commit61eb5351087be894a4bfeb71c99346b8065bb7f1 (patch)
tree1f6d4650cb019c63cfbd8df777b19faedde679a2
parentd8a892eb2f28b3ef4c2625c682d255f4f616cae2 (diff)
downloadjarink-61eb5351087be894a4bfeb71c99346b8065bb7f1.tar.xz
brokenlinks: check if link has been seen before scan
Given the following queue and its parent, /page2.html => /index.html /brokenPage => /index.html /brokenPage => /page2.html Before scanning the second "/brokenPage" on parent page "/page2.html", check if its seen first to get the status code before we run the scan. This allow jarink report "/brokenPage" as broken link for both pages, not just in "/index.html".
-rw-r--r--brokenlinks/worker.go20
1 files changed, 18 insertions, 2 deletions
diff --git a/brokenlinks/worker.go b/brokenlinks/worker.go
index 06bdcc6..cf41c49 100644
--- a/brokenlinks/worker.go
+++ b/brokenlinks/worker.go
@@ -141,6 +141,20 @@ func (wrk *worker) scanAll() (result *Result, err error) {
linkq = wrk.queue[x]
x++
wrk.log.Printf(`scan %d/%d: %s`, x, len(wrk.queue), linkq.Url)
+
+ var seen bool
+ linkq.StatusCode, seen = wrk.seenLink[linkq.Url]
+ if seen {
+ if linkq.StatusCode >= http.StatusBadRequest {
+ // Different pages may have the same broken
+ // link.
+ wrk.markAsBroken(linkq)
+ continue
+ }
+ if linkq.StatusCode != 0 {
+ continue
+ }
+ }
resultq = wrk.scan(linkq)
wrk.processResult(resultq)
}
@@ -200,11 +214,13 @@ func (wrk *worker) processResult(resultq map[string]jarink.Link) {
// Different pages may have the same broken
// link.
wrk.markAsBroken(result)
+ continue
+ }
+ if result.StatusCode != 0 {
+ continue
}
- continue
}
wrk.queue = append(wrk.queue, result)
- wrk.seenLink[result.Url] = result.StatusCode
wrk.log.Printf(`queue %d: %s`, len(wrk.queue), result.Url)
}
}