From 61eb5351087be894a4bfeb71c99346b8065bb7f1 Mon Sep 17 00:00:00 2001 From: Shulhan Date: Wed, 11 Feb 2026 04:38:20 +0700 Subject: brokenlinks: check if link has been seen before scan Given the following queue and its parent, /page2.html => /index.html /brokenPage => /index.html /brokenPage => /page2.html Before scanning the second "/brokenPage" on parent page "/page2.html", check if its seen first to get the status code before we run the scan. This allow jarink report "/brokenPage" as broken link for both pages, not just in "/index.html". --- brokenlinks/worker.go | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/brokenlinks/worker.go b/brokenlinks/worker.go index 06bdcc6..cf41c49 100644 --- a/brokenlinks/worker.go +++ b/brokenlinks/worker.go @@ -141,6 +141,20 @@ func (wrk *worker) scanAll() (result *Result, err error) { linkq = wrk.queue[x] x++ wrk.log.Printf(`scan %d/%d: %s`, x, len(wrk.queue), linkq.Url) + + var seen bool + linkq.StatusCode, seen = wrk.seenLink[linkq.Url] + if seen { + if linkq.StatusCode >= http.StatusBadRequest { + // Different pages may have the same broken + // link. + wrk.markAsBroken(linkq) + continue + } + if linkq.StatusCode != 0 { + continue + } + } resultq = wrk.scan(linkq) wrk.processResult(resultq) } @@ -200,11 +214,13 @@ func (wrk *worker) processResult(resultq map[string]jarink.Link) { // Different pages may have the same broken // link. wrk.markAsBroken(result) + continue + } + if result.StatusCode != 0 { + continue } - continue } wrk.queue = append(wrk.queue, result) - wrk.seenLink[result.Url] = result.StatusCode wrk.log.Printf(`queue %d: %s`, len(wrk.queue), result.Url) } } -- cgit v1.3