diff options
| author | Shulhan <ms@kilabit.info> | 2026-02-11 04:38:20 +0700 |
|---|---|---|
| committer | Shulhan <ms@kilabit.info> | 2026-02-11 04:38:20 +0700 |
| commit | 61eb5351087be894a4bfeb71c99346b8065bb7f1 (patch) | |
| tree | 1f6d4650cb019c63cfbd8df777b19faedde679a2 | |
| parent | d8a892eb2f28b3ef4c2625c682d255f4f616cae2 (diff) | |
| download | jarink-61eb5351087be894a4bfeb71c99346b8065bb7f1.tar.xz | |
brokenlinks: check if link has been seen before scan
Given the following queue and its parent,
/page2.html => /index.html
/brokenPage => /index.html
/brokenPage => /page2.html
Before scanning the second "/brokenPage" on parent page "/page2.html",
check if its seen first to get the status code before we run the scan.
This allow jarink report "/brokenPage" as broken link for both pages,
not just in "/index.html".
| -rw-r--r-- | brokenlinks/worker.go | 20 |
1 files changed, 18 insertions, 2 deletions
diff --git a/brokenlinks/worker.go b/brokenlinks/worker.go index 06bdcc6..cf41c49 100644 --- a/brokenlinks/worker.go +++ b/brokenlinks/worker.go @@ -141,6 +141,20 @@ func (wrk *worker) scanAll() (result *Result, err error) { linkq = wrk.queue[x] x++ wrk.log.Printf(`scan %d/%d: %s`, x, len(wrk.queue), linkq.Url) + + var seen bool + linkq.StatusCode, seen = wrk.seenLink[linkq.Url] + if seen { + if linkq.StatusCode >= http.StatusBadRequest { + // Different pages may have the same broken + // link. + wrk.markAsBroken(linkq) + continue + } + if linkq.StatusCode != 0 { + continue + } + } resultq = wrk.scan(linkq) wrk.processResult(resultq) } @@ -200,11 +214,13 @@ func (wrk *worker) processResult(resultq map[string]jarink.Link) { // Different pages may have the same broken // link. wrk.markAsBroken(result) + continue + } + if result.StatusCode != 0 { + continue } - continue } wrk.queue = append(wrk.queue, result) - wrk.seenLink[result.Url] = result.StatusCode wrk.log.Printf(`queue %d: %s`, len(wrk.queue), result.Url) } } |
