aboutsummaryrefslogtreecommitdiff
path: root/brokenlinks/worker.go
diff options
context:
space:
mode:
authorShulhan <ms@kilabit.info>2026-01-22 00:27:18 +0700
committerShulhan <ms@kilabit.info>2026-01-22 00:27:18 +0700
commit26fc8bd3203dae6b4705ada227439c90129bbe36 (patch)
tree26ed784cdf2779c3a0848fbf6c1355094b216eea /brokenlinks/worker.go
parent6637e9d11a57c67510d79c00a1425f5e66d18280 (diff)
downloadjarink-26fc8bd3203dae6b4705ada227439c90129bbe36.tar.xz
brokenlinks: refactoring the logic, simplify the code
Previously, we made the scan logic to run in multiple goroutine with one channel to push and consume the result and another channel to push and pop link to be processed. The logic is a very complicated code, making it hard to read and debug. These changes refactoring it to use single goroutine that push and pop link from/to a slices, as queue.
Diffstat (limited to 'brokenlinks/worker.go')
-rw-r--r--brokenlinks/worker.go345
1 files changed, 111 insertions, 234 deletions
diff --git a/brokenlinks/worker.go b/brokenlinks/worker.go
index 3c6f97e..c0a33dd 100644
--- a/brokenlinks/worker.go
+++ b/brokenlinks/worker.go
@@ -1,5 +1,5 @@
-// SPDX-FileCopyrightText: 2025 M. Shulhan <ms@kilabit.info>
// SPDX-License-Identifier: GPL-3.0-only
+// SPDX-FileCopyrightText: 2025 M. Shulhan <ms@kilabit.info>
package brokenlinks
@@ -14,7 +14,6 @@ import (
"os"
"slices"
"strings"
- "sync"
"time"
"golang.org/x/net/html"
@@ -26,10 +25,11 @@ import (
type worker struct {
// seenLink store the URL being or has been scanned and its HTTP
// status code.
+ // It store all links, including the broken one.
seenLink map[string]int
- // resultq channel that collect result from scanning.
- resultq chan map[string]linkQueue
+ // queue contains list of link to be scanned.
+ queue []linkQueue
// result contains the final result after all of the pages has been
// scanned.
@@ -41,6 +41,7 @@ type worker struct {
// The base URL that will be joined to relative or absolute
// links or image.
+ // baseURL is set to "scheme://host" only.
baseUrl *url.URL
// cache of scanned links.
@@ -51,12 +52,10 @@ type worker struct {
httpc *http.Client
opts Options
-
- // wg sync the goroutine scanner.
- wg sync.WaitGroup
}
func newWorker(opts Options) (wrk *worker, err error) {
+ var logp = `newWorker`
var netDial = &net.Dialer{
Timeout: 30 * time.Second,
KeepAlive: 30 * time.Second,
@@ -68,7 +67,6 @@ func newWorker(opts Options) (wrk *worker, err error) {
wrk = &worker{
opts: opts,
seenLink: map[string]int{},
- resultq: make(chan map[string]linkQueue, 100),
result: newResult(),
log: log.New(os.Stderr, ``, log.LstdFlags),
httpc: &http.Client{
@@ -107,7 +105,8 @@ func newWorker(opts Options) (wrk *worker, err error) {
wrk.pastResult = newResult()
err = json.Unmarshal(pastresult, &wrk.pastResult)
if err != nil {
- return nil, err
+ log.Printf(`%s: invalid past result file format: %s`, logp, err)
+ log.Printf(`%s: ignoring past result`, logp)
}
return wrk, nil
@@ -125,184 +124,107 @@ func (wrk *worker) run() (result *Result, err error) {
// scanAll scan all pages start from [Options.Url].
func (wrk *worker) scanAll() (result *Result, err error) {
// Scan the first URL to make sure that the server is reachable.
- var firstLinkq = linkQueue{
+ var linkq = linkQueue{
parentUrl: nil,
url: wrk.opts.scanUrl.String(),
- status: http.StatusProcessing,
}
- wrk.seenLink[firstLinkq.url] = http.StatusProcessing
-
- wrk.wg.Add(1)
- go func() {
- var resultq = wrk.scan(firstLinkq)
- wrk.pushResult(resultq)
- }()
- wrk.wg.Wait()
-
- var resultq = <-wrk.resultq
- for _, linkq := range resultq {
- if linkq.url == firstLinkq.url {
- if linkq.errScan != nil {
- return nil, linkq.errScan
- }
- wrk.seenLink[linkq.url] = linkq.status
- continue
- }
- if linkq.isExternal {
- var scannedLink = wrk.cache.Get(linkq.url)
- if scannedLink != nil {
- linkq.status = scannedLink.ResponseCode
- wrk.seen(linkq)
- continue
- }
- }
+ var resultq = wrk.scan(linkq)
+ linkq = resultq[linkq.url]
+ if linkq.errScan != nil {
+ return nil, linkq.errScan
+ }
+ wrk.processResult(resultq)
- wrk.seenLink[linkq.url] = http.StatusProcessing
- wrk.wg.Add(1)
- go func() {
- var resultq = wrk.scan(linkq)
- wrk.pushResult(resultq)
- }()
+ var x int
+ for x < len(wrk.queue) {
+ linkq = wrk.queue[x]
+ x++
+ resultq = wrk.scan(linkq)
+ wrk.processResult(resultq)
}
- wrk.processAndWait()
+ wrk.result.sort()
return wrk.result, nil
}
-// scanPastResult scan only pages reported inside
-// [Result.BrokenLinks].
+// scanPastResult scan only pages reported inside [Result.BrokenLinks].
func (wrk *worker) scanPastResult() (result *Result, err error) {
for page := range wrk.pastResult.BrokenLinks {
- var linkq = linkQueue{
- parentUrl: nil,
- url: page,
- status: http.StatusProcessing,
+ wrk.opts.scanUrl, err = url.Parse(page)
+ if err != nil {
+ log.Printf(`scanPastResult: invalid URL %q: %s`, page, err)
+ continue
}
- wrk.seenLink[linkq.url] = http.StatusProcessing
- wrk.wg.Add(1)
- go func() {
- var resultq = wrk.scan(linkq)
- wrk.pushResult(resultq)
- }()
- }
-
- wrk.processAndWait()
- return wrk.result, nil
-}
-
-func (wrk *worker) processAndWait() {
- var tick = time.NewTicker(500 * time.Millisecond)
- var listWaitStatus []linkQueue
- var isScanning = true
- for isScanning {
- select {
- case resultq := <-wrk.resultq:
- listWaitStatus = wrk.processResult(resultq, listWaitStatus)
-
- case <-tick.C:
- wrk.wg.Wait()
- if len(wrk.resultq) != 0 {
- continue
- }
- if len(listWaitStatus) != 0 {
- // There are links that still waiting for
- // scanning to be completed.
- continue
- }
- isScanning = false
+ wrk.queue = nil
+ _, err = wrk.scanAll()
+ if err != nil {
+ log.Printf(`scanPastResult: %q: %s`, page, err)
}
}
+
wrk.result.sort()
+ return wrk.result, nil
}
-// processResult the resultq contains the original URL being scanned
-// and its child links.
-// For example, scanning "http://example.tld" result in
+// processResult process the scan result and push it to queue.
+// The resultq contains the original URL being scanned and its child links.
+// For each link item in resultq,
//
-// "http://example.tld": {status=200}
-// "http://example.tld/page": {status=0}
-// "http://example.tld/image.png": {status=0}
-// "http://bad:domain/image.png": {status=700}
-func (wrk *worker) processResult(
- resultq map[string]linkQueue, listWaitStatus []linkQueue,
-) (
- newList []linkQueue,
-) {
- for _, linkq := range resultq {
- // Process the scanned page first.
-
+// - For non-zero status code, mark it as seen and store to cache if
+// it external.
+// - Skip external link that has been checked before.
+// - Skip link that has been seen.
+// - Otherwise push it to queue.
+func (wrk *worker) processResult(resultq map[string]linkQueue) {
+ var linkq linkQueue
+ var seen bool
+ for _, linkq = range resultq {
if linkq.status != 0 {
wrk.seen(linkq)
- if linkq.isExternal && linkq.status != StatusBadLink {
- wrk.cache.Set(linkq.url, linkq.status, linkq.size)
- }
continue
}
- // Now process the links inside the page.
-
if linkq.isExternal {
var scannedLink = wrk.cache.Get(linkq.url)
if scannedLink != nil {
- linkq.status = scannedLink.ResponseCode
- wrk.seen(linkq)
+ // The external link has been scanned
+ // previously.
continue
}
}
- seenStatus, seen := wrk.seenLink[linkq.url]
- if !seen {
- wrk.seenLink[linkq.url] = http.StatusProcessing
- wrk.wg.Add(1)
- go func() {
- var resultq = wrk.scan(linkq)
- wrk.pushResult(resultq)
- }()
- continue
- }
- if seenStatus >= http.StatusBadRequest {
- linkq.status = seenStatus
- wrk.markBroken(linkq)
- continue
- }
- if seenStatus >= http.StatusOK {
- // The link has been processed and its
- // not an error.
- continue
- }
- // The link being processed by other goroutine.
- linkq.status = seenStatus
- newList = append(newList, linkq)
- }
- for _, linkq := range listWaitStatus {
- seenStatus := wrk.seenLink[linkq.url]
- if seenStatus >= http.StatusBadRequest {
- linkq.status = seenStatus
- wrk.markBroken(linkq)
- continue
- }
- if seenStatus >= http.StatusOK {
- continue
- }
- if seenStatus == http.StatusProcessing {
- // Scanning still in progress.
- newList = append(newList, linkq)
+ linkq.status, seen = wrk.seenLink[linkq.url]
+ if seen {
+ if linkq.status >= http.StatusBadRequest {
+ // Different pages may have the same broken
+ // link.
+ wrk.markAsBroken(linkq)
+ }
continue
}
+ wrk.queue = append(wrk.queue, linkq)
}
- return newList
}
func (wrk *worker) seen(linkq linkQueue) {
+ wrk.seenLink[linkq.url] = linkq.status
+
+ if linkq.isExternal {
+ if linkq.status != StatusBadLink {
+ wrk.cache.Set(linkq.url, linkq.status, linkq.size)
+ }
+ }
+
if linkq.status >= http.StatusBadRequest {
- wrk.markBroken(linkq)
- return
+ wrk.markAsBroken(linkq)
}
- wrk.seenLink[linkq.url] = linkq.status
}
-func (wrk *worker) markBroken(linkq linkQueue) {
+func (wrk *worker) markAsBroken(linkq linkQueue) {
+ if slices.Contains(wrk.opts.ignoreStatus, linkq.status) {
+ return
+ }
var parentUrl = linkq.parentUrl.String()
var listBroken = wrk.result.BrokenLinks[parentUrl]
var brokenLink = Broken{
@@ -314,20 +236,10 @@ func (wrk *worker) markBroken(linkq linkQueue) {
}
listBroken = append(listBroken, brokenLink)
wrk.result.BrokenLinks[parentUrl] = listBroken
-
- wrk.seenLink[linkq.url] = linkq.status
}
-// scan fetch the HTML page or image to check if its valid.
+// scan the link to HTML page or image.
func (wrk *worker) scan(linkq linkQueue) (resultq map[string]linkQueue) {
- defer func() {
- if wrk.opts.IsVerbose && linkq.errScan != nil {
- wrk.log.Printf("error: %d %s error=%v\n", linkq.status,
- linkq.url, linkq.errScan)
- }
- wrk.wg.Done()
- }()
-
resultq = make(map[string]linkQueue)
var (
httpResp *http.Response
@@ -346,10 +258,6 @@ func (wrk *worker) scan(linkq linkQueue) (resultq map[string]linkQueue) {
linkq.size = httpResp.ContentLength
resultq[linkq.url] = linkq
- if slices.Contains(wrk.opts.ignoreStatus, httpResp.StatusCode) {
- return nil
- }
-
if httpResp.StatusCode >= http.StatusBadRequest {
return resultq
}
@@ -357,21 +265,23 @@ func (wrk *worker) scan(linkq linkQueue) (resultq map[string]linkQueue) {
return resultq
}
- var doc *html.Node
- doc, _ = html.Parse(httpResp.Body)
-
- // After we check the code and test for [html.Parse] there are
- // no case actual cases where HTML content will return an error.
+ // After we check the code for [html.Parse] there are no cases where
+ // it will return an error.
// The only possible error is when reading from body (io.Reader), and
// that is also almost impossible.
//
// [html.Parse]: https://go.googlesource.com/net/+/refs/tags/v0.40.0/html/parse.go#2347
+ var doc *html.Node
+ doc, _ = html.Parse(httpResp.Body)
- var scanUrl *url.URL
+ var parentUrl *url.URL
- scanUrl, err = url.Parse(linkq.url)
+ parentUrl, err = url.Parse(linkq.url)
if err != nil {
- log.Fatal(err)
+ linkq.status = StatusBadLink
+ linkq.errScan = err
+ resultq[linkq.url] = linkq
+ return resultq
}
var node *html.Node
@@ -379,42 +289,42 @@ func (wrk *worker) scan(linkq linkQueue) (resultq map[string]linkQueue) {
if node.Type != html.ElementNode {
continue
}
+ if node.DataAtom != atom.A && node.DataAtom != atom.Img {
+ continue
+ }
var nodeLink *linkQueue
if node.DataAtom == atom.A {
for _, attr := range node.Attr {
if attr.Key != `href` {
continue
}
- nodeLink = wrk.processLink(scanUrl, attr.Val, atom.A)
+ nodeLink = wrk.processLink(parentUrl, attr.Val, atom.A)
break
}
- } else if node.DataAtom == atom.Img {
+ } else { // atom.Img
for _, attr := range node.Attr {
if attr.Key != `src` {
continue
}
- nodeLink = wrk.processLink(scanUrl, attr.Val, atom.Img)
+ nodeLink = wrk.processLink(parentUrl, attr.Val, atom.Img)
break
}
- } else {
- continue
}
if nodeLink == nil {
+ // Link is invalid.
continue
}
_, seen := resultq[nodeLink.url]
- if !seen {
- wrk.checkExternal(nodeLink)
- resultq[nodeLink.url] = *nodeLink
+ if seen {
+ // The same link already exist previously.
+ continue
}
+ resultq[nodeLink.url] = *nodeLink
}
return resultq
}
-func (wrk *worker) fetch(linkq linkQueue) (
- httpResp *http.Response,
- err error,
-) {
+func (wrk *worker) fetch(linkq linkQueue) (httpResp *http.Response, err error) {
const maxRetry = 5
var retry int
for retry < 5 {
@@ -446,6 +356,8 @@ func (wrk *worker) fetch(linkq linkQueue) (
return nil, err
}
+// processLink given a parentURL and link value `val`
+// check if link `val` is valid and return it as linkQueue.
func (wrk *worker) processLink(parentUrl *url.URL, val string, kind atom.Atom) (
linkq *linkQueue,
) {
@@ -453,17 +365,19 @@ func (wrk *worker) processLink(parentUrl *url.URL, val string, kind atom.Atom) (
return nil
}
+ linkq = &linkQueue{
+ parentUrl: parentUrl,
+ kind: kind,
+ }
+
var newUrl *url.URL
var err error
newUrl, err = url.Parse(val)
if err != nil {
- return &linkQueue{
- parentUrl: parentUrl,
- errScan: err,
- url: val,
- kind: kind,
- status: StatusBadLink,
- }
+ linkq.errScan = err
+ linkq.url = val
+ linkq.status = StatusBadLink
+ return linkq
}
newUrl.Fragment = ""
newUrl.RawFragment = ""
@@ -472,55 +386,18 @@ func (wrk *worker) processLink(parentUrl *url.URL, val string, kind atom.Atom) (
// Ignore link to ID, like `href="#element_id"`.
return nil
}
- if strings.HasPrefix(val, `http`) {
- return &linkQueue{
- parentUrl: parentUrl,
- url: strings.TrimSuffix(newUrl.String(), `/`),
- kind: kind,
- }
- }
- if val[0] == '/' {
- // val is absolute to parent URL.
- newUrl = wrk.baseUrl.JoinPath(newUrl.Path)
- } else {
- // val is relative to parent URL.
- newUrl = parentUrl.JoinPath(`/`, newUrl.Path)
- }
- linkq = &linkQueue{
- parentUrl: parentUrl,
- url: strings.TrimSuffix(newUrl.String(), `/`),
- kind: kind,
- }
- return linkq
-}
-
-func (wrk *worker) pushResult(resultq map[string]linkQueue) {
- if len(resultq) == 0 {
- return
- }
- var tick = time.NewTicker(100 * time.Millisecond)
- for {
- select {
- case wrk.resultq <- resultq:
- tick.Stop()
- return
- case <-tick.C:
+ if !strings.HasPrefix(val, `http`) {
+ if val[0] == '/' {
+ // val is absolute link.
+ newUrl = wrk.baseUrl.JoinPath(newUrl.Path)
+ } else {
+ // val is relative to parent URL.
+ newUrl = parentUrl.JoinPath(`/`, newUrl.Path)
}
}
-}
-
-// checkExternal set the [linkQueue.isExternal] field to true if
-//
-// (1) [linkQueue.url] does not start with [Options.Url]
-// (2) linkQueue is not from scanPastResult, indicated by non-nil
-// [worker.pastResult].
-func (wrk *worker) checkExternal(linkq *linkQueue) {
+ linkq.url = strings.TrimSuffix(newUrl.String(), `/`)
if !strings.HasPrefix(linkq.url, wrk.opts.scanUrl.String()) {
linkq.isExternal = true
- return
- }
- if wrk.pastResult != nil {
- linkq.isExternal = true
- return
}
+ return linkq
}