aboutsummaryrefslogtreecommitdiff
path: root/brokenlinks/worker.go
diff options
context:
space:
mode:
Diffstat (limited to 'brokenlinks/worker.go')
-rw-r--r--brokenlinks/worker.go345
1 files changed, 111 insertions, 234 deletions
diff --git a/brokenlinks/worker.go b/brokenlinks/worker.go
index 3c6f97e..c0a33dd 100644
--- a/brokenlinks/worker.go
+++ b/brokenlinks/worker.go
@@ -1,5 +1,5 @@
-// SPDX-FileCopyrightText: 2025 M. Shulhan <ms@kilabit.info>
// SPDX-License-Identifier: GPL-3.0-only
+// SPDX-FileCopyrightText: 2025 M. Shulhan <ms@kilabit.info>
package brokenlinks
@@ -14,7 +14,6 @@ import (
"os"
"slices"
"strings"
- "sync"
"time"
"golang.org/x/net/html"
@@ -26,10 +25,11 @@ import (
type worker struct {
// seenLink store the URL being or has been scanned and its HTTP
// status code.
+ // It store all links, including the broken one.
seenLink map[string]int
- // resultq channel that collect result from scanning.
- resultq chan map[string]linkQueue
+ // queue contains list of link to be scanned.
+ queue []linkQueue
// result contains the final result after all of the pages has been
// scanned.
@@ -41,6 +41,7 @@ type worker struct {
// The base URL that will be joined to relative or absolute
// links or image.
+ // baseURL is set to "scheme://host" only.
baseUrl *url.URL
// cache of scanned links.
@@ -51,12 +52,10 @@ type worker struct {
httpc *http.Client
opts Options
-
- // wg sync the goroutine scanner.
- wg sync.WaitGroup
}
func newWorker(opts Options) (wrk *worker, err error) {
+ var logp = `newWorker`
var netDial = &net.Dialer{
Timeout: 30 * time.Second,
KeepAlive: 30 * time.Second,
@@ -68,7 +67,6 @@ func newWorker(opts Options) (wrk *worker, err error) {
wrk = &worker{
opts: opts,
seenLink: map[string]int{},
- resultq: make(chan map[string]linkQueue, 100),
result: newResult(),
log: log.New(os.Stderr, ``, log.LstdFlags),
httpc: &http.Client{
@@ -107,7 +105,8 @@ func newWorker(opts Options) (wrk *worker, err error) {
wrk.pastResult = newResult()
err = json.Unmarshal(pastresult, &wrk.pastResult)
if err != nil {
- return nil, err
+ log.Printf(`%s: invalid past result file format: %s`, logp, err)
+ log.Printf(`%s: ignoring past result`, logp)
}
return wrk, nil
@@ -125,184 +124,107 @@ func (wrk *worker) run() (result *Result, err error) {
// scanAll scan all pages start from [Options.Url].
func (wrk *worker) scanAll() (result *Result, err error) {
// Scan the first URL to make sure that the server is reachable.
- var firstLinkq = linkQueue{
+ var linkq = linkQueue{
parentUrl: nil,
url: wrk.opts.scanUrl.String(),
- status: http.StatusProcessing,
}
- wrk.seenLink[firstLinkq.url] = http.StatusProcessing
-
- wrk.wg.Add(1)
- go func() {
- var resultq = wrk.scan(firstLinkq)
- wrk.pushResult(resultq)
- }()
- wrk.wg.Wait()
-
- var resultq = <-wrk.resultq
- for _, linkq := range resultq {
- if linkq.url == firstLinkq.url {
- if linkq.errScan != nil {
- return nil, linkq.errScan
- }
- wrk.seenLink[linkq.url] = linkq.status
- continue
- }
- if linkq.isExternal {
- var scannedLink = wrk.cache.Get(linkq.url)
- if scannedLink != nil {
- linkq.status = scannedLink.ResponseCode
- wrk.seen(linkq)
- continue
- }
- }
+ var resultq = wrk.scan(linkq)
+ linkq = resultq[linkq.url]
+ if linkq.errScan != nil {
+ return nil, linkq.errScan
+ }
+ wrk.processResult(resultq)
- wrk.seenLink[linkq.url] = http.StatusProcessing
- wrk.wg.Add(1)
- go func() {
- var resultq = wrk.scan(linkq)
- wrk.pushResult(resultq)
- }()
+ var x int
+ for x < len(wrk.queue) {
+ linkq = wrk.queue[x]
+ x++
+ resultq = wrk.scan(linkq)
+ wrk.processResult(resultq)
}
- wrk.processAndWait()
+ wrk.result.sort()
return wrk.result, nil
}
-// scanPastResult scan only pages reported inside
-// [Result.BrokenLinks].
+// scanPastResult scan only pages reported inside [Result.BrokenLinks].
func (wrk *worker) scanPastResult() (result *Result, err error) {
for page := range wrk.pastResult.BrokenLinks {
- var linkq = linkQueue{
- parentUrl: nil,
- url: page,
- status: http.StatusProcessing,
+ wrk.opts.scanUrl, err = url.Parse(page)
+ if err != nil {
+ log.Printf(`scanPastResult: invalid URL %q: %s`, page, err)
+ continue
}
- wrk.seenLink[linkq.url] = http.StatusProcessing
- wrk.wg.Add(1)
- go func() {
- var resultq = wrk.scan(linkq)
- wrk.pushResult(resultq)
- }()
- }
-
- wrk.processAndWait()
- return wrk.result, nil
-}
-
-func (wrk *worker) processAndWait() {
- var tick = time.NewTicker(500 * time.Millisecond)
- var listWaitStatus []linkQueue
- var isScanning = true
- for isScanning {
- select {
- case resultq := <-wrk.resultq:
- listWaitStatus = wrk.processResult(resultq, listWaitStatus)
-
- case <-tick.C:
- wrk.wg.Wait()
- if len(wrk.resultq) != 0 {
- continue
- }
- if len(listWaitStatus) != 0 {
- // There are links that still waiting for
- // scanning to be completed.
- continue
- }
- isScanning = false
+ wrk.queue = nil
+ _, err = wrk.scanAll()
+ if err != nil {
+ log.Printf(`scanPastResult: %q: %s`, page, err)
}
}
+
wrk.result.sort()
+ return wrk.result, nil
}
-// processResult the resultq contains the original URL being scanned
-// and its child links.
-// For example, scanning "http://example.tld" result in
+// processResult process the scan result and push it to queue.
+// The resultq contains the original URL being scanned and its child links.
+// For each link item in resultq,
//
-// "http://example.tld": {status=200}
-// "http://example.tld/page": {status=0}
-// "http://example.tld/image.png": {status=0}
-// "http://bad:domain/image.png": {status=700}
-func (wrk *worker) processResult(
- resultq map[string]linkQueue, listWaitStatus []linkQueue,
-) (
- newList []linkQueue,
-) {
- for _, linkq := range resultq {
- // Process the scanned page first.
-
+// - For non-zero status code, mark it as seen and store to cache if
+// it external.
+// - Skip external link that has been checked before.
+// - Skip link that has been seen.
+// - Otherwise push it to queue.
+func (wrk *worker) processResult(resultq map[string]linkQueue) {
+ var linkq linkQueue
+ var seen bool
+ for _, linkq = range resultq {
if linkq.status != 0 {
wrk.seen(linkq)
- if linkq.isExternal && linkq.status != StatusBadLink {
- wrk.cache.Set(linkq.url, linkq.status, linkq.size)
- }
continue
}
- // Now process the links inside the page.
-
if linkq.isExternal {
var scannedLink = wrk.cache.Get(linkq.url)
if scannedLink != nil {
- linkq.status = scannedLink.ResponseCode
- wrk.seen(linkq)
+ // The external link has been scanned
+ // previously.
continue
}
}
- seenStatus, seen := wrk.seenLink[linkq.url]
- if !seen {
- wrk.seenLink[linkq.url] = http.StatusProcessing
- wrk.wg.Add(1)
- go func() {
- var resultq = wrk.scan(linkq)
- wrk.pushResult(resultq)
- }()
- continue
- }
- if seenStatus >= http.StatusBadRequest {
- linkq.status = seenStatus
- wrk.markBroken(linkq)
- continue
- }
- if seenStatus >= http.StatusOK {
- // The link has been processed and its
- // not an error.
- continue
- }
- // The link being processed by other goroutine.
- linkq.status = seenStatus
- newList = append(newList, linkq)
- }
- for _, linkq := range listWaitStatus {
- seenStatus := wrk.seenLink[linkq.url]
- if seenStatus >= http.StatusBadRequest {
- linkq.status = seenStatus
- wrk.markBroken(linkq)
- continue
- }
- if seenStatus >= http.StatusOK {
- continue
- }
- if seenStatus == http.StatusProcessing {
- // Scanning still in progress.
- newList = append(newList, linkq)
+ linkq.status, seen = wrk.seenLink[linkq.url]
+ if seen {
+ if linkq.status >= http.StatusBadRequest {
+ // Different pages may have the same broken
+ // link.
+ wrk.markAsBroken(linkq)
+ }
continue
}
+ wrk.queue = append(wrk.queue, linkq)
}
- return newList
}
func (wrk *worker) seen(linkq linkQueue) {
+ wrk.seenLink[linkq.url] = linkq.status
+
+ if linkq.isExternal {
+ if linkq.status != StatusBadLink {
+ wrk.cache.Set(linkq.url, linkq.status, linkq.size)
+ }
+ }
+
if linkq.status >= http.StatusBadRequest {
- wrk.markBroken(linkq)
- return
+ wrk.markAsBroken(linkq)
}
- wrk.seenLink[linkq.url] = linkq.status
}
-func (wrk *worker) markBroken(linkq linkQueue) {
+func (wrk *worker) markAsBroken(linkq linkQueue) {
+ if slices.Contains(wrk.opts.ignoreStatus, linkq.status) {
+ return
+ }
var parentUrl = linkq.parentUrl.String()
var listBroken = wrk.result.BrokenLinks[parentUrl]
var brokenLink = Broken{
@@ -314,20 +236,10 @@ func (wrk *worker) markBroken(linkq linkQueue) {
}
listBroken = append(listBroken, brokenLink)
wrk.result.BrokenLinks[parentUrl] = listBroken
-
- wrk.seenLink[linkq.url] = linkq.status
}
-// scan fetch the HTML page or image to check if its valid.
+// scan the link to HTML page or image.
func (wrk *worker) scan(linkq linkQueue) (resultq map[string]linkQueue) {
- defer func() {
- if wrk.opts.IsVerbose && linkq.errScan != nil {
- wrk.log.Printf("error: %d %s error=%v\n", linkq.status,
- linkq.url, linkq.errScan)
- }
- wrk.wg.Done()
- }()
-
resultq = make(map[string]linkQueue)
var (
httpResp *http.Response
@@ -346,10 +258,6 @@ func (wrk *worker) scan(linkq linkQueue) (resultq map[string]linkQueue) {
linkq.size = httpResp.ContentLength
resultq[linkq.url] = linkq
- if slices.Contains(wrk.opts.ignoreStatus, httpResp.StatusCode) {
- return nil
- }
-
if httpResp.StatusCode >= http.StatusBadRequest {
return resultq
}
@@ -357,21 +265,23 @@ func (wrk *worker) scan(linkq linkQueue) (resultq map[string]linkQueue) {
return resultq
}
- var doc *html.Node
- doc, _ = html.Parse(httpResp.Body)
-
- // After we check the code and test for [html.Parse] there are
- // no case actual cases where HTML content will return an error.
+ // After we check the code for [html.Parse] there are no cases where
+ // it will return an error.
// The only possible error is when reading from body (io.Reader), and
// that is also almost impossible.
//
// [html.Parse]: https://go.googlesource.com/net/+/refs/tags/v0.40.0/html/parse.go#2347
+ var doc *html.Node
+ doc, _ = html.Parse(httpResp.Body)
- var scanUrl *url.URL
+ var parentUrl *url.URL
- scanUrl, err = url.Parse(linkq.url)
+ parentUrl, err = url.Parse(linkq.url)
if err != nil {
- log.Fatal(err)
+ linkq.status = StatusBadLink
+ linkq.errScan = err
+ resultq[linkq.url] = linkq
+ return resultq
}
var node *html.Node
@@ -379,42 +289,42 @@ func (wrk *worker) scan(linkq linkQueue) (resultq map[string]linkQueue) {
if node.Type != html.ElementNode {
continue
}
+ if node.DataAtom != atom.A && node.DataAtom != atom.Img {
+ continue
+ }
var nodeLink *linkQueue
if node.DataAtom == atom.A {
for _, attr := range node.Attr {
if attr.Key != `href` {
continue
}
- nodeLink = wrk.processLink(scanUrl, attr.Val, atom.A)
+ nodeLink = wrk.processLink(parentUrl, attr.Val, atom.A)
break
}
- } else if node.DataAtom == atom.Img {
+ } else { // atom.Img
for _, attr := range node.Attr {
if attr.Key != `src` {
continue
}
- nodeLink = wrk.processLink(scanUrl, attr.Val, atom.Img)
+ nodeLink = wrk.processLink(parentUrl, attr.Val, atom.Img)
break
}
- } else {
- continue
}
if nodeLink == nil {
+ // Link is invalid.
continue
}
_, seen := resultq[nodeLink.url]
- if !seen {
- wrk.checkExternal(nodeLink)
- resultq[nodeLink.url] = *nodeLink
+ if seen {
+ // The same link already exist previously.
+ continue
}
+ resultq[nodeLink.url] = *nodeLink
}
return resultq
}
-func (wrk *worker) fetch(linkq linkQueue) (
- httpResp *http.Response,
- err error,
-) {
+func (wrk *worker) fetch(linkq linkQueue) (httpResp *http.Response, err error) {
const maxRetry = 5
var retry int
for retry < 5 {
@@ -446,6 +356,8 @@ func (wrk *worker) fetch(linkq linkQueue) (
return nil, err
}
+// processLink given a parentURL and link value `val`
+// check if link `val` is valid and return it as linkQueue.
func (wrk *worker) processLink(parentUrl *url.URL, val string, kind atom.Atom) (
linkq *linkQueue,
) {
@@ -453,17 +365,19 @@ func (wrk *worker) processLink(parentUrl *url.URL, val string, kind atom.Atom) (
return nil
}
+ linkq = &linkQueue{
+ parentUrl: parentUrl,
+ kind: kind,
+ }
+
var newUrl *url.URL
var err error
newUrl, err = url.Parse(val)
if err != nil {
- return &linkQueue{
- parentUrl: parentUrl,
- errScan: err,
- url: val,
- kind: kind,
- status: StatusBadLink,
- }
+ linkq.errScan = err
+ linkq.url = val
+ linkq.status = StatusBadLink
+ return linkq
}
newUrl.Fragment = ""
newUrl.RawFragment = ""
@@ -472,55 +386,18 @@ func (wrk *worker) processLink(parentUrl *url.URL, val string, kind atom.Atom) (
// Ignore link to ID, like `href="#element_id"`.
return nil
}
- if strings.HasPrefix(val, `http`) {
- return &linkQueue{
- parentUrl: parentUrl,
- url: strings.TrimSuffix(newUrl.String(), `/`),
- kind: kind,
- }
- }
- if val[0] == '/' {
- // val is absolute to parent URL.
- newUrl = wrk.baseUrl.JoinPath(newUrl.Path)
- } else {
- // val is relative to parent URL.
- newUrl = parentUrl.JoinPath(`/`, newUrl.Path)
- }
- linkq = &linkQueue{
- parentUrl: parentUrl,
- url: strings.TrimSuffix(newUrl.String(), `/`),
- kind: kind,
- }
- return linkq
-}
-
-func (wrk *worker) pushResult(resultq map[string]linkQueue) {
- if len(resultq) == 0 {
- return
- }
- var tick = time.NewTicker(100 * time.Millisecond)
- for {
- select {
- case wrk.resultq <- resultq:
- tick.Stop()
- return
- case <-tick.C:
+ if !strings.HasPrefix(val, `http`) {
+ if val[0] == '/' {
+ // val is absolute link.
+ newUrl = wrk.baseUrl.JoinPath(newUrl.Path)
+ } else {
+ // val is relative to parent URL.
+ newUrl = parentUrl.JoinPath(`/`, newUrl.Path)
}
}
-}
-
-// checkExternal set the [linkQueue.isExternal] field to true if
-//
-// (1) [linkQueue.url] does not start with [Options.Url]
-// (2) linkQueue is not from scanPastResult, indicated by non-nil
-// [worker.pastResult].
-func (wrk *worker) checkExternal(linkq *linkQueue) {
+ linkq.url = strings.TrimSuffix(newUrl.String(), `/`)
if !strings.HasPrefix(linkq.url, wrk.opts.scanUrl.String()) {
linkq.isExternal = true
- return
- }
- if wrk.pastResult != nil {
- linkq.isExternal = true
- return
}
+ return linkq
}