1 files changed, 111 insertions, 234 deletions
diff --git a/brokenlinks/worker.go b/brokenlinks/worker.go
index 3c6f97e..c0a33dd 100644
--- a/brokenlinks/worker.go
+++ b/brokenlinks/worker.go
@@ -1,5 +1,5 @@
-// SPDX-FileCopyrightText: 2025 M. Shulhan <ms@kilabit.info>
 // SPDX-License-Identifier: GPL-3.0-only
+// SPDX-FileCopyrightText: 2025 M. Shulhan <ms@kilabit.info>
 
 package brokenlinks
 
@@ -14,7 +14,6 @@ import (
 	"os"
 	"slices"
 	"strings"
-	"sync"
 	"time"
 
 	"golang.org/x/net/html"
@@ -26,10 +25,11 @@ import (
 type worker struct {
 	// seenLink store the URL being or has been scanned and its HTTP
 	// status code.
+	// It store all links, including the broken one.
 	seenLink map[string]int
 
-	// resultq channel that collect result from scanning.
-	resultq chan map[string]linkQueue
+	// queue contains list of link to be scanned.
+	queue []linkQueue
 
 	// result contains the final result after all of the pages has been
 	// scanned.
@@ -41,6 +41,7 @@ type worker struct {
 
 	// The base URL that will be joined to relative or absolute
 	// links or image.
+	// baseURL is set to "scheme://host" only.
 	baseUrl *url.URL
 
 	// cache of scanned links.
@@ -51,12 +52,10 @@ type worker struct {
 	httpc *http.Client
 
 	opts Options
-
-	// wg sync the goroutine scanner.
-	wg sync.WaitGroup
 }
 
 func newWorker(opts Options) (wrk *worker, err error) {
+	var logp = `newWorker`
 	var netDial = &net.Dialer{
 		Timeout:   30 * time.Second,
 		KeepAlive: 30 * time.Second,
@@ -68,7 +67,6 @@ func newWorker(opts Options) (wrk *worker, err error) {
 	wrk = &worker{
 		opts:     opts,
 		seenLink: map[string]int{},
-		resultq:  make(chan map[string]linkQueue, 100),
 		result:   newResult(),
 		log:      log.New(os.Stderr, ``, log.LstdFlags),
 		httpc: &http.Client{
@@ -107,7 +105,8 @@ func newWorker(opts Options) (wrk *worker, err error) {
 	wrk.pastResult = newResult()
 	err = json.Unmarshal(pastresult, &wrk.pastResult)
 	if err != nil {
-		return nil, err
+		log.Printf(`%s: invalid past result file format: %s`, logp, err)
+		log.Printf(`%s: ignoring past result`, logp)
 	}
 
 	return wrk, nil
@@ -125,184 +124,107 @@ func (wrk *worker) run() (result *Result, err error) {
 // scanAll scan all pages start from [Options.Url].
 func (wrk *worker) scanAll() (result *Result, err error) {
 	// Scan the first URL to make sure that the server is reachable.
-	var firstLinkq = linkQueue{
+	var linkq = linkQueue{
 		parentUrl: nil,
 		url:       wrk.opts.scanUrl.String(),
-		status:    http.StatusProcessing,
 	}
-	wrk.seenLink[firstLinkq.url] = http.StatusProcessing
-
-	wrk.wg.Add(1)
-	go func() {
-		var resultq = wrk.scan(firstLinkq)
-		wrk.pushResult(resultq)
-	}()
-	wrk.wg.Wait()
-
-	var resultq = <-wrk.resultq
-	for _, linkq := range resultq {
-		if linkq.url == firstLinkq.url {
-			if linkq.errScan != nil {
-				return nil, linkq.errScan
-			}
-			wrk.seenLink[linkq.url] = linkq.status
-			continue
-		}
 
-		if linkq.isExternal {
-			var scannedLink = wrk.cache.Get(linkq.url)
-			if scannedLink != nil {
-				linkq.status = scannedLink.ResponseCode
-				wrk.seen(linkq)
-				continue
-			}
-		}
+	var resultq = wrk.scan(linkq)
+	linkq = resultq[linkq.url]
+	if linkq.errScan != nil {
+		return nil, linkq.errScan
+	}
+	wrk.processResult(resultq)
 
-		wrk.seenLink[linkq.url] = http.StatusProcessing
-		wrk.wg.Add(1)
-		go func() {
-			var resultq = wrk.scan(linkq)
-			wrk.pushResult(resultq)
-		}()
+	var x int
+	for x < len(wrk.queue) {
+		linkq = wrk.queue[x]
+		x++
+		resultq = wrk.scan(linkq)
+		wrk.processResult(resultq)
 	}
 
-	wrk.processAndWait()
+	wrk.result.sort()
 	return wrk.result, nil
 }
 
-// scanPastResult scan only pages reported inside
-// [Result.BrokenLinks].
+// scanPastResult scan only pages reported inside [Result.BrokenLinks].
 func (wrk *worker) scanPastResult() (result *Result, err error) {
 	for page := range wrk.pastResult.BrokenLinks {
-		var linkq = linkQueue{
-			parentUrl: nil,
-			url:       page,
-			status:    http.StatusProcessing,
+		wrk.opts.scanUrl, err = url.Parse(page)
+		if err != nil {
+			log.Printf(`scanPastResult: invalid URL %q: %s`, page, err)
+			continue
 		}
-		wrk.seenLink[linkq.url] = http.StatusProcessing
-		wrk.wg.Add(1)
-		go func() {
-			var resultq = wrk.scan(linkq)
-			wrk.pushResult(resultq)
-		}()
-	}
-
-	wrk.processAndWait()
-	return wrk.result, nil
-}
-
-func (wrk *worker) processAndWait() {
-	var tick = time.NewTicker(500 * time.Millisecond)
-	var listWaitStatus []linkQueue
-	var isScanning = true
-	for isScanning {
-		select {
-		case resultq := <-wrk.resultq:
-			listWaitStatus = wrk.processResult(resultq, listWaitStatus)
-
-		case <-tick.C:
-			wrk.wg.Wait()
-			if len(wrk.resultq) != 0 {
-				continue
-			}
-			if len(listWaitStatus) != 0 {
-				// There are links that still waiting for
-				// scanning to be completed.
-				continue
-			}
-			isScanning = false
+		wrk.queue = nil
+		_, err = wrk.scanAll()
+		if err != nil {
+			log.Printf(`scanPastResult: %q: %s`, page, err)
 		}
 	}
+
 	wrk.result.sort()
+	return wrk.result, nil
 }
 
-// processResult the resultq contains the original URL being scanned
-// and its child links.
-// For example, scanning "http://example.tld" result in
+// processResult process the scan result and push it to queue.
+// The resultq contains the original URL being scanned and its child links.
+// For each link item in resultq,
 //
-//	"http://example.tld": {status=200}
-//	"http://example.tld/page": {status=0}
-//	"http://example.tld/image.png": {status=0}
-//	"http://bad:domain/image.png": {status=700}
-func (wrk *worker) processResult(
-	resultq map[string]linkQueue, listWaitStatus []linkQueue,
-) (
-	newList []linkQueue,
-) {
-	for _, linkq := range resultq {
-		// Process the scanned page first.
-
+//   - For non-zero status code, mark it as seen and store to cache if
+//     it external.
+//   - Skip external link that has been checked before.
+//   - Skip link that has been seen.
+//   - Otherwise push it to queue.
+func (wrk *worker) processResult(resultq map[string]linkQueue) {
+	var linkq linkQueue
+	var seen bool
+	for _, linkq = range resultq {
 		if linkq.status != 0 {
 			wrk.seen(linkq)
-			if linkq.isExternal && linkq.status != StatusBadLink {
-				wrk.cache.Set(linkq.url, linkq.status, linkq.size)
-			}
 			continue
 		}
 
-		// Now process the links inside the page.
-
 		if linkq.isExternal {
 			var scannedLink = wrk.cache.Get(linkq.url)
 			if scannedLink != nil {
-				linkq.status = scannedLink.ResponseCode
-				wrk.seen(linkq)
+				// The external link has been scanned
+				// previously.
 				continue
 			}
 		}
 
-		seenStatus, seen := wrk.seenLink[linkq.url]
-		if !seen {
-			wrk.seenLink[linkq.url] = http.StatusProcessing
-			wrk.wg.Add(1)
-			go func() {
-				var resultq = wrk.scan(linkq)
-				wrk.pushResult(resultq)
-			}()
-			continue
-		}
-		if seenStatus >= http.StatusBadRequest {
-			linkq.status = seenStatus
-			wrk.markBroken(linkq)
-			continue
-		}
-		if seenStatus >= http.StatusOK {
-			// The link has been processed and its
-			// not an error.
-			continue
-		}
-		// The link being processed by other goroutine.
-		linkq.status = seenStatus
-		newList = append(newList, linkq)
-	}
-	for _, linkq := range listWaitStatus {
-		seenStatus := wrk.seenLink[linkq.url]
-		if seenStatus >= http.StatusBadRequest {
-			linkq.status = seenStatus
-			wrk.markBroken(linkq)
-			continue
-		}
-		if seenStatus >= http.StatusOK {
-			continue
-		}
-		if seenStatus == http.StatusProcessing {
-			// Scanning still in progress.
-			newList = append(newList, linkq)
+		linkq.status, seen = wrk.seenLink[linkq.url]
+		if seen {
+			if linkq.status >= http.StatusBadRequest {
+				// Different pages may have the same broken
+				// link.
+				wrk.markAsBroken(linkq)
+			}
 			continue
 		}
+		wrk.queue = append(wrk.queue, linkq)
 	}
-	return newList
 }
 
 func (wrk *worker) seen(linkq linkQueue) {
+	wrk.seenLink[linkq.url] = linkq.status
+
+	if linkq.isExternal {
+		if linkq.status != StatusBadLink {
+			wrk.cache.Set(linkq.url, linkq.status, linkq.size)
+		}
+	}
+
 	if linkq.status >= http.StatusBadRequest {
-		wrk.markBroken(linkq)
-		return
+		wrk.markAsBroken(linkq)
 	}
-	wrk.seenLink[linkq.url] = linkq.status
 }
 
-func (wrk *worker) markBroken(linkq linkQueue) {
+func (wrk *worker) markAsBroken(linkq linkQueue) {
+	if slices.Contains(wrk.opts.ignoreStatus, linkq.status) {
+		return
+	}
 	var parentUrl = linkq.parentUrl.String()
 	var listBroken = wrk.result.BrokenLinks[parentUrl]
 	var brokenLink = Broken{
@@ -314,20 +236,10 @@ func (wrk *worker) markBroken(linkq linkQueue) {
 	}
 	listBroken = append(listBroken, brokenLink)
 	wrk.result.BrokenLinks[parentUrl] = listBroken
-
-	wrk.seenLink[linkq.url] = linkq.status
 }
 
-// scan fetch the HTML page or image to check if its valid.
+// scan the link to HTML page or image.
 func (wrk *worker) scan(linkq linkQueue) (resultq map[string]linkQueue) {
-	defer func() {
-		if wrk.opts.IsVerbose && linkq.errScan != nil {
-			wrk.log.Printf("error: %d %s error=%v\n", linkq.status,
-				linkq.url, linkq.errScan)
-		}
-		wrk.wg.Done()
-	}()
-
 	resultq = make(map[string]linkQueue)
 	var (
 		httpResp *http.Response
@@ -346,10 +258,6 @@ func (wrk *worker) scan(linkq linkQueue) (resultq map[string]linkQueue) {
 	linkq.size = httpResp.ContentLength
 	resultq[linkq.url] = linkq
 
-	if slices.Contains(wrk.opts.ignoreStatus, httpResp.StatusCode) {
-		return nil
-	}
-
 	if httpResp.StatusCode >= http.StatusBadRequest {
 		return resultq
 	}
@@ -357,21 +265,23 @@ func (wrk *worker) scan(linkq linkQueue) (resultq map[string]linkQueue) {
 		return resultq
 	}
 
-	var doc *html.Node
-	doc, _ = html.Parse(httpResp.Body)
-
-	// After we check the code and test for [html.Parse] there are
-	// no case actual cases where HTML content will return an error.
+	// After we check the code for [html.Parse] there are no cases where
+	// it will return an error.
 	// The only possible error is when reading from body (io.Reader), and
 	// that is also almost impossible.
 	//
 	// [html.Parse]: https://go.googlesource.com/net/+/refs/tags/v0.40.0/html/parse.go#2347
+	var doc *html.Node
+	doc, _ = html.Parse(httpResp.Body)
 
-	var scanUrl *url.URL
+	var parentUrl *url.URL
 
-	scanUrl, err = url.Parse(linkq.url)
+	parentUrl, err = url.Parse(linkq.url)
 	if err != nil {
-		log.Fatal(err)
+		linkq.status = StatusBadLink
+		linkq.errScan = err
+		resultq[linkq.url] = linkq
+		return resultq
 	}
 
 	var node *html.Node
@@ -379,42 +289,42 @@ func (wrk *worker) scan(linkq linkQueue) (resultq map[string]linkQueue) {
 		if node.Type != html.ElementNode {
 			continue
 		}
+		if node.DataAtom != atom.A && node.DataAtom != atom.Img {
+			continue
+		}
 		var nodeLink *linkQueue
 		if node.DataAtom == atom.A {
 			for _, attr := range node.Attr {
 				if attr.Key != `href` {
 					continue
 				}
-				nodeLink = wrk.processLink(scanUrl, attr.Val, atom.A)
+				nodeLink = wrk.processLink(parentUrl, attr.Val, atom.A)
 				break
 			}
-		} else if node.DataAtom == atom.Img {
+		} else { // atom.Img
 			for _, attr := range node.Attr {
 				if attr.Key != `src` {
 					continue
 				}
-				nodeLink = wrk.processLink(scanUrl, attr.Val, atom.Img)
+				nodeLink = wrk.processLink(parentUrl, attr.Val, atom.Img)
 				break
 			}
-		} else {
-			continue
 		}
 		if nodeLink == nil {
+			// Link is invalid.
 			continue
 		}
 		_, seen := resultq[nodeLink.url]
-		if !seen {
-			wrk.checkExternal(nodeLink)
-			resultq[nodeLink.url] = *nodeLink
+		if seen {
+			// The same link already exist previously.
+			continue
 		}
+		resultq[nodeLink.url] = *nodeLink
 	}
 	return resultq
 }
 
-func (wrk *worker) fetch(linkq linkQueue) (
-	httpResp *http.Response,
-	err error,
-) {
+func (wrk *worker) fetch(linkq linkQueue) (httpResp *http.Response, err error) {
 	const maxRetry = 5
 	var retry int
 	for retry < 5 {
@@ -446,6 +356,8 @@ func (wrk *worker) fetch(linkq linkQueue) (
 	return nil, err
 }
 
+// processLink given a parentURL and link value `val`
+// check if link `val` is valid and return it as linkQueue.
 func (wrk *worker) processLink(parentUrl *url.URL, val string, kind atom.Atom) (
 	linkq *linkQueue,
 ) {
@@ -453,17 +365,19 @@ func (wrk *worker) processLink(parentUrl *url.URL, val string, kind atom.Atom) (
 		return nil
 	}
 
+	linkq = &linkQueue{
+		parentUrl: parentUrl,
+		kind:      kind,
+	}
+
 	var newUrl *url.URL
 	var err error
 	newUrl, err = url.Parse(val)
 	if err != nil {
-		return &linkQueue{
-			parentUrl: parentUrl,
-			errScan:   err,
-			url:       val,
-			kind:      kind,
-			status:    StatusBadLink,
-		}
+		linkq.errScan = err
+		linkq.url = val
+		linkq.status = StatusBadLink
+		return linkq
 	}
 	newUrl.Fragment = ""
 	newUrl.RawFragment = ""
@@ -472,55 +386,18 @@ func (wrk *worker) processLink(parentUrl *url.URL, val string, kind atom.Atom) (
 		// Ignore link to ID, like `href="#element_id"`.
 		return nil
 	}
-	if strings.HasPrefix(val, `http`) {
-		return &linkQueue{
-			parentUrl: parentUrl,
-			url:       strings.TrimSuffix(newUrl.String(), `/`),
-			kind:      kind,
-		}
-	}
-	if val[0] == '/' {
-		// val is absolute to parent URL.
-		newUrl = wrk.baseUrl.JoinPath(newUrl.Path)
-	} else {
-		// val is relative to parent URL.
-		newUrl = parentUrl.JoinPath(`/`, newUrl.Path)
-	}
-	linkq = &linkQueue{
-		parentUrl: parentUrl,
-		url:       strings.TrimSuffix(newUrl.String(), `/`),
-		kind:      kind,
-	}
-	return linkq
-}
-
-func (wrk *worker) pushResult(resultq map[string]linkQueue) {
-	if len(resultq) == 0 {
-		return
-	}
-	var tick = time.NewTicker(100 * time.Millisecond)
-	for {
-		select {
-		case wrk.resultq <- resultq:
-			tick.Stop()
-			return
-		case <-tick.C:
+	if !strings.HasPrefix(val, `http`) {
+		if val[0] == '/' {
+			// val is absolute link.
+			newUrl = wrk.baseUrl.JoinPath(newUrl.Path)
+		} else {
+			// val is relative to parent URL.
+			newUrl = parentUrl.JoinPath(`/`, newUrl.Path)
 		}
 	}
-}
-
-// checkExternal set the [linkQueue.isExternal] field to true if
-//
-// (1) [linkQueue.url] does not start with [Options.Url]
-// (2) linkQueue is not from scanPastResult, indicated by non-nil
-// [worker.pastResult].
-func (wrk *worker) checkExternal(linkq *linkQueue) {
+	linkq.url = strings.TrimSuffix(newUrl.String(), `/`)
 	if !strings.HasPrefix(linkq.url, wrk.opts.scanUrl.String()) {
 		linkq.isExternal = true
-		return
-	}
-	if wrk.pastResult != nil {
-		linkq.isExternal = true
-		return
 	}
+	return linkq
 }