all: refactoring the scan work without sync.Mutex

When using goroutine to process a link, the result than passed to main goroutine through channel. The main goroutine then process the result one by one, check if its has been seen, error, or need to be scanned. In that way, we don't need mutex to guard if link has been seen or not.
author: Shulhan <ms@kilabit.info> 2025-05-31 16:35:27 +0700
committer: Shulhan <ms@kilabit.info> 2025-05-31 17:03:29 +0700
commit: fea6f2115088c676902a22e154562e36acc3eaa1 (patch)
tree: b70de00dabc73c0adc2cbb3e9ef01e29dc1196aa
parent: e332a3351ab1c82dc18b62e3c251447a7020a280 (diff)
download: jarink-fea6f2115088c676902a22e154562e36acc3eaa1.tar.xz
4 files changed, 284 insertions, 181 deletions
diff --git a/deadlinks.go b/deadlinks.go
index 1d31b8f..365a6fb 100644
--- a/deadlinks.go
+++ b/deadlinks.go
@@ -7,6 +7,10 @@ import (
 	"fmt"
 )
 
+// StatusBadLink status for link that is not parseable by [url.Parse] or not
+// reachable during GET or HEAD, either timeout or IP or domain not exist.
+const StatusBadLink = 700
+
 // Scan the baseUrl for dead links.
 func Scan(opts ScanOptions) (result *Result, err error) {
 	var logp = `Scan`
diff --git a/deadlinks_test.go b/deadlinks_test.go
index 2950db4..8b7d83e 100644
--- a/deadlinks_test.go
+++ b/deadlinks_test.go
@@ -89,64 +89,76 @@ func TestDeadLinks_Scan(t *testing.T) {
 	}, {
 		scanUrl: testUrl,
 		exp: map[string][]deadlinks.Broken{
-			testUrl: []deadlinks.Broken{{
-				Link: testUrl + `/broken.png`,
-				Code: http.StatusNotFound,
-			}, {
-				Link: testUrl + `/brokenPage`,
-				Code: http.StatusNotFound,
-			}, {
-				Link: `http://127.0.0.1:abc`,
-				Code: 700,
-			}, {
-				Link: `http:/127.0.0.1:11836`,
-				Code: http.StatusNotFound,
-			}},
-			testUrl + `/broken.html`: []deadlinks.Broken{{
-				Link: testUrl + `/brokenPage`,
-				Code: http.StatusNotFound,
-			}},
-			testUrl + `/page2`: []deadlinks.Broken{{
-				Link: testUrl + `/broken.png`,
-				Code: http.StatusNotFound,
-			}, {
-				Link: testUrl + `/page2/broken/relative`,
-				Code: http.StatusNotFound,
-			}, {
-				Link: testUrl + `/page2/broken2.png`,
-				Code: http.StatusNotFound,
-			}},
+			testUrl: []deadlinks.Broken{
+				{
+					Link: testUrl + `/broken.png`,
+					Code: http.StatusNotFound,
+				}, {
+					Link: testUrl + `/brokenPage`,
+					Code: http.StatusNotFound,
+				}, {
+					Link: `http://127.0.0.1:abc`,
+					Code: deadlinks.StatusBadLink,
+				}, {
+					Link: `http:/127.0.0.1:11836`,
+					Code: deadlinks.StatusBadLink,
+				},
+			},
+			testUrl + `/broken.html`: []deadlinks.Broken{
+				{
+					Link: testUrl + `/brokenPage`,
+					Code: http.StatusNotFound,
+				},
+			},
+			testUrl + `/page2`: []deadlinks.Broken{
+				{
+					Link: testUrl + `/broken.png`,
+					Code: http.StatusNotFound,
+				}, {
+					Link: testUrl + `/page2/broken/relative`,
+					Code: http.StatusNotFound,
+				}, {
+					Link: testUrl + `/page2/broken2.png`,
+					Code: http.StatusNotFound,
+				},
+			},
 		},
 	}, {
 		scanUrl: testUrl + `/page2`,
 		exp: map[string][]deadlinks.Broken{
-			testUrl: []deadlinks.Broken{{
-				Link: testUrl + `/broken.png`,
-				Code: http.StatusNotFound,
-			}, {
-				Link: testUrl + `/brokenPage`,
-				Code: http.StatusNotFound,
-			}, {
-				Link: `http://127.0.0.1:abc`,
-				Code: 700,
-			}, {
-				Link: `http:/127.0.0.1:11836`,
-				Code: http.StatusNotFound,
-			}},
-			testUrl + `/broken.html`: []deadlinks.Broken{{
-				Link: testUrl + `/brokenPage`,
-				Code: http.StatusNotFound,
-			}},
-			testUrl + `/page2`: []deadlinks.Broken{{
-				Link: testUrl + `/broken.png`,
-				Code: http.StatusNotFound,
-			}, {
-				Link: testUrl + `/page2/broken/relative`,
-				Code: http.StatusNotFound,
-			}, {
-				Link: testUrl + `/page2/broken2.png`,
-				Code: http.StatusNotFound,
-			}},
+			testUrl: []deadlinks.Broken{
+				{
+					Link: testUrl + `/broken.png`,
+					Code: http.StatusNotFound,
+				}, {
+					Link: testUrl + `/brokenPage`,
+					Code: http.StatusNotFound,
+				}, {
+					Link: `http://127.0.0.1:abc`,
+					Code: deadlinks.StatusBadLink,
+				}, {
+					Link: `http:/127.0.0.1:11836`,
+					Code: deadlinks.StatusBadLink,
+				},
+			},
+			testUrl + `/broken.html`: []deadlinks.Broken{
+				{
+					Link: testUrl + `/brokenPage`,
+					Code: http.StatusNotFound,
+				},
+			},
+			testUrl + `/page2`: []deadlinks.Broken{
+				{
+					Link: testUrl + `/broken.png`,
+					Code: http.StatusNotFound,
+				}, {
+					Link: testUrl + `/page2/broken/relative`,
+					Code: http.StatusNotFound,
+				}, {
+					Link: testUrl + `/page2/broken2.png`,
+					Code: http.StatusNotFound,
+				},
+			},
 		},
 	}}
 
@@ -155,6 +167,7 @@ func TestDeadLinks_Scan(t *testing.T) {
 		err    error
 	)
 	for _, tcase := range listCase {
+		t.Logf(`--- Scan: %s`, tcase.scanUrl)
 		var scanOpts = deadlinks.ScanOptions{
 			Url: tcase.scanUrl,
 		}
diff --git a/link_queue.go b/link_queue.go
index 9987586..10dbfff 100644
--- a/link_queue.go
+++ b/link_queue.go
@@ -11,6 +11,21 @@ import (
 
 type linkQueue struct {
 	parentUrl *url.URL
-	url       string
-	kind      atom.Atom
+
+	// The error from scan.
+	errScan error
+
+	// url being scanned.
+	url string
+
+	// kind of url, its either an anchor or image.
+	// It set to 0 if url is the first URL being scanned.
+	kind atom.Atom
+
+	// Status of link after scan, its mostly used the HTTP status code.
+	// 0: link is the result of scan, not processed yet.
+	// StatusBadLink: link is invalid, not parseable or unreachable.
+	// 200 - 211: OK.
+	// 400 - 511: Error.
+	status int
 }
diff --git a/worker.go b/worker.go
index 99bef6f..6bc5c13 100644
--- a/worker.go
+++ b/worker.go
@@ -5,29 +5,27 @@ package deadlinks
 
 import (
 	"fmt"
-	"io"
 	"log"
 	"net/http"
 	"net/url"
 	"strings"
 	"sync"
+	"time"
 
 	"golang.org/x/net/html"
 	"golang.org/x/net/html/atom"
 )
 
 type worker struct {
-	// seenLink store the page URL that has been scanned and its HTTP status
-	// code.
+	// seenLink store the URL being or has been scanned and its HTTP
+	// status code.
 	seenLink map[string]int
 
-	// linkq contains queue of page URL to be scanned.
-	linkq chan linkQueue
+	// resultq channel that collect result from scanning.
+	resultq chan map[string]linkQueue
 
-	// errq contains error from scanning a page URL.
-	errq chan error
-
-	// result contains map of page URL and its list of broken link.
+	// result contains the final result after all of the pages has been
+	// scanned.
 	result *Result
 
 	// The base URL that will be joined to relative or absolute
@@ -41,17 +39,13 @@ type worker struct {
 
 	// wg sync the goroutine scanner.
 	wg sync.WaitGroup
-
-	// seenLinkMtx guard the seenLink field from concurrent read/write.
-	seenLinkMtx sync.Mutex
 }
 
 func newWorker(opts ScanOptions) (wrk *worker, err error) {
 	wrk = &worker{
 		opts:     opts,
 		seenLink: map[string]int{},
-		linkq:    make(chan linkQueue, 10000),
-		errq:     make(chan error, 1),
+		resultq:  make(chan map[string]linkQueue, 100),
 		result:   newResult(),
 	}
 
@@ -69,63 +63,151 @@ func newWorker(opts ScanOptions) (wrk *worker, err error) {
 		Host:   wrk.scanUrl.Host,
 	}
 
-	wrk.linkq <- linkQueue{
-		parentUrl: nil,
-		url:       wrk.scanUrl.String(),
-	}
 	return wrk, nil
 }
 
 func (wrk *worker) run() (result *Result, err error) {
-	var ever bool = true
-	for ever {
+	// Scan the first URL to make sure that the server is reachable.
+	var firstLinkq = linkQueue{
+		parentUrl: nil,
+		url:       wrk.scanUrl.String(),
+		status:    http.StatusProcessing,
+	}
+	wrk.seenLink[firstLinkq.url] = http.StatusProcessing
+
+	wrk.wg.Add(1)
+	go wrk.scan(firstLinkq)
+	wrk.wg.Wait()
+
+	var resultq = <-wrk.resultq
+	for _, linkq := range resultq {
+		if linkq.url == firstLinkq.url {
+			if linkq.errScan != nil {
+				return nil, linkq.errScan
+			}
+			wrk.seenLink[linkq.url] = linkq.status
+			continue
+		}
+		if linkq.status >= http.StatusBadRequest {
+			wrk.markDead(linkq)
+			continue
+		}
+
+		wrk.seenLink[linkq.url] = http.StatusProcessing
+		wrk.wg.Add(1)
+		go wrk.scan(linkq)
+	}
+
+	var listWaitStatus []linkQueue
+	var isScanning = true
+	for isScanning {
 		select {
-		case linkq := <-wrk.linkq:
-			wrk.wg.Add(1)
-			go wrk.scan(linkq)
+		case resultq := <-wrk.resultq:
+
+			// The resultq contains the original URL being scanned
+			// and its child links.
+			// For example, scanning "http://example.tld" result
+			// in
+			//
+			//	"http://example.tld": {status=200}
+			//	"http://example.tld/page": {status=0}
+			//	"http://example.tld/image.png": {status=0}
+			//	"http://bad:domain/image.png": {status=700}
+
+			for _, linkq := range resultq {
+				if linkq.status >= http.StatusBadRequest {
+					wrk.markDead(linkq)
+					continue
+				}
+
+				seenStatus, seen := wrk.seenLink[linkq.url]
+				if !seen {
+					wrk.seenLink[linkq.url] = http.StatusProcessing
+					wrk.wg.Add(1)
+					go wrk.scan(linkq)
+					continue
+				}
+				if seenStatus >= http.StatusBadRequest {
+					linkq.status = seenStatus
+					wrk.markDead(linkq)
+					continue
+				}
+				if seenStatus >= http.StatusOK {
+					// The link has been processed and its
+					// not an error.
+					continue
+				}
+				if linkq.status != 0 {
+					// linkq is the result of scan with
+					// non error status.
+					wrk.seenLink[linkq.url] = linkq.status
+					continue
+				}
+
+				// The link being processed by other
+				// goroutine.
+				listWaitStatus = append(listWaitStatus, linkq)
+			}
 
 		default:
 			wrk.wg.Wait()
-
-			select {
-			case err = <-wrk.errq:
-				return nil, err
-			default:
-				if len(wrk.linkq) == 0 {
-					ever = false
+			if len(wrk.resultq) != 0 {
+				continue
+			}
+			var newList []linkQueue
+			for _, linkq := range listWaitStatus {
+				seenStatus := wrk.seenLink[linkq.url]
+				if seenStatus == http.StatusProcessing {
+					// Scanning still in progress.
+					newList = append(newList, linkq)
+					continue
+				}
+				if seenStatus >= http.StatusBadRequest {
+					linkq.status = seenStatus
+					wrk.markDead(linkq)
+					continue
 				}
 			}
+			if len(newList) != 0 {
+				// There are link that still waiting for
+				// scanning to be completed.
+				listWaitStatus = newList
+				continue
+			}
+			isScanning = false
 		}
 	}
 	wrk.result.sort()
 	return wrk.result, nil
 }
 
+func (wrk *worker) markDead(linkq linkQueue) {
+	var parentUrl = linkq.parentUrl.String()
+	var listBroken = wrk.result.PageLinks[parentUrl]
+	var brokenLink = Broken{
+		Link: linkq.url,
+		Code: linkq.status,
+	}
+	listBroken = append(listBroken, brokenLink)
+	wrk.result.PageLinks[parentUrl] = listBroken
+	wrk.seenLink[linkq.url] = linkq.status
+}
+
 // scan fetch the HTML page or image to check if its valid.
 func (wrk *worker) scan(linkq linkQueue) {
-	defer wrk.wg.Done()
-
-	wrk.seenLinkMtx.Lock()
-	statusCode, seen := wrk.seenLink[linkq.url]
-	wrk.seenLinkMtx.Unlock()
-	if seen {
-		if statusCode >= http.StatusBadRequest {
-			wrk.markDead(linkq, statusCode)
-		}
+	defer func() {
 		if wrk.opts.IsVerbose {
-			fmt.Printf("scan: %s %d\n", linkq.url, statusCode)
+			fmt.Printf("  done: %d %s\n", linkq.status, linkq.url)
 		}
-		return
-	}
-	wrk.seenLinkMtx.Lock()
-	wrk.seenLink[linkq.url] = http.StatusProcessing
-	wrk.seenLinkMtx.Unlock()
+		wrk.wg.Done()
+	}()
 
 	if wrk.opts.IsVerbose {
-		fmt.Printf("scan: %s %d\n", linkq.url, http.StatusProcessing)
+		fmt.Printf("scan: %d %s\n", linkq.status, linkq.url)
 	}
 
 	var (
+		resultq  = map[string]linkQueue{}
 		httpResp *http.Response
 		err      error
 	)
@@ -135,51 +217,34 @@ func (wrk *worker) scan(linkq linkQueue) {
 		httpResp, err = http.Get(linkq.url)
 	}
 	if err != nil {
-		if linkq.parentUrl == nil {
-			wrk.errq <- err
-		} else {
-			wrk.markDead(linkq, http.StatusNotFound)
-		}
+		linkq.status = StatusBadLink
+		linkq.errScan = err
+		resultq[linkq.url] = linkq
+		go wrk.pushResult(resultq)
 		return
 	}
 	defer httpResp.Body.Close()
 
-	if httpResp.StatusCode != http.StatusOK {
-		wrk.markDead(linkq, httpResp.StatusCode)
+	linkq.status = httpResp.StatusCode
+	resultq[linkq.url] = linkq
+
+	if httpResp.StatusCode >= http.StatusBadRequest {
+		go wrk.pushResult(resultq)
 		return
 	}
-	wrk.seenLinkMtx.Lock()
-	wrk.seenLink[linkq.url] = http.StatusOK
-	wrk.seenLinkMtx.Unlock()
-
 	if linkq.kind == atom.Img {
+		go wrk.pushResult(resultq)
 		return
 	}
 	if !strings.HasPrefix(linkq.url, wrk.baseUrl.String()) {
-		// Do not parse the page from external domain.
+		// Do not parse the HTML page from external domain, only need
+		// its HTTP status code.
+		go wrk.pushResult(resultq)
 		return
 	}
-	wrk.parseHTML(linkq.url, httpResp.Body)
-}
-
-func (wrk *worker) markDead(linkq linkQueue, httpStatusCode int) {
-	var parentUrl = linkq.parentUrl.String()
-
-	wrk.seenLinkMtx.Lock()
-	var listBroken = wrk.result.PageLinks[parentUrl]
-	listBroken = append(listBroken, Broken{
-		Link: linkq.url,
-		Code: httpStatusCode,
-	})
-	wrk.result.PageLinks[parentUrl] = listBroken
-	wrk.seenLink[linkq.url] = httpStatusCode
-	wrk.seenLinkMtx.Unlock()
-}
 
-func (wrk *worker) parseHTML(linkUrl string, body io.Reader) {
 	var doc *html.Node
-
-	doc, _ = html.Parse(body)
+	doc, _ = html.Parse(httpResp.Body)
 
 	// After we check the code and test for [html.Parse] there are
 	// no case actual cases where HTML content will return an error.
@@ -188,90 +253,96 @@ func (wrk *worker) parseHTML(linkUrl string, body io.Reader) {
 	//
 	// [html.Parse]: https://go.googlesource.com/net/+/refs/tags/v0.40.0/html/parse.go#2347
 
+	var scanUrl *url.URL
+
+	scanUrl, err = url.Parse(linkq.url)
+	if err != nil {
+		log.Fatal(err)
+	}
+
 	var node *html.Node
+	var link string
+	var status int
 	for node = range doc.Descendants() {
 		if node.Type != html.ElementNode {
 			continue
 		}
+		link = ""
 		if node.DataAtom == atom.A {
 			for _, attr := range node.Attr {
 				if attr.Key != `href` {
 					continue
 				}
-				wrk.processLink(linkUrl, attr.Val, atom.A)
+				link, status = wrk.processLink(scanUrl, attr.Val, atom.A)
+				break
 			}
-		}
-		if node.DataAtom == atom.Img {
+		} else if node.DataAtom == atom.Img {
 			for _, attr := range node.Attr {
 				if attr.Key != `src` {
 					continue
 				}
-				wrk.processLink(linkUrl, attr.Val, atom.Img)
+				link, status = wrk.processLink(scanUrl, attr.Val, atom.Img)
+				break
 			}
+		} else {
+			continue
+		}
+		if link == "" {
+			continue
+		}
+		resultq[link] = linkQueue{
+			parentUrl: scanUrl,
+			url:       link,
+			kind:      node.DataAtom,
+			status:    status,
 		}
 	}
+	go wrk.pushResult(resultq)
 }
 
-func (wrk *worker) processLink(rawParentUrl, val string, kind atom.Atom) {
+func (wrk *worker) processLink(parentUrl *url.URL, val string, kind atom.Atom) (
+	link string, status int,
+) {
 	if len(val) == 0 {
-		return
-	}
-
-	var parentUrl *url.URL
-	var err error
-
-	parentUrl, err = url.Parse(rawParentUrl)
-	if err != nil {
-		log.Fatal(err)
+		return "", 0
 	}
 
 	var newUrl *url.URL
+	var err error
 	newUrl, err = url.Parse(val)
 	if err != nil {
-		var linkq = linkQueue{
-			parentUrl: parentUrl,
-			url:       val,
-			kind:      kind,
-		}
-		wrk.markDead(linkq, 700)
-		return
+		return val, StatusBadLink
 	}
 	newUrl.Fragment = ""
 	newUrl.RawFragment = ""
 
-	var newUrlStr = strings.TrimSuffix(newUrl.String(), `/`)
-
 	if kind == atom.A && val[0] == '#' {
 		// Ignore link to ID, like `href="#element_id"`.
-		return
+		return "", 0
+	}
+	if strings.HasPrefix(val, `http`) {
+		link = strings.TrimSuffix(newUrl.String(), `/`)
+		return link, 0
 	}
-
-	// val is absolute to parent URL.
 	if val[0] == '/' {
-		// Link to the same domain will queued for scanning.
+		// val is absolute to parent URL.
 		newUrl = wrk.baseUrl.JoinPath(newUrl.Path)
-		newUrlStr = strings.TrimSuffix(newUrl.String(), `/`)
-		wrk.linkq <- linkQueue{
-			parentUrl: parentUrl,
-			url:       newUrlStr,
-			kind:      kind,
-		}
-		return
+	} else {
+		// val is relative to parent URL.
+		newUrl = parentUrl.JoinPath(`/`, newUrl.Path)
 	}
-	if strings.HasPrefix(val, `http`) {
-		wrk.linkq <- linkQueue{
-			parentUrl: parentUrl,
-			url:       newUrlStr,
-			kind:      kind,
+	link = strings.TrimSuffix(newUrl.String(), `/`)
+	return link, 0
+}
+
+func (wrk *worker) pushResult(resultq map[string]linkQueue) {
+	var tick = time.NewTicker(100 * time.Millisecond)
+	for {
+		select {
+		case wrk.resultq <- resultq:
+			tick.Stop()
+			return
+		case <-tick.C:
 		}
-		return
-	}
-	// val is relative to parent URL.
-	newUrl = parentUrl.JoinPath(`/`, newUrl.Path)
-	newUrlStr = strings.TrimSuffix(newUrl.String(), `/`)
-	wrk.linkq <- linkQueue{
-		parentUrl: parentUrl,
-		url:       newUrlStr,
-		kind:      kind,
 	}
 }
author	Shulhan <ms@kilabit.info>	2025-05-31 16:35:27 +0700
committer	Shulhan <ms@kilabit.info>	2025-05-31 17:03:29 +0700
commit	fea6f2115088c676902a22e154562e36acc3eaa1 (patch)
tree	b70de00dabc73c0adc2cbb3e9ef01e29dc1196aa
parent	e332a3351ab1c82dc18b62e3c251447a7020a280 (diff)
download	jarink-fea6f2115088c676902a22e154562e36acc3eaa1.tar.xz