4 files changed, 284 insertions, 181 deletions
diff --git a/deadlinks.go b/deadlinks.go
index 1d31b8f..365a6fb 100644
--- a/deadlinks.go
+++ b/deadlinks.go
@@ -7,6 +7,10 @@ import (
 	"fmt"
 )
 
+// StatusBadLink status for link that is not parseable by [url.Parse] or not
+// reachable during GET or HEAD, either timeout or IP or domain not exist.
+const StatusBadLink = 700
+
 // Scan the baseUrl for dead links.
 func Scan(opts ScanOptions) (result *Result, err error) {
 	var logp = `Scan`
diff --git a/deadlinks_test.go b/deadlinks_test.go
index 2950db4..8b7d83e 100644
--- a/deadlinks_test.go
+++ b/deadlinks_test.go
@@ -89,64 +89,76 @@ func TestDeadLinks_Scan(t *testing.T) {
 	}, {
 		scanUrl: testUrl,
 		exp: map[string][]deadlinks.Broken{
-			testUrl: []deadlinks.Broken{{
-				Link: testUrl + `/broken.png`,
-				Code: http.StatusNotFound,
-			}, {
-				Link: testUrl + `/brokenPage`,
-				Code: http.StatusNotFound,
-			}, {
-				Link: `http://127.0.0.1:abc`,
-				Code: 700,
-			}, {
-				Link: `http:/127.0.0.1:11836`,
-				Code: http.StatusNotFound,
-			}},
-			testUrl + `/broken.html`: []deadlinks.Broken{{
-				Link: testUrl + `/brokenPage`,
-				Code: http.StatusNotFound,
-			}},
-			testUrl + `/page2`: []deadlinks.Broken{{
-				Link: testUrl + `/broken.png`,
-				Code: http.StatusNotFound,
-			}, {
-				Link: testUrl + `/page2/broken/relative`,
-				Code: http.StatusNotFound,
-			}, {
-				Link: testUrl + `/page2/broken2.png`,
-				Code: http.StatusNotFound,
-			}},
+			testUrl: []deadlinks.Broken{
+				{
+					Link: testUrl + `/broken.png`,
+					Code: http.StatusNotFound,
+				}, {
+					Link: testUrl + `/brokenPage`,
+					Code: http.StatusNotFound,
+				}, {
+					Link: `http://127.0.0.1:abc`,
+					Code: deadlinks.StatusBadLink,
+				}, {
+					Link: `http:/127.0.0.1:11836`,
+					Code: deadlinks.StatusBadLink,
+				},
+			},
+			testUrl + `/broken.html`: []deadlinks.Broken{
+				{
+					Link: testUrl + `/brokenPage`,
+					Code: http.StatusNotFound,
+				},
+			},
+			testUrl + `/page2`: []deadlinks.Broken{
+				{
+					Link: testUrl + `/broken.png`,
+					Code: http.StatusNotFound,
+				}, {
+					Link: testUrl + `/page2/broken/relative`,
+					Code: http.StatusNotFound,
+				}, {
+					Link: testUrl + `/page2/broken2.png`,
+					Code: http.StatusNotFound,
+				},
+			},
 		},
 	}, {
 		scanUrl: testUrl + `/page2`,
 		exp: map[string][]deadlinks.Broken{
-			testUrl: []deadlinks.Broken{{
-				Link: testUrl + `/broken.png`,
-				Code: http.StatusNotFound,
-			}, {
-				Link: testUrl + `/brokenPage`,
-				Code: http.StatusNotFound,
-			}, {
-				Link: `http://127.0.0.1:abc`,
-				Code: 700,
-			}, {
-				Link: `http:/127.0.0.1:11836`,
-				Code: http.StatusNotFound,
-			}},
-			testUrl + `/broken.html`: []deadlinks.Broken{{
-				Link: testUrl + `/brokenPage`,
-				Code: http.StatusNotFound,
-			}},
-			testUrl + `/page2`: []deadlinks.Broken{{
-				Link: testUrl + `/broken.png`,
-				Code: http.StatusNotFound,
-			}, {
-				Link: testUrl + `/page2/broken/relative`,
-				Code: http.StatusNotFound,
-			}, {
-				Link: testUrl + `/page2/broken2.png`,
-				Code: http.StatusNotFound,
-			}},
+			testUrl: []deadlinks.Broken{
+				{
+					Link: testUrl + `/broken.png`,
+					Code: http.StatusNotFound,
+				}, {
+					Link: testUrl + `/brokenPage`,
+					Code: http.StatusNotFound,
+				}, {
+					Link: `http://127.0.0.1:abc`,
+					Code: deadlinks.StatusBadLink,
+				}, {
+					Link: `http:/127.0.0.1:11836`,
+					Code: deadlinks.StatusBadLink,
+				},
+			},
+			testUrl + `/broken.html`: []deadlinks.Broken{
+				{
+					Link: testUrl + `/brokenPage`,
+					Code: http.StatusNotFound,
+				},
+			},
+			testUrl + `/page2`: []deadlinks.Broken{
+				{
+					Link: testUrl + `/broken.png`,
+					Code: http.StatusNotFound,
+				}, {
+					Link: testUrl + `/page2/broken/relative`,
+					Code: http.StatusNotFound,
+				}, {
+					Link: testUrl + `/page2/broken2.png`,
+					Code: http.StatusNotFound,
+				},
+			},
 		},
 	}}
 
@@ -155,6 +167,7 @@ func TestDeadLinks_Scan(t *testing.T) {
 		err    error
 	)
 	for _, tcase := range listCase {
+		t.Logf(`--- Scan: %s`, tcase.scanUrl)
 		var scanOpts = deadlinks.ScanOptions{
 			Url: tcase.scanUrl,
 		}
diff --git a/link_queue.go b/link_queue.go
index 9987586..10dbfff 100644
--- a/link_queue.go
+++ b/link_queue.go
@@ -11,6 +11,21 @@ import (
 
 type linkQueue struct {
 	parentUrl *url.URL
-	url       string
-	kind      atom.Atom
+
+	// The error from scan.
+	errScan error
+
+	// url being scanned.
+	url string
+
+	// kind of url, its either an anchor or image.
+	// It set to 0 if url is the first URL being scanned.
+	kind atom.Atom
+
+	// Status of link after scan, its mostly used the HTTP status code.
+	// 0: link is the result of scan, not processed yet.
+	// StatusBadLink: link is invalid, not parseable or unreachable.
+	// 200 - 211: OK.
+	// 400 - 511: Error.
+	status int
 }
diff --git a/worker.go b/worker.go
index 99bef6f..6bc5c13 100644
--- a/worker.go
+++ b/worker.go
@@ -5,29 +5,27 @@ package deadlinks
 
 import (
 	"fmt"
-	"io"
 	"log"
 	"net/http"
 	"net/url"
 	"strings"
 	"sync"
+	"time"
 
 	"golang.org/x/net/html"
 	"golang.org/x/net/html/atom"
 )
 
 type worker struct {
-	// seenLink store the page URL that has been scanned and its HTTP status
-	// code.
+	// seenLink store the URL being or has been scanned and its HTTP
+	// status code.
 	seenLink map[string]int
 
-	// linkq contains queue of page URL to be scanned.
-	linkq chan linkQueue
+	// resultq channel that collect result from scanning.
+	resultq chan map[string]linkQueue
 
-	// errq contains error from scanning a page URL.
-	errq chan error
-
-	// result contains map of page URL and its list of broken link.
+	// result contains the final result after all of the pages has been
+	// scanned.
 	result *Result
 
 	// The base URL that will be joined to relative or absolute
@@ -41,17 +39,13 @@ type worker struct {
 
 	// wg sync the goroutine scanner.
 	wg sync.WaitGroup
-
-	// seenLinkMtx guard the seenLink field from concurrent read/write.
-	seenLinkMtx sync.Mutex
 }
 
 func newWorker(opts ScanOptions) (wrk *worker, err error) {
 	wrk = &worker{
 		opts:     opts,
 		seenLink: map[string]int{},
-		linkq:    make(chan linkQueue, 10000),
-		errq:     make(chan error, 1),
+		resultq:  make(chan map[string]linkQueue, 100),
 		result:   newResult(),
 	}
 
@@ -69,63 +63,151 @@ func newWorker(opts ScanOptions) (wrk *worker, err error) {
 		Host:   wrk.scanUrl.Host,
 	}
 
-	wrk.linkq <- linkQueue{
-		parentUrl: nil,
-		url:       wrk.scanUrl.String(),
-	}
 	return wrk, nil
 }
 
 func (wrk *worker) run() (result *Result, err error) {
-	var ever bool = true
-	for ever {
+	// Scan the first URL to make sure that the server is reachable.
+	var firstLinkq = linkQueue{
+		parentUrl: nil,
+		url:       wrk.scanUrl.String(),
+		status:    http.StatusProcessing,
+	}
+	wrk.seenLink[firstLinkq.url] = http.StatusProcessing
+
+	wrk.wg.Add(1)
+	go wrk.scan(firstLinkq)
+	wrk.wg.Wait()
+
+	var resultq = <-wrk.resultq
+	for _, linkq := range resultq {
+		if linkq.url == firstLinkq.url {
+			if linkq.errScan != nil {
+				return nil, linkq.errScan
+			}
+			wrk.seenLink[linkq.url] = linkq.status
+			continue
+		}
+		if linkq.status >= http.StatusBadRequest {
+			wrk.markDead(linkq)
+			continue
+		}
+
+		wrk.seenLink[linkq.url] = http.StatusProcessing
+		wrk.wg.Add(1)
+		go wrk.scan(linkq)
+	}
+
+	var listWaitStatus []linkQueue
+	var isScanning = true
+	for isScanning {
 		select {
-		case linkq := <-wrk.linkq:
-			wrk.wg.Add(1)
-			go wrk.scan(linkq)
+		case resultq := <-wrk.resultq:
+
+			// The resultq contains the original URL being scanned
+			// and its child links.
+			// For example, scanning "http://example.tld" result
+			// in
+			//
+			//	"http://example.tld": {status=200}
+			//	"http://example.tld/page": {status=0}
+			//	"http://example.tld/image.png": {status=0}
+			//	"http://bad:domain/image.png": {status=700}
+
+			for _, linkq := range resultq {
+				if linkq.status >= http.StatusBadRequest {
+					wrk.markDead(linkq)
+					continue
+				}
+
+				seenStatus, seen := wrk.seenLink[linkq.url]
+				if !seen {
+					wrk.seenLink[linkq.url] = http.StatusProcessing
+					wrk.wg.Add(1)
+					go wrk.scan(linkq)
+					continue
+				}
+				if seenStatus >= http.StatusBadRequest {
+					linkq.status = seenStatus
+					wrk.markDead(linkq)
+					continue
+				}
+				if seenStatus >= http.StatusOK {
+					// The link has been processed and its
+					// not an error.
+					continue
+				}
+				if linkq.status != 0 {
+					// linkq is the result of scan with
+					// non error status.
+					wrk.seenLink[linkq.url] = linkq.status
+					continue
+				}
+
+				// The link being processed by other
+				// goroutine.
+				listWaitStatus = append(listWaitStatus, linkq)
+			}
 
 		default:
 			wrk.wg.Wait()
-
-			select {
-			case err = <-wrk.errq:
-				return nil, err
-			default:
-				if len(wrk.linkq) == 0 {
-					ever = false
+			if len(wrk.resultq) != 0 {
+				continue
+			}
+			var newList []linkQueue
+			for _, linkq := range listWaitStatus {
+				seenStatus := wrk.seenLink[linkq.url]
+				if seenStatus == http.StatusProcessing {
+					// Scanning still in progress.
+					newList = append(newList, linkq)
+					continue
+				}
+				if seenStatus >= http.StatusBadRequest {
+					linkq.status = seenStatus
+					wrk.markDead(linkq)
+					continue
 				}
 			}
+			if len(newList) != 0 {
+				// There are link that still waiting for
+				// scanning to be completed.
+				listWaitStatus = newList
+				continue
+			}
+			isScanning = false
 		}
 	}
 	wrk.result.sort()
 	return wrk.result, nil
 }
 
+func (wrk *worker) markDead(linkq linkQueue) {
+	var parentUrl = linkq.parentUrl.String()
+	var listBroken = wrk.result.PageLinks[parentUrl]
+	var brokenLink = Broken{
+		Link: linkq.url,
+		Code: linkq.status,
+	}
+	listBroken = append(listBroken, brokenLink)
+	wrk.result.PageLinks[parentUrl] = listBroken
+	wrk.seenLink[linkq.url] = linkq.status
+}
+
 // scan fetch the HTML page or image to check if its valid.
 func (wrk *worker) scan(linkq linkQueue) {
-	defer wrk.wg.Done()
-
-	wrk.seenLinkMtx.Lock()
-	statusCode, seen := wrk.seenLink[linkq.url]
-	wrk.seenLinkMtx.Unlock()
-	if seen {
-		if statusCode >= http.StatusBadRequest {
-			wrk.markDead(linkq, statusCode)
-		}
+	defer func() {
 		if wrk.opts.IsVerbose {
-			fmt.Printf("scan: %s %d\n", linkq.url, statusCode)
+			fmt.Printf("  done: %d %s\n", linkq.status, linkq.url)
 		}
-		return
-	}
-	wrk.seenLinkMtx.Lock()
-	wrk.seenLink[linkq.url] = http.StatusProcessing
-	wrk.seenLinkMtx.Unlock()
+		wrk.wg.Done()
+	}()
 
 	if wrk.opts.IsVerbose {
-		fmt.Printf("scan: %s %d\n", linkq.url, http.StatusProcessing)
+		fmt.Printf("scan: %d %s\n", linkq.status, linkq.url)
 	}
 
 	var (
+		resultq  = map[string]linkQueue{}
 		httpResp *http.Response
 		err      error
 	)
@@ -135,51 +217,34 @@ func (wrk *worker) scan(linkq linkQueue) {
 		httpResp, err = http.Get(linkq.url)
 	}
 	if err != nil {
-		if linkq.parentUrl == nil {
-			wrk.errq <- err
-		} else {
-			wrk.markDead(linkq, http.StatusNotFound)
-		}
+		linkq.status = StatusBadLink
+		linkq.errScan = err
+		resultq[linkq.url] = linkq
+		go wrk.pushResult(resultq)
 		return
 	}
 	defer httpResp.Body.Close()
 
-	if httpResp.StatusCode != http.StatusOK {
-		wrk.markDead(linkq, httpResp.StatusCode)
+	linkq.status = httpResp.StatusCode
+	resultq[linkq.url] = linkq
+
+	if httpResp.StatusCode >= http.StatusBadRequest {
+		go wrk.pushResult(resultq)
 		return
 	}
-	wrk.seenLinkMtx.Lock()
-	wrk.seenLink[linkq.url] = http.StatusOK
-	wrk.seenLinkMtx.Unlock()
-
 	if linkq.kind == atom.Img {
+		go wrk.pushResult(resultq)
 		return
 	}
 	if !strings.HasPrefix(linkq.url, wrk.baseUrl.String()) {
-		// Do not parse the page from external domain.
+		// Do not parse the HTML page from external domain, only need
+		// its HTTP status code.
+		go wrk.pushResult(resultq)
 		return
 	}
-	wrk.parseHTML(linkq.url, httpResp.Body)
-}
-
-func (wrk *worker) markDead(linkq linkQueue, httpStatusCode int) {
-	var parentUrl = linkq.parentUrl.String()
-
-	wrk.seenLinkMtx.Lock()
-	var listBroken = wrk.result.PageLinks[parentUrl]
-	listBroken = append(listBroken, Broken{
-		Link: linkq.url,
-		Code: httpStatusCode,
-	})
-	wrk.result.PageLinks[parentUrl] = listBroken
-	wrk.seenLink[linkq.url] = httpStatusCode
-	wrk.seenLinkMtx.Unlock()
-}
 
-func (wrk *worker) parseHTML(linkUrl string, body io.Reader) {
 	var doc *html.Node
-
-	doc, _ = html.Parse(body)
+	doc, _ = html.Parse(httpResp.Body)
 
 	// After we check the code and test for [html.Parse] there are
 	// no case actual cases where HTML content will return an error.
@@ -188,90 +253,96 @@ func (wrk *worker) parseHTML(linkUrl string, body io.Reader) {
 	//
 	// [html.Parse]: https://go.googlesource.com/net/+/refs/tags/v0.40.0/html/parse.go#2347
 
+	var scanUrl *url.URL
+
+	scanUrl, err = url.Parse(linkq.url)
+	if err != nil {
+		log.Fatal(err)
+	}
+
 	var node *html.Node
+	var link string
+	var status int
 	for node = range doc.Descendants() {
 		if node.Type != html.ElementNode {
 			continue
 		}
+		link = ""
 		if node.DataAtom == atom.A {
 			for _, attr := range node.Attr {
 				if attr.Key != `href` {
 					continue
 				}
-				wrk.processLink(linkUrl, attr.Val, atom.A)
+				link, status = wrk.processLink(scanUrl, attr.Val, atom.A)
+				break
 			}
-		}
-		if node.DataAtom == atom.Img {
+		} else if node.DataAtom == atom.Img {
 			for _, attr := range node.Attr {
 				if attr.Key != `src` {
 					continue
 				}
-				wrk.processLink(linkUrl, attr.Val, atom.Img)
+				link, status = wrk.processLink(scanUrl, attr.Val, atom.Img)
+				break
 			}
+		} else {
+			continue
+		}
+		if link == "" {
+			continue
+		}
+		resultq[link] = linkQueue{
+			parentUrl: scanUrl,
+			url:       link,
+			kind:      node.DataAtom,
+			status:    status,
 		}
 	}
+	go wrk.pushResult(resultq)
 }
 
-func (wrk *worker) processLink(rawParentUrl, val string, kind atom.Atom) {
+func (wrk *worker) processLink(parentUrl *url.URL, val string, kind atom.Atom) (
+	link string, status int,
+) {
 	if len(val) == 0 {
-		return
-	}
-
-	var parentUrl *url.URL
-	var err error
-
-	parentUrl, err = url.Parse(rawParentUrl)
-	if err != nil {
-		log.Fatal(err)
+		return "", 0
 	}
 
 	var newUrl *url.URL
+	var err error
 	newUrl, err = url.Parse(val)
 	if err != nil {
-		var linkq = linkQueue{
-			parentUrl: parentUrl,
-			url:       val,
-			kind:      kind,
-		}
-		wrk.markDead(linkq, 700)
-		return
+		return val, StatusBadLink
 	}
 	newUrl.Fragment = ""
 	newUrl.RawFragment = ""
 
-	var newUrlStr = strings.TrimSuffix(newUrl.String(), `/`)
-
 	if kind == atom.A && val[0] == '#' {
 		// Ignore link to ID, like `href="#element_id"`.
-		return
+		return "", 0
+	}
+	if strings.HasPrefix(val, `http`) {
+		link = strings.TrimSuffix(newUrl.String(), `/`)
+		return link, 0
 	}
-
-	// val is absolute to parent URL.
 	if val[0] == '/' {
-		// Link to the same domain will queued for scanning.
+		// val is absolute to parent URL.
 		newUrl = wrk.baseUrl.JoinPath(newUrl.Path)
-		newUrlStr = strings.TrimSuffix(newUrl.String(), `/`)
-		wrk.linkq <- linkQueue{
-			parentUrl: parentUrl,
-			url:       newUrlStr,
-			kind:      kind,
-		}
-		return
+	} else {
+		// val is relative to parent URL.
+		newUrl = parentUrl.JoinPath(`/`, newUrl.Path)
 	}
-	if strings.HasPrefix(val, `http`) {
-		wrk.linkq <- linkQueue{
-			parentUrl: parentUrl,
-			url:       newUrlStr,
-			kind:      kind,
+	link = strings.TrimSuffix(newUrl.String(), `/`)
+	return link, 0
+}
+
+func (wrk *worker) pushResult(resultq map[string]linkQueue) {
+	var tick = time.NewTicker(100 * time.Millisecond)
+	for {
+		select {
+		case wrk.resultq <- resultq:
+			tick.Stop()
+			return
+		case <-tick.C:
 		}
-		return
-	}
-	// val is relative to parent URL.
-	newUrl = parentUrl.JoinPath(`/`, newUrl.Path)
-	newUrlStr = strings.TrimSuffix(newUrl.String(), `/`)
-	wrk.linkq <- linkQueue{
-		parentUrl: parentUrl,
-		url:       newUrlStr,
-		kind:      kind,
 	}
 }