all: complete the first minimum working implementation

The current implementation at least cover 84% of the cases. Todo, * CLI for scan * add more test case for 100% coverage, including scan on invalid base URL, scan on invalid HTML page, scan on invalid href or src image
author: Shulhan <ms@kilabit.info> 2025-05-23 11:01:23 +0700
committer: Shulhan <ms@kilabit.info> 2025-05-27 01:51:35 +0700
commit: bcccb3d65bf1714bb03ef6342a85fe14e6209c51 (patch)
tree: 4dfe3fd89737751228a13a712897b6bc07cd2d68
parent: 25c19699c84bdf77244edd1dda73652ac57376a6 (diff)
download: jarink-bcccb3d65bf1714bb03ef6342a85fe14e6209c51.tar.xz
9 files changed, 183 insertions, 45 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..2ad3c05
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,2 @@
+/cover.html
+/cover.out
diff --git a/Makefile b/Makefile
index e6d24fa..3699644 100644
--- a/Makefile
+++ b/Makefile
@@ -1,6 +1,9 @@
 ## SPDX-FileCopyrightText: 2025 M. Shulhan <ms@kilabit.info>
 ## SPDX-License-Identifier: GPL-3.0-only
 
+COVER_OUT:=cover.out
+COVER_HTML:=cover.html
+
 .PHONY: all
 all: lint test
 
@@ -11,4 +14,6 @@ lint:
 
 .PHONY: test
 test:
-	go test ./...
+	CGO_ENABLED=1 go test -failfast -timeout=1m -race \
+		-coverprofile=$(COVER_OUT) ./...
+	go tool cover -html=$(COVER_OUT) -o $(COVER_HTML)
diff --git a/deadlinks.go b/deadlinks.go
index 0f83c8a..4c0f062 100644
--- a/deadlinks.go
+++ b/deadlinks.go
@@ -3,16 +3,24 @@
 
 package deadlinks
 
-import "fmt"
+import (
+	"fmt"
+)
 
 // Scan the baseUrl for dead links.
 func Scan(baseUrl string) (result *Result, err error) {
 	var logp = `Scan`
-	var worker = newWorker(baseUrl)
+	var wrk *worker
 
-	result, err = worker.run()
+	wrk, err = newWorker(baseUrl)
 	if err != nil {
 		return nil, fmt.Errorf(`%s: %s`, logp, err)
 	}
+
+	result, err = wrk.run()
+	if err != nil {
+		return nil, fmt.Errorf(`%s: %s`, logp, err)
+	}
+
 	return result, nil
 }
diff --git a/deadlinks_test.go b/deadlinks_test.go
index b449b22..9556803 100644
--- a/deadlinks_test.go
+++ b/deadlinks_test.go
@@ -38,7 +38,7 @@ func TestMain(m *testing.M) {
 }
 
 func TestDeadLinks_Scan(t *testing.T) {
-	var testUrl = `http://` + testListenAddress
+	var testUrl = `http://` + testListenAddress + `/`
 
 	type testCase struct {
 		exp     map[string][]deadlinks.Broken
@@ -49,18 +49,24 @@ func TestDeadLinks_Scan(t *testing.T) {
 		baseUrl: testUrl,
 		exp: map[string][]deadlinks.Broken{
 			testUrl: []deadlinks.Broken{{
-				Code: http.StatusNotFound,
 				Link: testUrl + `broken.png`,
-			}, {
 				Code: http.StatusNotFound,
+			}, {
 				Link: testUrl + `brokenPage`,
+				Code: http.StatusNotFound,
+			}, {
+				Link: `https://kilabit.info/brokenPage`,
+				Code: http.StatusNotFound,
 			}},
 			testUrl + `page2`: []deadlinks.Broken{{
+				Link: testUrl + `broken.png`,
 				Code: http.StatusNotFound,
-				Link: testUrl + `broken2.png`,
 			}, {
-				Code: http.StatusNotFound,
 				Link: testUrl + `page2/broken/relative`,
+				Code: http.StatusNotFound,
+			}, {
+				Link: testUrl + `page2/broken2.png`,
+				Code: http.StatusNotFound,
 			}},
 		},
 	}}
diff --git a/link_queue.go b/link_queue.go
new file mode 100644
index 0000000..dfcba76
--- /dev/null
+++ b/link_queue.go
@@ -0,0 +1,11 @@
+// SPDX-FileCopyrightText: 2025 M. Shulhan <ms@kilabit.info>
+// SPDX-License-Identifier: GPL-3.0-only
+
+package deadlinks
+
+import "net/url"
+
+type linkQueue struct {
+	parentUrl *url.URL
+	url       string
+}
diff --git a/result.go b/result.go
index a66114f..1fc11c5 100644
--- a/result.go
+++ b/result.go
@@ -3,6 +3,11 @@
 
 package deadlinks
 
+import (
+	"slices"
+	"strings"
+)
+
 // Broken store the link with its HTTP status.
 type Broken struct {
 	Link string
@@ -20,3 +25,11 @@ func newResult() *Result {
 		PageLinks: map[string][]Broken{},
 	}
 }
+
+func (result *Result) sort() {
+	for _, listBroken := range result.PageLinks {
+		slices.SortFunc(listBroken, func(a, b Broken) int {
+			return strings.Compare(a.Link, b.Link)
+		})
+	}
+}
diff --git a/testdata/web/index.html b/testdata/web/index.html
index 5d85bcf..1124813 100644
--- a/testdata/web/index.html
+++ b/testdata/web/index.html
@@ -7,5 +7,6 @@ SPDX-License-Identifier: GPL-3.0-only
     <img src="/broken.png" />
     <a href="/brokenPage">Broken page</a>
     <a href="/page2">Page 2</a>
+    <a href="https://kilabit.info/brokenPage">Broken page at kilabit.info</a>
   </body>
 </html>
diff --git a/testdata/web/page2/index.html b/testdata/web/page2/index.html
index 933dede..0fc7601 100644
--- a/testdata/web/page2/index.html
+++ b/testdata/web/page2/index.html
@@ -4,7 +4,10 @@ SPDX-License-Identifier: GPL-3.0-only
 -->
 <html>
   <body>
+    <img src="/broken.png" />
     <img src="broken2.png" />
     <a href="broken/relative">broken relative link</a>
+    <a href="/">Back with absolute path</a>
+    <a href="../">Back with relative path</a>
   </body>
 </html>
diff --git a/worker.go b/worker.go
index 8463c79..0b2ffc0 100644
--- a/worker.go
+++ b/worker.go
@@ -6,19 +6,23 @@ package deadlinks
 import (
 	"fmt"
 	"io"
+	"log"
 	"net/http"
+	"net/url"
+	"strings"
 	"sync"
 
 	"golang.org/x/net/html"
+	"golang.org/x/net/html/atom"
 )
 
 type worker struct {
-	// seenPage store the page URL that has been scanned and its HTTP status
+	// seenLink store the page URL that has been scanned and its HTTP status
 	// code.
-	seenPage map[string]int
+	seenLink map[string]int
 
-	// pageq contains queue of page URL to be scanned.
-	pageq chan string
+	// linkq contains queue of page URL to be scanned.
+	linkq chan linkQueue
 
 	// errq contains error from scanning a page URL.
 	errq chan error
@@ -26,82 +30,121 @@ type worker struct {
 	// result contains map of page URL and its list of broken link.
 	result *Result
 
+	// The base URL to scan that will be joined to relative or absolute
+	// links or image.
+	baseUrl *url.URL
+
 	// wg sync the goroutine scanner.
 	wg sync.WaitGroup
 
-	// seenPageMtx guard the seenPage field from concurrent read/write.
-	seenPageMtx sync.Mutex
+	// seenLinkMtx guard the seenLink field from concurrent read/write.
+	seenLinkMtx sync.Mutex
 }
 
-func newWorker(baseUrl string) (wrk *worker) {
+func newWorker(baseUrl string) (wrk *worker, err error) {
 	wrk = &worker{
-		pageq:    make(chan string, 1),
+		seenLink: map[string]int{},
+		linkq:    make(chan linkQueue, 1000),
 		errq:     make(chan error, 1),
-		seenPage: map[string]int{},
 		result:   newResult(),
 	}
-	wrk.pageq <- baseUrl
-	return wrk
+	wrk.baseUrl, err = url.Parse(baseUrl)
+	if err != nil {
+		return nil, err
+	}
+
+	wrk.baseUrl = wrk.baseUrl.JoinPath(`/`)
+	wrk.linkq <- linkQueue{
+		parentUrl: nil,
+		url:       wrk.baseUrl.String(),
+	}
+	return wrk, nil
 }
 
 func (wrk *worker) run() (result *Result, err error) {
-	for len(wrk.pageq) > 0 {
+	var ever bool = true
+	for ever {
 		select {
-		case page := <-wrk.pageq:
+		case linkq := <-wrk.linkq:
 			wrk.wg.Add(1)
-			go wrk.scan(page)
+			go wrk.scan(linkq)
 
 		case err = <-wrk.errq:
-			close(wrk.pageq)
 			return nil, err
 
 		default:
 			wrk.wg.Wait()
+			if len(wrk.linkq) == 0 {
+				ever = false
+			}
 		}
 	}
+	wrk.result.sort()
 	return wrk.result, nil
 }
 
 // scan the function that fetch the HTML page and scan for broken links.
-func (wrk *worker) scan(pageUrl string) {
+func (wrk *worker) scan(linkq linkQueue) {
 	var logp = `scan`
 
 	defer wrk.wg.Done()
 
-	if wrk.hasSeen(pageUrl) {
+	wrk.seenLinkMtx.Lock()
+	statusCode, seen := wrk.seenLink[linkq.url]
+	wrk.seenLinkMtx.Unlock()
+	if seen {
+		if statusCode >= http.StatusBadRequest {
+			wrk.markDead(linkq, statusCode)
+		}
 		return
 	}
-	wrk.markSeen(pageUrl, http.StatusProcessing)
+	wrk.seenLinkMtx.Lock()
+	wrk.seenLink[linkq.url] = http.StatusProcessing
+	wrk.seenLinkMtx.Unlock()
 
 	var (
 		httpResp *http.Response
 		err      error
 	)
-	httpResp, err = http.Get(pageUrl)
+	httpResp, err = http.Get(linkq.url)
 	if err != nil {
 		wrk.errq <- err
 		return
 	}
 	if httpResp.StatusCode != http.StatusOK {
-		wrk.errq <- fmt.Errorf(`%s %s: return HTTP status code %d`,
-			logp, pageUrl, httpResp.StatusCode)
+		wrk.markDead(linkq, httpResp.StatusCode)
 		return
 	}
 
 	defer httpResp.Body.Close()
 
-	err = wrk.parseHTML(pageUrl, httpResp.Body)
+	err = wrk.parseHTML(linkq.url, httpResp.Body)
 	if err != nil {
-		wrk.errq <- fmt.Errorf(`%s %s: %w`, logp, pageUrl, err)
+		wrk.errq <- fmt.Errorf(`%s %s: %w`, logp, linkq.url, err)
 		return
 	}
 }
 
-func (wrk *worker) parseHTML(pageUrl string, body io.Reader) (err error) {
+func (wrk *worker) markDead(linkq linkQueue, httpStatusCode int) {
+	var parentUrl = linkq.parentUrl.String()
+
+	wrk.seenLinkMtx.Lock()
+	var listBroken = wrk.result.PageLinks[parentUrl]
+	listBroken = append(listBroken, Broken{
+		Link: linkq.url,
+		Code: httpStatusCode,
+	})
+	wrk.result.PageLinks[parentUrl] = listBroken
+	wrk.seenLink[linkq.url] = httpStatusCode
+	wrk.seenLinkMtx.Unlock()
+}
+
+func (wrk *worker) parseHTML(linkUrl string, body io.Reader) (err error) {
 	var (
 		logp = `parseHTML`
 		doc  *html.Node
 	)
+
 	doc, err = html.Parse(body)
 	if err != nil {
 		return fmt.Errorf(`%s: %w`, logp, err)
@@ -112,20 +155,66 @@ func (wrk *worker) parseHTML(pageUrl string, body io.Reader) (err error) {
 		if node.Type != html.ElementNode {
 			continue
 		}
+		if node.DataAtom == atom.A {
+			for _, attr := range node.Attr {
+				if attr.Key != `href` {
+					continue
+				}
+				wrk.processLink(linkUrl, attr.Val)
+			}
+		}
+		if node.DataAtom == atom.Img {
+			for _, attr := range node.Attr {
+				if attr.Key != `src` {
+					continue
+				}
+				wrk.processLink(linkUrl, attr.Val)
+			}
+		}
 	}
 	return nil
 }
 
-// hasSeen return true if the pageUrl has been scanned.
-func (wrk *worker) hasSeen(pageUrl string) (ok bool) {
-	wrk.seenPageMtx.Lock()
-	_, ok = wrk.seenPage[pageUrl]
-	wrk.seenPageMtx.Unlock()
-	return ok
-}
-
-func (wrk *worker) markSeen(pageUrl string, httpStatusCode int) {
-	wrk.seenPageMtx.Lock()
-	wrk.seenPage[pageUrl] = httpStatusCode
-	wrk.seenPageMtx.Unlock()
+func (wrk *worker) processLink(rawParentUrl string, val string) {
+	if len(val) == 0 {
+		return
+	}
+	var parentUrl *url.URL
+	var err error
+	parentUrl, err = url.Parse(rawParentUrl)
+	if err != nil {
+		log.Fatal(err)
+	}
+	if val[0] == '/' {
+		// Link to the same domain will queued for scanning.
+		var newUrl = wrk.baseUrl.JoinPath(val)
+		wrk.linkq <- linkQueue{
+			parentUrl: parentUrl,
+			url:       newUrl.String(),
+		}
+		return
+	}
+	if strings.HasPrefix(val, `http`) {
+		var newUrl *url.URL
+		newUrl, err = url.Parse(val)
+		if err != nil {
+			var linkq = linkQueue{
+				parentUrl: parentUrl,
+				url:       val,
+			}
+			wrk.markDead(linkq, 700)
+			return
+		}
+		wrk.linkq <- linkQueue{
+			parentUrl: parentUrl,
+			url:       newUrl.String(),
+		}
+		return
+	}
+	// val is relative to parent URL.
+	var newUrl = parentUrl.JoinPath(`/`, val)
+	wrk.linkq <- linkQueue{
+		parentUrl: parentUrl,
+		url:       newUrl.String(),
+	}
 }
author	Shulhan <ms@kilabit.info>	2025-05-23 11:01:23 +0700
committer	Shulhan <ms@kilabit.info>	2025-05-27 01:51:35 +0700
commit	bcccb3d65bf1714bb03ef6342a85fe14e6209c51 (patch)
tree	4dfe3fd89737751228a13a712897b6bc07cd2d68
parent	25c19699c84bdf77244edd1dda73652ac57376a6 (diff)
download	jarink-bcccb3d65bf1714bb03ef6342a85fe14e6209c51.tar.xz