summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.gitignore2
-rw-r--r--Makefile7
-rw-r--r--deadlinks.go14
-rw-r--r--deadlinks_test.go16
-rw-r--r--link_queue.go11
-rw-r--r--result.go13
-rw-r--r--testdata/web/index.html1
-rw-r--r--testdata/web/page2/index.html3
-rw-r--r--worker.go161
9 files changed, 183 insertions, 45 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..2ad3c05
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,2 @@
+/cover.html
+/cover.out
diff --git a/Makefile b/Makefile
index e6d24fa..3699644 100644
--- a/Makefile
+++ b/Makefile
@@ -1,6 +1,9 @@
## SPDX-FileCopyrightText: 2025 M. Shulhan <ms@kilabit.info>
## SPDX-License-Identifier: GPL-3.0-only
+COVER_OUT:=cover.out
+COVER_HTML:=cover.html
+
.PHONY: all
all: lint test
@@ -11,4 +14,6 @@ lint:
.PHONY: test
test:
- go test ./...
+ CGO_ENABLED=1 go test -failfast -timeout=1m -race \
+ -coverprofile=$(COVER_OUT) ./...
+ go tool cover -html=$(COVER_OUT) -o $(COVER_HTML)
diff --git a/deadlinks.go b/deadlinks.go
index 0f83c8a..4c0f062 100644
--- a/deadlinks.go
+++ b/deadlinks.go
@@ -3,16 +3,24 @@
package deadlinks
-import "fmt"
+import (
+ "fmt"
+)
// Scan the baseUrl for dead links.
func Scan(baseUrl string) (result *Result, err error) {
var logp = `Scan`
- var worker = newWorker(baseUrl)
+ var wrk *worker
- result, err = worker.run()
+ wrk, err = newWorker(baseUrl)
if err != nil {
return nil, fmt.Errorf(`%s: %s`, logp, err)
}
+
+ result, err = wrk.run()
+ if err != nil {
+ return nil, fmt.Errorf(`%s: %s`, logp, err)
+ }
+
return result, nil
}
diff --git a/deadlinks_test.go b/deadlinks_test.go
index b449b22..9556803 100644
--- a/deadlinks_test.go
+++ b/deadlinks_test.go
@@ -38,7 +38,7 @@ func TestMain(m *testing.M) {
}
func TestDeadLinks_Scan(t *testing.T) {
- var testUrl = `http://` + testListenAddress
+ var testUrl = `http://` + testListenAddress + `/`
type testCase struct {
exp map[string][]deadlinks.Broken
@@ -49,18 +49,24 @@ func TestDeadLinks_Scan(t *testing.T) {
baseUrl: testUrl,
exp: map[string][]deadlinks.Broken{
testUrl: []deadlinks.Broken{{
- Code: http.StatusNotFound,
Link: testUrl + `broken.png`,
- }, {
Code: http.StatusNotFound,
+ }, {
Link: testUrl + `brokenPage`,
+ Code: http.StatusNotFound,
+ }, {
+ Link: `https://kilabit.info/brokenPage`,
+ Code: http.StatusNotFound,
}},
testUrl + `page2`: []deadlinks.Broken{{
+ Link: testUrl + `broken.png`,
Code: http.StatusNotFound,
- Link: testUrl + `broken2.png`,
}, {
- Code: http.StatusNotFound,
Link: testUrl + `page2/broken/relative`,
+ Code: http.StatusNotFound,
+ }, {
+ Link: testUrl + `page2/broken2.png`,
+ Code: http.StatusNotFound,
}},
},
}}
diff --git a/link_queue.go b/link_queue.go
new file mode 100644
index 0000000..dfcba76
--- /dev/null
+++ b/link_queue.go
@@ -0,0 +1,11 @@
+// SPDX-FileCopyrightText: 2025 M. Shulhan <ms@kilabit.info>
+// SPDX-License-Identifier: GPL-3.0-only
+
+package deadlinks
+
+import "net/url"
+
+type linkQueue struct {
+ parentUrl *url.URL
+ url string
+}
diff --git a/result.go b/result.go
index a66114f..1fc11c5 100644
--- a/result.go
+++ b/result.go
@@ -3,6 +3,11 @@
package deadlinks
+import (
+ "slices"
+ "strings"
+)
+
// Broken store the link with its HTTP status.
type Broken struct {
Link string
@@ -20,3 +25,11 @@ func newResult() *Result {
PageLinks: map[string][]Broken{},
}
}
+
+func (result *Result) sort() {
+ for _, listBroken := range result.PageLinks {
+ slices.SortFunc(listBroken, func(a, b Broken) int {
+ return strings.Compare(a.Link, b.Link)
+ })
+ }
+}
diff --git a/testdata/web/index.html b/testdata/web/index.html
index 5d85bcf..1124813 100644
--- a/testdata/web/index.html
+++ b/testdata/web/index.html
@@ -7,5 +7,6 @@ SPDX-License-Identifier: GPL-3.0-only
<img src="/broken.png" />
<a href="/brokenPage">Broken page</a>
<a href="/page2">Page 2</a>
+ <a href="https://kilabit.info/brokenPage">Broken page at kilabit.info</a>
</body>
</html>
diff --git a/testdata/web/page2/index.html b/testdata/web/page2/index.html
index 933dede..0fc7601 100644
--- a/testdata/web/page2/index.html
+++ b/testdata/web/page2/index.html
@@ -4,7 +4,10 @@ SPDX-License-Identifier: GPL-3.0-only
-->
<html>
<body>
+ <img src="/broken.png" />
<img src="broken2.png" />
<a href="broken/relative">broken relative link</a>
+ <a href="/">Back with absolute path</a>
+ <a href="../">Back with relative path</a>
</body>
</html>
diff --git a/worker.go b/worker.go
index 8463c79..0b2ffc0 100644
--- a/worker.go
+++ b/worker.go
@@ -6,19 +6,23 @@ package deadlinks
import (
"fmt"
"io"
+ "log"
"net/http"
+ "net/url"
+ "strings"
"sync"
"golang.org/x/net/html"
+ "golang.org/x/net/html/atom"
)
type worker struct {
- // seenPage store the page URL that has been scanned and its HTTP status
+ // seenLink store the page URL that has been scanned and its HTTP status
// code.
- seenPage map[string]int
+ seenLink map[string]int
- // pageq contains queue of page URL to be scanned.
- pageq chan string
+ // linkq contains queue of page URL to be scanned.
+ linkq chan linkQueue
// errq contains error from scanning a page URL.
errq chan error
@@ -26,82 +30,121 @@ type worker struct {
// result contains map of page URL and its list of broken link.
result *Result
+ // The base URL to scan that will be joined to relative or absolute
+ // links or image.
+ baseUrl *url.URL
+
// wg sync the goroutine scanner.
wg sync.WaitGroup
- // seenPageMtx guard the seenPage field from concurrent read/write.
- seenPageMtx sync.Mutex
+ // seenLinkMtx guard the seenLink field from concurrent read/write.
+ seenLinkMtx sync.Mutex
}
-func newWorker(baseUrl string) (wrk *worker) {
+func newWorker(baseUrl string) (wrk *worker, err error) {
wrk = &worker{
- pageq: make(chan string, 1),
+ seenLink: map[string]int{},
+ linkq: make(chan linkQueue, 1000),
errq: make(chan error, 1),
- seenPage: map[string]int{},
result: newResult(),
}
- wrk.pageq <- baseUrl
- return wrk
+ wrk.baseUrl, err = url.Parse(baseUrl)
+ if err != nil {
+ return nil, err
+ }
+
+ wrk.baseUrl = wrk.baseUrl.JoinPath(`/`)
+ wrk.linkq <- linkQueue{
+ parentUrl: nil,
+ url: wrk.baseUrl.String(),
+ }
+ return wrk, nil
}
func (wrk *worker) run() (result *Result, err error) {
- for len(wrk.pageq) > 0 {
+ var ever bool = true
+ for ever {
select {
- case page := <-wrk.pageq:
+ case linkq := <-wrk.linkq:
wrk.wg.Add(1)
- go wrk.scan(page)
+ go wrk.scan(linkq)
case err = <-wrk.errq:
- close(wrk.pageq)
return nil, err
default:
wrk.wg.Wait()
+ if len(wrk.linkq) == 0 {
+ ever = false
+ }
}
}
+ wrk.result.sort()
return wrk.result, nil
}
// scan the function that fetch the HTML page and scan for broken links.
-func (wrk *worker) scan(pageUrl string) {
+func (wrk *worker) scan(linkq linkQueue) {
var logp = `scan`
defer wrk.wg.Done()
- if wrk.hasSeen(pageUrl) {
+ wrk.seenLinkMtx.Lock()
+ statusCode, seen := wrk.seenLink[linkq.url]
+ wrk.seenLinkMtx.Unlock()
+ if seen {
+ if statusCode >= http.StatusBadRequest {
+ wrk.markDead(linkq, statusCode)
+ }
return
}
- wrk.markSeen(pageUrl, http.StatusProcessing)
+ wrk.seenLinkMtx.Lock()
+ wrk.seenLink[linkq.url] = http.StatusProcessing
+ wrk.seenLinkMtx.Unlock()
var (
httpResp *http.Response
err error
)
- httpResp, err = http.Get(pageUrl)
+ httpResp, err = http.Get(linkq.url)
if err != nil {
wrk.errq <- err
return
}
if httpResp.StatusCode != http.StatusOK {
- wrk.errq <- fmt.Errorf(`%s %s: return HTTP status code %d`,
- logp, pageUrl, httpResp.StatusCode)
+ wrk.markDead(linkq, httpResp.StatusCode)
return
}
defer httpResp.Body.Close()
- err = wrk.parseHTML(pageUrl, httpResp.Body)
+ err = wrk.parseHTML(linkq.url, httpResp.Body)
if err != nil {
- wrk.errq <- fmt.Errorf(`%s %s: %w`, logp, pageUrl, err)
+ wrk.errq <- fmt.Errorf(`%s %s: %w`, logp, linkq.url, err)
return
}
}
-func (wrk *worker) parseHTML(pageUrl string, body io.Reader) (err error) {
+func (wrk *worker) markDead(linkq linkQueue, httpStatusCode int) {
+ var parentUrl = linkq.parentUrl.String()
+
+ wrk.seenLinkMtx.Lock()
+ var listBroken = wrk.result.PageLinks[parentUrl]
+ listBroken = append(listBroken, Broken{
+ Link: linkq.url,
+ Code: httpStatusCode,
+ })
+ wrk.result.PageLinks[parentUrl] = listBroken
+ wrk.seenLink[linkq.url] = httpStatusCode
+ wrk.seenLinkMtx.Unlock()
+}
+
+func (wrk *worker) parseHTML(linkUrl string, body io.Reader) (err error) {
var (
logp = `parseHTML`
doc *html.Node
)
+
doc, err = html.Parse(body)
if err != nil {
return fmt.Errorf(`%s: %w`, logp, err)
@@ -112,20 +155,66 @@ func (wrk *worker) parseHTML(pageUrl string, body io.Reader) (err error) {
if node.Type != html.ElementNode {
continue
}
+ if node.DataAtom == atom.A {
+ for _, attr := range node.Attr {
+ if attr.Key != `href` {
+ continue
+ }
+ wrk.processLink(linkUrl, attr.Val)
+ }
+ }
+ if node.DataAtom == atom.Img {
+ for _, attr := range node.Attr {
+ if attr.Key != `src` {
+ continue
+ }
+ wrk.processLink(linkUrl, attr.Val)
+ }
+ }
}
return nil
}
-// hasSeen return true if the pageUrl has been scanned.
-func (wrk *worker) hasSeen(pageUrl string) (ok bool) {
- wrk.seenPageMtx.Lock()
- _, ok = wrk.seenPage[pageUrl]
- wrk.seenPageMtx.Unlock()
- return ok
-}
-
-func (wrk *worker) markSeen(pageUrl string, httpStatusCode int) {
- wrk.seenPageMtx.Lock()
- wrk.seenPage[pageUrl] = httpStatusCode
- wrk.seenPageMtx.Unlock()
+func (wrk *worker) processLink(rawParentUrl string, val string) {
+ if len(val) == 0 {
+ return
+ }
+ var parentUrl *url.URL
+ var err error
+ parentUrl, err = url.Parse(rawParentUrl)
+ if err != nil {
+ log.Fatal(err)
+ }
+ if val[0] == '/' {
+ // Link to the same domain will queued for scanning.
+ var newUrl = wrk.baseUrl.JoinPath(val)
+ wrk.linkq <- linkQueue{
+ parentUrl: parentUrl,
+ url: newUrl.String(),
+ }
+ return
+ }
+ if strings.HasPrefix(val, `http`) {
+ var newUrl *url.URL
+ newUrl, err = url.Parse(val)
+ if err != nil {
+ var linkq = linkQueue{
+ parentUrl: parentUrl,
+ url: val,
+ }
+ wrk.markDead(linkq, 700)
+ return
+ }
+ wrk.linkq <- linkQueue{
+ parentUrl: parentUrl,
+ url: newUrl.String(),
+ }
+ return
+ }
+ // val is relative to parent URL.
+ var newUrl = parentUrl.JoinPath(`/`, val)
+ wrk.linkq <- linkQueue{
+ parentUrl: parentUrl,
+ url: newUrl.String(),
+ }
}