diff options
| author | Shulhan <ms@kilabit.info> | 2025-05-23 11:01:23 +0700 |
|---|---|---|
| committer | Shulhan <ms@kilabit.info> | 2025-05-27 01:51:35 +0700 |
| commit | bcccb3d65bf1714bb03ef6342a85fe14e6209c51 (patch) | |
| tree | 4dfe3fd89737751228a13a712897b6bc07cd2d68 | |
| parent | 25c19699c84bdf77244edd1dda73652ac57376a6 (diff) | |
| download | jarink-bcccb3d65bf1714bb03ef6342a85fe14e6209c51.tar.xz | |
all: complete the first minimum working implementation
The current implementation at least cover 84% of the cases.
Todo,
* CLI for scan
* add more test case for 100% coverage, including scan on invalid
base URL, scan on invalid HTML page, scan on invalid href or
src image
| -rw-r--r-- | .gitignore | 2 | ||||
| -rw-r--r-- | Makefile | 7 | ||||
| -rw-r--r-- | deadlinks.go | 14 | ||||
| -rw-r--r-- | deadlinks_test.go | 16 | ||||
| -rw-r--r-- | link_queue.go | 11 | ||||
| -rw-r--r-- | result.go | 13 | ||||
| -rw-r--r-- | testdata/web/index.html | 1 | ||||
| -rw-r--r-- | testdata/web/page2/index.html | 3 | ||||
| -rw-r--r-- | worker.go | 161 |
9 files changed, 183 insertions, 45 deletions
diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..2ad3c05 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +/cover.html +/cover.out @@ -1,6 +1,9 @@ ## SPDX-FileCopyrightText: 2025 M. Shulhan <ms@kilabit.info> ## SPDX-License-Identifier: GPL-3.0-only +COVER_OUT:=cover.out +COVER_HTML:=cover.html + .PHONY: all all: lint test @@ -11,4 +14,6 @@ lint: .PHONY: test test: - go test ./... + CGO_ENABLED=1 go test -failfast -timeout=1m -race \ + -coverprofile=$(COVER_OUT) ./... + go tool cover -html=$(COVER_OUT) -o $(COVER_HTML) diff --git a/deadlinks.go b/deadlinks.go index 0f83c8a..4c0f062 100644 --- a/deadlinks.go +++ b/deadlinks.go @@ -3,16 +3,24 @@ package deadlinks -import "fmt" +import ( + "fmt" +) // Scan the baseUrl for dead links. func Scan(baseUrl string) (result *Result, err error) { var logp = `Scan` - var worker = newWorker(baseUrl) + var wrk *worker - result, err = worker.run() + wrk, err = newWorker(baseUrl) if err != nil { return nil, fmt.Errorf(`%s: %s`, logp, err) } + + result, err = wrk.run() + if err != nil { + return nil, fmt.Errorf(`%s: %s`, logp, err) + } + return result, nil } diff --git a/deadlinks_test.go b/deadlinks_test.go index b449b22..9556803 100644 --- a/deadlinks_test.go +++ b/deadlinks_test.go @@ -38,7 +38,7 @@ func TestMain(m *testing.M) { } func TestDeadLinks_Scan(t *testing.T) { - var testUrl = `http://` + testListenAddress + var testUrl = `http://` + testListenAddress + `/` type testCase struct { exp map[string][]deadlinks.Broken @@ -49,18 +49,24 @@ func TestDeadLinks_Scan(t *testing.T) { baseUrl: testUrl, exp: map[string][]deadlinks.Broken{ testUrl: []deadlinks.Broken{{ - Code: http.StatusNotFound, Link: testUrl + `broken.png`, - }, { Code: http.StatusNotFound, + }, { Link: testUrl + `brokenPage`, + Code: http.StatusNotFound, + }, { + Link: `https://kilabit.info/brokenPage`, + Code: http.StatusNotFound, }}, testUrl + `page2`: []deadlinks.Broken{{ + Link: testUrl + `broken.png`, Code: http.StatusNotFound, - Link: testUrl + `broken2.png`, }, { - Code: http.StatusNotFound, Link: testUrl + `page2/broken/relative`, + Code: http.StatusNotFound, + }, { + Link: testUrl + `page2/broken2.png`, + Code: http.StatusNotFound, }}, }, }} diff --git a/link_queue.go b/link_queue.go new file mode 100644 index 0000000..dfcba76 --- /dev/null +++ b/link_queue.go @@ -0,0 +1,11 @@ +// SPDX-FileCopyrightText: 2025 M. Shulhan <ms@kilabit.info> +// SPDX-License-Identifier: GPL-3.0-only + +package deadlinks + +import "net/url" + +type linkQueue struct { + parentUrl *url.URL + url string +} @@ -3,6 +3,11 @@ package deadlinks +import ( + "slices" + "strings" +) + // Broken store the link with its HTTP status. type Broken struct { Link string @@ -20,3 +25,11 @@ func newResult() *Result { PageLinks: map[string][]Broken{}, } } + +func (result *Result) sort() { + for _, listBroken := range result.PageLinks { + slices.SortFunc(listBroken, func(a, b Broken) int { + return strings.Compare(a.Link, b.Link) + }) + } +} diff --git a/testdata/web/index.html b/testdata/web/index.html index 5d85bcf..1124813 100644 --- a/testdata/web/index.html +++ b/testdata/web/index.html @@ -7,5 +7,6 @@ SPDX-License-Identifier: GPL-3.0-only <img src="/broken.png" /> <a href="/brokenPage">Broken page</a> <a href="/page2">Page 2</a> + <a href="https://kilabit.info/brokenPage">Broken page at kilabit.info</a> </body> </html> diff --git a/testdata/web/page2/index.html b/testdata/web/page2/index.html index 933dede..0fc7601 100644 --- a/testdata/web/page2/index.html +++ b/testdata/web/page2/index.html @@ -4,7 +4,10 @@ SPDX-License-Identifier: GPL-3.0-only --> <html> <body> + <img src="/broken.png" /> <img src="broken2.png" /> <a href="broken/relative">broken relative link</a> + <a href="/">Back with absolute path</a> + <a href="../">Back with relative path</a> </body> </html> @@ -6,19 +6,23 @@ package deadlinks import ( "fmt" "io" + "log" "net/http" + "net/url" + "strings" "sync" "golang.org/x/net/html" + "golang.org/x/net/html/atom" ) type worker struct { - // seenPage store the page URL that has been scanned and its HTTP status + // seenLink store the page URL that has been scanned and its HTTP status // code. - seenPage map[string]int + seenLink map[string]int - // pageq contains queue of page URL to be scanned. - pageq chan string + // linkq contains queue of page URL to be scanned. + linkq chan linkQueue // errq contains error from scanning a page URL. errq chan error @@ -26,82 +30,121 @@ type worker struct { // result contains map of page URL and its list of broken link. result *Result + // The base URL to scan that will be joined to relative or absolute + // links or image. + baseUrl *url.URL + // wg sync the goroutine scanner. wg sync.WaitGroup - // seenPageMtx guard the seenPage field from concurrent read/write. - seenPageMtx sync.Mutex + // seenLinkMtx guard the seenLink field from concurrent read/write. + seenLinkMtx sync.Mutex } -func newWorker(baseUrl string) (wrk *worker) { +func newWorker(baseUrl string) (wrk *worker, err error) { wrk = &worker{ - pageq: make(chan string, 1), + seenLink: map[string]int{}, + linkq: make(chan linkQueue, 1000), errq: make(chan error, 1), - seenPage: map[string]int{}, result: newResult(), } - wrk.pageq <- baseUrl - return wrk + wrk.baseUrl, err = url.Parse(baseUrl) + if err != nil { + return nil, err + } + + wrk.baseUrl = wrk.baseUrl.JoinPath(`/`) + wrk.linkq <- linkQueue{ + parentUrl: nil, + url: wrk.baseUrl.String(), + } + return wrk, nil } func (wrk *worker) run() (result *Result, err error) { - for len(wrk.pageq) > 0 { + var ever bool = true + for ever { select { - case page := <-wrk.pageq: + case linkq := <-wrk.linkq: wrk.wg.Add(1) - go wrk.scan(page) + go wrk.scan(linkq) case err = <-wrk.errq: - close(wrk.pageq) return nil, err default: wrk.wg.Wait() + if len(wrk.linkq) == 0 { + ever = false + } } } + wrk.result.sort() return wrk.result, nil } // scan the function that fetch the HTML page and scan for broken links. -func (wrk *worker) scan(pageUrl string) { +func (wrk *worker) scan(linkq linkQueue) { var logp = `scan` defer wrk.wg.Done() - if wrk.hasSeen(pageUrl) { + wrk.seenLinkMtx.Lock() + statusCode, seen := wrk.seenLink[linkq.url] + wrk.seenLinkMtx.Unlock() + if seen { + if statusCode >= http.StatusBadRequest { + wrk.markDead(linkq, statusCode) + } return } - wrk.markSeen(pageUrl, http.StatusProcessing) + wrk.seenLinkMtx.Lock() + wrk.seenLink[linkq.url] = http.StatusProcessing + wrk.seenLinkMtx.Unlock() var ( httpResp *http.Response err error ) - httpResp, err = http.Get(pageUrl) + httpResp, err = http.Get(linkq.url) if err != nil { wrk.errq <- err return } if httpResp.StatusCode != http.StatusOK { - wrk.errq <- fmt.Errorf(`%s %s: return HTTP status code %d`, - logp, pageUrl, httpResp.StatusCode) + wrk.markDead(linkq, httpResp.StatusCode) return } defer httpResp.Body.Close() - err = wrk.parseHTML(pageUrl, httpResp.Body) + err = wrk.parseHTML(linkq.url, httpResp.Body) if err != nil { - wrk.errq <- fmt.Errorf(`%s %s: %w`, logp, pageUrl, err) + wrk.errq <- fmt.Errorf(`%s %s: %w`, logp, linkq.url, err) return } } -func (wrk *worker) parseHTML(pageUrl string, body io.Reader) (err error) { +func (wrk *worker) markDead(linkq linkQueue, httpStatusCode int) { + var parentUrl = linkq.parentUrl.String() + + wrk.seenLinkMtx.Lock() + var listBroken = wrk.result.PageLinks[parentUrl] + listBroken = append(listBroken, Broken{ + Link: linkq.url, + Code: httpStatusCode, + }) + wrk.result.PageLinks[parentUrl] = listBroken + wrk.seenLink[linkq.url] = httpStatusCode + wrk.seenLinkMtx.Unlock() +} + +func (wrk *worker) parseHTML(linkUrl string, body io.Reader) (err error) { var ( logp = `parseHTML` doc *html.Node ) + doc, err = html.Parse(body) if err != nil { return fmt.Errorf(`%s: %w`, logp, err) @@ -112,20 +155,66 @@ func (wrk *worker) parseHTML(pageUrl string, body io.Reader) (err error) { if node.Type != html.ElementNode { continue } + if node.DataAtom == atom.A { + for _, attr := range node.Attr { + if attr.Key != `href` { + continue + } + wrk.processLink(linkUrl, attr.Val) + } + } + if node.DataAtom == atom.Img { + for _, attr := range node.Attr { + if attr.Key != `src` { + continue + } + wrk.processLink(linkUrl, attr.Val) + } + } } return nil } -// hasSeen return true if the pageUrl has been scanned. -func (wrk *worker) hasSeen(pageUrl string) (ok bool) { - wrk.seenPageMtx.Lock() - _, ok = wrk.seenPage[pageUrl] - wrk.seenPageMtx.Unlock() - return ok -} - -func (wrk *worker) markSeen(pageUrl string, httpStatusCode int) { - wrk.seenPageMtx.Lock() - wrk.seenPage[pageUrl] = httpStatusCode - wrk.seenPageMtx.Unlock() +func (wrk *worker) processLink(rawParentUrl string, val string) { + if len(val) == 0 { + return + } + var parentUrl *url.URL + var err error + parentUrl, err = url.Parse(rawParentUrl) + if err != nil { + log.Fatal(err) + } + if val[0] == '/' { + // Link to the same domain will queued for scanning. + var newUrl = wrk.baseUrl.JoinPath(val) + wrk.linkq <- linkQueue{ + parentUrl: parentUrl, + url: newUrl.String(), + } + return + } + if strings.HasPrefix(val, `http`) { + var newUrl *url.URL + newUrl, err = url.Parse(val) + if err != nil { + var linkq = linkQueue{ + parentUrl: parentUrl, + url: val, + } + wrk.markDead(linkq, 700) + return + } + wrk.linkq <- linkQueue{ + parentUrl: parentUrl, + url: newUrl.String(), + } + return + } + // val is relative to parent URL. + var newUrl = parentUrl.JoinPath(`/`, val) + wrk.linkq <- linkQueue{ + parentUrl: parentUrl, + url: newUrl.String(), + } } |
