diff options
| author | Shulhan <ms@kilabit.info> | 2025-05-29 12:58:19 +0700 |
|---|---|---|
| committer | Shulhan <ms@kilabit.info> | 2025-05-29 12:58:19 +0700 |
| commit | fa85558a0d1b20e4d203ddb537c3bde51bc5131f (patch) | |
| tree | e17db63c00d818a7d621574021055822d6c46479 | |
| parent | bcccb3d65bf1714bb03ef6342a85fe14e6209c51 (diff) | |
| download | jarink-fa85558a0d1b20e4d203ddb537c3bde51bc5131f.tar.xz | |
all: handle case for invalid URL, dead server, and on subpage
Scanning invalid URL like "127.0.0.1:14594", without HTTP scheme,
and "http://127.0.0.1:14594" (server not available) should return
an error.
Scanning on subpage like "http://127.0.0.1:11836/page2" should return
the same result as scanning from the base URL
"http://127.0.0.1:11836/page2".
| -rw-r--r-- | deadlinks_test.go | 59 | ||||
| -rw-r--r-- | url_test.go | 39 | ||||
| -rw-r--r-- | worker.go | 56 |
3 files changed, 122 insertions, 32 deletions
diff --git a/deadlinks_test.go b/deadlinks_test.go index 9556803..a43df3d 100644 --- a/deadlinks_test.go +++ b/deadlinks_test.go @@ -38,34 +38,65 @@ func TestMain(m *testing.M) { } func TestDeadLinks_Scan(t *testing.T) { - var testUrl = `http://` + testListenAddress + `/` + var testUrl = `http://` + testListenAddress type testCase struct { - exp map[string][]deadlinks.Broken - baseUrl string + exp map[string][]deadlinks.Broken + scanUrl string + expError string } listCase := []testCase{{ - baseUrl: testUrl, + scanUrl: `127.0.0.1:14594`, + expError: `Scan: invalid URL "127.0.0.1:14594"`, + }, { + scanUrl: `http://127.0.0.1:14594`, + expError: `Scan: Get "http://127.0.0.1:14594": dial tcp 127.0.0.1:14594: connect: connection refused`, + }, { + scanUrl: testUrl, exp: map[string][]deadlinks.Broken{ testUrl: []deadlinks.Broken{{ - Link: testUrl + `broken.png`, + Link: testUrl + `/broken.png`, Code: http.StatusNotFound, }, { - Link: testUrl + `brokenPage`, + Link: testUrl + `/brokenPage`, Code: http.StatusNotFound, }, { Link: `https://kilabit.info/brokenPage`, Code: http.StatusNotFound, }}, - testUrl + `page2`: []deadlinks.Broken{{ - Link: testUrl + `broken.png`, + testUrl + `/page2`: []deadlinks.Broken{{ + Link: testUrl + `/broken.png`, Code: http.StatusNotFound, }, { - Link: testUrl + `page2/broken/relative`, + Link: testUrl + `/page2/broken/relative`, Code: http.StatusNotFound, }, { - Link: testUrl + `page2/broken2.png`, + Link: testUrl + `/page2/broken2.png`, + Code: http.StatusNotFound, + }}, + }, + }, { + scanUrl: testUrl + `/page2`, + exp: map[string][]deadlinks.Broken{ + testUrl: []deadlinks.Broken{{ + Link: testUrl + `/broken.png`, + Code: http.StatusNotFound, + }, { + Link: testUrl + `/brokenPage`, + Code: http.StatusNotFound, + }, { + Link: `https://kilabit.info/brokenPage`, + Code: http.StatusNotFound, + }}, + testUrl + `/page2`: []deadlinks.Broken{{ + Link: testUrl + `/broken.png`, + Code: http.StatusNotFound, + }, { + Link: testUrl + `/page2/broken/relative`, + Code: http.StatusNotFound, + }, { + Link: testUrl + `/page2/broken2.png`, Code: http.StatusNotFound, }}, }, @@ -76,10 +107,12 @@ func TestDeadLinks_Scan(t *testing.T) { err error ) for _, tcase := range listCase { - result, err = deadlinks.Scan(tcase.baseUrl) + result, err = deadlinks.Scan(tcase.scanUrl) if err != nil { - t.Fatal(err) + test.Assert(t, tcase.scanUrl+` error`, + tcase.expError, err.Error()) + continue } - test.Assert(t, tcase.baseUrl, tcase.exp, result.PageLinks) + test.Assert(t, tcase.scanUrl, tcase.exp, result.PageLinks) } } diff --git a/url_test.go b/url_test.go new file mode 100644 index 0000000..698c49c --- /dev/null +++ b/url_test.go @@ -0,0 +1,39 @@ +// SPDX-FileCopyrightText: 2025 M. Shulhan <ms@kilabit.info> +// SPDX-License-Identifier: GPL-3.0-only + +package deadlinks + +import ( + "net/url" + "testing" + + "git.sr.ht/~shulhan/pakakeh.go/lib/test" +) + +func TestUrlString(t *testing.T) { + type testCase struct { + rawUrl string + exp string + } + var listCase = []testCase{{ + rawUrl: `http://127.0.0.1`, + exp: `http://127.0.0.1`, + }, { + rawUrl: `http://127.0.0.1/`, + exp: `http://127.0.0.1/`, + }, { + rawUrl: `http://127.0.0.1/page`, + exp: `http://127.0.0.1/page`, + }, { + rawUrl: `http://127.0.0.1/page/`, + exp: `http://127.0.0.1/page/`, + }} + for _, tcase := range listCase { + gotUrl, err := url.Parse(tcase.rawUrl) + if err != nil { + t.Fatal(err) + } + var got = gotUrl.String() + test.Assert(t, tcase.rawUrl, tcase.exp, got) + } +} @@ -30,10 +30,13 @@ type worker struct { // result contains map of page URL and its list of broken link. result *Result - // The base URL to scan that will be joined to relative or absolute + // The base URL that will be joined to relative or absolute // links or image. baseUrl *url.URL + // The URL to scan. + scanUrl *url.URL + // wg sync the goroutine scanner. wg sync.WaitGroup @@ -41,22 +44,29 @@ type worker struct { seenLinkMtx sync.Mutex } -func newWorker(baseUrl string) (wrk *worker, err error) { +func newWorker(scanUrl string) (wrk *worker, err error) { wrk = &worker{ seenLink: map[string]int{}, linkq: make(chan linkQueue, 1000), errq: make(chan error, 1), result: newResult(), } - wrk.baseUrl, err = url.Parse(baseUrl) + + wrk.scanUrl, err = url.Parse(scanUrl) if err != nil { - return nil, err + return nil, fmt.Errorf(`invalid URL %q`, scanUrl) + } + + wrk.scanUrl.Path = strings.TrimSuffix(wrk.scanUrl.Path, `/`) + + wrk.baseUrl = &url.URL{ + Scheme: wrk.scanUrl.Scheme, + Host: wrk.scanUrl.Host, } - wrk.baseUrl = wrk.baseUrl.JoinPath(`/`) wrk.linkq <- linkQueue{ parentUrl: nil, - url: wrk.baseUrl.String(), + url: wrk.scanUrl.String(), } return wrk, nil } @@ -69,13 +79,16 @@ func (wrk *worker) run() (result *Result, err error) { wrk.wg.Add(1) go wrk.scan(linkq) - case err = <-wrk.errq: - return nil, err - default: wrk.wg.Wait() - if len(wrk.linkq) == 0 { - ever = false + + select { + case err = <-wrk.errq: + return nil, err + default: + if len(wrk.linkq) == 0 { + ever = false + } } } } @@ -83,7 +96,7 @@ func (wrk *worker) run() (result *Result, err error) { return wrk.result, nil } -// scan the function that fetch the HTML page and scan for broken links. +// scan fetch the HTML page or image to check if its valid. func (wrk *worker) scan(linkq linkQueue) { var logp = `scan` @@ -140,11 +153,9 @@ func (wrk *worker) markDead(linkq linkQueue, httpStatusCode int) { } func (wrk *worker) parseHTML(linkUrl string, body io.Reader) (err error) { - var ( - logp = `parseHTML` - doc *html.Node - ) + var logp = `parseHTML` + var doc *html.Node doc, err = html.Parse(body) if err != nil { return fmt.Errorf(`%s: %w`, logp, err) @@ -179,18 +190,23 @@ func (wrk *worker) processLink(rawParentUrl string, val string) { if len(val) == 0 { return } + var parentUrl *url.URL var err error + parentUrl, err = url.Parse(rawParentUrl) if err != nil { log.Fatal(err) } + + // val is absolute to parent URL. if val[0] == '/' { // Link to the same domain will queued for scanning. var newUrl = wrk.baseUrl.JoinPath(val) + var newUrlStr = strings.TrimSuffix(newUrl.String(), `/`) wrk.linkq <- linkQueue{ parentUrl: parentUrl, - url: newUrl.String(), + url: newUrlStr, } return } @@ -205,16 +221,18 @@ func (wrk *worker) processLink(rawParentUrl string, val string) { wrk.markDead(linkq, 700) return } + var newUrlStr = strings.TrimSuffix(newUrl.String(), `/`) wrk.linkq <- linkQueue{ parentUrl: parentUrl, - url: newUrl.String(), + url: newUrlStr, } return } // val is relative to parent URL. var newUrl = parentUrl.JoinPath(`/`, val) + var newUrlStr = strings.TrimSuffix(newUrl.String(), `/`) wrk.linkq <- linkQueue{ parentUrl: parentUrl, - url: newUrl.String(), + url: newUrlStr, } } |
