summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorShulhan <ms@kilabit.info>2025-05-29 12:58:19 +0700
committerShulhan <ms@kilabit.info>2025-05-29 12:58:19 +0700
commitfa85558a0d1b20e4d203ddb537c3bde51bc5131f (patch)
treee17db63c00d818a7d621574021055822d6c46479
parentbcccb3d65bf1714bb03ef6342a85fe14e6209c51 (diff)
downloadjarink-fa85558a0d1b20e4d203ddb537c3bde51bc5131f.tar.xz
all: handle case for invalid URL, dead server, and on subpage
Scanning invalid URL like "127.0.0.1:14594", without HTTP scheme, and "http://127.0.0.1:14594" (server not available) should return an error. Scanning on subpage like "http://127.0.0.1:11836/page2" should return the same result as scanning from the base URL "http://127.0.0.1:11836/page2".
-rw-r--r--deadlinks_test.go59
-rw-r--r--url_test.go39
-rw-r--r--worker.go56
3 files changed, 122 insertions, 32 deletions
diff --git a/deadlinks_test.go b/deadlinks_test.go
index 9556803..a43df3d 100644
--- a/deadlinks_test.go
+++ b/deadlinks_test.go
@@ -38,34 +38,65 @@ func TestMain(m *testing.M) {
}
func TestDeadLinks_Scan(t *testing.T) {
- var testUrl = `http://` + testListenAddress + `/`
+ var testUrl = `http://` + testListenAddress
type testCase struct {
- exp map[string][]deadlinks.Broken
- baseUrl string
+ exp map[string][]deadlinks.Broken
+ scanUrl string
+ expError string
}
listCase := []testCase{{
- baseUrl: testUrl,
+ scanUrl: `127.0.0.1:14594`,
+ expError: `Scan: invalid URL "127.0.0.1:14594"`,
+ }, {
+ scanUrl: `http://127.0.0.1:14594`,
+ expError: `Scan: Get "http://127.0.0.1:14594": dial tcp 127.0.0.1:14594: connect: connection refused`,
+ }, {
+ scanUrl: testUrl,
exp: map[string][]deadlinks.Broken{
testUrl: []deadlinks.Broken{{
- Link: testUrl + `broken.png`,
+ Link: testUrl + `/broken.png`,
Code: http.StatusNotFound,
}, {
- Link: testUrl + `brokenPage`,
+ Link: testUrl + `/brokenPage`,
Code: http.StatusNotFound,
}, {
Link: `https://kilabit.info/brokenPage`,
Code: http.StatusNotFound,
}},
- testUrl + `page2`: []deadlinks.Broken{{
- Link: testUrl + `broken.png`,
+ testUrl + `/page2`: []deadlinks.Broken{{
+ Link: testUrl + `/broken.png`,
Code: http.StatusNotFound,
}, {
- Link: testUrl + `page2/broken/relative`,
+ Link: testUrl + `/page2/broken/relative`,
Code: http.StatusNotFound,
}, {
- Link: testUrl + `page2/broken2.png`,
+ Link: testUrl + `/page2/broken2.png`,
+ Code: http.StatusNotFound,
+ }},
+ },
+ }, {
+ scanUrl: testUrl + `/page2`,
+ exp: map[string][]deadlinks.Broken{
+ testUrl: []deadlinks.Broken{{
+ Link: testUrl + `/broken.png`,
+ Code: http.StatusNotFound,
+ }, {
+ Link: testUrl + `/brokenPage`,
+ Code: http.StatusNotFound,
+ }, {
+ Link: `https://kilabit.info/brokenPage`,
+ Code: http.StatusNotFound,
+ }},
+ testUrl + `/page2`: []deadlinks.Broken{{
+ Link: testUrl + `/broken.png`,
+ Code: http.StatusNotFound,
+ }, {
+ Link: testUrl + `/page2/broken/relative`,
+ Code: http.StatusNotFound,
+ }, {
+ Link: testUrl + `/page2/broken2.png`,
Code: http.StatusNotFound,
}},
},
@@ -76,10 +107,12 @@ func TestDeadLinks_Scan(t *testing.T) {
err error
)
for _, tcase := range listCase {
- result, err = deadlinks.Scan(tcase.baseUrl)
+ result, err = deadlinks.Scan(tcase.scanUrl)
if err != nil {
- t.Fatal(err)
+ test.Assert(t, tcase.scanUrl+` error`,
+ tcase.expError, err.Error())
+ continue
}
- test.Assert(t, tcase.baseUrl, tcase.exp, result.PageLinks)
+ test.Assert(t, tcase.scanUrl, tcase.exp, result.PageLinks)
}
}
diff --git a/url_test.go b/url_test.go
new file mode 100644
index 0000000..698c49c
--- /dev/null
+++ b/url_test.go
@@ -0,0 +1,39 @@
+// SPDX-FileCopyrightText: 2025 M. Shulhan <ms@kilabit.info>
+// SPDX-License-Identifier: GPL-3.0-only
+
+package deadlinks
+
+import (
+ "net/url"
+ "testing"
+
+ "git.sr.ht/~shulhan/pakakeh.go/lib/test"
+)
+
+func TestUrlString(t *testing.T) {
+ type testCase struct {
+ rawUrl string
+ exp string
+ }
+ var listCase = []testCase{{
+ rawUrl: `http://127.0.0.1`,
+ exp: `http://127.0.0.1`,
+ }, {
+ rawUrl: `http://127.0.0.1/`,
+ exp: `http://127.0.0.1/`,
+ }, {
+ rawUrl: `http://127.0.0.1/page`,
+ exp: `http://127.0.0.1/page`,
+ }, {
+ rawUrl: `http://127.0.0.1/page/`,
+ exp: `http://127.0.0.1/page/`,
+ }}
+ for _, tcase := range listCase {
+ gotUrl, err := url.Parse(tcase.rawUrl)
+ if err != nil {
+ t.Fatal(err)
+ }
+ var got = gotUrl.String()
+ test.Assert(t, tcase.rawUrl, tcase.exp, got)
+ }
+}
diff --git a/worker.go b/worker.go
index 0b2ffc0..206fbd3 100644
--- a/worker.go
+++ b/worker.go
@@ -30,10 +30,13 @@ type worker struct {
// result contains map of page URL and its list of broken link.
result *Result
- // The base URL to scan that will be joined to relative or absolute
+ // The base URL that will be joined to relative or absolute
// links or image.
baseUrl *url.URL
+ // The URL to scan.
+ scanUrl *url.URL
+
// wg sync the goroutine scanner.
wg sync.WaitGroup
@@ -41,22 +44,29 @@ type worker struct {
seenLinkMtx sync.Mutex
}
-func newWorker(baseUrl string) (wrk *worker, err error) {
+func newWorker(scanUrl string) (wrk *worker, err error) {
wrk = &worker{
seenLink: map[string]int{},
linkq: make(chan linkQueue, 1000),
errq: make(chan error, 1),
result: newResult(),
}
- wrk.baseUrl, err = url.Parse(baseUrl)
+
+ wrk.scanUrl, err = url.Parse(scanUrl)
if err != nil {
- return nil, err
+ return nil, fmt.Errorf(`invalid URL %q`, scanUrl)
+ }
+
+ wrk.scanUrl.Path = strings.TrimSuffix(wrk.scanUrl.Path, `/`)
+
+ wrk.baseUrl = &url.URL{
+ Scheme: wrk.scanUrl.Scheme,
+ Host: wrk.scanUrl.Host,
}
- wrk.baseUrl = wrk.baseUrl.JoinPath(`/`)
wrk.linkq <- linkQueue{
parentUrl: nil,
- url: wrk.baseUrl.String(),
+ url: wrk.scanUrl.String(),
}
return wrk, nil
}
@@ -69,13 +79,16 @@ func (wrk *worker) run() (result *Result, err error) {
wrk.wg.Add(1)
go wrk.scan(linkq)
- case err = <-wrk.errq:
- return nil, err
-
default:
wrk.wg.Wait()
- if len(wrk.linkq) == 0 {
- ever = false
+
+ select {
+ case err = <-wrk.errq:
+ return nil, err
+ default:
+ if len(wrk.linkq) == 0 {
+ ever = false
+ }
}
}
}
@@ -83,7 +96,7 @@ func (wrk *worker) run() (result *Result, err error) {
return wrk.result, nil
}
-// scan the function that fetch the HTML page and scan for broken links.
+// scan fetch the HTML page or image to check if its valid.
func (wrk *worker) scan(linkq linkQueue) {
var logp = `scan`
@@ -140,11 +153,9 @@ func (wrk *worker) markDead(linkq linkQueue, httpStatusCode int) {
}
func (wrk *worker) parseHTML(linkUrl string, body io.Reader) (err error) {
- var (
- logp = `parseHTML`
- doc *html.Node
- )
+ var logp = `parseHTML`
+ var doc *html.Node
doc, err = html.Parse(body)
if err != nil {
return fmt.Errorf(`%s: %w`, logp, err)
@@ -179,18 +190,23 @@ func (wrk *worker) processLink(rawParentUrl string, val string) {
if len(val) == 0 {
return
}
+
var parentUrl *url.URL
var err error
+
parentUrl, err = url.Parse(rawParentUrl)
if err != nil {
log.Fatal(err)
}
+
+ // val is absolute to parent URL.
if val[0] == '/' {
// Link to the same domain will queued for scanning.
var newUrl = wrk.baseUrl.JoinPath(val)
+ var newUrlStr = strings.TrimSuffix(newUrl.String(), `/`)
wrk.linkq <- linkQueue{
parentUrl: parentUrl,
- url: newUrl.String(),
+ url: newUrlStr,
}
return
}
@@ -205,16 +221,18 @@ func (wrk *worker) processLink(rawParentUrl string, val string) {
wrk.markDead(linkq, 700)
return
}
+ var newUrlStr = strings.TrimSuffix(newUrl.String(), `/`)
wrk.linkq <- linkQueue{
parentUrl: parentUrl,
- url: newUrl.String(),
+ url: newUrlStr,
}
return
}
// val is relative to parent URL.
var newUrl = parentUrl.JoinPath(`/`, val)
+ var newUrlStr = strings.TrimSuffix(newUrl.String(), `/`)
wrk.linkq <- linkQueue{
parentUrl: parentUrl,
- url: newUrl.String(),
+ url: newUrlStr,
}
}