diff options
| author | Shulhan <ms@kilabit.info> | 2025-06-01 16:02:46 +0700 |
|---|---|---|
| committer | Shulhan <ms@kilabit.info> | 2025-06-01 16:02:46 +0700 |
| commit | ff0fb55d60f4c29ffb6b8a69c2be3469f794990a (patch) | |
| tree | a3a28c2b1fe241dc1378e237407a2db1cb433779 | |
| parent | 19b2e2d1ac0fd8a88af6b0da3d00017e4ee31c06 (diff) | |
| download | jarink-ff0fb55d60f4c29ffb6b8a69c2be3469f794990a.tar.xz | |
all: brokenlinks should scan only URL on given path
Previously, if we pass the URL with path to brokenlinks, for example
"web.tld/path" it will scan all of the pages in the website "web.tld".
Now, it only scan the "/path" and its sub paths.
| -rw-r--r-- | README | 35 | ||||
| -rw-r--r-- | brokenlinks_test.go | 25 | ||||
| -rw-r--r-- | brokenlinks_worker.go | 2 |
3 files changed, 37 insertions, 25 deletions
@@ -20,8 +20,41 @@ Available commands, Links will be scanned on anchor href attribute ("<a href=...>") or on the image src attribute ("<img src=..."). + The URL can be start from base or from sub path. + Scanning from path only report brokenlinks on that path and their + sub paths. + For example, given a website that have the following pages, + + - web.tld (base) + - web.tld/page1 + - web.tld/page1/sub1 + - web.tld/page2 + - web.tld/page2/sub1 + + Invoking brokenlinks with + + $ jarink brokenlinks https://web.tld + + will scan all of the pages, but invoking brokenlinks on path + "/page2" + + $ jarink brokenlinks https://web.tld/page2 + + only scan "/page2" and "/page2/sub1". + Once finished it will print the page and list of broken links in - JSON format to standard output. + JSON format to standard output, + + { + "$PAGE": [{ + "Link": <string>, + "Error": <string>, + "Code": <integer> + }, + ... + ], + ... + } This command accept the following options, diff --git a/brokenlinks_test.go b/brokenlinks_test.go index c1a607f..1c43937 100644 --- a/brokenlinks_test.go +++ b/brokenlinks_test.go @@ -126,31 +126,10 @@ func TestBrokenlinks(t *testing.T) { }, }, }, { + // Scanning on "/path" should not scan the the "/" or other + // pages other than below of "/path" itself. scanUrl: testUrl + `/page2`, exp: map[string][]jarink.Broken{ - testUrl: []jarink.Broken{ - { - Link: testUrl + `/broken.png`, - Code: http.StatusNotFound, - }, { - Link: testUrl + `/brokenPage`, - Code: http.StatusNotFound, - }, { - Link: `http://127.0.0.1:abc`, - Error: `parse "http://127.0.0.1:abc": invalid port ":abc" after host`, - Code: jarink.StatusBadLink, - }, { - Link: `http:/127.0.0.1:11836`, - Error: `Head "http:/127.0.0.1:11836": http: no Host in request URL`, - Code: jarink.StatusBadLink, - }, - }, - testUrl + `/broken.html`: []jarink.Broken{ - { - Link: testUrl + `/brokenPage`, - Code: http.StatusNotFound, - }, - }, testUrl + `/page2`: []jarink.Broken{ { Link: testUrl + `/broken.png`, diff --git a/brokenlinks_worker.go b/brokenlinks_worker.go index a196d2f..5cc8c25 100644 --- a/brokenlinks_worker.go +++ b/brokenlinks_worker.go @@ -303,7 +303,7 @@ func (wrk *brokenlinksWorker) scan(linkq linkQueue) { } _, seen := resultq[nodeLink.url] if !seen { - if !strings.HasPrefix(nodeLink.url, wrk.baseUrl.String()) { + if !strings.HasPrefix(nodeLink.url, wrk.scanUrl.String()) { nodeLink.isExternal = true } resultq[nodeLink.url] = *nodeLink |
