aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorShulhan <ms@kilabit.info>2025-06-01 16:02:46 +0700
committerShulhan <ms@kilabit.info>2025-06-01 16:02:46 +0700
commitff0fb55d60f4c29ffb6b8a69c2be3469f794990a (patch)
treea3a28c2b1fe241dc1378e237407a2db1cb433779
parent19b2e2d1ac0fd8a88af6b0da3d00017e4ee31c06 (diff)
downloadjarink-ff0fb55d60f4c29ffb6b8a69c2be3469f794990a.tar.xz
all: brokenlinks should scan only URL on given path
Previously, if we pass the URL with path to brokenlinks, for example "web.tld/path" it will scan all of the pages in the website "web.tld". Now, it only scan the "/path" and its sub paths.
-rw-r--r--README35
-rw-r--r--brokenlinks_test.go25
-rw-r--r--brokenlinks_worker.go2
3 files changed, 37 insertions, 25 deletions
diff --git a/README b/README
index a26722f..2d8b273 100644
--- a/README
+++ b/README
@@ -20,8 +20,41 @@ Available commands,
Links will be scanned on anchor href attribute ("<a href=...>") or
on the image src attribute ("<img src=...").
+ The URL can be start from base or from sub path.
+ Scanning from path only report brokenlinks on that path and their
+ sub paths.
+ For example, given a website that have the following pages,
+
+ - web.tld (base)
+ - web.tld/page1
+ - web.tld/page1/sub1
+ - web.tld/page2
+ - web.tld/page2/sub1
+
+ Invoking brokenlinks with
+
+ $ jarink brokenlinks https://web.tld
+
+ will scan all of the pages, but invoking brokenlinks on path
+ "/page2"
+
+ $ jarink brokenlinks https://web.tld/page2
+
+ only scan "/page2" and "/page2/sub1".
+
Once finished it will print the page and list of broken links in
- JSON format to standard output.
+ JSON format to standard output,
+
+ {
+ "$PAGE": [{
+ "Link": <string>,
+ "Error": <string>,
+ "Code": <integer>
+ },
+ ...
+ ],
+ ...
+ }
This command accept the following options,
diff --git a/brokenlinks_test.go b/brokenlinks_test.go
index c1a607f..1c43937 100644
--- a/brokenlinks_test.go
+++ b/brokenlinks_test.go
@@ -126,31 +126,10 @@ func TestBrokenlinks(t *testing.T) {
},
},
}, {
+ // Scanning on "/path" should not scan the the "/" or other
+ // pages other than below of "/path" itself.
scanUrl: testUrl + `/page2`,
exp: map[string][]jarink.Broken{
- testUrl: []jarink.Broken{
- {
- Link: testUrl + `/broken.png`,
- Code: http.StatusNotFound,
- }, {
- Link: testUrl + `/brokenPage`,
- Code: http.StatusNotFound,
- }, {
- Link: `http://127.0.0.1:abc`,
- Error: `parse "http://127.0.0.1:abc": invalid port ":abc" after host`,
- Code: jarink.StatusBadLink,
- }, {
- Link: `http:/127.0.0.1:11836`,
- Error: `Head "http:/127.0.0.1:11836": http: no Host in request URL`,
- Code: jarink.StatusBadLink,
- },
- },
- testUrl + `/broken.html`: []jarink.Broken{
- {
- Link: testUrl + `/brokenPage`,
- Code: http.StatusNotFound,
- },
- },
testUrl + `/page2`: []jarink.Broken{
{
Link: testUrl + `/broken.png`,
diff --git a/brokenlinks_worker.go b/brokenlinks_worker.go
index a196d2f..5cc8c25 100644
--- a/brokenlinks_worker.go
+++ b/brokenlinks_worker.go
@@ -303,7 +303,7 @@ func (wrk *brokenlinksWorker) scan(linkq linkQueue) {
}
_, seen := resultq[nodeLink.url]
if !seen {
- if !strings.HasPrefix(nodeLink.url, wrk.baseUrl.String()) {
+ if !strings.HasPrefix(nodeLink.url, wrk.scanUrl.String()) {
nodeLink.isExternal = true
}
resultq[nodeLink.url] = *nodeLink