aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorShulhan <ms@kilabit.info>2026-02-12 01:04:40 +0700
committerShulhan <ms@kilabit.info>2026-02-12 01:04:40 +0700
commit9c7ee77376294e9abd70ca356e26d0ab16ad7466 (patch)
tree2fc279d233575571389ea89d2ba0be16314bc7dd
parent8100b3be0730173a77f1a64f9ac6bc8862a159ac (diff)
downloadjarink-9c7ee77376294e9abd70ca356e26d0ab16ad7466.tar.xz
brokenlinks: store the anchor or image source in link
In the struct Link, we add field Value that store the href from A element or src from IMG element. This allow us to debug any error during scan, especially joining path and link.
-rw-r--r--brokenlinks/brokenlinks_test.go15
-rw-r--r--brokenlinks/testdata/exp_cache.json3
-rw-r--r--brokenlinks/worker.go12
-rw-r--r--link.go3
4 files changed, 28 insertions, 5 deletions
diff --git a/brokenlinks/brokenlinks_test.go b/brokenlinks/brokenlinks_test.go
index ab857cd..460e5ac 100644
--- a/brokenlinks/brokenlinks_test.go
+++ b/brokenlinks/brokenlinks_test.go
@@ -281,16 +281,19 @@ func TestScan(t *testing.T) {
testUrl + `/`: []jarink.Link{
{
ParentUrl: parsedTestUrl,
+ Value: `/broken.png`,
Url: testUrl + `/broken.png`,
StatusCode: http.StatusNotFound,
Kind: int(atom.Img),
}, {
ParentUrl: parsedTestUrl,
+ Value: `/brokenPage`,
Url: testUrl + `/brokenPage`,
StatusCode: http.StatusNotFound,
Kind: int(atom.A),
}, {
ParentUrl: parsedTestUrl,
+ Value: `http://127.0.0.1:abc`,
Url: `http://127.0.0.1:abc`,
ErrScan: errScanPort,
Error: `parse "http://127.0.0.1:abc": invalid port ":abc" after host`,
@@ -298,6 +301,7 @@ func TestScan(t *testing.T) {
Kind: int(atom.A),
}, {
ParentUrl: parsedTestUrl,
+ Value: `http:/127.0.0.1:11836`,
Url: `http:/127.0.0.1:11836`,
ErrScan: &url.Error{
Op: `Get`,
@@ -310,6 +314,7 @@ func TestScan(t *testing.T) {
IsExternal: true,
}, {
ParentUrl: parsedTestUrl,
+ Value: `https://domain`,
Url: `https://domain`,
ErrScan: &url.Error{
Op: `Get`,
@@ -333,6 +338,7 @@ func TestScan(t *testing.T) {
testUrl + `/broken.html`: []jarink.Link{
{
ParentUrl: parsedUrlBrokenHtml,
+ Value: `/brokenPage`,
Url: testUrl + `/brokenPage`,
StatusCode: http.StatusNotFound,
Kind: int(atom.A),
@@ -341,16 +347,19 @@ func TestScan(t *testing.T) {
testUrl + `/page2/`: []jarink.Link{
{
ParentUrl: parsedUrlPage2,
+ Value: `/broken.png`,
Url: testUrl + `/broken.png`,
StatusCode: http.StatusNotFound,
Kind: int(atom.Img),
}, {
ParentUrl: parsedUrlPage2,
+ Value: `broken/relative`,
Url: testUrl + `/page2/broken/relative`,
StatusCode: http.StatusNotFound,
Kind: int(atom.A),
}, {
ParentUrl: parsedUrlPage2,
+ Value: `broken2.png`,
Url: testUrl + `/page2/broken2.png`,
StatusCode: http.StatusNotFound,
Kind: int(atom.Img),
@@ -368,17 +377,20 @@ func TestScan(t *testing.T) {
testUrl + `/page2/`: []jarink.Link{
{
ParentUrl: parsedUrlPage2,
+ Value: `/broken.png`,
Url: testUrl + `/broken.png`,
StatusCode: http.StatusNotFound,
Kind: int(atom.Img),
IsExternal: true,
}, {
ParentUrl: parsedUrlPage2,
+ Value: `broken/relative`,
Url: testUrl + `/page2/broken/relative`,
StatusCode: http.StatusNotFound,
Kind: int(atom.A),
}, {
ParentUrl: parsedUrlPage2,
+ Value: `broken2.png`,
Url: testUrl + `/page2/broken2.png`,
StatusCode: http.StatusNotFound,
Kind: int(atom.Img),
@@ -442,17 +454,20 @@ func TestScan_pastResult(t *testing.T) {
testUrl + `/page2/`: []jarink.Link{
{
ParentUrl: parsedUrlPage2,
+ Value: `/broken.png`,
Url: testUrl + `/broken.png`,
StatusCode: http.StatusNotFound,
Kind: int(atom.Img),
IsExternal: true,
}, {
ParentUrl: parsedUrlPage2,
+ Value: `broken/relative`,
Url: testUrl + `/page2/broken/relative`,
StatusCode: http.StatusNotFound,
Kind: int(atom.A),
}, {
ParentUrl: parsedUrlPage2,
+ Value: `broken2.png`,
Url: testUrl + `/page2/broken2.png`,
StatusCode: http.StatusNotFound,
Kind: int(atom.Img),
diff --git a/brokenlinks/testdata/exp_cache.json b/brokenlinks/testdata/exp_cache.json
index e80202f..41828bc 100644
--- a/brokenlinks/testdata/exp_cache.json
+++ b/brokenlinks/testdata/exp_cache.json
@@ -1,16 +1,19 @@
{
"scanned_links": {
"http://127.0.0.1:11900": {
+ "value": "http://127.0.0.1:11900",
"url": "http://127.0.0.1:11900",
"size": 1214,
"status_code": 200
},
"http://127.0.0.1:11900/page2": {
+ "value": "http://127.0.0.1:11900/page2",
"url": "http://127.0.0.1:11900/page2",
"size": 483,
"status_code": 200
},
"https://127.0.0.1:11838": {
+ "value": "https://127.0.0.1:11838",
"url": "https://127.0.0.1:11838",
"size": 1214,
"status_code": 200
diff --git a/brokenlinks/worker.go b/brokenlinks/worker.go
index 387cc05..79ae796 100644
--- a/brokenlinks/worker.go
+++ b/brokenlinks/worker.go
@@ -89,8 +89,10 @@ func newWorker(opts Options) (wrk *worker, err error) {
}
wrk.baseUrl = &url.URL{
- Scheme: wrk.opts.scanUrl.Scheme,
- Host: wrk.opts.scanUrl.Host,
+ Scheme: wrk.opts.scanUrl.Scheme,
+ Host: wrk.opts.scanUrl.Host,
+ Path: `/`,
+ RawPath: `/`,
}
if opts.PastResultFile == "" {
@@ -304,8 +306,6 @@ func (wrk *worker) scan(linkq jarink.Link) (resultq map[string]jarink.Link) {
return resultq
}
- var parentUrl *url.URL
-
// Check and get the redirect location or use the original URL.
location := httpResp.Header.Get(`Location`)
if location == `` {
@@ -314,6 +314,7 @@ func (wrk *worker) scan(linkq jarink.Link) (resultq map[string]jarink.Link) {
linkq.Url = location
}
+ var parentUrl *url.URL
parentUrl, err = url.Parse(location)
if err != nil {
linkq.StatusCode = StatusBadLink
@@ -415,6 +416,7 @@ func (wrk *worker) processLink(parentUrl *url.URL, val string, kind int) (
linkq = &jarink.Link{
ParentUrl: parentUrl,
+ Value: val,
Kind: kind,
}
@@ -452,7 +454,7 @@ func (wrk *worker) processLink(parentUrl *url.URL, val string, kind int) (
// "parent/page.ext" + "val" => "parent/val"
tmp, _ := url.Parse(parentUrl.String())
tmp.Path = path.Dir(tmp.Path)
- linkq.Url = parentUrl.JoinPath(tmp.Path).String()
+ linkq.Url = tmp.JoinPath(val).String()
}
return linkq
}
diff --git a/link.go b/link.go
index 266bcdd..0d43e2c 100644
--- a/link.go
+++ b/link.go
@@ -14,6 +14,9 @@ type Link struct {
// The error from scan.
ErrScan error `json:"-"`
+ // Value contains the original URL inside the anchor or image element.
+ Value string `json:"value"`
+
Url string `json:"url"`
Error string `json:"error,omitempty"`
Size int64 `json:"size,omitempty"`