From 9c7ee77376294e9abd70ca356e26d0ab16ad7466 Mon Sep 17 00:00:00 2001 From: Shulhan Date: Thu, 12 Feb 2026 01:04:40 +0700 Subject: brokenlinks: store the anchor or image source in link In the struct Link, we add field Value that store the href from A element or src from IMG element. This allow us to debug any error during scan, especially joining path and link. --- brokenlinks/brokenlinks_test.go | 15 +++++++++++++++ brokenlinks/testdata/exp_cache.json | 3 +++ brokenlinks/worker.go | 12 +++++++----- link.go | 3 +++ 4 files changed, 28 insertions(+), 5 deletions(-) diff --git a/brokenlinks/brokenlinks_test.go b/brokenlinks/brokenlinks_test.go index ab857cd..460e5ac 100644 --- a/brokenlinks/brokenlinks_test.go +++ b/brokenlinks/brokenlinks_test.go @@ -281,16 +281,19 @@ func TestScan(t *testing.T) { testUrl + `/`: []jarink.Link{ { ParentUrl: parsedTestUrl, + Value: `/broken.png`, Url: testUrl + `/broken.png`, StatusCode: http.StatusNotFound, Kind: int(atom.Img), }, { ParentUrl: parsedTestUrl, + Value: `/brokenPage`, Url: testUrl + `/brokenPage`, StatusCode: http.StatusNotFound, Kind: int(atom.A), }, { ParentUrl: parsedTestUrl, + Value: `http://127.0.0.1:abc`, Url: `http://127.0.0.1:abc`, ErrScan: errScanPort, Error: `parse "http://127.0.0.1:abc": invalid port ":abc" after host`, @@ -298,6 +301,7 @@ func TestScan(t *testing.T) { Kind: int(atom.A), }, { ParentUrl: parsedTestUrl, + Value: `http:/127.0.0.1:11836`, Url: `http:/127.0.0.1:11836`, ErrScan: &url.Error{ Op: `Get`, @@ -310,6 +314,7 @@ func TestScan(t *testing.T) { IsExternal: true, }, { ParentUrl: parsedTestUrl, + Value: `https://domain`, Url: `https://domain`, ErrScan: &url.Error{ Op: `Get`, @@ -333,6 +338,7 @@ func TestScan(t *testing.T) { testUrl + `/broken.html`: []jarink.Link{ { ParentUrl: parsedUrlBrokenHtml, + Value: `/brokenPage`, Url: testUrl + `/brokenPage`, StatusCode: http.StatusNotFound, Kind: int(atom.A), @@ -341,16 +347,19 @@ func TestScan(t *testing.T) { testUrl + `/page2/`: []jarink.Link{ { ParentUrl: parsedUrlPage2, + Value: `/broken.png`, Url: testUrl + `/broken.png`, StatusCode: http.StatusNotFound, Kind: int(atom.Img), }, { ParentUrl: parsedUrlPage2, + Value: `broken/relative`, Url: testUrl + `/page2/broken/relative`, StatusCode: http.StatusNotFound, Kind: int(atom.A), }, { ParentUrl: parsedUrlPage2, + Value: `broken2.png`, Url: testUrl + `/page2/broken2.png`, StatusCode: http.StatusNotFound, Kind: int(atom.Img), @@ -368,17 +377,20 @@ func TestScan(t *testing.T) { testUrl + `/page2/`: []jarink.Link{ { ParentUrl: parsedUrlPage2, + Value: `/broken.png`, Url: testUrl + `/broken.png`, StatusCode: http.StatusNotFound, Kind: int(atom.Img), IsExternal: true, }, { ParentUrl: parsedUrlPage2, + Value: `broken/relative`, Url: testUrl + `/page2/broken/relative`, StatusCode: http.StatusNotFound, Kind: int(atom.A), }, { ParentUrl: parsedUrlPage2, + Value: `broken2.png`, Url: testUrl + `/page2/broken2.png`, StatusCode: http.StatusNotFound, Kind: int(atom.Img), @@ -442,17 +454,20 @@ func TestScan_pastResult(t *testing.T) { testUrl + `/page2/`: []jarink.Link{ { ParentUrl: parsedUrlPage2, + Value: `/broken.png`, Url: testUrl + `/broken.png`, StatusCode: http.StatusNotFound, Kind: int(atom.Img), IsExternal: true, }, { ParentUrl: parsedUrlPage2, + Value: `broken/relative`, Url: testUrl + `/page2/broken/relative`, StatusCode: http.StatusNotFound, Kind: int(atom.A), }, { ParentUrl: parsedUrlPage2, + Value: `broken2.png`, Url: testUrl + `/page2/broken2.png`, StatusCode: http.StatusNotFound, Kind: int(atom.Img), diff --git a/brokenlinks/testdata/exp_cache.json b/brokenlinks/testdata/exp_cache.json index e80202f..41828bc 100644 --- a/brokenlinks/testdata/exp_cache.json +++ b/brokenlinks/testdata/exp_cache.json @@ -1,16 +1,19 @@ { "scanned_links": { "http://127.0.0.1:11900": { + "value": "http://127.0.0.1:11900", "url": "http://127.0.0.1:11900", "size": 1214, "status_code": 200 }, "http://127.0.0.1:11900/page2": { + "value": "http://127.0.0.1:11900/page2", "url": "http://127.0.0.1:11900/page2", "size": 483, "status_code": 200 }, "https://127.0.0.1:11838": { + "value": "https://127.0.0.1:11838", "url": "https://127.0.0.1:11838", "size": 1214, "status_code": 200 diff --git a/brokenlinks/worker.go b/brokenlinks/worker.go index 387cc05..79ae796 100644 --- a/brokenlinks/worker.go +++ b/brokenlinks/worker.go @@ -89,8 +89,10 @@ func newWorker(opts Options) (wrk *worker, err error) { } wrk.baseUrl = &url.URL{ - Scheme: wrk.opts.scanUrl.Scheme, - Host: wrk.opts.scanUrl.Host, + Scheme: wrk.opts.scanUrl.Scheme, + Host: wrk.opts.scanUrl.Host, + Path: `/`, + RawPath: `/`, } if opts.PastResultFile == "" { @@ -304,8 +306,6 @@ func (wrk *worker) scan(linkq jarink.Link) (resultq map[string]jarink.Link) { return resultq } - var parentUrl *url.URL - // Check and get the redirect location or use the original URL. location := httpResp.Header.Get(`Location`) if location == `` { @@ -314,6 +314,7 @@ func (wrk *worker) scan(linkq jarink.Link) (resultq map[string]jarink.Link) { linkq.Url = location } + var parentUrl *url.URL parentUrl, err = url.Parse(location) if err != nil { linkq.StatusCode = StatusBadLink @@ -415,6 +416,7 @@ func (wrk *worker) processLink(parentUrl *url.URL, val string, kind int) ( linkq = &jarink.Link{ ParentUrl: parentUrl, + Value: val, Kind: kind, } @@ -452,7 +454,7 @@ func (wrk *worker) processLink(parentUrl *url.URL, val string, kind int) ( // "parent/page.ext" + "val" => "parent/val" tmp, _ := url.Parse(parentUrl.String()) tmp.Path = path.Dir(tmp.Path) - linkq.Url = parentUrl.JoinPath(tmp.Path).String() + linkq.Url = tmp.JoinPath(val).String() } return linkq } diff --git a/link.go b/link.go index 266bcdd..0d43e2c 100644 --- a/link.go +++ b/link.go @@ -14,6 +14,9 @@ type Link struct { // The error from scan. ErrScan error `json:"-"` + // Value contains the original URL inside the anchor or image element. + Value string `json:"value"` + Url string `json:"url"` Error string `json:"error,omitempty"` Size int64 `json:"size,omitempty"` -- cgit v1.3