diff options
| author | Shulhan <ms@kilabit.info> | 2026-02-12 01:04:40 +0700 |
|---|---|---|
| committer | Shulhan <ms@kilabit.info> | 2026-02-12 01:04:40 +0700 |
| commit | 9c7ee77376294e9abd70ca356e26d0ab16ad7466 (patch) | |
| tree | 2fc279d233575571389ea89d2ba0be16314bc7dd | |
| parent | 8100b3be0730173a77f1a64f9ac6bc8862a159ac (diff) | |
| download | jarink-9c7ee77376294e9abd70ca356e26d0ab16ad7466.tar.xz | |
brokenlinks: store the anchor or image source in link
In the struct Link, we add field Value that store the href from A element
or src from IMG element.
This allow us to debug any error during scan, especially joining path
and link.
| -rw-r--r-- | brokenlinks/brokenlinks_test.go | 15 | ||||
| -rw-r--r-- | brokenlinks/testdata/exp_cache.json | 3 | ||||
| -rw-r--r-- | brokenlinks/worker.go | 12 | ||||
| -rw-r--r-- | link.go | 3 |
4 files changed, 28 insertions, 5 deletions
diff --git a/brokenlinks/brokenlinks_test.go b/brokenlinks/brokenlinks_test.go index ab857cd..460e5ac 100644 --- a/brokenlinks/brokenlinks_test.go +++ b/brokenlinks/brokenlinks_test.go @@ -281,16 +281,19 @@ func TestScan(t *testing.T) { testUrl + `/`: []jarink.Link{ { ParentUrl: parsedTestUrl, + Value: `/broken.png`, Url: testUrl + `/broken.png`, StatusCode: http.StatusNotFound, Kind: int(atom.Img), }, { ParentUrl: parsedTestUrl, + Value: `/brokenPage`, Url: testUrl + `/brokenPage`, StatusCode: http.StatusNotFound, Kind: int(atom.A), }, { ParentUrl: parsedTestUrl, + Value: `http://127.0.0.1:abc`, Url: `http://127.0.0.1:abc`, ErrScan: errScanPort, Error: `parse "http://127.0.0.1:abc": invalid port ":abc" after host`, @@ -298,6 +301,7 @@ func TestScan(t *testing.T) { Kind: int(atom.A), }, { ParentUrl: parsedTestUrl, + Value: `http:/127.0.0.1:11836`, Url: `http:/127.0.0.1:11836`, ErrScan: &url.Error{ Op: `Get`, @@ -310,6 +314,7 @@ func TestScan(t *testing.T) { IsExternal: true, }, { ParentUrl: parsedTestUrl, + Value: `https://domain`, Url: `https://domain`, ErrScan: &url.Error{ Op: `Get`, @@ -333,6 +338,7 @@ func TestScan(t *testing.T) { testUrl + `/broken.html`: []jarink.Link{ { ParentUrl: parsedUrlBrokenHtml, + Value: `/brokenPage`, Url: testUrl + `/brokenPage`, StatusCode: http.StatusNotFound, Kind: int(atom.A), @@ -341,16 +347,19 @@ func TestScan(t *testing.T) { testUrl + `/page2/`: []jarink.Link{ { ParentUrl: parsedUrlPage2, + Value: `/broken.png`, Url: testUrl + `/broken.png`, StatusCode: http.StatusNotFound, Kind: int(atom.Img), }, { ParentUrl: parsedUrlPage2, + Value: `broken/relative`, Url: testUrl + `/page2/broken/relative`, StatusCode: http.StatusNotFound, Kind: int(atom.A), }, { ParentUrl: parsedUrlPage2, + Value: `broken2.png`, Url: testUrl + `/page2/broken2.png`, StatusCode: http.StatusNotFound, Kind: int(atom.Img), @@ -368,17 +377,20 @@ func TestScan(t *testing.T) { testUrl + `/page2/`: []jarink.Link{ { ParentUrl: parsedUrlPage2, + Value: `/broken.png`, Url: testUrl + `/broken.png`, StatusCode: http.StatusNotFound, Kind: int(atom.Img), IsExternal: true, }, { ParentUrl: parsedUrlPage2, + Value: `broken/relative`, Url: testUrl + `/page2/broken/relative`, StatusCode: http.StatusNotFound, Kind: int(atom.A), }, { ParentUrl: parsedUrlPage2, + Value: `broken2.png`, Url: testUrl + `/page2/broken2.png`, StatusCode: http.StatusNotFound, Kind: int(atom.Img), @@ -442,17 +454,20 @@ func TestScan_pastResult(t *testing.T) { testUrl + `/page2/`: []jarink.Link{ { ParentUrl: parsedUrlPage2, + Value: `/broken.png`, Url: testUrl + `/broken.png`, StatusCode: http.StatusNotFound, Kind: int(atom.Img), IsExternal: true, }, { ParentUrl: parsedUrlPage2, + Value: `broken/relative`, Url: testUrl + `/page2/broken/relative`, StatusCode: http.StatusNotFound, Kind: int(atom.A), }, { ParentUrl: parsedUrlPage2, + Value: `broken2.png`, Url: testUrl + `/page2/broken2.png`, StatusCode: http.StatusNotFound, Kind: int(atom.Img), diff --git a/brokenlinks/testdata/exp_cache.json b/brokenlinks/testdata/exp_cache.json index e80202f..41828bc 100644 --- a/brokenlinks/testdata/exp_cache.json +++ b/brokenlinks/testdata/exp_cache.json @@ -1,16 +1,19 @@ { "scanned_links": { "http://127.0.0.1:11900": { + "value": "http://127.0.0.1:11900", "url": "http://127.0.0.1:11900", "size": 1214, "status_code": 200 }, "http://127.0.0.1:11900/page2": { + "value": "http://127.0.0.1:11900/page2", "url": "http://127.0.0.1:11900/page2", "size": 483, "status_code": 200 }, "https://127.0.0.1:11838": { + "value": "https://127.0.0.1:11838", "url": "https://127.0.0.1:11838", "size": 1214, "status_code": 200 diff --git a/brokenlinks/worker.go b/brokenlinks/worker.go index 387cc05..79ae796 100644 --- a/brokenlinks/worker.go +++ b/brokenlinks/worker.go @@ -89,8 +89,10 @@ func newWorker(opts Options) (wrk *worker, err error) { } wrk.baseUrl = &url.URL{ - Scheme: wrk.opts.scanUrl.Scheme, - Host: wrk.opts.scanUrl.Host, + Scheme: wrk.opts.scanUrl.Scheme, + Host: wrk.opts.scanUrl.Host, + Path: `/`, + RawPath: `/`, } if opts.PastResultFile == "" { @@ -304,8 +306,6 @@ func (wrk *worker) scan(linkq jarink.Link) (resultq map[string]jarink.Link) { return resultq } - var parentUrl *url.URL - // Check and get the redirect location or use the original URL. location := httpResp.Header.Get(`Location`) if location == `` { @@ -314,6 +314,7 @@ func (wrk *worker) scan(linkq jarink.Link) (resultq map[string]jarink.Link) { linkq.Url = location } + var parentUrl *url.URL parentUrl, err = url.Parse(location) if err != nil { linkq.StatusCode = StatusBadLink @@ -415,6 +416,7 @@ func (wrk *worker) processLink(parentUrl *url.URL, val string, kind int) ( linkq = &jarink.Link{ ParentUrl: parentUrl, + Value: val, Kind: kind, } @@ -452,7 +454,7 @@ func (wrk *worker) processLink(parentUrl *url.URL, val string, kind int) ( // "parent/page.ext" + "val" => "parent/val" tmp, _ := url.Parse(parentUrl.String()) tmp.Path = path.Dir(tmp.Path) - linkq.Url = parentUrl.JoinPath(tmp.Path).String() + linkq.Url = tmp.JoinPath(val).String() } return linkq } @@ -14,6 +14,9 @@ type Link struct { // The error from scan. ErrScan error `json:"-"` + // Value contains the original URL inside the anchor or image element. + Value string `json:"value"` + Url string `json:"url"` Error string `json:"error,omitempty"` Size int64 `json:"size,omitempty"` |
