diff options
| author | Shulhan <ms@kilabit.info> | 2025-05-31 21:50:12 +0700 |
|---|---|---|
| committer | Shulhan <ms@kilabit.info> | 2025-05-31 21:50:12 +0700 |
| commit | 0e4126ad99a216a08896156d31aafe3ab5611ba2 (patch) | |
| tree | 2304028638850b0730ffcfdc3d7b7628fa448780 | |
| parent | 95c71232fc977d68aeae0c14aab1e91c9d3e9605 (diff) | |
| download | jarink-0e4126ad99a216a08896156d31aafe3ab5611ba2.tar.xz | |
all: record an error due to broken link in HTML anchor or image
| -rw-r--r-- | deadlinks_test.go | 10 | ||||
| -rw-r--r-- | worker.go | 53 |
2 files changed, 35 insertions, 28 deletions
diff --git a/deadlinks_test.go b/deadlinks_test.go index c93e384..c219aa0 100644 --- a/deadlinks_test.go +++ b/deadlinks_test.go @@ -97,8 +97,9 @@ func TestDeadLinks_Scan(t *testing.T) { Link: testUrl + `/brokenPage`, Code: http.StatusNotFound, }, { - Link: `http://127.0.0.1:abc`, - Code: deadlinks.StatusBadLink, + Link: `http://127.0.0.1:abc`, + Error: `parse "http://127.0.0.1:abc": invalid port ":abc" after host`, + Code: deadlinks.StatusBadLink, }, { Link: `http:/127.0.0.1:11836`, Error: `Head "http:/127.0.0.1:11836": http: no Host in request URL`, @@ -135,8 +136,9 @@ func TestDeadLinks_Scan(t *testing.T) { Link: testUrl + `/brokenPage`, Code: http.StatusNotFound, }, { - Link: `http://127.0.0.1:abc`, - Code: deadlinks.StatusBadLink, + Link: `http://127.0.0.1:abc`, + Error: `parse "http://127.0.0.1:abc": invalid port ":abc" after host`, + Code: deadlinks.StatusBadLink, }, { Link: `http:/127.0.0.1:11836`, Error: `Head "http:/127.0.0.1:11836": http: no Host in request URL`, @@ -270,19 +270,17 @@ func (wrk *worker) scan(linkq linkQueue) { } var node *html.Node - var link string - var status int for node = range doc.Descendants() { if node.Type != html.ElementNode { continue } - link = "" + var nodeLink *linkQueue if node.DataAtom == atom.A { for _, attr := range node.Attr { if attr.Key != `href` { continue } - link, status = wrk.processLink(scanUrl, attr.Val, atom.A) + nodeLink = wrk.processLink(scanUrl, attr.Val, atom.A) break } } else if node.DataAtom == atom.Img { @@ -290,55 +288,58 @@ func (wrk *worker) scan(linkq linkQueue) { if attr.Key != `src` { continue } - link, status = wrk.processLink(scanUrl, attr.Val, atom.Img) + nodeLink = wrk.processLink(scanUrl, attr.Val, atom.Img) break } } else { continue } - if link == "" { + if nodeLink == nil { continue } - _, seen := resultq[link] + _, seen := resultq[nodeLink.url] if !seen { - var childLink = linkQueue{ - parentUrl: scanUrl, - url: link, - kind: node.DataAtom, - status: status, + if !strings.HasPrefix(nodeLink.url, wrk.baseUrl.String()) { + nodeLink.isExternal = true } - if !strings.HasPrefix(childLink.url, wrk.baseUrl.String()) { - childLink.isExternal = true - } - resultq[link] = childLink + resultq[nodeLink.url] = *nodeLink } } go wrk.pushResult(resultq) } func (wrk *worker) processLink(parentUrl *url.URL, val string, kind atom.Atom) ( - link string, status int, + linkq *linkQueue, ) { if len(val) == 0 { - return "", 0 + return nil } var newUrl *url.URL var err error newUrl, err = url.Parse(val) if err != nil { - return val, StatusBadLink + return &linkQueue{ + parentUrl: parentUrl, + errScan: err, + url: val, + kind: kind, + status: StatusBadLink, + } } newUrl.Fragment = "" newUrl.RawFragment = "" if kind == atom.A && val[0] == '#' { // Ignore link to ID, like `href="#element_id"`. - return "", 0 + return nil } if strings.HasPrefix(val, `http`) { - link = strings.TrimSuffix(newUrl.String(), `/`) - return link, 0 + return &linkQueue{ + parentUrl: parentUrl, + url: strings.TrimSuffix(newUrl.String(), `/`), + kind: kind, + } } if val[0] == '/' { // val is absolute to parent URL. @@ -347,8 +348,12 @@ func (wrk *worker) processLink(parentUrl *url.URL, val string, kind atom.Atom) ( // val is relative to parent URL. newUrl = parentUrl.JoinPath(`/`, newUrl.Path) } - link = strings.TrimSuffix(newUrl.String(), `/`) - return link, 0 + linkq = &linkQueue{ + parentUrl: parentUrl, + url: strings.TrimSuffix(newUrl.String(), `/`), + kind: kind, + } + return linkq } func (wrk *worker) pushResult(resultq map[string]linkQueue) { |
