From 0e4126ad99a216a08896156d31aafe3ab5611ba2 Mon Sep 17 00:00:00 2001 From: Shulhan Date: Sat, 31 May 2025 21:50:12 +0700 Subject: all: record an error due to broken link in HTML anchor or image --- deadlinks_test.go | 10 ++++++---- worker.go | 53 +++++++++++++++++++++++++++++------------------------ 2 files changed, 35 insertions(+), 28 deletions(-) diff --git a/deadlinks_test.go b/deadlinks_test.go index c93e384..c219aa0 100644 --- a/deadlinks_test.go +++ b/deadlinks_test.go @@ -97,8 +97,9 @@ func TestDeadLinks_Scan(t *testing.T) { Link: testUrl + `/brokenPage`, Code: http.StatusNotFound, }, { - Link: `http://127.0.0.1:abc`, - Code: deadlinks.StatusBadLink, + Link: `http://127.0.0.1:abc`, + Error: `parse "http://127.0.0.1:abc": invalid port ":abc" after host`, + Code: deadlinks.StatusBadLink, }, { Link: `http:/127.0.0.1:11836`, Error: `Head "http:/127.0.0.1:11836": http: no Host in request URL`, @@ -135,8 +136,9 @@ func TestDeadLinks_Scan(t *testing.T) { Link: testUrl + `/brokenPage`, Code: http.StatusNotFound, }, { - Link: `http://127.0.0.1:abc`, - Code: deadlinks.StatusBadLink, + Link: `http://127.0.0.1:abc`, + Error: `parse "http://127.0.0.1:abc": invalid port ":abc" after host`, + Code: deadlinks.StatusBadLink, }, { Link: `http:/127.0.0.1:11836`, Error: `Head "http:/127.0.0.1:11836": http: no Host in request URL`, diff --git a/worker.go b/worker.go index bc66a98..817ff3b 100644 --- a/worker.go +++ b/worker.go @@ -270,19 +270,17 @@ func (wrk *worker) scan(linkq linkQueue) { } var node *html.Node - var link string - var status int for node = range doc.Descendants() { if node.Type != html.ElementNode { continue } - link = "" + var nodeLink *linkQueue if node.DataAtom == atom.A { for _, attr := range node.Attr { if attr.Key != `href` { continue } - link, status = wrk.processLink(scanUrl, attr.Val, atom.A) + nodeLink = wrk.processLink(scanUrl, attr.Val, atom.A) break } } else if node.DataAtom == atom.Img { @@ -290,55 +288,58 @@ func (wrk *worker) scan(linkq linkQueue) { if attr.Key != `src` { continue } - link, status = wrk.processLink(scanUrl, attr.Val, atom.Img) + nodeLink = wrk.processLink(scanUrl, attr.Val, atom.Img) break } } else { continue } - if link == "" { + if nodeLink == nil { continue } - _, seen := resultq[link] + _, seen := resultq[nodeLink.url] if !seen { - var childLink = linkQueue{ - parentUrl: scanUrl, - url: link, - kind: node.DataAtom, - status: status, + if !strings.HasPrefix(nodeLink.url, wrk.baseUrl.String()) { + nodeLink.isExternal = true } - if !strings.HasPrefix(childLink.url, wrk.baseUrl.String()) { - childLink.isExternal = true - } - resultq[link] = childLink + resultq[nodeLink.url] = *nodeLink } } go wrk.pushResult(resultq) } func (wrk *worker) processLink(parentUrl *url.URL, val string, kind atom.Atom) ( - link string, status int, + linkq *linkQueue, ) { if len(val) == 0 { - return "", 0 + return nil } var newUrl *url.URL var err error newUrl, err = url.Parse(val) if err != nil { - return val, StatusBadLink + return &linkQueue{ + parentUrl: parentUrl, + errScan: err, + url: val, + kind: kind, + status: StatusBadLink, + } } newUrl.Fragment = "" newUrl.RawFragment = "" if kind == atom.A && val[0] == '#' { // Ignore link to ID, like `href="#element_id"`. - return "", 0 + return nil } if strings.HasPrefix(val, `http`) { - link = strings.TrimSuffix(newUrl.String(), `/`) - return link, 0 + return &linkQueue{ + parentUrl: parentUrl, + url: strings.TrimSuffix(newUrl.String(), `/`), + kind: kind, + } } if val[0] == '/' { // val is absolute to parent URL. @@ -347,8 +348,12 @@ func (wrk *worker) processLink(parentUrl *url.URL, val string, kind atom.Atom) ( // val is relative to parent URL. newUrl = parentUrl.JoinPath(`/`, newUrl.Path) } - link = strings.TrimSuffix(newUrl.String(), `/`) - return link, 0 + linkq = &linkQueue{ + parentUrl: parentUrl, + url: strings.TrimSuffix(newUrl.String(), `/`), + kind: kind, + } + return linkq } func (wrk *worker) pushResult(resultq map[string]linkQueue) { -- cgit v1.3