summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorShulhan <ms@kilabit.info>2025-05-31 21:50:12 +0700
committerShulhan <ms@kilabit.info>2025-05-31 21:50:12 +0700
commit0e4126ad99a216a08896156d31aafe3ab5611ba2 (patch)
tree2304028638850b0730ffcfdc3d7b7628fa448780
parent95c71232fc977d68aeae0c14aab1e91c9d3e9605 (diff)
downloadjarink-0e4126ad99a216a08896156d31aafe3ab5611ba2.tar.xz
all: record an error due to broken link in HTML anchor or image
-rw-r--r--deadlinks_test.go10
-rw-r--r--worker.go53
2 files changed, 35 insertions, 28 deletions
diff --git a/deadlinks_test.go b/deadlinks_test.go
index c93e384..c219aa0 100644
--- a/deadlinks_test.go
+++ b/deadlinks_test.go
@@ -97,8 +97,9 @@ func TestDeadLinks_Scan(t *testing.T) {
Link: testUrl + `/brokenPage`,
Code: http.StatusNotFound,
}, {
- Link: `http://127.0.0.1:abc`,
- Code: deadlinks.StatusBadLink,
+ Link: `http://127.0.0.1:abc`,
+ Error: `parse "http://127.0.0.1:abc": invalid port ":abc" after host`,
+ Code: deadlinks.StatusBadLink,
}, {
Link: `http:/127.0.0.1:11836`,
Error: `Head "http:/127.0.0.1:11836": http: no Host in request URL`,
@@ -135,8 +136,9 @@ func TestDeadLinks_Scan(t *testing.T) {
Link: testUrl + `/brokenPage`,
Code: http.StatusNotFound,
}, {
- Link: `http://127.0.0.1:abc`,
- Code: deadlinks.StatusBadLink,
+ Link: `http://127.0.0.1:abc`,
+ Error: `parse "http://127.0.0.1:abc": invalid port ":abc" after host`,
+ Code: deadlinks.StatusBadLink,
}, {
Link: `http:/127.0.0.1:11836`,
Error: `Head "http:/127.0.0.1:11836": http: no Host in request URL`,
diff --git a/worker.go b/worker.go
index bc66a98..817ff3b 100644
--- a/worker.go
+++ b/worker.go
@@ -270,19 +270,17 @@ func (wrk *worker) scan(linkq linkQueue) {
}
var node *html.Node
- var link string
- var status int
for node = range doc.Descendants() {
if node.Type != html.ElementNode {
continue
}
- link = ""
+ var nodeLink *linkQueue
if node.DataAtom == atom.A {
for _, attr := range node.Attr {
if attr.Key != `href` {
continue
}
- link, status = wrk.processLink(scanUrl, attr.Val, atom.A)
+ nodeLink = wrk.processLink(scanUrl, attr.Val, atom.A)
break
}
} else if node.DataAtom == atom.Img {
@@ -290,55 +288,58 @@ func (wrk *worker) scan(linkq linkQueue) {
if attr.Key != `src` {
continue
}
- link, status = wrk.processLink(scanUrl, attr.Val, atom.Img)
+ nodeLink = wrk.processLink(scanUrl, attr.Val, atom.Img)
break
}
} else {
continue
}
- if link == "" {
+ if nodeLink == nil {
continue
}
- _, seen := resultq[link]
+ _, seen := resultq[nodeLink.url]
if !seen {
- var childLink = linkQueue{
- parentUrl: scanUrl,
- url: link,
- kind: node.DataAtom,
- status: status,
+ if !strings.HasPrefix(nodeLink.url, wrk.baseUrl.String()) {
+ nodeLink.isExternal = true
}
- if !strings.HasPrefix(childLink.url, wrk.baseUrl.String()) {
- childLink.isExternal = true
- }
- resultq[link] = childLink
+ resultq[nodeLink.url] = *nodeLink
}
}
go wrk.pushResult(resultq)
}
func (wrk *worker) processLink(parentUrl *url.URL, val string, kind atom.Atom) (
- link string, status int,
+ linkq *linkQueue,
) {
if len(val) == 0 {
- return "", 0
+ return nil
}
var newUrl *url.URL
var err error
newUrl, err = url.Parse(val)
if err != nil {
- return val, StatusBadLink
+ return &linkQueue{
+ parentUrl: parentUrl,
+ errScan: err,
+ url: val,
+ kind: kind,
+ status: StatusBadLink,
+ }
}
newUrl.Fragment = ""
newUrl.RawFragment = ""
if kind == atom.A && val[0] == '#' {
// Ignore link to ID, like `href="#element_id"`.
- return "", 0
+ return nil
}
if strings.HasPrefix(val, `http`) {
- link = strings.TrimSuffix(newUrl.String(), `/`)
- return link, 0
+ return &linkQueue{
+ parentUrl: parentUrl,
+ url: strings.TrimSuffix(newUrl.String(), `/`),
+ kind: kind,
+ }
}
if val[0] == '/' {
// val is absolute to parent URL.
@@ -347,8 +348,12 @@ func (wrk *worker) processLink(parentUrl *url.URL, val string, kind atom.Atom) (
// val is relative to parent URL.
newUrl = parentUrl.JoinPath(`/`, newUrl.Path)
}
- link = strings.TrimSuffix(newUrl.String(), `/`)
- return link, 0
+ linkq = &linkQueue{
+ parentUrl: parentUrl,
+ url: strings.TrimSuffix(newUrl.String(), `/`),
+ kind: kind,
+ }
+ return linkq
}
func (wrk *worker) pushResult(resultq map[string]linkQueue) {