summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorShulhan <ms@kilabit.info>2025-05-29 13:37:53 +0700
committerShulhan <ms@kilabit.info>2025-05-29 13:37:53 +0700
commit3d39941514395137610fb1c58768814a390b7c35 (patch)
tree2eb960820410f04f2b32e3f317a0731de54676fa
parent0148e3b3fd94e4c8359e97b402cc09c85768b2c3 (diff)
downloadjarink-3d39941514395137610fb1c58768814a390b7c35.tar.xz
all: parse only link to HTML page
For link to image we can skip parsing it.
-rw-r--r--link_queue.go7
-rw-r--r--testdata/web/gopher.pngbin0 -> 32775 bytes
-rw-r--r--testdata/web/index.html1
-rw-r--r--worker.go20
4 files changed, 22 insertions, 6 deletions
diff --git a/link_queue.go b/link_queue.go
index dfcba76..9987586 100644
--- a/link_queue.go
+++ b/link_queue.go
@@ -3,9 +3,14 @@
package deadlinks
-import "net/url"
+import (
+ "net/url"
+
+ "golang.org/x/net/html/atom"
+)
type linkQueue struct {
parentUrl *url.URL
url string
+ kind atom.Atom
}
diff --git a/testdata/web/gopher.png b/testdata/web/gopher.png
new file mode 100644
index 0000000..79352be
--- /dev/null
+++ b/testdata/web/gopher.png
Binary files differ
diff --git a/testdata/web/index.html b/testdata/web/index.html
index c6ac324..e4d8bd0 100644
--- a/testdata/web/index.html
+++ b/testdata/web/index.html
@@ -6,6 +6,7 @@ SPDX-License-Identifier: GPL-3.0-only
<body>
<img src="/broken.png" />
<a href="/brokenPage">Broken page</a>
+ <img src="/gopher.png" />
<a href="/page2">Page 2</a>
<a href="/broken.html">Broken HTML</a>
</body>
diff --git a/worker.go b/worker.go
index 206fbd3..ac25bf4 100644
--- a/worker.go
+++ b/worker.go
@@ -124,13 +124,19 @@ func (wrk *worker) scan(linkq linkQueue) {
wrk.errq <- err
return
}
+ defer httpResp.Body.Close()
+
if httpResp.StatusCode != http.StatusOK {
wrk.markDead(linkq, httpResp.StatusCode)
return
}
+ wrk.seenLinkMtx.Lock()
+ wrk.seenLink[linkq.url] = http.StatusOK
+ wrk.seenLinkMtx.Unlock()
- defer httpResp.Body.Close()
-
+ if linkq.kind == atom.Img {
+ return
+ }
err = wrk.parseHTML(linkq.url, httpResp.Body)
if err != nil {
wrk.errq <- fmt.Errorf(`%s %s: %w`, logp, linkq.url, err)
@@ -171,7 +177,7 @@ func (wrk *worker) parseHTML(linkUrl string, body io.Reader) (err error) {
if attr.Key != `href` {
continue
}
- wrk.processLink(linkUrl, attr.Val)
+ wrk.processLink(linkUrl, attr.Val, atom.A)
}
}
if node.DataAtom == atom.Img {
@@ -179,14 +185,14 @@ func (wrk *worker) parseHTML(linkUrl string, body io.Reader) (err error) {
if attr.Key != `src` {
continue
}
- wrk.processLink(linkUrl, attr.Val)
+ wrk.processLink(linkUrl, attr.Val, atom.Img)
}
}
}
return nil
}
-func (wrk *worker) processLink(rawParentUrl string, val string) {
+func (wrk *worker) processLink(rawParentUrl string, val string, kind atom.Atom) {
if len(val) == 0 {
return
}
@@ -207,6 +213,7 @@ func (wrk *worker) processLink(rawParentUrl string, val string) {
wrk.linkq <- linkQueue{
parentUrl: parentUrl,
url: newUrlStr,
+ kind: kind,
}
return
}
@@ -217,6 +224,7 @@ func (wrk *worker) processLink(rawParentUrl string, val string) {
var linkq = linkQueue{
parentUrl: parentUrl,
url: val,
+ kind: kind,
}
wrk.markDead(linkq, 700)
return
@@ -225,6 +233,7 @@ func (wrk *worker) processLink(rawParentUrl string, val string) {
wrk.linkq <- linkQueue{
parentUrl: parentUrl,
url: newUrlStr,
+ kind: kind,
}
return
}
@@ -234,5 +243,6 @@ func (wrk *worker) processLink(rawParentUrl string, val string) {
wrk.linkq <- linkQueue{
parentUrl: parentUrl,
url: newUrlStr,
+ kind: kind,
}
}