aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorShulhan <ms@kilabit.info>2026-02-04 21:53:00 +0700
committerShulhan <ms@kilabit.info>2026-02-04 21:53:00 +0700
commitb36d6e1f423bc405895d1b72e9a5915c4aa74ecc (patch)
tree5bfa5c5377578a98c8f0e568f2a977da23f99adb
parent6b5ed409a5f11ed437586c8b046bcfc43749361d (diff)
downloadjarink-b36d6e1f423bc405895d1b72e9a5915c4aa74ecc.tar.xz
brokenlinks: skip parsing non-HTML page
If the response Content-type return other than "text/html", skip parsing the content and return immediately.
-rw-r--r--brokenlinks/testdata/exp_cache.json4
l---------brokenlinks/testdata/web/brokenlinks.go1
-rw-r--r--brokenlinks/testdata/web/index.html3
-rw-r--r--brokenlinks/worker.go18
4 files changed, 17 insertions, 9 deletions
diff --git a/brokenlinks/testdata/exp_cache.json b/brokenlinks/testdata/exp_cache.json
index f9aa32a..bb8fa94 100644
--- a/brokenlinks/testdata/exp_cache.json
+++ b/brokenlinks/testdata/exp_cache.json
@@ -2,7 +2,7 @@
"scanned_links": {
"http://127.0.0.1:11900": {
"url": "http://127.0.0.1:11900",
- "size": 1064,
+ "size": 1141,
"status_code": 200
},
"http://127.0.0.1:11900/page2": {
@@ -12,7 +12,7 @@
},
"https://127.0.0.1:11838": {
"url": "https://127.0.0.1:11838",
- "size": 1064,
+ "size": 1141,
"status_code": 200
}
}
diff --git a/brokenlinks/testdata/web/brokenlinks.go b/brokenlinks/testdata/web/brokenlinks.go
new file mode 120000
index 0000000..019f032
--- /dev/null
+++ b/brokenlinks/testdata/web/brokenlinks.go
@@ -0,0 +1 @@
+../../brokenlinks.go \ No newline at end of file
diff --git a/brokenlinks/testdata/web/index.html b/brokenlinks/testdata/web/index.html
index 88f1184..4eaf3ed 100644
--- a/brokenlinks/testdata/web/index.html
+++ b/brokenlinks/testdata/web/index.html
@@ -32,5 +32,8 @@ SPDX-License-Identifier: GPL-3.0-only
<!-- Pages with invalid domain -->
<a href="https://domain">Invalid domain</a>
+
+ <!-- Non-HTML file -->
+ <a href="/brokenlinks.go">brokenlinks.go</a>
</body>
</html>
diff --git a/brokenlinks/worker.go b/brokenlinks/worker.go
index 09c8b12..3e089fc 100644
--- a/brokenlinks/worker.go
+++ b/brokenlinks/worker.go
@@ -266,14 +266,18 @@ func (wrk *worker) scan(linkq jarink.Link) (resultq map[string]jarink.Link) {
return resultq
}
- // After we check the code for [html.Parse] there are no cases where
- // it will return an error.
- // The only possible error is when reading from body (io.Reader), and
- // that is also almost impossible.
- //
- // [html.Parse]: https://go.googlesource.com/net/+/refs/tags/v0.40.0/html/parse.go#2347
+ contentType := httpResp.Header.Get(`Content-Type`)
+ if !strings.HasPrefix(contentType, `text/html`) {
+ return resultq
+ }
+
var doc *html.Node
- doc, _ = html.Parse(httpResp.Body)
+ doc, err = html.Parse(httpResp.Body)
+ if err != nil {
+ linkq.ErrScan = err
+ resultq[linkq.Url] = linkq
+ return resultq
+ }
var parentUrl *url.URL