aboutsummaryrefslogtreecommitdiff
path: root/brokenlinks/worker.go
diff options
context:
space:
mode:
authorShulhan <ms@kilabit.info>2026-02-04 21:53:00 +0700
committerShulhan <ms@kilabit.info>2026-02-04 21:53:00 +0700
commitb36d6e1f423bc405895d1b72e9a5915c4aa74ecc (patch)
tree5bfa5c5377578a98c8f0e568f2a977da23f99adb /brokenlinks/worker.go
parent6b5ed409a5f11ed437586c8b046bcfc43749361d (diff)
downloadjarink-b36d6e1f423bc405895d1b72e9a5915c4aa74ecc.tar.xz
brokenlinks: skip parsing non-HTML page
If the response Content-type return other than "text/html", skip parsing the content and return immediately.
Diffstat (limited to 'brokenlinks/worker.go')
-rw-r--r--brokenlinks/worker.go18
1 files changed, 11 insertions, 7 deletions
diff --git a/brokenlinks/worker.go b/brokenlinks/worker.go
index 09c8b12..3e089fc 100644
--- a/brokenlinks/worker.go
+++ b/brokenlinks/worker.go
@@ -266,14 +266,18 @@ func (wrk *worker) scan(linkq jarink.Link) (resultq map[string]jarink.Link) {
return resultq
}
- // After we check the code for [html.Parse] there are no cases where
- // it will return an error.
- // The only possible error is when reading from body (io.Reader), and
- // that is also almost impossible.
- //
- // [html.Parse]: https://go.googlesource.com/net/+/refs/tags/v0.40.0/html/parse.go#2347
+ contentType := httpResp.Header.Get(`Content-Type`)
+ if !strings.HasPrefix(contentType, `text/html`) {
+ return resultq
+ }
+
var doc *html.Node
- doc, _ = html.Parse(httpResp.Body)
+ doc, err = html.Parse(httpResp.Body)
+ if err != nil {
+ linkq.ErrScan = err
+ resultq[linkq.Url] = linkq
+ return resultq
+ }
var parentUrl *url.URL