brokenlinks: skip parsing non-HTML page

If the response Content-type return other than "text/html", skip parsing the content and return immediately.
author: Shulhan <ms@kilabit.info> 2026-02-04 21:53:00 +0700
committer: Shulhan <ms@kilabit.info> 2026-02-04 21:53:00 +0700
commit: b36d6e1f423bc405895d1b72e9a5915c4aa74ecc (patch)
tree: 5bfa5c5377578a98c8f0e568f2a977da23f99adb /brokenlinks/worker.go
parent: 6b5ed409a5f11ed437586c8b046bcfc43749361d (diff)
download: jarink-b36d6e1f423bc405895d1b72e9a5915c4aa74ecc.tar.xz
1 files changed, 11 insertions, 7 deletions
diff --git a/brokenlinks/worker.go b/brokenlinks/worker.go
index 09c8b12..3e089fc 100644
--- a/brokenlinks/worker.go
+++ b/brokenlinks/worker.go
@@ -266,14 +266,18 @@ func (wrk *worker) scan(linkq jarink.Link) (resultq map[string]jarink.Link) {
 		return resultq
 	}
 
-	// After we check the code for [html.Parse] there are no cases where
-	// it will return an error.
-	// The only possible error is when reading from body (io.Reader), and
-	// that is also almost impossible.
-	//
-	// [html.Parse]: https://go.googlesource.com/net/+/refs/tags/v0.40.0/html/parse.go#2347
+	contentType := httpResp.Header.Get(`Content-Type`)
+	if !strings.HasPrefix(contentType, `text/html`) {
+		return resultq
+	}
+
 	var doc *html.Node
-	doc, _ = html.Parse(httpResp.Body)
+	doc, err = html.Parse(httpResp.Body)
+	if err != nil {
+		linkq.ErrScan = err
+		resultq[linkq.Url] = linkq
+		return resultq
+	}
 
 	var parentUrl *url.URL
author	Shulhan <ms@kilabit.info>	2026-02-04 21:53:00 +0700
committer	Shulhan <ms@kilabit.info>	2026-02-04 21:53:00 +0700
commit	b36d6e1f423bc405895d1b72e9a5915c4aa74ecc (patch)
tree	5bfa5c5377578a98c8f0e568f2a977da23f99adb /brokenlinks/worker.go
parent	6b5ed409a5f11ed437586c8b046bcfc43749361d (diff)
download	jarink-b36d6e1f423bc405895d1b72e9a5915c4aa74ecc.tar.xz