diff options
| author | Shulhan <ms@kilabit.info> | 2026-02-04 21:53:00 +0700 |
|---|---|---|
| committer | Shulhan <ms@kilabit.info> | 2026-02-04 21:53:00 +0700 |
| commit | b36d6e1f423bc405895d1b72e9a5915c4aa74ecc (patch) | |
| tree | 5bfa5c5377578a98c8f0e568f2a977da23f99adb /brokenlinks | |
| parent | 6b5ed409a5f11ed437586c8b046bcfc43749361d (diff) | |
| download | jarink-b36d6e1f423bc405895d1b72e9a5915c4aa74ecc.tar.xz | |
brokenlinks: skip parsing non-HTML page
If the response Content-type return other than "text/html", skip parsing
the content and return immediately.
Diffstat (limited to 'brokenlinks')
| -rw-r--r-- | brokenlinks/testdata/exp_cache.json | 4 | ||||
| l--------- | brokenlinks/testdata/web/brokenlinks.go | 1 | ||||
| -rw-r--r-- | brokenlinks/testdata/web/index.html | 3 | ||||
| -rw-r--r-- | brokenlinks/worker.go | 18 |
4 files changed, 17 insertions, 9 deletions
diff --git a/brokenlinks/testdata/exp_cache.json b/brokenlinks/testdata/exp_cache.json index f9aa32a..bb8fa94 100644 --- a/brokenlinks/testdata/exp_cache.json +++ b/brokenlinks/testdata/exp_cache.json @@ -2,7 +2,7 @@ "scanned_links": { "http://127.0.0.1:11900": { "url": "http://127.0.0.1:11900", - "size": 1064, + "size": 1141, "status_code": 200 }, "http://127.0.0.1:11900/page2": { @@ -12,7 +12,7 @@ }, "https://127.0.0.1:11838": { "url": "https://127.0.0.1:11838", - "size": 1064, + "size": 1141, "status_code": 200 } } diff --git a/brokenlinks/testdata/web/brokenlinks.go b/brokenlinks/testdata/web/brokenlinks.go new file mode 120000 index 0000000..019f032 --- /dev/null +++ b/brokenlinks/testdata/web/brokenlinks.go @@ -0,0 +1 @@ +../../brokenlinks.go
\ No newline at end of file diff --git a/brokenlinks/testdata/web/index.html b/brokenlinks/testdata/web/index.html index 88f1184..4eaf3ed 100644 --- a/brokenlinks/testdata/web/index.html +++ b/brokenlinks/testdata/web/index.html @@ -32,5 +32,8 @@ SPDX-License-Identifier: GPL-3.0-only <!-- Pages with invalid domain --> <a href="https://domain">Invalid domain</a> + + <!-- Non-HTML file --> + <a href="/brokenlinks.go">brokenlinks.go</a> </body> </html> diff --git a/brokenlinks/worker.go b/brokenlinks/worker.go index 09c8b12..3e089fc 100644 --- a/brokenlinks/worker.go +++ b/brokenlinks/worker.go @@ -266,14 +266,18 @@ func (wrk *worker) scan(linkq jarink.Link) (resultq map[string]jarink.Link) { return resultq } - // After we check the code for [html.Parse] there are no cases where - // it will return an error. - // The only possible error is when reading from body (io.Reader), and - // that is also almost impossible. - // - // [html.Parse]: https://go.googlesource.com/net/+/refs/tags/v0.40.0/html/parse.go#2347 + contentType := httpResp.Header.Get(`Content-Type`) + if !strings.HasPrefix(contentType, `text/html`) { + return resultq + } + var doc *html.Node - doc, _ = html.Parse(httpResp.Body) + doc, err = html.Parse(httpResp.Body) + if err != nil { + linkq.ErrScan = err + resultq[linkq.Url] = linkq + return resultq + } var parentUrl *url.URL |
