diff options
| author | Shulhan <ms@kilabit.info> | 2026-02-04 21:53:00 +0700 |
|---|---|---|
| committer | Shulhan <ms@kilabit.info> | 2026-02-04 21:53:00 +0700 |
| commit | b36d6e1f423bc405895d1b72e9a5915c4aa74ecc (patch) | |
| tree | 5bfa5c5377578a98c8f0e568f2a977da23f99adb /brokenlinks/worker.go | |
| parent | 6b5ed409a5f11ed437586c8b046bcfc43749361d (diff) | |
| download | jarink-b36d6e1f423bc405895d1b72e9a5915c4aa74ecc.tar.xz | |
brokenlinks: skip parsing non-HTML page
If the response Content-type return other than "text/html", skip parsing
the content and return immediately.
Diffstat (limited to 'brokenlinks/worker.go')
| -rw-r--r-- | brokenlinks/worker.go | 18 |
1 files changed, 11 insertions, 7 deletions
diff --git a/brokenlinks/worker.go b/brokenlinks/worker.go index 09c8b12..3e089fc 100644 --- a/brokenlinks/worker.go +++ b/brokenlinks/worker.go @@ -266,14 +266,18 @@ func (wrk *worker) scan(linkq jarink.Link) (resultq map[string]jarink.Link) { return resultq } - // After we check the code for [html.Parse] there are no cases where - // it will return an error. - // The only possible error is when reading from body (io.Reader), and - // that is also almost impossible. - // - // [html.Parse]: https://go.googlesource.com/net/+/refs/tags/v0.40.0/html/parse.go#2347 + contentType := httpResp.Header.Get(`Content-Type`) + if !strings.HasPrefix(contentType, `text/html`) { + return resultq + } + var doc *html.Node - doc, _ = html.Parse(httpResp.Body) + doc, err = html.Parse(httpResp.Body) + if err != nil { + linkq.ErrScan = err + resultq[linkq.Url] = linkq + return resultq + } var parentUrl *url.URL |
