From 3d39941514395137610fb1c58768814a390b7c35 Mon Sep 17 00:00:00 2001 From: Shulhan Date: Thu, 29 May 2025 13:37:53 +0700 Subject: all: parse only link to HTML page For link to image we can skip parsing it. --- link_queue.go | 7 ++++++- testdata/web/gopher.png | Bin 0 -> 32775 bytes testdata/web/index.html | 1 + worker.go | 20 +++++++++++++++----- 4 files changed, 22 insertions(+), 6 deletions(-) create mode 100644 testdata/web/gopher.png diff --git a/link_queue.go b/link_queue.go index dfcba76..9987586 100644 --- a/link_queue.go +++ b/link_queue.go @@ -3,9 +3,14 @@ package deadlinks -import "net/url" +import ( + "net/url" + + "golang.org/x/net/html/atom" +) type linkQueue struct { parentUrl *url.URL url string + kind atom.Atom } diff --git a/testdata/web/gopher.png b/testdata/web/gopher.png new file mode 100644 index 0000000..79352be Binary files /dev/null and b/testdata/web/gopher.png differ diff --git a/testdata/web/index.html b/testdata/web/index.html index c6ac324..e4d8bd0 100644 --- a/testdata/web/index.html +++ b/testdata/web/index.html @@ -6,6 +6,7 @@ SPDX-License-Identifier: GPL-3.0-only Broken page + Page 2 Broken HTML diff --git a/worker.go b/worker.go index 206fbd3..ac25bf4 100644 --- a/worker.go +++ b/worker.go @@ -124,13 +124,19 @@ func (wrk *worker) scan(linkq linkQueue) { wrk.errq <- err return } + defer httpResp.Body.Close() + if httpResp.StatusCode != http.StatusOK { wrk.markDead(linkq, httpResp.StatusCode) return } + wrk.seenLinkMtx.Lock() + wrk.seenLink[linkq.url] = http.StatusOK + wrk.seenLinkMtx.Unlock() - defer httpResp.Body.Close() - + if linkq.kind == atom.Img { + return + } err = wrk.parseHTML(linkq.url, httpResp.Body) if err != nil { wrk.errq <- fmt.Errorf(`%s %s: %w`, logp, linkq.url, err) @@ -171,7 +177,7 @@ func (wrk *worker) parseHTML(linkUrl string, body io.Reader) (err error) { if attr.Key != `href` { continue } - wrk.processLink(linkUrl, attr.Val) + wrk.processLink(linkUrl, attr.Val, atom.A) } } if node.DataAtom == atom.Img { @@ -179,14 +185,14 @@ func (wrk *worker) parseHTML(linkUrl string, body io.Reader) (err error) { if attr.Key != `src` { continue } - wrk.processLink(linkUrl, attr.Val) + wrk.processLink(linkUrl, attr.Val, atom.Img) } } } return nil } -func (wrk *worker) processLink(rawParentUrl string, val string) { +func (wrk *worker) processLink(rawParentUrl string, val string, kind atom.Atom) { if len(val) == 0 { return } @@ -207,6 +213,7 @@ func (wrk *worker) processLink(rawParentUrl string, val string) { wrk.linkq <- linkQueue{ parentUrl: parentUrl, url: newUrlStr, + kind: kind, } return } @@ -217,6 +224,7 @@ func (wrk *worker) processLink(rawParentUrl string, val string) { var linkq = linkQueue{ parentUrl: parentUrl, url: val, + kind: kind, } wrk.markDead(linkq, 700) return @@ -225,6 +233,7 @@ func (wrk *worker) processLink(rawParentUrl string, val string) { wrk.linkq <- linkQueue{ parentUrl: parentUrl, url: newUrlStr, + kind: kind, } return } @@ -234,5 +243,6 @@ func (wrk *worker) processLink(rawParentUrl string, val string) { wrk.linkq <- linkQueue{ parentUrl: parentUrl, url: newUrlStr, + kind: kind, } } -- cgit v1.3