diff options
| -rw-r--r-- | link_queue.go | 7 | ||||
| -rw-r--r-- | testdata/web/gopher.png | bin | 0 -> 32775 bytes | |||
| -rw-r--r-- | testdata/web/index.html | 1 | ||||
| -rw-r--r-- | worker.go | 20 |
4 files changed, 22 insertions, 6 deletions
diff --git a/link_queue.go b/link_queue.go index dfcba76..9987586 100644 --- a/link_queue.go +++ b/link_queue.go @@ -3,9 +3,14 @@ package deadlinks -import "net/url" +import ( + "net/url" + + "golang.org/x/net/html/atom" +) type linkQueue struct { parentUrl *url.URL url string + kind atom.Atom } diff --git a/testdata/web/gopher.png b/testdata/web/gopher.png Binary files differnew file mode 100644 index 0000000..79352be --- /dev/null +++ b/testdata/web/gopher.png diff --git a/testdata/web/index.html b/testdata/web/index.html index c6ac324..e4d8bd0 100644 --- a/testdata/web/index.html +++ b/testdata/web/index.html @@ -6,6 +6,7 @@ SPDX-License-Identifier: GPL-3.0-only <body> <img src="/broken.png" /> <a href="/brokenPage">Broken page</a> + <img src="/gopher.png" /> <a href="/page2">Page 2</a> <a href="/broken.html">Broken HTML</a> </body> @@ -124,13 +124,19 @@ func (wrk *worker) scan(linkq linkQueue) { wrk.errq <- err return } + defer httpResp.Body.Close() + if httpResp.StatusCode != http.StatusOK { wrk.markDead(linkq, httpResp.StatusCode) return } + wrk.seenLinkMtx.Lock() + wrk.seenLink[linkq.url] = http.StatusOK + wrk.seenLinkMtx.Unlock() - defer httpResp.Body.Close() - + if linkq.kind == atom.Img { + return + } err = wrk.parseHTML(linkq.url, httpResp.Body) if err != nil { wrk.errq <- fmt.Errorf(`%s %s: %w`, logp, linkq.url, err) @@ -171,7 +177,7 @@ func (wrk *worker) parseHTML(linkUrl string, body io.Reader) (err error) { if attr.Key != `href` { continue } - wrk.processLink(linkUrl, attr.Val) + wrk.processLink(linkUrl, attr.Val, atom.A) } } if node.DataAtom == atom.Img { @@ -179,14 +185,14 @@ func (wrk *worker) parseHTML(linkUrl string, body io.Reader) (err error) { if attr.Key != `src` { continue } - wrk.processLink(linkUrl, attr.Val) + wrk.processLink(linkUrl, attr.Val, atom.Img) } } } return nil } -func (wrk *worker) processLink(rawParentUrl string, val string) { +func (wrk *worker) processLink(rawParentUrl string, val string, kind atom.Atom) { if len(val) == 0 { return } @@ -207,6 +213,7 @@ func (wrk *worker) processLink(rawParentUrl string, val string) { wrk.linkq <- linkQueue{ parentUrl: parentUrl, url: newUrlStr, + kind: kind, } return } @@ -217,6 +224,7 @@ func (wrk *worker) processLink(rawParentUrl string, val string) { var linkq = linkQueue{ parentUrl: parentUrl, url: val, + kind: kind, } wrk.markDead(linkq, 700) return @@ -225,6 +233,7 @@ func (wrk *worker) processLink(rawParentUrl string, val string) { wrk.linkq <- linkQueue{ parentUrl: parentUrl, url: newUrlStr, + kind: kind, } return } @@ -234,5 +243,6 @@ func (wrk *worker) processLink(rawParentUrl string, val string) { wrk.linkq <- linkQueue{ parentUrl: parentUrl, url: newUrlStr, + kind: kind, } } |
