From 180f44b7f89f61f89528f7c113cf5c5e15a3e1a0 Mon Sep 17 00:00:00 2001 From: Shulhan Date: Sat, 31 May 2025 18:44:05 +0700 Subject: all: use HTTP method HEAD to check external domains For link that is not from the same domain being scanned, use the HTTP method HEAD to minimize resources being transported. --- link_queue.go | 4 ++++ worker.go | 42 +++++++++++++++++++++++------------------- 2 files changed, 27 insertions(+), 19 deletions(-) diff --git a/link_queue.go b/link_queue.go index 10dbfff..63940cc 100644 --- a/link_queue.go +++ b/link_queue.go @@ -22,6 +22,10 @@ type linkQueue struct { // It set to 0 if url is the first URL being scanned. kind atom.Atom + // isExternal if true the scan will issue HTTP method HEAD instead of + // GET. + isExternal bool + // Status of link after scan, its mostly used the HTTP status code. // 0: link is the result of scan, not processed yet. // StatusBadLink: link is invalid, not parseable or unreachable. diff --git a/worker.go b/worker.go index e9ff6bd..75e39d0 100644 --- a/worker.go +++ b/worker.go @@ -197,24 +197,27 @@ func (wrk *worker) markDead(linkq linkQueue) { // scan fetch the HTML page or image to check if its valid. func (wrk *worker) scan(linkq linkQueue) { defer func() { - if wrk.opts.IsVerbose { - fmt.Printf(" done: %d %s\n", linkq.status, linkq.url) + if wrk.opts.IsVerbose && linkq.errScan != nil { + fmt.Printf("error: %d %s error=%v\n", linkq.status, + linkq.url, linkq.errScan) } wrk.wg.Done() }() - if wrk.opts.IsVerbose { - fmt.Printf("scan: %d %s\n", linkq.status, linkq.url) - } - var ( resultq = map[string]linkQueue{} httpResp *http.Response err error ) - if linkq.kind == atom.Img { + if linkq.kind == atom.Img || linkq.isExternal { + if wrk.opts.IsVerbose { + fmt.Printf("scan: HEAD %s\n", linkq.url) + } httpResp, err = http.Head(linkq.url) } else { + if wrk.opts.IsVerbose { + fmt.Printf("scan: GET %s\n", linkq.url) + } httpResp, err = http.Get(linkq.url) } if err != nil { @@ -233,13 +236,7 @@ func (wrk *worker) scan(linkq linkQueue) { go wrk.pushResult(resultq) return } - if linkq.kind == atom.Img { - go wrk.pushResult(resultq) - return - } - if !strings.HasPrefix(linkq.url, wrk.baseUrl.String()) { - // Do not parse the HTML page from external domain, only need - // its HTTP status code. + if linkq.kind == atom.Img || linkq.isExternal { go wrk.pushResult(resultq) return } @@ -291,11 +288,18 @@ func (wrk *worker) scan(linkq linkQueue) { if link == "" { continue } - resultq[link] = linkQueue{ - parentUrl: scanUrl, - url: link, - kind: node.DataAtom, - status: status, + _, seen := resultq[link] + if !seen { + var childLink = linkQueue{ + parentUrl: scanUrl, + url: link, + kind: node.DataAtom, + status: status, + } + if !strings.HasPrefix(childLink.url, wrk.baseUrl.String()) { + childLink.isExternal = true + } + resultq[link] = childLink } } go wrk.pushResult(resultq) -- cgit v1.3