diff options
| author | Shulhan <ms@kilabit.info> | 2025-05-31 18:44:05 +0700 |
|---|---|---|
| committer | Shulhan <ms@kilabit.info> | 2025-05-31 18:44:05 +0700 |
| commit | 180f44b7f89f61f89528f7c113cf5c5e15a3e1a0 (patch) | |
| tree | 4913db6b0e8a25b16f07b81786d9f264314397c6 | |
| parent | 2e3c8f4cdc8d81b6a2f4842d849a64d7ce3d3926 (diff) | |
| download | jarink-180f44b7f89f61f89528f7c113cf5c5e15a3e1a0.tar.xz | |
all: use HTTP method HEAD to check external domains
For link that is not from the same domain being scanned, use the HTTP
method HEAD to minimize resources being transported.
| -rw-r--r-- | link_queue.go | 4 | ||||
| -rw-r--r-- | worker.go | 42 |
2 files changed, 27 insertions, 19 deletions
diff --git a/link_queue.go b/link_queue.go index 10dbfff..63940cc 100644 --- a/link_queue.go +++ b/link_queue.go @@ -22,6 +22,10 @@ type linkQueue struct { // It set to 0 if url is the first URL being scanned. kind atom.Atom + // isExternal if true the scan will issue HTTP method HEAD instead of + // GET. + isExternal bool + // Status of link after scan, its mostly used the HTTP status code. // 0: link is the result of scan, not processed yet. // StatusBadLink: link is invalid, not parseable or unreachable. @@ -197,24 +197,27 @@ func (wrk *worker) markDead(linkq linkQueue) { // scan fetch the HTML page or image to check if its valid. func (wrk *worker) scan(linkq linkQueue) { defer func() { - if wrk.opts.IsVerbose { - fmt.Printf(" done: %d %s\n", linkq.status, linkq.url) + if wrk.opts.IsVerbose && linkq.errScan != nil { + fmt.Printf("error: %d %s error=%v\n", linkq.status, + linkq.url, linkq.errScan) } wrk.wg.Done() }() - if wrk.opts.IsVerbose { - fmt.Printf("scan: %d %s\n", linkq.status, linkq.url) - } - var ( resultq = map[string]linkQueue{} httpResp *http.Response err error ) - if linkq.kind == atom.Img { + if linkq.kind == atom.Img || linkq.isExternal { + if wrk.opts.IsVerbose { + fmt.Printf("scan: HEAD %s\n", linkq.url) + } httpResp, err = http.Head(linkq.url) } else { + if wrk.opts.IsVerbose { + fmt.Printf("scan: GET %s\n", linkq.url) + } httpResp, err = http.Get(linkq.url) } if err != nil { @@ -233,13 +236,7 @@ func (wrk *worker) scan(linkq linkQueue) { go wrk.pushResult(resultq) return } - if linkq.kind == atom.Img { - go wrk.pushResult(resultq) - return - } - if !strings.HasPrefix(linkq.url, wrk.baseUrl.String()) { - // Do not parse the HTML page from external domain, only need - // its HTTP status code. + if linkq.kind == atom.Img || linkq.isExternal { go wrk.pushResult(resultq) return } @@ -291,11 +288,18 @@ func (wrk *worker) scan(linkq linkQueue) { if link == "" { continue } - resultq[link] = linkQueue{ - parentUrl: scanUrl, - url: link, - kind: node.DataAtom, - status: status, + _, seen := resultq[link] + if !seen { + var childLink = linkQueue{ + parentUrl: scanUrl, + url: link, + kind: node.DataAtom, + status: status, + } + if !strings.HasPrefix(childLink.url, wrk.baseUrl.String()) { + childLink.isExternal = true + } + resultq[link] = childLink } } go wrk.pushResult(resultq) |
