summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorShulhan <ms@kilabit.info>2025-05-31 18:44:05 +0700
committerShulhan <ms@kilabit.info>2025-05-31 18:44:05 +0700
commit180f44b7f89f61f89528f7c113cf5c5e15a3e1a0 (patch)
tree4913db6b0e8a25b16f07b81786d9f264314397c6
parent2e3c8f4cdc8d81b6a2f4842d849a64d7ce3d3926 (diff)
downloadjarink-180f44b7f89f61f89528f7c113cf5c5e15a3e1a0.tar.xz
all: use HTTP method HEAD to check external domains
For link that is not from the same domain being scanned, use the HTTP method HEAD to minimize resources being transported.
-rw-r--r--link_queue.go4
-rw-r--r--worker.go42
2 files changed, 27 insertions, 19 deletions
diff --git a/link_queue.go b/link_queue.go
index 10dbfff..63940cc 100644
--- a/link_queue.go
+++ b/link_queue.go
@@ -22,6 +22,10 @@ type linkQueue struct {
// It set to 0 if url is the first URL being scanned.
kind atom.Atom
+ // isExternal if true the scan will issue HTTP method HEAD instead of
+ // GET.
+ isExternal bool
+
// Status of link after scan, its mostly used the HTTP status code.
// 0: link is the result of scan, not processed yet.
// StatusBadLink: link is invalid, not parseable or unreachable.
diff --git a/worker.go b/worker.go
index e9ff6bd..75e39d0 100644
--- a/worker.go
+++ b/worker.go
@@ -197,24 +197,27 @@ func (wrk *worker) markDead(linkq linkQueue) {
// scan fetch the HTML page or image to check if its valid.
func (wrk *worker) scan(linkq linkQueue) {
defer func() {
- if wrk.opts.IsVerbose {
- fmt.Printf(" done: %d %s\n", linkq.status, linkq.url)
+ if wrk.opts.IsVerbose && linkq.errScan != nil {
+ fmt.Printf("error: %d %s error=%v\n", linkq.status,
+ linkq.url, linkq.errScan)
}
wrk.wg.Done()
}()
- if wrk.opts.IsVerbose {
- fmt.Printf("scan: %d %s\n", linkq.status, linkq.url)
- }
-
var (
resultq = map[string]linkQueue{}
httpResp *http.Response
err error
)
- if linkq.kind == atom.Img {
+ if linkq.kind == atom.Img || linkq.isExternal {
+ if wrk.opts.IsVerbose {
+ fmt.Printf("scan: HEAD %s\n", linkq.url)
+ }
httpResp, err = http.Head(linkq.url)
} else {
+ if wrk.opts.IsVerbose {
+ fmt.Printf("scan: GET %s\n", linkq.url)
+ }
httpResp, err = http.Get(linkq.url)
}
if err != nil {
@@ -233,13 +236,7 @@ func (wrk *worker) scan(linkq linkQueue) {
go wrk.pushResult(resultq)
return
}
- if linkq.kind == atom.Img {
- go wrk.pushResult(resultq)
- return
- }
- if !strings.HasPrefix(linkq.url, wrk.baseUrl.String()) {
- // Do not parse the HTML page from external domain, only need
- // its HTTP status code.
+ if linkq.kind == atom.Img || linkq.isExternal {
go wrk.pushResult(resultq)
return
}
@@ -291,11 +288,18 @@ func (wrk *worker) scan(linkq linkQueue) {
if link == "" {
continue
}
- resultq[link] = linkQueue{
- parentUrl: scanUrl,
- url: link,
- kind: node.DataAtom,
- status: status,
+ _, seen := resultq[link]
+ if !seen {
+ var childLink = linkQueue{
+ parentUrl: scanUrl,
+ url: link,
+ kind: node.DataAtom,
+ status: status,
+ }
+ if !strings.HasPrefix(childLink.url, wrk.baseUrl.String()) {
+ childLink.isExternal = true
+ }
+ resultq[link] = childLink
}
}
go wrk.pushResult(resultq)