aboutsummaryrefslogtreecommitdiff
path: root/brokenlinks/worker.go
diff options
context:
space:
mode:
authorShulhan <ms@kilabit.info>2026-01-22 01:39:41 +0700
committerShulhan <ms@kilabit.info>2026-01-22 01:39:41 +0700
commit79eaccc81b85eb92dab9cf18d52662f367903652 (patch)
treeaf58482138c2ba9211029174ab579e951cd7fff6 /brokenlinks/worker.go
parent26fc8bd3203dae6b4705ada227439c90129bbe36 (diff)
downloadjarink-79eaccc81b85eb92dab9cf18d52662f367903652.tar.xz
all: refactoring, use single struct to represent Link
Previously, have [jarink.Link], [brokenlinks.Broken], and [brokenlinks.linkQueue] to store the metadata for a link. These changes unified them into struct [jarink.Link].
Diffstat (limited to 'brokenlinks/worker.go')
-rw-r--r--brokenlinks/worker.go138
1 files changed, 68 insertions, 70 deletions
diff --git a/brokenlinks/worker.go b/brokenlinks/worker.go
index c0a33dd..07bda88 100644
--- a/brokenlinks/worker.go
+++ b/brokenlinks/worker.go
@@ -29,7 +29,7 @@ type worker struct {
seenLink map[string]int
// queue contains list of link to be scanned.
- queue []linkQueue
+ queue []jarink.Link
// result contains the final result after all of the pages has been
// scanned.
@@ -124,15 +124,14 @@ func (wrk *worker) run() (result *Result, err error) {
// scanAll scan all pages start from [Options.Url].
func (wrk *worker) scanAll() (result *Result, err error) {
// Scan the first URL to make sure that the server is reachable.
- var linkq = linkQueue{
- parentUrl: nil,
- url: wrk.opts.scanUrl.String(),
+ var linkq = jarink.Link{
+ Url: wrk.opts.scanUrl.String(),
}
var resultq = wrk.scan(linkq)
- linkq = resultq[linkq.url]
- if linkq.errScan != nil {
- return nil, linkq.errScan
+ linkq = resultq[linkq.Url]
+ if linkq.ErrScan != nil {
+ return nil, linkq.ErrScan
}
wrk.processResult(resultq)
@@ -176,17 +175,16 @@ func (wrk *worker) scanPastResult() (result *Result, err error) {
// - Skip external link that has been checked before.
// - Skip link that has been seen.
// - Otherwise push it to queue.
-func (wrk *worker) processResult(resultq map[string]linkQueue) {
- var linkq linkQueue
+func (wrk *worker) processResult(resultq map[string]jarink.Link) {
var seen bool
- for _, linkq = range resultq {
- if linkq.status != 0 {
+ for _, linkq := range resultq {
+ if linkq.StatusCode != 0 {
wrk.seen(linkq)
continue
}
- if linkq.isExternal {
- var scannedLink = wrk.cache.Get(linkq.url)
+ if linkq.IsExternal {
+ var scannedLink = wrk.cache.Get(linkq.Url)
if scannedLink != nil {
// The external link has been scanned
// previously.
@@ -194,9 +192,9 @@ func (wrk *worker) processResult(resultq map[string]linkQueue) {
}
}
- linkq.status, seen = wrk.seenLink[linkq.url]
+ linkq.StatusCode, seen = wrk.seenLink[linkq.Url]
if seen {
- if linkq.status >= http.StatusBadRequest {
+ if linkq.StatusCode >= http.StatusBadRequest {
// Different pages may have the same broken
// link.
wrk.markAsBroken(linkq)
@@ -207,61 +205,61 @@ func (wrk *worker) processResult(resultq map[string]linkQueue) {
}
}
-func (wrk *worker) seen(linkq linkQueue) {
- wrk.seenLink[linkq.url] = linkq.status
+func (wrk *worker) seen(linkq jarink.Link) {
+ wrk.seenLink[linkq.Url] = linkq.StatusCode
- if linkq.isExternal {
- if linkq.status != StatusBadLink {
- wrk.cache.Set(linkq.url, linkq.status, linkq.size)
+ if linkq.IsExternal {
+ if linkq.StatusCode != StatusBadLink {
+ wrk.cache.Set(linkq)
}
}
- if linkq.status >= http.StatusBadRequest {
+ if linkq.StatusCode >= http.StatusBadRequest {
wrk.markAsBroken(linkq)
}
}
-func (wrk *worker) markAsBroken(linkq linkQueue) {
- if slices.Contains(wrk.opts.ignoreStatus, linkq.status) {
+func (wrk *worker) markAsBroken(linkq jarink.Link) {
+ if slices.Contains(wrk.opts.ignoreStatus, linkq.StatusCode) {
return
}
- var parentUrl = linkq.parentUrl.String()
+ var parentUrl = linkq.ParentUrl.String()
var listBroken = wrk.result.BrokenLinks[parentUrl]
- var brokenLink = Broken{
- Link: linkq.url,
- Code: linkq.status,
+ if linkq.ErrScan != nil {
+ linkq.Error = linkq.ErrScan.Error()
}
- if linkq.errScan != nil {
- brokenLink.Error = linkq.errScan.Error()
- }
- listBroken = append(listBroken, brokenLink)
+ listBroken = append(listBroken, linkq)
wrk.result.BrokenLinks[parentUrl] = listBroken
}
// scan the link to HTML page or image.
-func (wrk *worker) scan(linkq linkQueue) (resultq map[string]linkQueue) {
- resultq = make(map[string]linkQueue)
+func (wrk *worker) scan(linkq jarink.Link) (resultq map[string]jarink.Link) {
+ resultq = make(map[string]jarink.Link)
var (
httpResp *http.Response
err error
)
httpResp, err = wrk.fetch(linkq)
if err != nil {
- linkq.status = StatusBadLink
- linkq.errScan = err
- resultq[linkq.url] = linkq
+ linkq.StatusCode = StatusBadLink
+ linkq.ErrScan = err
+ resultq[linkq.Url] = linkq
return resultq
}
defer httpResp.Body.Close()
- linkq.status = httpResp.StatusCode
- linkq.size = httpResp.ContentLength
- resultq[linkq.url] = linkq
+ linkq.StatusCode = httpResp.StatusCode
+ resultq[linkq.Url] = linkq
if httpResp.StatusCode >= http.StatusBadRequest {
return resultq
}
- if linkq.kind == atom.Img || linkq.isExternal {
+ if linkq.Kind == int(atom.Img) {
+ return resultq
+ }
+ linkq.Size = httpResp.ContentLength
+ if linkq.IsExternal {
+ resultq[linkq.Url] = linkq
return resultq
}
@@ -276,11 +274,11 @@ func (wrk *worker) scan(linkq linkQueue) (resultq map[string]linkQueue) {
var parentUrl *url.URL
- parentUrl, err = url.Parse(linkq.url)
+ parentUrl, err = url.Parse(linkq.Url)
if err != nil {
- linkq.status = StatusBadLink
- linkq.errScan = err
- resultq[linkq.url] = linkq
+ linkq.StatusCode = StatusBadLink
+ linkq.ErrScan = err
+ resultq[linkq.Url] = linkq
return resultq
}
@@ -292,13 +290,13 @@ func (wrk *worker) scan(linkq linkQueue) (resultq map[string]linkQueue) {
if node.DataAtom != atom.A && node.DataAtom != atom.Img {
continue
}
- var nodeLink *linkQueue
+ var nodeLink *jarink.Link
if node.DataAtom == atom.A {
for _, attr := range node.Attr {
if attr.Key != `href` {
continue
}
- nodeLink = wrk.processLink(parentUrl, attr.Val, atom.A)
+ nodeLink = wrk.processLink(parentUrl, attr.Val, int(atom.A))
break
}
} else { // atom.Img
@@ -306,7 +304,7 @@ func (wrk *worker) scan(linkq linkQueue) (resultq map[string]linkQueue) {
if attr.Key != `src` {
continue
}
- nodeLink = wrk.processLink(parentUrl, attr.Val, atom.Img)
+ nodeLink = wrk.processLink(parentUrl, attr.Val, int(atom.Img))
break
}
}
@@ -314,30 +312,30 @@ func (wrk *worker) scan(linkq linkQueue) (resultq map[string]linkQueue) {
// Link is invalid.
continue
}
- _, seen := resultq[nodeLink.url]
+ _, seen := resultq[nodeLink.Url]
if seen {
// The same link already exist previously.
continue
}
- resultq[nodeLink.url] = *nodeLink
+ resultq[nodeLink.Url] = *nodeLink
}
return resultq
}
-func (wrk *worker) fetch(linkq linkQueue) (httpResp *http.Response, err error) {
+func (wrk *worker) fetch(linkq jarink.Link) (httpResp *http.Response, err error) {
const maxRetry = 5
var retry int
for retry < 5 {
- if linkq.kind == atom.Img {
+ if linkq.Kind == int(atom.Img) {
if wrk.opts.IsVerbose {
- wrk.log.Printf("fetch: HEAD %s", linkq.url)
+ wrk.log.Printf("fetch: HEAD %s", linkq.Url)
}
- httpResp, err = wrk.httpc.Head(linkq.url)
+ httpResp, err = wrk.httpc.Head(linkq.Url)
} else {
if wrk.opts.IsVerbose {
- wrk.log.Printf("fetch: GET %s", linkq.url)
+ wrk.log.Printf("fetch: GET %s", linkq.Url)
}
- httpResp, err = wrk.httpc.Get(linkq.url)
+ httpResp, err = wrk.httpc.Get(linkq.Url)
}
if err == nil {
return httpResp, nil
@@ -348,7 +346,7 @@ func (wrk *worker) fetch(linkq linkQueue) (httpResp *http.Response, err error) {
}
if errDNS.Timeout() {
retry++
- wrk.log.Printf(`fetch %s: %s (%d/%d)`, linkq.url, err, retry, maxRetry)
+ wrk.log.Printf(`fetch %s: %s (%d/%d)`, linkq.Url, err, retry, maxRetry)
continue
}
break
@@ -356,33 +354,33 @@ func (wrk *worker) fetch(linkq linkQueue) (httpResp *http.Response, err error) {
return nil, err
}
-// processLink given a parentURL and link value `val`
-// check if link `val` is valid and return it as linkQueue.
-func (wrk *worker) processLink(parentUrl *url.URL, val string, kind atom.Atom) (
- linkq *linkQueue,
+// processLink given a parentURL, check if link `val` is valid, and return it
+// as [jarink.Link].
+func (wrk *worker) processLink(parentUrl *url.URL, val string, kind int) (
+ linkq *jarink.Link,
) {
if len(val) == 0 {
return nil
}
- linkq = &linkQueue{
- parentUrl: parentUrl,
- kind: kind,
+ linkq = &jarink.Link{
+ ParentUrl: parentUrl,
+ Kind: kind,
}
var newUrl *url.URL
var err error
newUrl, err = url.Parse(val)
if err != nil {
- linkq.errScan = err
- linkq.url = val
- linkq.status = StatusBadLink
+ linkq.ErrScan = err
+ linkq.Url = val
+ linkq.StatusCode = StatusBadLink
return linkq
}
newUrl.Fragment = ""
newUrl.RawFragment = ""
- if kind == atom.A && val[0] == '#' {
+ if kind == int(atom.A) && val[0] == '#' {
// Ignore link to ID, like `href="#element_id"`.
return nil
}
@@ -395,9 +393,9 @@ func (wrk *worker) processLink(parentUrl *url.URL, val string, kind atom.Atom) (
newUrl = parentUrl.JoinPath(`/`, newUrl.Path)
}
}
- linkq.url = strings.TrimSuffix(newUrl.String(), `/`)
- if !strings.HasPrefix(linkq.url, wrk.opts.scanUrl.String()) {
- linkq.isExternal = true
+ linkq.Url = strings.TrimSuffix(newUrl.String(), `/`)
+ if !strings.HasPrefix(linkq.Url, wrk.opts.scanUrl.String()) {
+ linkq.IsExternal = true
}
return linkq
}