summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorShulhan <ms@kilabit.info>2026-01-22 00:27:18 +0700
committerShulhan <ms@kilabit.info>2026-01-22 00:27:18 +0700
commit26fc8bd3203dae6b4705ada227439c90129bbe36 (patch)
tree26ed784cdf2779c3a0848fbf6c1355094b216eea
parent6637e9d11a57c67510d79c00a1425f5e66d18280 (diff)
downloadjarink-26fc8bd3203dae6b4705ada227439c90129bbe36.tar.xz
brokenlinks: refactoring the logic, simplify the code
Previously, we made the scan logic to run in multiple goroutine with one channel to push and consume the result and another channel to push and pop link to be processed. The logic is a very complicated code, making it hard to read and debug. These changes refactoring it to use single goroutine that push and pop link from/to a slices, as queue.
-rw-r--r--CHANGELOG.adoc14
-rw-r--r--brokenlinks/brokenlinks.go2
-rw-r--r--brokenlinks/brokenlinks_test.go87
-rw-r--r--brokenlinks/worker.go345
-rw-r--r--cache.go17
-rw-r--r--cmd/jarink/main.go4
-rw-r--r--link.go11
7 files changed, 197 insertions, 283 deletions
diff --git a/CHANGELOG.adoc b/CHANGELOG.adoc
index 06ea7a1..9d08249 100644
--- a/CHANGELOG.adoc
+++ b/CHANGELOG.adoc
@@ -14,6 +14,20 @@ Legend,
* 🌼: Enhancement
* 💧: Chores
+[#jarink_v0_2_2]
+== jarink 0.2.2 (2026-xx-xx)
+
+**🌼 brokenlinks: refactoring the logic, simplify the code**
+
+Previously, we made the scan logic to run in multiple goroutine with
+one channel to push and consume the result and another channel to push
+and pop link to be processed.
+The logic is a very complicated code, making it hard to read and debug.
+
+These changes refactoring it to use single goroutine that push and pop
+link from/to a slices, as queue.
+
+
[#jarink_v0_2_1]
== jarink 0.2.1 (2025-12-27)
diff --git a/brokenlinks/brokenlinks.go b/brokenlinks/brokenlinks.go
index 7b2e282..af8ce63 100644
--- a/brokenlinks/brokenlinks.go
+++ b/brokenlinks/brokenlinks.go
@@ -1,5 +1,5 @@
-// SPDX-FileCopyrightText: 2025 M. Shulhan <ms@kilabit.info>
// SPDX-License-Identifier: GPL-3.0-only
+// SPDX-FileCopyrightText: 2025 M. Shulhan <ms@kilabit.info>
package brokenlinks
diff --git a/brokenlinks/brokenlinks_test.go b/brokenlinks/brokenlinks_test.go
index b49bcc5..db3775a 100644
--- a/brokenlinks/brokenlinks_test.go
+++ b/brokenlinks/brokenlinks_test.go
@@ -1,10 +1,9 @@
-// SPDX-FileCopyrightText: 2025 M. Shulhan <ms@kilabit.info>
// SPDX-License-Identifier: GPL-3.0-only
+// SPDX-FileCopyrightText: 2025 M. Shulhan <ms@kilabit.info>
package brokenlinks_test
import (
- "encoding/json"
"log"
"net/http"
"os"
@@ -59,7 +58,7 @@ func TestMain(m *testing.M) {
go testServer(fshandle)
go testExternalServer(fshandle)
go testInsecureServer(fshandle)
- go runServerSlow(testAddressSlow)
+ go runServerSlow()
var err = libnet.WaitAlive(`tcp`, testAddress, 5*time.Second)
if err != nil {
@@ -136,7 +135,7 @@ func testInsecureServer(fshandle http.Handler) {
}
}
-func runServerSlow(addr string) {
+func runServerSlow() {
var mux = http.NewServeMux()
mux.HandleFunc(`/`, func(resp http.ResponseWriter, req *http.Request) {
resp.WriteHeader(http.StatusOK)
@@ -171,7 +170,7 @@ func runServerSlow(addr string) {
})
mux.HandleFunc(`/slow3`,
func(resp http.ResponseWriter, req *http.Request) {
- time.Sleep(2 * time.Second)
+ time.Sleep(1 * time.Second)
resp.WriteHeader(http.StatusOK)
var body = []byte(`<html><body>
<a href="/slow1/sub">Slow 1, sub</a>
@@ -183,22 +182,22 @@ func runServerSlow(addr string) {
mux.HandleFunc(`/slow1/sub`,
func(resp http.ResponseWriter, req *http.Request) {
- time.Sleep(4 * time.Second)
+ time.Sleep(1 * time.Second)
resp.WriteHeader(http.StatusOK)
})
mux.HandleFunc(`/slow2/sub`,
func(resp http.ResponseWriter, req *http.Request) {
- time.Sleep(5 * time.Second)
+ time.Sleep(1 * time.Second)
resp.WriteHeader(http.StatusOK)
})
mux.HandleFunc(`/slow3/sub`,
func(resp http.ResponseWriter, req *http.Request) {
- time.Sleep(6 * time.Second)
+ time.Sleep(1 * time.Second)
resp.WriteHeader(http.StatusForbidden)
})
var httpServer = &http.Server{
- Addr: addr,
+ Addr: testAddressSlow,
Handler: mux,
ReadTimeout: 10 * time.Second,
WriteTimeout: 10 * time.Second,
@@ -216,32 +215,38 @@ func TestScan(t *testing.T) {
type testCase struct {
exp map[string][]brokenlinks.Broken
expError string
+ desc string
opts brokenlinks.Options
}
listCase := []testCase{{
+ desc: `With invalid URL`,
opts: brokenlinks.Options{
Url: `127.0.0.1:14594`,
},
expError: `Scan: Options: invalid URL "127.0.0.1:14594"`,
}, {
+ desc: `With non-open port`,
opts: brokenlinks.Options{
Url: `http://127.0.0.1:14594`,
},
expError: `Scan: Get "http://127.0.0.1:14594": dial tcp 127.0.0.1:14594: connect: connection refused`,
}, {
+ desc: `With invalid IgnoreStatus`,
opts: brokenlinks.Options{
Url: testUrl,
IgnoreStatus: "abc",
},
expError: `Scan: Options: invalid status code "abc"`,
}, {
+ desc: `With unknown IgnoreStatus code`,
opts: brokenlinks.Options{
Url: testUrl,
IgnoreStatus: "50",
},
expError: `Scan: Options: unknown status code "50"`,
}, {
+ desc: `With ` + testUrl,
opts: brokenlinks.Options{
Url: testUrl,
IgnoreStatus: `403`,
@@ -290,6 +295,7 @@ func TestScan(t *testing.T) {
},
},
}, {
+ desc: `With ` + testUrl + `/page2`,
// Scanning on "/page2" should not scan the the "/" or other
// pages other than below of "/page2" itself.
opts: brokenlinks.Options{
@@ -317,16 +323,23 @@ func TestScan(t *testing.T) {
err error
)
for _, tcase := range listCase {
- t.Logf(`--- brokenlinks: %s`, tcase.opts.Url)
- result, err = brokenlinks.Scan(tcase.opts)
- if err != nil {
- test.Assert(t, tcase.opts.Url+` error`,
- tcase.expError, err.Error())
- continue
- }
- //got, _ := json.MarshalIndent(result.BrokenLinks, ``, ` `)
- //t.Logf(`got=%s`, got)
- test.Assert(t, tcase.opts.Url, tcase.exp, result.BrokenLinks)
+ t.Run(tcase.desc, func(tt *testing.T) {
+ internal.CacheFile = func() (string, error) {
+ return tt.TempDir() + `/cache.json`, nil
+ }
+
+ result, err = brokenlinks.Scan(tcase.opts)
+ if err != nil {
+ test.Assert(tt, tcase.opts.Url+` error`,
+ tcase.expError, err.Error())
+ return
+ }
+
+ //got, _ := json.MarshalIndent(result.BrokenLinks, ``, ` `)
+ //tt.Logf(`got=%s`, got)
+
+ test.Assert(tt, tcase.opts.Url, tcase.exp, result.BrokenLinks)
+ })
}
}
@@ -337,19 +350,20 @@ func TestScan_pastResult(t *testing.T) {
type testCase struct {
exp map[string][]brokenlinks.Broken
+ desc string
expError string
opts brokenlinks.Options
}
listCase := []testCase{{
- // With invalid file.
+ desc: `With invalid file`,
opts: brokenlinks.Options{
Url: testUrl,
PastResultFile: `testdata/invalid`,
},
expError: `Scan: open testdata/invalid: no such file or directory`,
}, {
- // With valid file.
+ desc: `With valid file`,
opts: brokenlinks.Options{
Url: testUrl,
PastResultFile: `testdata/past_result.json`,
@@ -376,16 +390,20 @@ func TestScan_pastResult(t *testing.T) {
err error
)
for _, tcase := range listCase {
- t.Logf(`--- brokenlinks: %s`, tcase.opts.Url)
- result, err = brokenlinks.Scan(tcase.opts)
- if err != nil {
- test.Assert(t, tcase.opts.Url+` error`,
- tcase.expError, err.Error())
- continue
- }
- //got, _ := json.MarshalIndent(result.BrokenLinks, ``, ` `)
- //t.Logf(`got=%s`, got)
- test.Assert(t, tcase.opts.Url, tcase.exp, result.BrokenLinks)
+ t.Run(tcase.desc, func(tt *testing.T) {
+ internal.CacheFile = func() (string, error) {
+ return tt.TempDir() + `/cache.json`, nil
+ }
+
+ result, err = brokenlinks.Scan(tcase.opts)
+ if err != nil {
+ test.Assert(tt, tcase.opts.Url+` error`, tcase.expError, err.Error())
+ return
+ }
+ //got, _ := json.MarshalIndent(result.BrokenLinks, ``, ` `)
+ //tt.Logf(`got=%s`, got)
+ test.Assert(tt, tcase.opts.Url, tcase.exp, result.BrokenLinks)
+ })
}
}
@@ -393,7 +411,8 @@ func TestScan_slow(t *testing.T) {
const testUrl = `http://` + testAddressSlow
var opts = brokenlinks.Options{
- Url: testUrl,
+ Url: testUrl,
+ IsVerbose: true,
}
var gotResult *brokenlinks.Result
@@ -403,8 +422,8 @@ func TestScan_slow(t *testing.T) {
t.Fatal(err)
}
- got, _ := json.MarshalIndent(gotResult, ``, ` `)
- t.Logf(`got=%s`, got)
+ //got, _ := json.MarshalIndent(gotResult, ``, ` `)
+ //t.Logf(`got=%s`, got)
var expResult = &brokenlinks.Result{
BrokenLinks: map[string][]brokenlinks.Broken{
diff --git a/brokenlinks/worker.go b/brokenlinks/worker.go
index 3c6f97e..c0a33dd 100644
--- a/brokenlinks/worker.go
+++ b/brokenlinks/worker.go
@@ -1,5 +1,5 @@
-// SPDX-FileCopyrightText: 2025 M. Shulhan <ms@kilabit.info>
// SPDX-License-Identifier: GPL-3.0-only
+// SPDX-FileCopyrightText: 2025 M. Shulhan <ms@kilabit.info>
package brokenlinks
@@ -14,7 +14,6 @@ import (
"os"
"slices"
"strings"
- "sync"
"time"
"golang.org/x/net/html"
@@ -26,10 +25,11 @@ import (
type worker struct {
// seenLink store the URL being or has been scanned and its HTTP
// status code.
+ // It store all links, including the broken one.
seenLink map[string]int
- // resultq channel that collect result from scanning.
- resultq chan map[string]linkQueue
+ // queue contains list of link to be scanned.
+ queue []linkQueue
// result contains the final result after all of the pages has been
// scanned.
@@ -41,6 +41,7 @@ type worker struct {
// The base URL that will be joined to relative or absolute
// links or image.
+ // baseURL is set to "scheme://host" only.
baseUrl *url.URL
// cache of scanned links.
@@ -51,12 +52,10 @@ type worker struct {
httpc *http.Client
opts Options
-
- // wg sync the goroutine scanner.
- wg sync.WaitGroup
}
func newWorker(opts Options) (wrk *worker, err error) {
+ var logp = `newWorker`
var netDial = &net.Dialer{
Timeout: 30 * time.Second,
KeepAlive: 30 * time.Second,
@@ -68,7 +67,6 @@ func newWorker(opts Options) (wrk *worker, err error) {
wrk = &worker{
opts: opts,
seenLink: map[string]int{},
- resultq: make(chan map[string]linkQueue, 100),
result: newResult(),
log: log.New(os.Stderr, ``, log.LstdFlags),
httpc: &http.Client{
@@ -107,7 +105,8 @@ func newWorker(opts Options) (wrk *worker, err error) {
wrk.pastResult = newResult()
err = json.Unmarshal(pastresult, &wrk.pastResult)
if err != nil {
- return nil, err
+ log.Printf(`%s: invalid past result file format: %s`, logp, err)
+ log.Printf(`%s: ignoring past result`, logp)
}
return wrk, nil
@@ -125,184 +124,107 @@ func (wrk *worker) run() (result *Result, err error) {
// scanAll scan all pages start from [Options.Url].
func (wrk *worker) scanAll() (result *Result, err error) {
// Scan the first URL to make sure that the server is reachable.
- var firstLinkq = linkQueue{
+ var linkq = linkQueue{
parentUrl: nil,
url: wrk.opts.scanUrl.String(),
- status: http.StatusProcessing,
}
- wrk.seenLink[firstLinkq.url] = http.StatusProcessing
-
- wrk.wg.Add(1)
- go func() {
- var resultq = wrk.scan(firstLinkq)
- wrk.pushResult(resultq)
- }()
- wrk.wg.Wait()
-
- var resultq = <-wrk.resultq
- for _, linkq := range resultq {
- if linkq.url == firstLinkq.url {
- if linkq.errScan != nil {
- return nil, linkq.errScan
- }
- wrk.seenLink[linkq.url] = linkq.status
- continue
- }
- if linkq.isExternal {
- var scannedLink = wrk.cache.Get(linkq.url)
- if scannedLink != nil {
- linkq.status = scannedLink.ResponseCode
- wrk.seen(linkq)
- continue
- }
- }
+ var resultq = wrk.scan(linkq)
+ linkq = resultq[linkq.url]
+ if linkq.errScan != nil {
+ return nil, linkq.errScan
+ }
+ wrk.processResult(resultq)
- wrk.seenLink[linkq.url] = http.StatusProcessing
- wrk.wg.Add(1)
- go func() {
- var resultq = wrk.scan(linkq)
- wrk.pushResult(resultq)
- }()
+ var x int
+ for x < len(wrk.queue) {
+ linkq = wrk.queue[x]
+ x++
+ resultq = wrk.scan(linkq)
+ wrk.processResult(resultq)
}
- wrk.processAndWait()
+ wrk.result.sort()
return wrk.result, nil
}
-// scanPastResult scan only pages reported inside
-// [Result.BrokenLinks].
+// scanPastResult scan only pages reported inside [Result.BrokenLinks].
func (wrk *worker) scanPastResult() (result *Result, err error) {
for page := range wrk.pastResult.BrokenLinks {
- var linkq = linkQueue{
- parentUrl: nil,
- url: page,
- status: http.StatusProcessing,
+ wrk.opts.scanUrl, err = url.Parse(page)
+ if err != nil {
+ log.Printf(`scanPastResult: invalid URL %q: %s`, page, err)
+ continue
}
- wrk.seenLink[linkq.url] = http.StatusProcessing
- wrk.wg.Add(1)
- go func() {
- var resultq = wrk.scan(linkq)
- wrk.pushResult(resultq)
- }()
- }
-
- wrk.processAndWait()
- return wrk.result, nil
-}
-
-func (wrk *worker) processAndWait() {
- var tick = time.NewTicker(500 * time.Millisecond)
- var listWaitStatus []linkQueue
- var isScanning = true
- for isScanning {
- select {
- case resultq := <-wrk.resultq:
- listWaitStatus = wrk.processResult(resultq, listWaitStatus)
-
- case <-tick.C:
- wrk.wg.Wait()
- if len(wrk.resultq) != 0 {
- continue
- }
- if len(listWaitStatus) != 0 {
- // There are links that still waiting for
- // scanning to be completed.
- continue
- }
- isScanning = false
+ wrk.queue = nil
+ _, err = wrk.scanAll()
+ if err != nil {
+ log.Printf(`scanPastResult: %q: %s`, page, err)
}
}
+
wrk.result.sort()
+ return wrk.result, nil
}
-// processResult the resultq contains the original URL being scanned
-// and its child links.
-// For example, scanning "http://example.tld" result in
+// processResult process the scan result and push it to queue.
+// The resultq contains the original URL being scanned and its child links.
+// For each link item in resultq,
//
-// "http://example.tld": {status=200}
-// "http://example.tld/page": {status=0}
-// "http://example.tld/image.png": {status=0}
-// "http://bad:domain/image.png": {status=700}
-func (wrk *worker) processResult(
- resultq map[string]linkQueue, listWaitStatus []linkQueue,
-) (
- newList []linkQueue,
-) {
- for _, linkq := range resultq {
- // Process the scanned page first.
-
+// - For non-zero status code, mark it as seen and store to cache if
+// it external.
+// - Skip external link that has been checked before.
+// - Skip link that has been seen.
+// - Otherwise push it to queue.
+func (wrk *worker) processResult(resultq map[string]linkQueue) {
+ var linkq linkQueue
+ var seen bool
+ for _, linkq = range resultq {
if linkq.status != 0 {
wrk.seen(linkq)
- if linkq.isExternal && linkq.status != StatusBadLink {
- wrk.cache.Set(linkq.url, linkq.status, linkq.size)
- }
continue
}
- // Now process the links inside the page.
-
if linkq.isExternal {
var scannedLink = wrk.cache.Get(linkq.url)
if scannedLink != nil {
- linkq.status = scannedLink.ResponseCode
- wrk.seen(linkq)
+ // The external link has been scanned
+ // previously.
continue
}
}
- seenStatus, seen := wrk.seenLink[linkq.url]
- if !seen {
- wrk.seenLink[linkq.url] = http.StatusProcessing
- wrk.wg.Add(1)
- go func() {
- var resultq = wrk.scan(linkq)
- wrk.pushResult(resultq)
- }()
- continue
- }
- if seenStatus >= http.StatusBadRequest {
- linkq.status = seenStatus
- wrk.markBroken(linkq)
- continue
- }
- if seenStatus >= http.StatusOK {
- // The link has been processed and its
- // not an error.
- continue
- }
- // The link being processed by other goroutine.
- linkq.status = seenStatus
- newList = append(newList, linkq)
- }
- for _, linkq := range listWaitStatus {
- seenStatus := wrk.seenLink[linkq.url]
- if seenStatus >= http.StatusBadRequest {
- linkq.status = seenStatus
- wrk.markBroken(linkq)
- continue
- }
- if seenStatus >= http.StatusOK {
- continue
- }
- if seenStatus == http.StatusProcessing {
- // Scanning still in progress.
- newList = append(newList, linkq)
+ linkq.status, seen = wrk.seenLink[linkq.url]
+ if seen {
+ if linkq.status >= http.StatusBadRequest {
+ // Different pages may have the same broken
+ // link.
+ wrk.markAsBroken(linkq)
+ }
continue
}
+ wrk.queue = append(wrk.queue, linkq)
}
- return newList
}
func (wrk *worker) seen(linkq linkQueue) {
+ wrk.seenLink[linkq.url] = linkq.status
+
+ if linkq.isExternal {
+ if linkq.status != StatusBadLink {
+ wrk.cache.Set(linkq.url, linkq.status, linkq.size)
+ }
+ }
+
if linkq.status >= http.StatusBadRequest {
- wrk.markBroken(linkq)
- return
+ wrk.markAsBroken(linkq)
}
- wrk.seenLink[linkq.url] = linkq.status
}
-func (wrk *worker) markBroken(linkq linkQueue) {
+func (wrk *worker) markAsBroken(linkq linkQueue) {
+ if slices.Contains(wrk.opts.ignoreStatus, linkq.status) {
+ return
+ }
var parentUrl = linkq.parentUrl.String()
var listBroken = wrk.result.BrokenLinks[parentUrl]
var brokenLink = Broken{
@@ -314,20 +236,10 @@ func (wrk *worker) markBroken(linkq linkQueue) {
}
listBroken = append(listBroken, brokenLink)
wrk.result.BrokenLinks[parentUrl] = listBroken
-
- wrk.seenLink[linkq.url] = linkq.status
}
-// scan fetch the HTML page or image to check if its valid.
+// scan the link to HTML page or image.
func (wrk *worker) scan(linkq linkQueue) (resultq map[string]linkQueue) {
- defer func() {
- if wrk.opts.IsVerbose && linkq.errScan != nil {
- wrk.log.Printf("error: %d %s error=%v\n", linkq.status,
- linkq.url, linkq.errScan)
- }
- wrk.wg.Done()
- }()
-
resultq = make(map[string]linkQueue)
var (
httpResp *http.Response
@@ -346,10 +258,6 @@ func (wrk *worker) scan(linkq linkQueue) (resultq map[string]linkQueue) {
linkq.size = httpResp.ContentLength
resultq[linkq.url] = linkq
- if slices.Contains(wrk.opts.ignoreStatus, httpResp.StatusCode) {
- return nil
- }
-
if httpResp.StatusCode >= http.StatusBadRequest {
return resultq
}
@@ -357,21 +265,23 @@ func (wrk *worker) scan(linkq linkQueue) (resultq map[string]linkQueue) {
return resultq
}
- var doc *html.Node
- doc, _ = html.Parse(httpResp.Body)
-
- // After we check the code and test for [html.Parse] there are
- // no case actual cases where HTML content will return an error.
+ // After we check the code for [html.Parse] there are no cases where
+ // it will return an error.
// The only possible error is when reading from body (io.Reader), and
// that is also almost impossible.
//
// [html.Parse]: https://go.googlesource.com/net/+/refs/tags/v0.40.0/html/parse.go#2347
+ var doc *html.Node
+ doc, _ = html.Parse(httpResp.Body)
- var scanUrl *url.URL
+ var parentUrl *url.URL
- scanUrl, err = url.Parse(linkq.url)
+ parentUrl, err = url.Parse(linkq.url)
if err != nil {
- log.Fatal(err)
+ linkq.status = StatusBadLink
+ linkq.errScan = err
+ resultq[linkq.url] = linkq
+ return resultq
}
var node *html.Node
@@ -379,42 +289,42 @@ func (wrk *worker) scan(linkq linkQueue) (resultq map[string]linkQueue) {
if node.Type != html.ElementNode {
continue
}
+ if node.DataAtom != atom.A && node.DataAtom != atom.Img {
+ continue
+ }
var nodeLink *linkQueue
if node.DataAtom == atom.A {
for _, attr := range node.Attr {
if attr.Key != `href` {
continue
}
- nodeLink = wrk.processLink(scanUrl, attr.Val, atom.A)
+ nodeLink = wrk.processLink(parentUrl, attr.Val, atom.A)
break
}
- } else if node.DataAtom == atom.Img {
+ } else { // atom.Img
for _, attr := range node.Attr {
if attr.Key != `src` {
continue
}
- nodeLink = wrk.processLink(scanUrl, attr.Val, atom.Img)
+ nodeLink = wrk.processLink(parentUrl, attr.Val, atom.Img)
break
}
- } else {
- continue
}
if nodeLink == nil {
+ // Link is invalid.
continue
}
_, seen := resultq[nodeLink.url]
- if !seen {
- wrk.checkExternal(nodeLink)
- resultq[nodeLink.url] = *nodeLink
+ if seen {
+ // The same link already exist previously.
+ continue
}
+ resultq[nodeLink.url] = *nodeLink
}
return resultq
}
-func (wrk *worker) fetch(linkq linkQueue) (
- httpResp *http.Response,
- err error,
-) {
+func (wrk *worker) fetch(linkq linkQueue) (httpResp *http.Response, err error) {
const maxRetry = 5
var retry int
for retry < 5 {
@@ -446,6 +356,8 @@ func (wrk *worker) fetch(linkq linkQueue) (
return nil, err
}
+// processLink given a parentURL and link value `val`
+// check if link `val` is valid and return it as linkQueue.
func (wrk *worker) processLink(parentUrl *url.URL, val string, kind atom.Atom) (
linkq *linkQueue,
) {
@@ -453,17 +365,19 @@ func (wrk *worker) processLink(parentUrl *url.URL, val string, kind atom.Atom) (
return nil
}
+ linkq = &linkQueue{
+ parentUrl: parentUrl,
+ kind: kind,
+ }
+
var newUrl *url.URL
var err error
newUrl, err = url.Parse(val)
if err != nil {
- return &linkQueue{
- parentUrl: parentUrl,
- errScan: err,
- url: val,
- kind: kind,
- status: StatusBadLink,
- }
+ linkq.errScan = err
+ linkq.url = val
+ linkq.status = StatusBadLink
+ return linkq
}
newUrl.Fragment = ""
newUrl.RawFragment = ""
@@ -472,55 +386,18 @@ func (wrk *worker) processLink(parentUrl *url.URL, val string, kind atom.Atom) (
// Ignore link to ID, like `href="#element_id"`.
return nil
}
- if strings.HasPrefix(val, `http`) {
- return &linkQueue{
- parentUrl: parentUrl,
- url: strings.TrimSuffix(newUrl.String(), `/`),
- kind: kind,
- }
- }
- if val[0] == '/' {
- // val is absolute to parent URL.
- newUrl = wrk.baseUrl.JoinPath(newUrl.Path)
- } else {
- // val is relative to parent URL.
- newUrl = parentUrl.JoinPath(`/`, newUrl.Path)
- }
- linkq = &linkQueue{
- parentUrl: parentUrl,
- url: strings.TrimSuffix(newUrl.String(), `/`),
- kind: kind,
- }
- return linkq
-}
-
-func (wrk *worker) pushResult(resultq map[string]linkQueue) {
- if len(resultq) == 0 {
- return
- }
- var tick = time.NewTicker(100 * time.Millisecond)
- for {
- select {
- case wrk.resultq <- resultq:
- tick.Stop()
- return
- case <-tick.C:
+ if !strings.HasPrefix(val, `http`) {
+ if val[0] == '/' {
+ // val is absolute link.
+ newUrl = wrk.baseUrl.JoinPath(newUrl.Path)
+ } else {
+ // val is relative to parent URL.
+ newUrl = parentUrl.JoinPath(`/`, newUrl.Path)
}
}
-}
-
-// checkExternal set the [linkQueue.isExternal] field to true if
-//
-// (1) [linkQueue.url] does not start with [Options.Url]
-// (2) linkQueue is not from scanPastResult, indicated by non-nil
-// [worker.pastResult].
-func (wrk *worker) checkExternal(linkq *linkQueue) {
+ linkq.url = strings.TrimSuffix(newUrl.String(), `/`)
if !strings.HasPrefix(linkq.url, wrk.opts.scanUrl.String()) {
linkq.isExternal = true
- return
- }
- if wrk.pastResult != nil {
- linkq.isExternal = true
- return
}
+ return linkq
}
diff --git a/cache.go b/cache.go
index 12c7b74..73b4d58 100644
--- a/cache.go
+++ b/cache.go
@@ -1,5 +1,5 @@
-// SPDX-FileCopyrightText: 2025 M. Shulhan <ms@kilabit.info>
// SPDX-License-Identifier: GPL-3.0-only
+// SPDX-FileCopyrightText: 2025 M. Shulhan <ms@kilabit.info>
package jarink
@@ -12,13 +12,6 @@ import (
"git.sr.ht/~shulhan/jarink/internal"
)
-// ScannedLink store information about the link.
-type ScannedLink struct {
- Url string `json:"url"`
- Size int64 `json:"size"`
- ResponseCode int `json:"response_code"`
-}
-
// Cache store external links that has been scanned, to minize
// request to the same URL in the future.
// The cache is stored as JSON file under user's cache directory, inside
@@ -26,7 +19,7 @@ type ScannedLink struct {
// For example, in Linux it should be "$HOME/.cache/jarink/cache.json".
// See [os.UserCacheDir] for location specific to operating system.
type Cache struct {
- ScannedLinks map[string]*ScannedLink `json:"scanned_links"`
+ ScannedLinks map[string]*Link `json:"scanned_links"`
file string
mtx sync.Mutex
}
@@ -36,7 +29,7 @@ func LoadCache() (cache *Cache, err error) {
var logp = `LoadCache`
cache = &Cache{
- ScannedLinks: map[string]*ScannedLink{},
+ ScannedLinks: map[string]*Link{},
}
cache.file, err = internal.CacheFile()
@@ -62,7 +55,7 @@ func LoadCache() (cache *Cache, err error) {
}
// Get return the scanned link information by url.
-func (cache *Cache) Get(url string) (scannedLink *ScannedLink) {
+func (cache *Cache) Get(url string) (scannedLink *Link) {
cache.mtx.Lock()
scannedLink = cache.ScannedLinks[url]
cache.mtx.Unlock()
@@ -95,7 +88,7 @@ func (cache *Cache) Set(url string, respCode int, size int64) {
if scannedLink != nil {
return
}
- scannedLink = &ScannedLink{
+ scannedLink = &Link{
Url: url,
Size: size,
ResponseCode: respCode,
diff --git a/cmd/jarink/main.go b/cmd/jarink/main.go
index afd1dbd..c2dc5b6 100644
--- a/cmd/jarink/main.go
+++ b/cmd/jarink/main.go
@@ -1,5 +1,5 @@
-// SPDX-FileCopyrightText: 2025 M. Shulhan <ms@kilabit.info>
// SPDX-License-Identifier: GPL-3.0-only
+// SPDX-FileCopyrightText: 2025 M. Shulhan <ms@kilabit.info>
package main
@@ -70,7 +70,7 @@ func main() {
if err != nil {
log.Fatal(err.Error())
}
- fmt.Printf("%s\n", resultJson)
+ fmt.Println(string(resultJson))
return
case `help`:
diff --git a/link.go b/link.go
new file mode 100644
index 0000000..a03808b
--- /dev/null
+++ b/link.go
@@ -0,0 +1,11 @@
+// SPDX-License-Identifier: GPL-3.0-only
+// SPDX-FileCopyrightText: 2025 M. Shulhan <ms@kilabit.info>
+
+package jarink
+
+// Link store information about the link.
+type Link struct {
+ Url string `json:"url"`
+ Size int64 `json:"size"`
+ ResponseCode int `json:"response_code"`
+}