aboutsummaryrefslogtreecommitdiff
path: root/brokenlinks
diff options
context:
space:
mode:
authorShulhan <ms@kilabit.info>2025-06-21 15:20:01 +0700
committerShulhan <ms@kilabit.info>2025-06-27 12:19:23 +0700
commit1ca561ed0ecfa59b70a10191ac8e58cde90d126e (patch)
tree80f0c65f7e9321ad92dfc1a53a444226cee4be3d /brokenlinks
parent8bc8fce1bd80b5a25c452ac5a24b1a1e3f5a4feb (diff)
downloadjarink-1ca561ed0ecfa59b70a10191ac8e58cde90d126e.tar.xz
brokenlinks: implement caching for external URLs
Any succesful fetch on external URLs, will be recorded into jarink cache file, located in user's home cache directory. For example, in Linux it would be `$HOME/.cache/jarink/cache.json`. This help improve the future rescanning on the same or different target URL, minimizing network requests.
Diffstat (limited to 'brokenlinks')
-rw-r--r--brokenlinks/brokenlinks.go6
-rw-r--r--brokenlinks/brokenlinks_test.go47
-rw-r--r--brokenlinks/link_queue.go21
-rw-r--r--brokenlinks/testdata/exp_cache.json19
-rw-r--r--brokenlinks/testdata/exp_cache.json.license2
-rw-r--r--brokenlinks/testdata/web/index.html5
-rw-r--r--brokenlinks/worker.go72
7 files changed, 141 insertions, 31 deletions
diff --git a/brokenlinks/brokenlinks.go b/brokenlinks/brokenlinks.go
index 5ba25d9..7b2e282 100644
--- a/brokenlinks/brokenlinks.go
+++ b/brokenlinks/brokenlinks.go
@@ -5,6 +5,7 @@ package brokenlinks
import (
"fmt"
+ "log"
)
const Version = `0.1.0`
@@ -34,5 +35,10 @@ func Scan(opts Options) (result *Result, err error) {
return nil, fmt.Errorf(`%s: %w`, logp, err)
}
+ err = wrk.cache.Save()
+ if err != nil {
+ log.Printf(`%s: %s`, logp, err)
+ }
+
return result, nil
}
diff --git a/brokenlinks/brokenlinks_test.go b/brokenlinks/brokenlinks_test.go
index d9f9a59..f957ae3 100644
--- a/brokenlinks/brokenlinks_test.go
+++ b/brokenlinks/brokenlinks_test.go
@@ -8,6 +8,7 @@ import (
"log"
"net/http"
"os"
+ "path/filepath"
"testing"
"time"
@@ -15,6 +16,7 @@ import (
"git.sr.ht/~shulhan/pakakeh.go/lib/test"
"git.sr.ht/~shulhan/jarink/brokenlinks"
+ "git.sr.ht/~shulhan/jarink/internal"
)
// The test run four web servers.
@@ -41,6 +43,16 @@ const testAddressSlow = `127.0.0.1:11839`
func TestMain(m *testing.M) {
log.SetFlags(0)
+
+ var orgCacheFile = internal.CacheFile
+ var tmpCacheFile = filepath.Join(os.TempDir(), `cache.json`)
+ internal.CacheFile = func() (string, error) {
+ return tmpCacheFile, nil
+ }
+ defer func() {
+ internal.CacheFile = orgCacheFile
+ }()
+
var httpDirWeb = http.Dir(`testdata/web`)
var fshandle = http.FileServer(httpDirWeb)
@@ -234,6 +246,7 @@ func TestScan(t *testing.T) {
Url: testUrl,
IgnoreStatus: `403`,
Insecure: true,
+ IsVerbose: true,
},
exp: map[string][]brokenlinks.Broken{
testUrl: []brokenlinks.Broken{
@@ -276,7 +289,8 @@ func TestScan(t *testing.T) {
// Scanning on "/page2" should not scan the the "/" or other
// pages other than below of "/page2" itself.
opts: brokenlinks.Options{
- Url: testUrl + `/page2`,
+ Url: testUrl + `/page2`,
+ IsVerbose: true,
},
exp: map[string][]brokenlinks.Broken{
testUrl + `/page2`: []brokenlinks.Broken{
@@ -406,3 +420,34 @@ func TestScan_slow(t *testing.T) {
}
test.Assert(t, `TestScan_slow`, expResult, gotResult)
}
+
+func TestBrokenlinks_cache(t *testing.T) {
+ var orgCacheFile = internal.CacheFile
+ var gotCacheFile = filepath.Join(t.TempDir(), `cache.json`)
+ var expCacheFile = filepath.Join(`testdata`, `exp_cache.json`)
+ defer func() {
+ internal.CacheFile = orgCacheFile
+ }()
+ internal.CacheFile = func() (string, error) {
+ return gotCacheFile, nil
+ }
+
+ var testUrl = `http://` + testAddress
+ var opts = brokenlinks.Options{
+ Url: testUrl,
+ IgnoreStatus: `403`,
+ Insecure: true,
+ }
+
+ var err error
+ _, err = brokenlinks.Scan(opts)
+ gotCache, err := os.ReadFile(gotCacheFile)
+ if err != nil {
+ t.Fatal(err)
+ }
+ expCache, err := os.ReadFile(expCacheFile)
+ if err != nil {
+ t.Fatal(err)
+ }
+ test.Assert(t, `cache`, string(gotCache), string(expCache))
+}
diff --git a/brokenlinks/link_queue.go b/brokenlinks/link_queue.go
index 6a7dd32..14bf8c7 100644
--- a/brokenlinks/link_queue.go
+++ b/brokenlinks/link_queue.go
@@ -5,7 +5,6 @@ package brokenlinks
import (
"net/url"
- "strings"
"golang.org/x/net/html/atom"
)
@@ -33,23 +32,7 @@ type linkQueue struct {
// 200 - 211: OK.
// 400 - 511: Error.
status int
-}
-// checkExternal set the isExternal field to be true if
-//
-// (1) [linkQueue.url] does not start with [Options.Url]
-//
-// (2) linkQueue is from scanPastResult, indicated by non-nil
-// [worker.pastResult].
-// In this case, we did not want to scan the other pages from the same scanUrl
-// domain.
-func (linkq *linkQueue) checkExternal(wrk *worker) {
- if !strings.HasPrefix(linkq.url, wrk.opts.scanUrl.String()) {
- linkq.isExternal = true
- return
- }
- if wrk.pastResult != nil {
- linkq.isExternal = true
- return
- }
+ // Size of the page, derived from HTTP response ContentLength.
+ size int64
}
diff --git a/brokenlinks/testdata/exp_cache.json b/brokenlinks/testdata/exp_cache.json
new file mode 100644
index 0000000..563164d
--- /dev/null
+++ b/brokenlinks/testdata/exp_cache.json
@@ -0,0 +1,19 @@
+{
+ "scanned_links": {
+ "http://127.0.0.1:11900": {
+ "url": "http://127.0.0.1:11900",
+ "size": 976,
+ "response_code": 200
+ },
+ "http://127.0.0.1:11900/page2": {
+ "url": "http://127.0.0.1:11900/page2",
+ "size": 410,
+ "response_code": 200
+ },
+ "https://127.0.0.1:11838": {
+ "url": "https://127.0.0.1:11838",
+ "size": 976,
+ "response_code": 200
+ }
+ }
+}
diff --git a/brokenlinks/testdata/exp_cache.json.license b/brokenlinks/testdata/exp_cache.json.license
new file mode 100644
index 0000000..22616a9
--- /dev/null
+++ b/brokenlinks/testdata/exp_cache.json.license
@@ -0,0 +1,2 @@
+SPDX-FileCopyrightText: 2025 M. Shulhan <ms@kilabit.info>
+SPDX-License-Identifier: GPL-3.0-only
diff --git a/brokenlinks/testdata/web/index.html b/brokenlinks/testdata/web/index.html
index 7b9101c..596d374 100644
--- a/brokenlinks/testdata/web/index.html
+++ b/brokenlinks/testdata/web/index.html
@@ -10,11 +10,16 @@ SPDX-License-Identifier: GPL-3.0-only
<img width="200" src="" />
<a href="/page2">Page 2</a>
<a href="/broken.html">Broken HTML</a>
+
+ <!-- External link -->
<a href="http://127.0.0.1:11900">External URL</a>
+
<!-- Error when fetching with GET -->
<a href="http:/127.0.0.1:11836">Invalid external URL</a>
+
<!-- Error when parsing URL -->
<a href="http://127.0.0.1:abc">Invalid URL port</a>
+
<!-- Fragment should be skipped and cleaned up -->
<a href="#goto_a">Same with href to "/"</a>
<a href="/page2#goto_a">Same with href to "/page2"</a>
diff --git a/brokenlinks/worker.go b/brokenlinks/worker.go
index 3eacf01..8d7918f 100644
--- a/brokenlinks/worker.go
+++ b/brokenlinks/worker.go
@@ -19,6 +19,8 @@ import (
"golang.org/x/net/html"
"golang.org/x/net/html/atom"
+
+ "git.sr.ht/~shulhan/jarink"
)
type worker struct {
@@ -41,6 +43,9 @@ type worker struct {
// links or image.
baseUrl *url.URL
+ // cache of scanned links.
+ cache *jarink.Cache
+
log *log.Logger
httpc *http.Client
@@ -79,6 +84,11 @@ func newWorker(opts Options) (wrk *worker, err error) {
},
}
+ wrk.cache, err = jarink.LoadCache()
+ if err != nil {
+ return nil, err
+ }
+
wrk.baseUrl = &url.URL{
Scheme: wrk.opts.scanUrl.Scheme,
Host: wrk.opts.scanUrl.Host,
@@ -135,9 +145,14 @@ func (wrk *worker) scanAll() (result *Result, err error) {
wrk.seenLink[linkq.url] = linkq.status
continue
}
- if linkq.status >= http.StatusBadRequest {
- wrk.markBroken(linkq)
- continue
+
+ if linkq.isExternal {
+ var scannedLink = wrk.cache.Get(linkq.url)
+ if scannedLink != nil {
+ linkq.status = scannedLink.ResponseCode
+ wrk.seen(linkq)
+ continue
+ }
}
wrk.seenLink[linkq.url] = http.StatusProcessing
@@ -206,17 +221,27 @@ func (wrk *worker) processResult(
newList []linkQueue,
) {
for _, linkq := range resultq {
- if linkq.status >= http.StatusBadRequest {
- wrk.markBroken(linkq)
- continue
- }
+ // Process the scanned page first.
+
if linkq.status != 0 {
- // linkq is the result of scan with
- // non error status.
- wrk.seenLink[linkq.url] = linkq.status
+ wrk.seen(linkq)
+ if linkq.isExternal && linkq.status != StatusBadLink {
+ wrk.cache.Set(linkq.url, linkq.status, linkq.size)
+ }
continue
}
+ // Now process the links inside the page.
+
+ if linkq.isExternal {
+ var scannedLink = wrk.cache.Get(linkq.url)
+ if scannedLink != nil {
+ linkq.status = scannedLink.ResponseCode
+ wrk.seen(linkq)
+ continue
+ }
+ }
+
seenStatus, seen := wrk.seenLink[linkq.url]
if !seen {
wrk.seenLink[linkq.url] = http.StatusProcessing
@@ -257,6 +282,14 @@ func (wrk *worker) processResult(
return newList
}
+func (wrk *worker) seen(linkq linkQueue) {
+ if linkq.status >= http.StatusBadRequest {
+ wrk.markBroken(linkq)
+ return
+ }
+ wrk.seenLink[linkq.url] = linkq.status
+}
+
func (wrk *worker) markBroken(linkq linkQueue) {
var parentUrl = linkq.parentUrl.String()
var listBroken = wrk.result.BrokenLinks[parentUrl]
@@ -299,6 +332,7 @@ func (wrk *worker) scan(linkq linkQueue) {
defer httpResp.Body.Close()
linkq.status = httpResp.StatusCode
+ linkq.size = httpResp.ContentLength
resultq[linkq.url] = linkq
if slices.Contains(wrk.opts.ignoreStatus, httpResp.StatusCode) {
@@ -361,7 +395,7 @@ func (wrk *worker) scan(linkq linkQueue) {
}
_, seen := resultq[nodeLink.url]
if !seen {
- nodeLink.checkExternal(wrk)
+ wrk.checkExternal(nodeLink)
resultq[nodeLink.url] = *nodeLink
}
}
@@ -459,3 +493,19 @@ func (wrk *worker) pushResult(resultq map[string]linkQueue) {
}
}
}
+
+// checkExternal set the [linkQueue.isExternal] field to true if
+//
+// (1) [linkQueue.url] does not start with [Options.Url]
+// (2) linkQueue is not from scanPastResult, indicated by non-nil
+// [worker.pastResult].
+func (wrk *worker) checkExternal(linkq *linkQueue) {
+ if !strings.HasPrefix(linkq.url, wrk.opts.scanUrl.String()) {
+ linkq.isExternal = true
+ return
+ }
+ if wrk.pastResult != nil {
+ linkq.isExternal = true
+ return
+ }
+}