summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--brokenlinks/brokenlinks.go6
-rw-r--r--brokenlinks/brokenlinks_test.go47
-rw-r--r--brokenlinks/link_queue.go21
-rw-r--r--brokenlinks/testdata/exp_cache.json19
-rw-r--r--brokenlinks/testdata/exp_cache.json.license2
-rw-r--r--brokenlinks/testdata/web/index.html5
-rw-r--r--brokenlinks/worker.go72
-rw-r--r--cache.go104
-rw-r--r--internal/internal.go34
9 files changed, 279 insertions, 31 deletions
diff --git a/brokenlinks/brokenlinks.go b/brokenlinks/brokenlinks.go
index 5ba25d9..7b2e282 100644
--- a/brokenlinks/brokenlinks.go
+++ b/brokenlinks/brokenlinks.go
@@ -5,6 +5,7 @@ package brokenlinks
import (
"fmt"
+ "log"
)
const Version = `0.1.0`
@@ -34,5 +35,10 @@ func Scan(opts Options) (result *Result, err error) {
return nil, fmt.Errorf(`%s: %w`, logp, err)
}
+ err = wrk.cache.Save()
+ if err != nil {
+ log.Printf(`%s: %s`, logp, err)
+ }
+
return result, nil
}
diff --git a/brokenlinks/brokenlinks_test.go b/brokenlinks/brokenlinks_test.go
index d9f9a59..f957ae3 100644
--- a/brokenlinks/brokenlinks_test.go
+++ b/brokenlinks/brokenlinks_test.go
@@ -8,6 +8,7 @@ import (
"log"
"net/http"
"os"
+ "path/filepath"
"testing"
"time"
@@ -15,6 +16,7 @@ import (
"git.sr.ht/~shulhan/pakakeh.go/lib/test"
"git.sr.ht/~shulhan/jarink/brokenlinks"
+ "git.sr.ht/~shulhan/jarink/internal"
)
// The test run four web servers.
@@ -41,6 +43,16 @@ const testAddressSlow = `127.0.0.1:11839`
func TestMain(m *testing.M) {
log.SetFlags(0)
+
+ var orgCacheFile = internal.CacheFile
+ var tmpCacheFile = filepath.Join(os.TempDir(), `cache.json`)
+ internal.CacheFile = func() (string, error) {
+ return tmpCacheFile, nil
+ }
+ defer func() {
+ internal.CacheFile = orgCacheFile
+ }()
+
var httpDirWeb = http.Dir(`testdata/web`)
var fshandle = http.FileServer(httpDirWeb)
@@ -234,6 +246,7 @@ func TestScan(t *testing.T) {
Url: testUrl,
IgnoreStatus: `403`,
Insecure: true,
+ IsVerbose: true,
},
exp: map[string][]brokenlinks.Broken{
testUrl: []brokenlinks.Broken{
@@ -276,7 +289,8 @@ func TestScan(t *testing.T) {
// Scanning on "/page2" should not scan the the "/" or other
// pages other than below of "/page2" itself.
opts: brokenlinks.Options{
- Url: testUrl + `/page2`,
+ Url: testUrl + `/page2`,
+ IsVerbose: true,
},
exp: map[string][]brokenlinks.Broken{
testUrl + `/page2`: []brokenlinks.Broken{
@@ -406,3 +420,34 @@ func TestScan_slow(t *testing.T) {
}
test.Assert(t, `TestScan_slow`, expResult, gotResult)
}
+
+func TestBrokenlinks_cache(t *testing.T) {
+ var orgCacheFile = internal.CacheFile
+ var gotCacheFile = filepath.Join(t.TempDir(), `cache.json`)
+ var expCacheFile = filepath.Join(`testdata`, `exp_cache.json`)
+ defer func() {
+ internal.CacheFile = orgCacheFile
+ }()
+ internal.CacheFile = func() (string, error) {
+ return gotCacheFile, nil
+ }
+
+ var testUrl = `http://` + testAddress
+ var opts = brokenlinks.Options{
+ Url: testUrl,
+ IgnoreStatus: `403`,
+ Insecure: true,
+ }
+
+ var err error
+ _, err = brokenlinks.Scan(opts)
+ gotCache, err := os.ReadFile(gotCacheFile)
+ if err != nil {
+ t.Fatal(err)
+ }
+ expCache, err := os.ReadFile(expCacheFile)
+ if err != nil {
+ t.Fatal(err)
+ }
+ test.Assert(t, `cache`, string(gotCache), string(expCache))
+}
diff --git a/brokenlinks/link_queue.go b/brokenlinks/link_queue.go
index 6a7dd32..14bf8c7 100644
--- a/brokenlinks/link_queue.go
+++ b/brokenlinks/link_queue.go
@@ -5,7 +5,6 @@ package brokenlinks
import (
"net/url"
- "strings"
"golang.org/x/net/html/atom"
)
@@ -33,23 +32,7 @@ type linkQueue struct {
// 200 - 211: OK.
// 400 - 511: Error.
status int
-}
-// checkExternal set the isExternal field to be true if
-//
-// (1) [linkQueue.url] does not start with [Options.Url]
-//
-// (2) linkQueue is from scanPastResult, indicated by non-nil
-// [worker.pastResult].
-// In this case, we did not want to scan the other pages from the same scanUrl
-// domain.
-func (linkq *linkQueue) checkExternal(wrk *worker) {
- if !strings.HasPrefix(linkq.url, wrk.opts.scanUrl.String()) {
- linkq.isExternal = true
- return
- }
- if wrk.pastResult != nil {
- linkq.isExternal = true
- return
- }
+ // Size of the page, derived from HTTP response ContentLength.
+ size int64
}
diff --git a/brokenlinks/testdata/exp_cache.json b/brokenlinks/testdata/exp_cache.json
new file mode 100644
index 0000000..563164d
--- /dev/null
+++ b/brokenlinks/testdata/exp_cache.json
@@ -0,0 +1,19 @@
+{
+ "scanned_links": {
+ "http://127.0.0.1:11900": {
+ "url": "http://127.0.0.1:11900",
+ "size": 976,
+ "response_code": 200
+ },
+ "http://127.0.0.1:11900/page2": {
+ "url": "http://127.0.0.1:11900/page2",
+ "size": 410,
+ "response_code": 200
+ },
+ "https://127.0.0.1:11838": {
+ "url": "https://127.0.0.1:11838",
+ "size": 976,
+ "response_code": 200
+ }
+ }
+}
diff --git a/brokenlinks/testdata/exp_cache.json.license b/brokenlinks/testdata/exp_cache.json.license
new file mode 100644
index 0000000..22616a9
--- /dev/null
+++ b/brokenlinks/testdata/exp_cache.json.license
@@ -0,0 +1,2 @@
+SPDX-FileCopyrightText: 2025 M. Shulhan <ms@kilabit.info>
+SPDX-License-Identifier: GPL-3.0-only
diff --git a/brokenlinks/testdata/web/index.html b/brokenlinks/testdata/web/index.html
index 7b9101c..596d374 100644
--- a/brokenlinks/testdata/web/index.html
+++ b/brokenlinks/testdata/web/index.html
@@ -10,11 +10,16 @@ SPDX-License-Identifier: GPL-3.0-only
<img width="200" src="" />
<a href="/page2">Page 2</a>
<a href="/broken.html">Broken HTML</a>
+
+ <!-- External link -->
<a href="http://127.0.0.1:11900">External URL</a>
+
<!-- Error when fetching with GET -->
<a href="http:/127.0.0.1:11836">Invalid external URL</a>
+
<!-- Error when parsing URL -->
<a href="http://127.0.0.1:abc">Invalid URL port</a>
+
<!-- Fragment should be skipped and cleaned up -->
<a href="#goto_a">Same with href to "/"</a>
<a href="/page2#goto_a">Same with href to "/page2"</a>
diff --git a/brokenlinks/worker.go b/brokenlinks/worker.go
index 3eacf01..8d7918f 100644
--- a/brokenlinks/worker.go
+++ b/brokenlinks/worker.go
@@ -19,6 +19,8 @@ import (
"golang.org/x/net/html"
"golang.org/x/net/html/atom"
+
+ "git.sr.ht/~shulhan/jarink"
)
type worker struct {
@@ -41,6 +43,9 @@ type worker struct {
// links or image.
baseUrl *url.URL
+ // cache of scanned links.
+ cache *jarink.Cache
+
log *log.Logger
httpc *http.Client
@@ -79,6 +84,11 @@ func newWorker(opts Options) (wrk *worker, err error) {
},
}
+ wrk.cache, err = jarink.LoadCache()
+ if err != nil {
+ return nil, err
+ }
+
wrk.baseUrl = &url.URL{
Scheme: wrk.opts.scanUrl.Scheme,
Host: wrk.opts.scanUrl.Host,
@@ -135,9 +145,14 @@ func (wrk *worker) scanAll() (result *Result, err error) {
wrk.seenLink[linkq.url] = linkq.status
continue
}
- if linkq.status >= http.StatusBadRequest {
- wrk.markBroken(linkq)
- continue
+
+ if linkq.isExternal {
+ var scannedLink = wrk.cache.Get(linkq.url)
+ if scannedLink != nil {
+ linkq.status = scannedLink.ResponseCode
+ wrk.seen(linkq)
+ continue
+ }
}
wrk.seenLink[linkq.url] = http.StatusProcessing
@@ -206,17 +221,27 @@ func (wrk *worker) processResult(
newList []linkQueue,
) {
for _, linkq := range resultq {
- if linkq.status >= http.StatusBadRequest {
- wrk.markBroken(linkq)
- continue
- }
+ // Process the scanned page first.
+
if linkq.status != 0 {
- // linkq is the result of scan with
- // non error status.
- wrk.seenLink[linkq.url] = linkq.status
+ wrk.seen(linkq)
+ if linkq.isExternal && linkq.status != StatusBadLink {
+ wrk.cache.Set(linkq.url, linkq.status, linkq.size)
+ }
continue
}
+ // Now process the links inside the page.
+
+ if linkq.isExternal {
+ var scannedLink = wrk.cache.Get(linkq.url)
+ if scannedLink != nil {
+ linkq.status = scannedLink.ResponseCode
+ wrk.seen(linkq)
+ continue
+ }
+ }
+
seenStatus, seen := wrk.seenLink[linkq.url]
if !seen {
wrk.seenLink[linkq.url] = http.StatusProcessing
@@ -257,6 +282,14 @@ func (wrk *worker) processResult(
return newList
}
+func (wrk *worker) seen(linkq linkQueue) {
+ if linkq.status >= http.StatusBadRequest {
+ wrk.markBroken(linkq)
+ return
+ }
+ wrk.seenLink[linkq.url] = linkq.status
+}
+
func (wrk *worker) markBroken(linkq linkQueue) {
var parentUrl = linkq.parentUrl.String()
var listBroken = wrk.result.BrokenLinks[parentUrl]
@@ -299,6 +332,7 @@ func (wrk *worker) scan(linkq linkQueue) {
defer httpResp.Body.Close()
linkq.status = httpResp.StatusCode
+ linkq.size = httpResp.ContentLength
resultq[linkq.url] = linkq
if slices.Contains(wrk.opts.ignoreStatus, httpResp.StatusCode) {
@@ -361,7 +395,7 @@ func (wrk *worker) scan(linkq linkQueue) {
}
_, seen := resultq[nodeLink.url]
if !seen {
- nodeLink.checkExternal(wrk)
+ wrk.checkExternal(nodeLink)
resultq[nodeLink.url] = *nodeLink
}
}
@@ -459,3 +493,19 @@ func (wrk *worker) pushResult(resultq map[string]linkQueue) {
}
}
}
+
+// checkExternal set the [linkQueue.isExternal] field to true if
+//
+// (1) [linkQueue.url] does not start with [Options.Url]
+// (2) linkQueue is not from scanPastResult, indicated by non-nil
+// [worker.pastResult].
+func (wrk *worker) checkExternal(linkq *linkQueue) {
+ if !strings.HasPrefix(linkq.url, wrk.opts.scanUrl.String()) {
+ linkq.isExternal = true
+ return
+ }
+ if wrk.pastResult != nil {
+ linkq.isExternal = true
+ return
+ }
+}
diff --git a/cache.go b/cache.go
new file mode 100644
index 0000000..12c7b74
--- /dev/null
+++ b/cache.go
@@ -0,0 +1,104 @@
+// SPDX-FileCopyrightText: 2025 M. Shulhan <ms@kilabit.info>
+// SPDX-License-Identifier: GPL-3.0-only
+
+package jarink
+
+import (
+ "encoding/json"
+ "fmt"
+ "os"
+ "sync"
+
+ "git.sr.ht/~shulhan/jarink/internal"
+)
+
+// ScannedLink store information about the link.
+type ScannedLink struct {
+ Url string `json:"url"`
+ Size int64 `json:"size"`
+ ResponseCode int `json:"response_code"`
+}
+
+// Cache store external links that has been scanned, to minize
+// request to the same URL in the future.
+// The cache is stored as JSON file under user's cache directory, inside
+// "jarink" directory.
+// For example, in Linux it should be "$HOME/.cache/jarink/cache.json".
+// See [os.UserCacheDir] for location specific to operating system.
+type Cache struct {
+ ScannedLinks map[string]*ScannedLink `json:"scanned_links"`
+ file string
+ mtx sync.Mutex
+}
+
+// LoadCache from local storage.
+func LoadCache() (cache *Cache, err error) {
+ var logp = `LoadCache`
+
+ cache = &Cache{
+ ScannedLinks: map[string]*ScannedLink{},
+ }
+
+ cache.file, err = internal.CacheFile()
+ if err != nil {
+ return nil, fmt.Errorf(`%s: %w`, logp, err)
+ }
+
+ var cacheJson []byte
+ cacheJson, err = os.ReadFile(cache.file)
+ if err != nil {
+ if os.IsNotExist(err) {
+ return cache, nil
+ }
+ return nil, fmt.Errorf(`%s: %w`, logp, err)
+ }
+
+ err = json.Unmarshal(cacheJson, &cache)
+ if err != nil {
+ return nil, fmt.Errorf(`%s: %w`, logp, err)
+ }
+
+ return cache, nil
+}
+
+// Get return the scanned link information by url.
+func (cache *Cache) Get(url string) (scannedLink *ScannedLink) {
+ cache.mtx.Lock()
+ scannedLink = cache.ScannedLinks[url]
+ cache.mtx.Unlock()
+ return scannedLink
+}
+
+// Save the cache into local storage.
+func (cache *Cache) Save() (err error) {
+ var logp = `Save`
+ var cacheJson []byte
+ cacheJson, err = json.MarshalIndent(cache, ``, ` `)
+ if err != nil {
+ return fmt.Errorf(`%s: %w`, logp, err)
+ }
+
+ cacheJson = append(cacheJson, '\n')
+
+ err = os.WriteFile(cache.file, cacheJson, 0600)
+ if err != nil {
+ return fmt.Errorf(`%s: %w`, logp, err)
+ }
+ return nil
+}
+
+func (cache *Cache) Set(url string, respCode int, size int64) {
+ cache.mtx.Lock()
+ defer cache.mtx.Unlock()
+
+ var scannedLink = cache.ScannedLinks[url]
+ if scannedLink != nil {
+ return
+ }
+ scannedLink = &ScannedLink{
+ Url: url,
+ Size: size,
+ ResponseCode: respCode,
+ }
+ cache.ScannedLinks[url] = scannedLink
+}
diff --git a/internal/internal.go b/internal/internal.go
new file mode 100644
index 0000000..7127932
--- /dev/null
+++ b/internal/internal.go
@@ -0,0 +1,34 @@
+// SPDX-FileCopyrightText: 2025 M. Shulhan <ms@kilabit.info>
+// SPDX-License-Identifier: GPL-3.0-only
+
+package internal
+
+import (
+ "fmt"
+ "os"
+ "path/filepath"
+)
+
+// CacheFile return the path to cache file under [os.UserCacheDir] +
+// "jarink" directory.
+// This variable defined here so the test file can override it.
+var CacheFile = DefaultCacheFile
+
+func DefaultCacheFile() (cacheFile string, err error) {
+ var logp = `DefaultCacheFile`
+ var cacheDir string
+
+ cacheDir, err = os.UserCacheDir()
+ if err != nil {
+ return ``, fmt.Errorf(`%s: %w`, logp, err)
+ }
+ cacheDir = filepath.Join(cacheDir, `jarink`)
+
+ err = os.MkdirAll(cacheDir, 0700)
+ if err != nil {
+ return ``, fmt.Errorf(`%s: %w`, logp, err)
+ }
+
+ cacheFile = filepath.Join(cacheDir, `cache.json`)
+ return cacheFile, nil
+}