diff options
| author | Shulhan <ms@kilabit.info> | 2025-06-21 15:20:01 +0700 |
|---|---|---|
| committer | Shulhan <ms@kilabit.info> | 2025-06-27 12:19:23 +0700 |
| commit | 1ca561ed0ecfa59b70a10191ac8e58cde90d126e (patch) | |
| tree | 80f0c65f7e9321ad92dfc1a53a444226cee4be3d | |
| parent | 8bc8fce1bd80b5a25c452ac5a24b1a1e3f5a4feb (diff) | |
| download | jarink-1ca561ed0ecfa59b70a10191ac8e58cde90d126e.tar.xz | |
brokenlinks: implement caching for external URLs
Any succesful fetch on external URLs, will be recorded into jarink
cache file, located in user's home cache directory.
For example, in Linux it would be `$HOME/.cache/jarink/cache.json`.
This help improve the future rescanning on the same or different target
URL, minimizing network requests.
| -rw-r--r-- | brokenlinks/brokenlinks.go | 6 | ||||
| -rw-r--r-- | brokenlinks/brokenlinks_test.go | 47 | ||||
| -rw-r--r-- | brokenlinks/link_queue.go | 21 | ||||
| -rw-r--r-- | brokenlinks/testdata/exp_cache.json | 19 | ||||
| -rw-r--r-- | brokenlinks/testdata/exp_cache.json.license | 2 | ||||
| -rw-r--r-- | brokenlinks/testdata/web/index.html | 5 | ||||
| -rw-r--r-- | brokenlinks/worker.go | 72 | ||||
| -rw-r--r-- | cache.go | 104 | ||||
| -rw-r--r-- | internal/internal.go | 34 |
9 files changed, 279 insertions, 31 deletions
diff --git a/brokenlinks/brokenlinks.go b/brokenlinks/brokenlinks.go index 5ba25d9..7b2e282 100644 --- a/brokenlinks/brokenlinks.go +++ b/brokenlinks/brokenlinks.go @@ -5,6 +5,7 @@ package brokenlinks import ( "fmt" + "log" ) const Version = `0.1.0` @@ -34,5 +35,10 @@ func Scan(opts Options) (result *Result, err error) { return nil, fmt.Errorf(`%s: %w`, logp, err) } + err = wrk.cache.Save() + if err != nil { + log.Printf(`%s: %s`, logp, err) + } + return result, nil } diff --git a/brokenlinks/brokenlinks_test.go b/brokenlinks/brokenlinks_test.go index d9f9a59..f957ae3 100644 --- a/brokenlinks/brokenlinks_test.go +++ b/brokenlinks/brokenlinks_test.go @@ -8,6 +8,7 @@ import ( "log" "net/http" "os" + "path/filepath" "testing" "time" @@ -15,6 +16,7 @@ import ( "git.sr.ht/~shulhan/pakakeh.go/lib/test" "git.sr.ht/~shulhan/jarink/brokenlinks" + "git.sr.ht/~shulhan/jarink/internal" ) // The test run four web servers. @@ -41,6 +43,16 @@ const testAddressSlow = `127.0.0.1:11839` func TestMain(m *testing.M) { log.SetFlags(0) + + var orgCacheFile = internal.CacheFile + var tmpCacheFile = filepath.Join(os.TempDir(), `cache.json`) + internal.CacheFile = func() (string, error) { + return tmpCacheFile, nil + } + defer func() { + internal.CacheFile = orgCacheFile + }() + var httpDirWeb = http.Dir(`testdata/web`) var fshandle = http.FileServer(httpDirWeb) @@ -234,6 +246,7 @@ func TestScan(t *testing.T) { Url: testUrl, IgnoreStatus: `403`, Insecure: true, + IsVerbose: true, }, exp: map[string][]brokenlinks.Broken{ testUrl: []brokenlinks.Broken{ @@ -276,7 +289,8 @@ func TestScan(t *testing.T) { // Scanning on "/page2" should not scan the the "/" or other // pages other than below of "/page2" itself. opts: brokenlinks.Options{ - Url: testUrl + `/page2`, + Url: testUrl + `/page2`, + IsVerbose: true, }, exp: map[string][]brokenlinks.Broken{ testUrl + `/page2`: []brokenlinks.Broken{ @@ -406,3 +420,34 @@ func TestScan_slow(t *testing.T) { } test.Assert(t, `TestScan_slow`, expResult, gotResult) } + +func TestBrokenlinks_cache(t *testing.T) { + var orgCacheFile = internal.CacheFile + var gotCacheFile = filepath.Join(t.TempDir(), `cache.json`) + var expCacheFile = filepath.Join(`testdata`, `exp_cache.json`) + defer func() { + internal.CacheFile = orgCacheFile + }() + internal.CacheFile = func() (string, error) { + return gotCacheFile, nil + } + + var testUrl = `http://` + testAddress + var opts = brokenlinks.Options{ + Url: testUrl, + IgnoreStatus: `403`, + Insecure: true, + } + + var err error + _, err = brokenlinks.Scan(opts) + gotCache, err := os.ReadFile(gotCacheFile) + if err != nil { + t.Fatal(err) + } + expCache, err := os.ReadFile(expCacheFile) + if err != nil { + t.Fatal(err) + } + test.Assert(t, `cache`, string(gotCache), string(expCache)) +} diff --git a/brokenlinks/link_queue.go b/brokenlinks/link_queue.go index 6a7dd32..14bf8c7 100644 --- a/brokenlinks/link_queue.go +++ b/brokenlinks/link_queue.go @@ -5,7 +5,6 @@ package brokenlinks import ( "net/url" - "strings" "golang.org/x/net/html/atom" ) @@ -33,23 +32,7 @@ type linkQueue struct { // 200 - 211: OK. // 400 - 511: Error. status int -} -// checkExternal set the isExternal field to be true if -// -// (1) [linkQueue.url] does not start with [Options.Url] -// -// (2) linkQueue is from scanPastResult, indicated by non-nil -// [worker.pastResult]. -// In this case, we did not want to scan the other pages from the same scanUrl -// domain. -func (linkq *linkQueue) checkExternal(wrk *worker) { - if !strings.HasPrefix(linkq.url, wrk.opts.scanUrl.String()) { - linkq.isExternal = true - return - } - if wrk.pastResult != nil { - linkq.isExternal = true - return - } + // Size of the page, derived from HTTP response ContentLength. + size int64 } diff --git a/brokenlinks/testdata/exp_cache.json b/brokenlinks/testdata/exp_cache.json new file mode 100644 index 0000000..563164d --- /dev/null +++ b/brokenlinks/testdata/exp_cache.json @@ -0,0 +1,19 @@ +{ + "scanned_links": { + "http://127.0.0.1:11900": { + "url": "http://127.0.0.1:11900", + "size": 976, + "response_code": 200 + }, + "http://127.0.0.1:11900/page2": { + "url": "http://127.0.0.1:11900/page2", + "size": 410, + "response_code": 200 + }, + "https://127.0.0.1:11838": { + "url": "https://127.0.0.1:11838", + "size": 976, + "response_code": 200 + } + } +} diff --git a/brokenlinks/testdata/exp_cache.json.license b/brokenlinks/testdata/exp_cache.json.license new file mode 100644 index 0000000..22616a9 --- /dev/null +++ b/brokenlinks/testdata/exp_cache.json.license @@ -0,0 +1,2 @@ +SPDX-FileCopyrightText: 2025 M. Shulhan <ms@kilabit.info> +SPDX-License-Identifier: GPL-3.0-only diff --git a/brokenlinks/testdata/web/index.html b/brokenlinks/testdata/web/index.html index 7b9101c..596d374 100644 --- a/brokenlinks/testdata/web/index.html +++ b/brokenlinks/testdata/web/index.html @@ -10,11 +10,16 @@ SPDX-License-Identifier: GPL-3.0-only <img width="200" src="" /> <a href="/page2">Page 2</a> <a href="/broken.html">Broken HTML</a> + + <!-- External link --> <a href="http://127.0.0.1:11900">External URL</a> + <!-- Error when fetching with GET --> <a href="http:/127.0.0.1:11836">Invalid external URL</a> + <!-- Error when parsing URL --> <a href="http://127.0.0.1:abc">Invalid URL port</a> + <!-- Fragment should be skipped and cleaned up --> <a href="#goto_a">Same with href to "/"</a> <a href="/page2#goto_a">Same with href to "/page2"</a> diff --git a/brokenlinks/worker.go b/brokenlinks/worker.go index 3eacf01..8d7918f 100644 --- a/brokenlinks/worker.go +++ b/brokenlinks/worker.go @@ -19,6 +19,8 @@ import ( "golang.org/x/net/html" "golang.org/x/net/html/atom" + + "git.sr.ht/~shulhan/jarink" ) type worker struct { @@ -41,6 +43,9 @@ type worker struct { // links or image. baseUrl *url.URL + // cache of scanned links. + cache *jarink.Cache + log *log.Logger httpc *http.Client @@ -79,6 +84,11 @@ func newWorker(opts Options) (wrk *worker, err error) { }, } + wrk.cache, err = jarink.LoadCache() + if err != nil { + return nil, err + } + wrk.baseUrl = &url.URL{ Scheme: wrk.opts.scanUrl.Scheme, Host: wrk.opts.scanUrl.Host, @@ -135,9 +145,14 @@ func (wrk *worker) scanAll() (result *Result, err error) { wrk.seenLink[linkq.url] = linkq.status continue } - if linkq.status >= http.StatusBadRequest { - wrk.markBroken(linkq) - continue + + if linkq.isExternal { + var scannedLink = wrk.cache.Get(linkq.url) + if scannedLink != nil { + linkq.status = scannedLink.ResponseCode + wrk.seen(linkq) + continue + } } wrk.seenLink[linkq.url] = http.StatusProcessing @@ -206,17 +221,27 @@ func (wrk *worker) processResult( newList []linkQueue, ) { for _, linkq := range resultq { - if linkq.status >= http.StatusBadRequest { - wrk.markBroken(linkq) - continue - } + // Process the scanned page first. + if linkq.status != 0 { - // linkq is the result of scan with - // non error status. - wrk.seenLink[linkq.url] = linkq.status + wrk.seen(linkq) + if linkq.isExternal && linkq.status != StatusBadLink { + wrk.cache.Set(linkq.url, linkq.status, linkq.size) + } continue } + // Now process the links inside the page. + + if linkq.isExternal { + var scannedLink = wrk.cache.Get(linkq.url) + if scannedLink != nil { + linkq.status = scannedLink.ResponseCode + wrk.seen(linkq) + continue + } + } + seenStatus, seen := wrk.seenLink[linkq.url] if !seen { wrk.seenLink[linkq.url] = http.StatusProcessing @@ -257,6 +282,14 @@ func (wrk *worker) processResult( return newList } +func (wrk *worker) seen(linkq linkQueue) { + if linkq.status >= http.StatusBadRequest { + wrk.markBroken(linkq) + return + } + wrk.seenLink[linkq.url] = linkq.status +} + func (wrk *worker) markBroken(linkq linkQueue) { var parentUrl = linkq.parentUrl.String() var listBroken = wrk.result.BrokenLinks[parentUrl] @@ -299,6 +332,7 @@ func (wrk *worker) scan(linkq linkQueue) { defer httpResp.Body.Close() linkq.status = httpResp.StatusCode + linkq.size = httpResp.ContentLength resultq[linkq.url] = linkq if slices.Contains(wrk.opts.ignoreStatus, httpResp.StatusCode) { @@ -361,7 +395,7 @@ func (wrk *worker) scan(linkq linkQueue) { } _, seen := resultq[nodeLink.url] if !seen { - nodeLink.checkExternal(wrk) + wrk.checkExternal(nodeLink) resultq[nodeLink.url] = *nodeLink } } @@ -459,3 +493,19 @@ func (wrk *worker) pushResult(resultq map[string]linkQueue) { } } } + +// checkExternal set the [linkQueue.isExternal] field to true if +// +// (1) [linkQueue.url] does not start with [Options.Url] +// (2) linkQueue is not from scanPastResult, indicated by non-nil +// [worker.pastResult]. +func (wrk *worker) checkExternal(linkq *linkQueue) { + if !strings.HasPrefix(linkq.url, wrk.opts.scanUrl.String()) { + linkq.isExternal = true + return + } + if wrk.pastResult != nil { + linkq.isExternal = true + return + } +} diff --git a/cache.go b/cache.go new file mode 100644 index 0000000..12c7b74 --- /dev/null +++ b/cache.go @@ -0,0 +1,104 @@ +// SPDX-FileCopyrightText: 2025 M. Shulhan <ms@kilabit.info> +// SPDX-License-Identifier: GPL-3.0-only + +package jarink + +import ( + "encoding/json" + "fmt" + "os" + "sync" + + "git.sr.ht/~shulhan/jarink/internal" +) + +// ScannedLink store information about the link. +type ScannedLink struct { + Url string `json:"url"` + Size int64 `json:"size"` + ResponseCode int `json:"response_code"` +} + +// Cache store external links that has been scanned, to minize +// request to the same URL in the future. +// The cache is stored as JSON file under user's cache directory, inside +// "jarink" directory. +// For example, in Linux it should be "$HOME/.cache/jarink/cache.json". +// See [os.UserCacheDir] for location specific to operating system. +type Cache struct { + ScannedLinks map[string]*ScannedLink `json:"scanned_links"` + file string + mtx sync.Mutex +} + +// LoadCache from local storage. +func LoadCache() (cache *Cache, err error) { + var logp = `LoadCache` + + cache = &Cache{ + ScannedLinks: map[string]*ScannedLink{}, + } + + cache.file, err = internal.CacheFile() + if err != nil { + return nil, fmt.Errorf(`%s: %w`, logp, err) + } + + var cacheJson []byte + cacheJson, err = os.ReadFile(cache.file) + if err != nil { + if os.IsNotExist(err) { + return cache, nil + } + return nil, fmt.Errorf(`%s: %w`, logp, err) + } + + err = json.Unmarshal(cacheJson, &cache) + if err != nil { + return nil, fmt.Errorf(`%s: %w`, logp, err) + } + + return cache, nil +} + +// Get return the scanned link information by url. +func (cache *Cache) Get(url string) (scannedLink *ScannedLink) { + cache.mtx.Lock() + scannedLink = cache.ScannedLinks[url] + cache.mtx.Unlock() + return scannedLink +} + +// Save the cache into local storage. +func (cache *Cache) Save() (err error) { + var logp = `Save` + var cacheJson []byte + cacheJson, err = json.MarshalIndent(cache, ``, ` `) + if err != nil { + return fmt.Errorf(`%s: %w`, logp, err) + } + + cacheJson = append(cacheJson, '\n') + + err = os.WriteFile(cache.file, cacheJson, 0600) + if err != nil { + return fmt.Errorf(`%s: %w`, logp, err) + } + return nil +} + +func (cache *Cache) Set(url string, respCode int, size int64) { + cache.mtx.Lock() + defer cache.mtx.Unlock() + + var scannedLink = cache.ScannedLinks[url] + if scannedLink != nil { + return + } + scannedLink = &ScannedLink{ + Url: url, + Size: size, + ResponseCode: respCode, + } + cache.ScannedLinks[url] = scannedLink +} diff --git a/internal/internal.go b/internal/internal.go new file mode 100644 index 0000000..7127932 --- /dev/null +++ b/internal/internal.go @@ -0,0 +1,34 @@ +// SPDX-FileCopyrightText: 2025 M. Shulhan <ms@kilabit.info> +// SPDX-License-Identifier: GPL-3.0-only + +package internal + +import ( + "fmt" + "os" + "path/filepath" +) + +// CacheFile return the path to cache file under [os.UserCacheDir] + +// "jarink" directory. +// This variable defined here so the test file can override it. +var CacheFile = DefaultCacheFile + +func DefaultCacheFile() (cacheFile string, err error) { + var logp = `DefaultCacheFile` + var cacheDir string + + cacheDir, err = os.UserCacheDir() + if err != nil { + return ``, fmt.Errorf(`%s: %w`, logp, err) + } + cacheDir = filepath.Join(cacheDir, `jarink`) + + err = os.MkdirAll(cacheDir, 0700) + if err != nil { + return ``, fmt.Errorf(`%s: %w`, logp, err) + } + + cacheFile = filepath.Join(cacheDir, `cache.json`) + return cacheFile, nil +} |
