From 1ca561ed0ecfa59b70a10191ac8e58cde90d126e Mon Sep 17 00:00:00 2001 From: Shulhan Date: Sat, 21 Jun 2025 15:20:01 +0700 Subject: brokenlinks: implement caching for external URLs Any succesful fetch on external URLs, will be recorded into jarink cache file, located in user's home cache directory. For example, in Linux it would be `$HOME/.cache/jarink/cache.json`. This help improve the future rescanning on the same or different target URL, minimizing network requests. --- brokenlinks/brokenlinks.go | 6 ++ brokenlinks/brokenlinks_test.go | 47 ++++++++++++- brokenlinks/link_queue.go | 21 +----- brokenlinks/testdata/exp_cache.json | 19 +++++ brokenlinks/testdata/exp_cache.json.license | 2 + brokenlinks/testdata/web/index.html | 5 ++ brokenlinks/worker.go | 72 ++++++++++++++++--- cache.go | 104 ++++++++++++++++++++++++++++ internal/internal.go | 34 +++++++++ 9 files changed, 279 insertions(+), 31 deletions(-) create mode 100644 brokenlinks/testdata/exp_cache.json create mode 100644 brokenlinks/testdata/exp_cache.json.license create mode 100644 cache.go create mode 100644 internal/internal.go diff --git a/brokenlinks/brokenlinks.go b/brokenlinks/brokenlinks.go index 5ba25d9..7b2e282 100644 --- a/brokenlinks/brokenlinks.go +++ b/brokenlinks/brokenlinks.go @@ -5,6 +5,7 @@ package brokenlinks import ( "fmt" + "log" ) const Version = `0.1.0` @@ -34,5 +35,10 @@ func Scan(opts Options) (result *Result, err error) { return nil, fmt.Errorf(`%s: %w`, logp, err) } + err = wrk.cache.Save() + if err != nil { + log.Printf(`%s: %s`, logp, err) + } + return result, nil } diff --git a/brokenlinks/brokenlinks_test.go b/brokenlinks/brokenlinks_test.go index d9f9a59..f957ae3 100644 --- a/brokenlinks/brokenlinks_test.go +++ b/brokenlinks/brokenlinks_test.go @@ -8,6 +8,7 @@ import ( "log" "net/http" "os" + "path/filepath" "testing" "time" @@ -15,6 +16,7 @@ import ( "git.sr.ht/~shulhan/pakakeh.go/lib/test" "git.sr.ht/~shulhan/jarink/brokenlinks" + "git.sr.ht/~shulhan/jarink/internal" ) // The test run four web servers. @@ -41,6 +43,16 @@ const testAddressSlow = `127.0.0.1:11839` func TestMain(m *testing.M) { log.SetFlags(0) + + var orgCacheFile = internal.CacheFile + var tmpCacheFile = filepath.Join(os.TempDir(), `cache.json`) + internal.CacheFile = func() (string, error) { + return tmpCacheFile, nil + } + defer func() { + internal.CacheFile = orgCacheFile + }() + var httpDirWeb = http.Dir(`testdata/web`) var fshandle = http.FileServer(httpDirWeb) @@ -234,6 +246,7 @@ func TestScan(t *testing.T) { Url: testUrl, IgnoreStatus: `403`, Insecure: true, + IsVerbose: true, }, exp: map[string][]brokenlinks.Broken{ testUrl: []brokenlinks.Broken{ @@ -276,7 +289,8 @@ func TestScan(t *testing.T) { // Scanning on "/page2" should not scan the the "/" or other // pages other than below of "/page2" itself. opts: brokenlinks.Options{ - Url: testUrl + `/page2`, + Url: testUrl + `/page2`, + IsVerbose: true, }, exp: map[string][]brokenlinks.Broken{ testUrl + `/page2`: []brokenlinks.Broken{ @@ -406,3 +420,34 @@ func TestScan_slow(t *testing.T) { } test.Assert(t, `TestScan_slow`, expResult, gotResult) } + +func TestBrokenlinks_cache(t *testing.T) { + var orgCacheFile = internal.CacheFile + var gotCacheFile = filepath.Join(t.TempDir(), `cache.json`) + var expCacheFile = filepath.Join(`testdata`, `exp_cache.json`) + defer func() { + internal.CacheFile = orgCacheFile + }() + internal.CacheFile = func() (string, error) { + return gotCacheFile, nil + } + + var testUrl = `http://` + testAddress + var opts = brokenlinks.Options{ + Url: testUrl, + IgnoreStatus: `403`, + Insecure: true, + } + + var err error + _, err = brokenlinks.Scan(opts) + gotCache, err := os.ReadFile(gotCacheFile) + if err != nil { + t.Fatal(err) + } + expCache, err := os.ReadFile(expCacheFile) + if err != nil { + t.Fatal(err) + } + test.Assert(t, `cache`, string(gotCache), string(expCache)) +} diff --git a/brokenlinks/link_queue.go b/brokenlinks/link_queue.go index 6a7dd32..14bf8c7 100644 --- a/brokenlinks/link_queue.go +++ b/brokenlinks/link_queue.go @@ -5,7 +5,6 @@ package brokenlinks import ( "net/url" - "strings" "golang.org/x/net/html/atom" ) @@ -33,23 +32,7 @@ type linkQueue struct { // 200 - 211: OK. // 400 - 511: Error. status int -} -// checkExternal set the isExternal field to be true if -// -// (1) [linkQueue.url] does not start with [Options.Url] -// -// (2) linkQueue is from scanPastResult, indicated by non-nil -// [worker.pastResult]. -// In this case, we did not want to scan the other pages from the same scanUrl -// domain. -func (linkq *linkQueue) checkExternal(wrk *worker) { - if !strings.HasPrefix(linkq.url, wrk.opts.scanUrl.String()) { - linkq.isExternal = true - return - } - if wrk.pastResult != nil { - linkq.isExternal = true - return - } + // Size of the page, derived from HTTP response ContentLength. + size int64 } diff --git a/brokenlinks/testdata/exp_cache.json b/brokenlinks/testdata/exp_cache.json new file mode 100644 index 0000000..563164d --- /dev/null +++ b/brokenlinks/testdata/exp_cache.json @@ -0,0 +1,19 @@ +{ + "scanned_links": { + "http://127.0.0.1:11900": { + "url": "http://127.0.0.1:11900", + "size": 976, + "response_code": 200 + }, + "http://127.0.0.1:11900/page2": { + "url": "http://127.0.0.1:11900/page2", + "size": 410, + "response_code": 200 + }, + "https://127.0.0.1:11838": { + "url": "https://127.0.0.1:11838", + "size": 976, + "response_code": 200 + } + } +} diff --git a/brokenlinks/testdata/exp_cache.json.license b/brokenlinks/testdata/exp_cache.json.license new file mode 100644 index 0000000..22616a9 --- /dev/null +++ b/brokenlinks/testdata/exp_cache.json.license @@ -0,0 +1,2 @@ +SPDX-FileCopyrightText: 2025 M. Shulhan +SPDX-License-Identifier: GPL-3.0-only diff --git a/brokenlinks/testdata/web/index.html b/brokenlinks/testdata/web/index.html index 7b9101c..596d374 100644 --- a/brokenlinks/testdata/web/index.html +++ b/brokenlinks/testdata/web/index.html @@ -10,11 +10,16 @@ SPDX-License-Identifier: GPL-3.0-only Page 2 Broken HTML + + External URL + Invalid external URL + Invalid URL port + Same with href to "/" Same with href to "/page2" diff --git a/brokenlinks/worker.go b/brokenlinks/worker.go index 3eacf01..8d7918f 100644 --- a/brokenlinks/worker.go +++ b/brokenlinks/worker.go @@ -19,6 +19,8 @@ import ( "golang.org/x/net/html" "golang.org/x/net/html/atom" + + "git.sr.ht/~shulhan/jarink" ) type worker struct { @@ -41,6 +43,9 @@ type worker struct { // links or image. baseUrl *url.URL + // cache of scanned links. + cache *jarink.Cache + log *log.Logger httpc *http.Client @@ -79,6 +84,11 @@ func newWorker(opts Options) (wrk *worker, err error) { }, } + wrk.cache, err = jarink.LoadCache() + if err != nil { + return nil, err + } + wrk.baseUrl = &url.URL{ Scheme: wrk.opts.scanUrl.Scheme, Host: wrk.opts.scanUrl.Host, @@ -135,9 +145,14 @@ func (wrk *worker) scanAll() (result *Result, err error) { wrk.seenLink[linkq.url] = linkq.status continue } - if linkq.status >= http.StatusBadRequest { - wrk.markBroken(linkq) - continue + + if linkq.isExternal { + var scannedLink = wrk.cache.Get(linkq.url) + if scannedLink != nil { + linkq.status = scannedLink.ResponseCode + wrk.seen(linkq) + continue + } } wrk.seenLink[linkq.url] = http.StatusProcessing @@ -206,17 +221,27 @@ func (wrk *worker) processResult( newList []linkQueue, ) { for _, linkq := range resultq { - if linkq.status >= http.StatusBadRequest { - wrk.markBroken(linkq) - continue - } + // Process the scanned page first. + if linkq.status != 0 { - // linkq is the result of scan with - // non error status. - wrk.seenLink[linkq.url] = linkq.status + wrk.seen(linkq) + if linkq.isExternal && linkq.status != StatusBadLink { + wrk.cache.Set(linkq.url, linkq.status, linkq.size) + } continue } + // Now process the links inside the page. + + if linkq.isExternal { + var scannedLink = wrk.cache.Get(linkq.url) + if scannedLink != nil { + linkq.status = scannedLink.ResponseCode + wrk.seen(linkq) + continue + } + } + seenStatus, seen := wrk.seenLink[linkq.url] if !seen { wrk.seenLink[linkq.url] = http.StatusProcessing @@ -257,6 +282,14 @@ func (wrk *worker) processResult( return newList } +func (wrk *worker) seen(linkq linkQueue) { + if linkq.status >= http.StatusBadRequest { + wrk.markBroken(linkq) + return + } + wrk.seenLink[linkq.url] = linkq.status +} + func (wrk *worker) markBroken(linkq linkQueue) { var parentUrl = linkq.parentUrl.String() var listBroken = wrk.result.BrokenLinks[parentUrl] @@ -299,6 +332,7 @@ func (wrk *worker) scan(linkq linkQueue) { defer httpResp.Body.Close() linkq.status = httpResp.StatusCode + linkq.size = httpResp.ContentLength resultq[linkq.url] = linkq if slices.Contains(wrk.opts.ignoreStatus, httpResp.StatusCode) { @@ -361,7 +395,7 @@ func (wrk *worker) scan(linkq linkQueue) { } _, seen := resultq[nodeLink.url] if !seen { - nodeLink.checkExternal(wrk) + wrk.checkExternal(nodeLink) resultq[nodeLink.url] = *nodeLink } } @@ -459,3 +493,19 @@ func (wrk *worker) pushResult(resultq map[string]linkQueue) { } } } + +// checkExternal set the [linkQueue.isExternal] field to true if +// +// (1) [linkQueue.url] does not start with [Options.Url] +// (2) linkQueue is not from scanPastResult, indicated by non-nil +// [worker.pastResult]. +func (wrk *worker) checkExternal(linkq *linkQueue) { + if !strings.HasPrefix(linkq.url, wrk.opts.scanUrl.String()) { + linkq.isExternal = true + return + } + if wrk.pastResult != nil { + linkq.isExternal = true + return + } +} diff --git a/cache.go b/cache.go new file mode 100644 index 0000000..12c7b74 --- /dev/null +++ b/cache.go @@ -0,0 +1,104 @@ +// SPDX-FileCopyrightText: 2025 M. Shulhan +// SPDX-License-Identifier: GPL-3.0-only + +package jarink + +import ( + "encoding/json" + "fmt" + "os" + "sync" + + "git.sr.ht/~shulhan/jarink/internal" +) + +// ScannedLink store information about the link. +type ScannedLink struct { + Url string `json:"url"` + Size int64 `json:"size"` + ResponseCode int `json:"response_code"` +} + +// Cache store external links that has been scanned, to minize +// request to the same URL in the future. +// The cache is stored as JSON file under user's cache directory, inside +// "jarink" directory. +// For example, in Linux it should be "$HOME/.cache/jarink/cache.json". +// See [os.UserCacheDir] for location specific to operating system. +type Cache struct { + ScannedLinks map[string]*ScannedLink `json:"scanned_links"` + file string + mtx sync.Mutex +} + +// LoadCache from local storage. +func LoadCache() (cache *Cache, err error) { + var logp = `LoadCache` + + cache = &Cache{ + ScannedLinks: map[string]*ScannedLink{}, + } + + cache.file, err = internal.CacheFile() + if err != nil { + return nil, fmt.Errorf(`%s: %w`, logp, err) + } + + var cacheJson []byte + cacheJson, err = os.ReadFile(cache.file) + if err != nil { + if os.IsNotExist(err) { + return cache, nil + } + return nil, fmt.Errorf(`%s: %w`, logp, err) + } + + err = json.Unmarshal(cacheJson, &cache) + if err != nil { + return nil, fmt.Errorf(`%s: %w`, logp, err) + } + + return cache, nil +} + +// Get return the scanned link information by url. +func (cache *Cache) Get(url string) (scannedLink *ScannedLink) { + cache.mtx.Lock() + scannedLink = cache.ScannedLinks[url] + cache.mtx.Unlock() + return scannedLink +} + +// Save the cache into local storage. +func (cache *Cache) Save() (err error) { + var logp = `Save` + var cacheJson []byte + cacheJson, err = json.MarshalIndent(cache, ``, ` `) + if err != nil { + return fmt.Errorf(`%s: %w`, logp, err) + } + + cacheJson = append(cacheJson, '\n') + + err = os.WriteFile(cache.file, cacheJson, 0600) + if err != nil { + return fmt.Errorf(`%s: %w`, logp, err) + } + return nil +} + +func (cache *Cache) Set(url string, respCode int, size int64) { + cache.mtx.Lock() + defer cache.mtx.Unlock() + + var scannedLink = cache.ScannedLinks[url] + if scannedLink != nil { + return + } + scannedLink = &ScannedLink{ + Url: url, + Size: size, + ResponseCode: respCode, + } + cache.ScannedLinks[url] = scannedLink +} diff --git a/internal/internal.go b/internal/internal.go new file mode 100644 index 0000000..7127932 --- /dev/null +++ b/internal/internal.go @@ -0,0 +1,34 @@ +// SPDX-FileCopyrightText: 2025 M. Shulhan +// SPDX-License-Identifier: GPL-3.0-only + +package internal + +import ( + "fmt" + "os" + "path/filepath" +) + +// CacheFile return the path to cache file under [os.UserCacheDir] + +// "jarink" directory. +// This variable defined here so the test file can override it. +var CacheFile = DefaultCacheFile + +func DefaultCacheFile() (cacheFile string, err error) { + var logp = `DefaultCacheFile` + var cacheDir string + + cacheDir, err = os.UserCacheDir() + if err != nil { + return ``, fmt.Errorf(`%s: %w`, logp, err) + } + cacheDir = filepath.Join(cacheDir, `jarink`) + + err = os.MkdirAll(cacheDir, 0700) + if err != nil { + return ``, fmt.Errorf(`%s: %w`, logp, err) + } + + cacheFile = filepath.Join(cacheDir, `cache.json`) + return cacheFile, nil +} -- cgit v1.3