diff options
| -rw-r--r-- | brokenlinks/brokenlinks.go | 6 | ||||
| -rw-r--r-- | brokenlinks/brokenlinks_test.go | 47 | ||||
| -rw-r--r-- | brokenlinks/link_queue.go | 21 | ||||
| -rw-r--r-- | brokenlinks/testdata/exp_cache.json | 19 | ||||
| -rw-r--r-- | brokenlinks/testdata/exp_cache.json.license | 2 | ||||
| -rw-r--r-- | brokenlinks/testdata/web/index.html | 5 | ||||
| -rw-r--r-- | brokenlinks/worker.go | 72 | ||||
| -rw-r--r-- | cache.go | 104 | ||||
| -rw-r--r-- | internal/internal.go | 34 |
9 files changed, 279 insertions, 31 deletions
diff --git a/brokenlinks/brokenlinks.go b/brokenlinks/brokenlinks.go index 5ba25d9..7b2e282 100644 --- a/brokenlinks/brokenlinks.go +++ b/brokenlinks/brokenlinks.go @@ -5,6 +5,7 @@ package brokenlinks import ( "fmt" + "log" ) const Version = `0.1.0` @@ -34,5 +35,10 @@ func Scan(opts Options) (result *Result, err error) { return nil, fmt.Errorf(`%s: %w`, logp, err) } + err = wrk.cache.Save() + if err != nil { + log.Printf(`%s: %s`, logp, err) + } + return result, nil } diff --git a/brokenlinks/brokenlinks_test.go b/brokenlinks/brokenlinks_test.go index d9f9a59..f957ae3 100644 --- a/brokenlinks/brokenlinks_test.go +++ b/brokenlinks/brokenlinks_test.go @@ -8,6 +8,7 @@ import ( "log" "net/http" "os" + "path/filepath" "testing" "time" @@ -15,6 +16,7 @@ import ( "git.sr.ht/~shulhan/pakakeh.go/lib/test" "git.sr.ht/~shulhan/jarink/brokenlinks" + "git.sr.ht/~shulhan/jarink/internal" ) // The test run four web servers. @@ -41,6 +43,16 @@ const testAddressSlow = `127.0.0.1:11839` func TestMain(m *testing.M) { log.SetFlags(0) + + var orgCacheFile = internal.CacheFile + var tmpCacheFile = filepath.Join(os.TempDir(), `cache.json`) + internal.CacheFile = func() (string, error) { + return tmpCacheFile, nil + } + defer func() { + internal.CacheFile = orgCacheFile + }() + var httpDirWeb = http.Dir(`testdata/web`) var fshandle = http.FileServer(httpDirWeb) @@ -234,6 +246,7 @@ func TestScan(t *testing.T) { Url: testUrl, IgnoreStatus: `403`, Insecure: true, + IsVerbose: true, }, exp: map[string][]brokenlinks.Broken{ testUrl: []brokenlinks.Broken{ @@ -276,7 +289,8 @@ func TestScan(t *testing.T) { // Scanning on "/page2" should not scan the the "/" or other // pages other than below of "/page2" itself. opts: brokenlinks.Options{ - Url: testUrl + `/page2`, + Url: testUrl + `/page2`, + IsVerbose: true, }, exp: map[string][]brokenlinks.Broken{ testUrl + `/page2`: []brokenlinks.Broken{ @@ -406,3 +420,34 @@ func TestScan_slow(t *testing.T) { } test.Assert(t, `TestScan_slow`, expResult, gotResult) } + +func TestBrokenlinks_cache(t *testing.T) { + var orgCacheFile = internal.CacheFile + var gotCacheFile = filepath.Join(t.TempDir(), `cache.json`) + var expCacheFile = filepath.Join(`testdata`, `exp_cache.json`) + defer func() { + internal.CacheFile = orgCacheFile + }() + internal.CacheFile = func() (string, error) { + return gotCacheFile, nil + } + + var testUrl = `http://` + testAddress + var opts = brokenlinks.Options{ + Url: testUrl, + IgnoreStatus: `403`, + Insecure: true, + } + + var err error + _, err = brokenlinks.Scan(opts) + gotCache, err := os.ReadFile(gotCacheFile) + if err != nil { + t.Fatal(err) + } + expCache, err := os.ReadFile(expCacheFile) + if err != nil { + t.Fatal(err) + } + test.Assert(t, `cache`, string(gotCache), string(expCache)) +} diff --git a/brokenlinks/link_queue.go b/brokenlinks/link_queue.go index 6a7dd32..14bf8c7 100644 --- a/brokenlinks/link_queue.go +++ b/brokenlinks/link_queue.go @@ -5,7 +5,6 @@ package brokenlinks import ( "net/url" - "strings" "golang.org/x/net/html/atom" ) @@ -33,23 +32,7 @@ type linkQueue struct { // 200 - 211: OK. // 400 - 511: Error. status int -} -// checkExternal set the isExternal field to be true if -// -// (1) [linkQueue.url] does not start with [Options.Url] -// -// (2) linkQueue is from scanPastResult, indicated by non-nil -// [worker.pastResult]. -// In this case, we did not want to scan the other pages from the same scanUrl -// domain. -func (linkq *linkQueue) checkExternal(wrk *worker) { - if !strings.HasPrefix(linkq.url, wrk.opts.scanUrl.String()) { - linkq.isExternal = true - return - } - if wrk.pastResult != nil { - linkq.isExternal = true - return - } + // Size of the page, derived from HTTP response ContentLength. + size int64 } diff --git a/brokenlinks/testdata/exp_cache.json b/brokenlinks/testdata/exp_cache.json new file mode 100644 index 0000000..563164d --- /dev/null +++ b/brokenlinks/testdata/exp_cache.json @@ -0,0 +1,19 @@ +{ + "scanned_links": { + "http://127.0.0.1:11900": { + "url": "http://127.0.0.1:11900", + "size": 976, + "response_code": 200 + }, + "http://127.0.0.1:11900/page2": { + "url": "http://127.0.0.1:11900/page2", + "size": 410, + "response_code": 200 + }, + "https://127.0.0.1:11838": { + "url": "https://127.0.0.1:11838", + "size": 976, + "response_code": 200 + } + } +} diff --git a/brokenlinks/testdata/exp_cache.json.license b/brokenlinks/testdata/exp_cache.json.license new file mode 100644 index 0000000..22616a9 --- /dev/null +++ b/brokenlinks/testdata/exp_cache.json.license @@ -0,0 +1,2 @@ +SPDX-FileCopyrightText: 2025 M. Shulhan <ms@kilabit.info> +SPDX-License-Identifier: GPL-3.0-only diff --git a/brokenlinks/testdata/web/index.html b/brokenlinks/testdata/web/index.html index 7b9101c..596d374 100644 --- a/brokenlinks/testdata/web/index.html +++ b/brokenlinks/testdata/web/index.html @@ -10,11 +10,16 @@ SPDX-License-Identifier: GPL-3.0-only <img width="200" src="" /> <a href="/page2">Page 2</a> <a href="/broken.html">Broken HTML</a> + + <!-- External link --> <a href="http://127.0.0.1:11900">External URL</a> + <!-- Error when fetching with GET --> <a href="http:/127.0.0.1:11836">Invalid external URL</a> + <!-- Error when parsing URL --> <a href="http://127.0.0.1:abc">Invalid URL port</a> + <!-- Fragment should be skipped and cleaned up --> <a href="#goto_a">Same with href to "/"</a> <a href="/page2#goto_a">Same with href to "/page2"</a> diff --git a/brokenlinks/worker.go b/brokenlinks/worker.go index 3eacf01..8d7918f 100644 --- a/brokenlinks/worker.go +++ b/brokenlinks/worker.go @@ -19,6 +19,8 @@ import ( "golang.org/x/net/html" "golang.org/x/net/html/atom" + + "git.sr.ht/~shulhan/jarink" ) type worker struct { @@ -41,6 +43,9 @@ type worker struct { // links or image. baseUrl *url.URL + // cache of scanned links. + cache *jarink.Cache + log *log.Logger httpc *http.Client @@ -79,6 +84,11 @@ func newWorker(opts Options) (wrk *worker, err error) { }, } + wrk.cache, err = jarink.LoadCache() + if err != nil { + return nil, err + } + wrk.baseUrl = &url.URL{ Scheme: wrk.opts.scanUrl.Scheme, Host: wrk.opts.scanUrl.Host, @@ -135,9 +145,14 @@ func (wrk *worker) scanAll() (result *Result, err error) { wrk.seenLink[linkq.url] = linkq.status continue } - if linkq.status >= http.StatusBadRequest { - wrk.markBroken(linkq) - continue + + if linkq.isExternal { + var scannedLink = wrk.cache.Get(linkq.url) + if scannedLink != nil { + linkq.status = scannedLink.ResponseCode + wrk.seen(linkq) + continue + } } wrk.seenLink[linkq.url] = http.StatusProcessing @@ -206,17 +221,27 @@ func (wrk *worker) processResult( newList []linkQueue, ) { for _, linkq := range resultq { - if linkq.status >= http.StatusBadRequest { - wrk.markBroken(linkq) - continue - } + // Process the scanned page first. + if linkq.status != 0 { - // linkq is the result of scan with - // non error status. - wrk.seenLink[linkq.url] = linkq.status + wrk.seen(linkq) + if linkq.isExternal && linkq.status != StatusBadLink { + wrk.cache.Set(linkq.url, linkq.status, linkq.size) + } continue } + // Now process the links inside the page. + + if linkq.isExternal { + var scannedLink = wrk.cache.Get(linkq.url) + if scannedLink != nil { + linkq.status = scannedLink.ResponseCode + wrk.seen(linkq) + continue + } + } + seenStatus, seen := wrk.seenLink[linkq.url] if !seen { wrk.seenLink[linkq.url] = http.StatusProcessing @@ -257,6 +282,14 @@ func (wrk *worker) processResult( return newList } +func (wrk *worker) seen(linkq linkQueue) { + if linkq.status >= http.StatusBadRequest { + wrk.markBroken(linkq) + return + } + wrk.seenLink[linkq.url] = linkq.status +} + func (wrk *worker) markBroken(linkq linkQueue) { var parentUrl = linkq.parentUrl.String() var listBroken = wrk.result.BrokenLinks[parentUrl] @@ -299,6 +332,7 @@ func (wrk *worker) scan(linkq linkQueue) { defer httpResp.Body.Close() linkq.status = httpResp.StatusCode + linkq.size = httpResp.ContentLength resultq[linkq.url] = linkq if slices.Contains(wrk.opts.ignoreStatus, httpResp.StatusCode) { @@ -361,7 +395,7 @@ func (wrk *worker) scan(linkq linkQueue) { } _, seen := resultq[nodeLink.url] if !seen { - nodeLink.checkExternal(wrk) + wrk.checkExternal(nodeLink) resultq[nodeLink.url] = *nodeLink } } @@ -459,3 +493,19 @@ func (wrk *worker) pushResult(resultq map[string]linkQueue) { } } } + +// checkExternal set the [linkQueue.isExternal] field to true if +// +// (1) [linkQueue.url] does not start with [Options.Url] +// (2) linkQueue is not from scanPastResult, indicated by non-nil +// [worker.pastResult]. +func (wrk *worker) checkExternal(linkq *linkQueue) { + if !strings.HasPrefix(linkq.url, wrk.opts.scanUrl.String()) { + linkq.isExternal = true + return + } + if wrk.pastResult != nil { + linkq.isExternal = true + return + } +} diff --git a/cache.go b/cache.go new file mode 100644 index 0000000..12c7b74 --- /dev/null +++ b/cache.go @@ -0,0 +1,104 @@ +// SPDX-FileCopyrightText: 2025 M. Shulhan <ms@kilabit.info> +// SPDX-License-Identifier: GPL-3.0-only + +package jarink + +import ( + "encoding/json" + "fmt" + "os" + "sync" + + "git.sr.ht/~shulhan/jarink/internal" +) + +// ScannedLink store information about the link. +type ScannedLink struct { + Url string `json:"url"` + Size int64 `json:"size"` + ResponseCode int `json:"response_code"` +} + +// Cache store external links that has been scanned, to minize +// request to the same URL in the future. +// The cache is stored as JSON file under user's cache directory, inside +// "jarink" directory. +// For example, in Linux it should be "$HOME/.cache/jarink/cache.json". +// See [os.UserCacheDir] for location specific to operating system. +type Cache struct { + ScannedLinks map[string]*ScannedLink `json:"scanned_links"` + file string + mtx sync.Mutex +} + +// LoadCache from local storage. +func LoadCache() (cache *Cache, err error) { + var logp = `LoadCache` + + cache = &Cache{ + ScannedLinks: map[string]*ScannedLink{}, + } + + cache.file, err = internal.CacheFile() + if err != nil { + return nil, fmt.Errorf(`%s: %w`, logp, err) + } + + var cacheJson []byte + cacheJson, err = os.ReadFile(cache.file) + if err != nil { + if os.IsNotExist(err) { + return cache, nil + } + return nil, fmt.Errorf(`%s: %w`, logp, err) + } + + err = json.Unmarshal(cacheJson, &cache) + if err != nil { + return nil, fmt.Errorf(`%s: %w`, logp, err) + } + + return cache, nil +} + +// Get return the scanned link information by url. +func (cache *Cache) Get(url string) (scannedLink *ScannedLink) { + cache.mtx.Lock() + scannedLink = cache.ScannedLinks[url] + cache.mtx.Unlock() + return scannedLink +} + +// Save the cache into local storage. +func (cache *Cache) Save() (err error) { + var logp = `Save` + var cacheJson []byte + cacheJson, err = json.MarshalIndent(cache, ``, ` `) + if err != nil { + return fmt.Errorf(`%s: %w`, logp, err) + } + + cacheJson = append(cacheJson, '\n') + + err = os.WriteFile(cache.file, cacheJson, 0600) + if err != nil { + return fmt.Errorf(`%s: %w`, logp, err) + } + return nil +} + +func (cache *Cache) Set(url string, respCode int, size int64) { + cache.mtx.Lock() + defer cache.mtx.Unlock() + + var scannedLink = cache.ScannedLinks[url] + if scannedLink != nil { + return + } + scannedLink = &ScannedLink{ + Url: url, + Size: size, + ResponseCode: respCode, + } + cache.ScannedLinks[url] = scannedLink +} diff --git a/internal/internal.go b/internal/internal.go new file mode 100644 index 0000000..7127932 --- /dev/null +++ b/internal/internal.go @@ -0,0 +1,34 @@ +// SPDX-FileCopyrightText: 2025 M. Shulhan <ms@kilabit.info> +// SPDX-License-Identifier: GPL-3.0-only + +package internal + +import ( + "fmt" + "os" + "path/filepath" +) + +// CacheFile return the path to cache file under [os.UserCacheDir] + +// "jarink" directory. +// This variable defined here so the test file can override it. +var CacheFile = DefaultCacheFile + +func DefaultCacheFile() (cacheFile string, err error) { + var logp = `DefaultCacheFile` + var cacheDir string + + cacheDir, err = os.UserCacheDir() + if err != nil { + return ``, fmt.Errorf(`%s: %w`, logp, err) + } + cacheDir = filepath.Join(cacheDir, `jarink`) + + err = os.MkdirAll(cacheDir, 0700) + if err != nil { + return ``, fmt.Errorf(`%s: %w`, logp, err) + } + + cacheFile = filepath.Join(cacheDir, `cache.json`) + return cacheFile, nil +} |
