diff options
Diffstat (limited to 'brokenlinks')
| -rw-r--r-- | brokenlinks/brokenlinks.go | 6 | ||||
| -rw-r--r-- | brokenlinks/brokenlinks_test.go | 47 | ||||
| -rw-r--r-- | brokenlinks/link_queue.go | 21 | ||||
| -rw-r--r-- | brokenlinks/testdata/exp_cache.json | 19 | ||||
| -rw-r--r-- | brokenlinks/testdata/exp_cache.json.license | 2 | ||||
| -rw-r--r-- | brokenlinks/testdata/web/index.html | 5 | ||||
| -rw-r--r-- | brokenlinks/worker.go | 72 |
7 files changed, 141 insertions, 31 deletions
diff --git a/brokenlinks/brokenlinks.go b/brokenlinks/brokenlinks.go index 5ba25d9..7b2e282 100644 --- a/brokenlinks/brokenlinks.go +++ b/brokenlinks/brokenlinks.go @@ -5,6 +5,7 @@ package brokenlinks import ( "fmt" + "log" ) const Version = `0.1.0` @@ -34,5 +35,10 @@ func Scan(opts Options) (result *Result, err error) { return nil, fmt.Errorf(`%s: %w`, logp, err) } + err = wrk.cache.Save() + if err != nil { + log.Printf(`%s: %s`, logp, err) + } + return result, nil } diff --git a/brokenlinks/brokenlinks_test.go b/brokenlinks/brokenlinks_test.go index d9f9a59..f957ae3 100644 --- a/brokenlinks/brokenlinks_test.go +++ b/brokenlinks/brokenlinks_test.go @@ -8,6 +8,7 @@ import ( "log" "net/http" "os" + "path/filepath" "testing" "time" @@ -15,6 +16,7 @@ import ( "git.sr.ht/~shulhan/pakakeh.go/lib/test" "git.sr.ht/~shulhan/jarink/brokenlinks" + "git.sr.ht/~shulhan/jarink/internal" ) // The test run four web servers. @@ -41,6 +43,16 @@ const testAddressSlow = `127.0.0.1:11839` func TestMain(m *testing.M) { log.SetFlags(0) + + var orgCacheFile = internal.CacheFile + var tmpCacheFile = filepath.Join(os.TempDir(), `cache.json`) + internal.CacheFile = func() (string, error) { + return tmpCacheFile, nil + } + defer func() { + internal.CacheFile = orgCacheFile + }() + var httpDirWeb = http.Dir(`testdata/web`) var fshandle = http.FileServer(httpDirWeb) @@ -234,6 +246,7 @@ func TestScan(t *testing.T) { Url: testUrl, IgnoreStatus: `403`, Insecure: true, + IsVerbose: true, }, exp: map[string][]brokenlinks.Broken{ testUrl: []brokenlinks.Broken{ @@ -276,7 +289,8 @@ func TestScan(t *testing.T) { // Scanning on "/page2" should not scan the the "/" or other // pages other than below of "/page2" itself. opts: brokenlinks.Options{ - Url: testUrl + `/page2`, + Url: testUrl + `/page2`, + IsVerbose: true, }, exp: map[string][]brokenlinks.Broken{ testUrl + `/page2`: []brokenlinks.Broken{ @@ -406,3 +420,34 @@ func TestScan_slow(t *testing.T) { } test.Assert(t, `TestScan_slow`, expResult, gotResult) } + +func TestBrokenlinks_cache(t *testing.T) { + var orgCacheFile = internal.CacheFile + var gotCacheFile = filepath.Join(t.TempDir(), `cache.json`) + var expCacheFile = filepath.Join(`testdata`, `exp_cache.json`) + defer func() { + internal.CacheFile = orgCacheFile + }() + internal.CacheFile = func() (string, error) { + return gotCacheFile, nil + } + + var testUrl = `http://` + testAddress + var opts = brokenlinks.Options{ + Url: testUrl, + IgnoreStatus: `403`, + Insecure: true, + } + + var err error + _, err = brokenlinks.Scan(opts) + gotCache, err := os.ReadFile(gotCacheFile) + if err != nil { + t.Fatal(err) + } + expCache, err := os.ReadFile(expCacheFile) + if err != nil { + t.Fatal(err) + } + test.Assert(t, `cache`, string(gotCache), string(expCache)) +} diff --git a/brokenlinks/link_queue.go b/brokenlinks/link_queue.go index 6a7dd32..14bf8c7 100644 --- a/brokenlinks/link_queue.go +++ b/brokenlinks/link_queue.go @@ -5,7 +5,6 @@ package brokenlinks import ( "net/url" - "strings" "golang.org/x/net/html/atom" ) @@ -33,23 +32,7 @@ type linkQueue struct { // 200 - 211: OK. // 400 - 511: Error. status int -} -// checkExternal set the isExternal field to be true if -// -// (1) [linkQueue.url] does not start with [Options.Url] -// -// (2) linkQueue is from scanPastResult, indicated by non-nil -// [worker.pastResult]. -// In this case, we did not want to scan the other pages from the same scanUrl -// domain. -func (linkq *linkQueue) checkExternal(wrk *worker) { - if !strings.HasPrefix(linkq.url, wrk.opts.scanUrl.String()) { - linkq.isExternal = true - return - } - if wrk.pastResult != nil { - linkq.isExternal = true - return - } + // Size of the page, derived from HTTP response ContentLength. + size int64 } diff --git a/brokenlinks/testdata/exp_cache.json b/brokenlinks/testdata/exp_cache.json new file mode 100644 index 0000000..563164d --- /dev/null +++ b/brokenlinks/testdata/exp_cache.json @@ -0,0 +1,19 @@ +{ + "scanned_links": { + "http://127.0.0.1:11900": { + "url": "http://127.0.0.1:11900", + "size": 976, + "response_code": 200 + }, + "http://127.0.0.1:11900/page2": { + "url": "http://127.0.0.1:11900/page2", + "size": 410, + "response_code": 200 + }, + "https://127.0.0.1:11838": { + "url": "https://127.0.0.1:11838", + "size": 976, + "response_code": 200 + } + } +} diff --git a/brokenlinks/testdata/exp_cache.json.license b/brokenlinks/testdata/exp_cache.json.license new file mode 100644 index 0000000..22616a9 --- /dev/null +++ b/brokenlinks/testdata/exp_cache.json.license @@ -0,0 +1,2 @@ +SPDX-FileCopyrightText: 2025 M. Shulhan <ms@kilabit.info> +SPDX-License-Identifier: GPL-3.0-only diff --git a/brokenlinks/testdata/web/index.html b/brokenlinks/testdata/web/index.html index 7b9101c..596d374 100644 --- a/brokenlinks/testdata/web/index.html +++ b/brokenlinks/testdata/web/index.html @@ -10,11 +10,16 @@ SPDX-License-Identifier: GPL-3.0-only <img width="200" src="" /> <a href="/page2">Page 2</a> <a href="/broken.html">Broken HTML</a> + + <!-- External link --> <a href="http://127.0.0.1:11900">External URL</a> + <!-- Error when fetching with GET --> <a href="http:/127.0.0.1:11836">Invalid external URL</a> + <!-- Error when parsing URL --> <a href="http://127.0.0.1:abc">Invalid URL port</a> + <!-- Fragment should be skipped and cleaned up --> <a href="#goto_a">Same with href to "/"</a> <a href="/page2#goto_a">Same with href to "/page2"</a> diff --git a/brokenlinks/worker.go b/brokenlinks/worker.go index 3eacf01..8d7918f 100644 --- a/brokenlinks/worker.go +++ b/brokenlinks/worker.go @@ -19,6 +19,8 @@ import ( "golang.org/x/net/html" "golang.org/x/net/html/atom" + + "git.sr.ht/~shulhan/jarink" ) type worker struct { @@ -41,6 +43,9 @@ type worker struct { // links or image. baseUrl *url.URL + // cache of scanned links. + cache *jarink.Cache + log *log.Logger httpc *http.Client @@ -79,6 +84,11 @@ func newWorker(opts Options) (wrk *worker, err error) { }, } + wrk.cache, err = jarink.LoadCache() + if err != nil { + return nil, err + } + wrk.baseUrl = &url.URL{ Scheme: wrk.opts.scanUrl.Scheme, Host: wrk.opts.scanUrl.Host, @@ -135,9 +145,14 @@ func (wrk *worker) scanAll() (result *Result, err error) { wrk.seenLink[linkq.url] = linkq.status continue } - if linkq.status >= http.StatusBadRequest { - wrk.markBroken(linkq) - continue + + if linkq.isExternal { + var scannedLink = wrk.cache.Get(linkq.url) + if scannedLink != nil { + linkq.status = scannedLink.ResponseCode + wrk.seen(linkq) + continue + } } wrk.seenLink[linkq.url] = http.StatusProcessing @@ -206,17 +221,27 @@ func (wrk *worker) processResult( newList []linkQueue, ) { for _, linkq := range resultq { - if linkq.status >= http.StatusBadRequest { - wrk.markBroken(linkq) - continue - } + // Process the scanned page first. + if linkq.status != 0 { - // linkq is the result of scan with - // non error status. - wrk.seenLink[linkq.url] = linkq.status + wrk.seen(linkq) + if linkq.isExternal && linkq.status != StatusBadLink { + wrk.cache.Set(linkq.url, linkq.status, linkq.size) + } continue } + // Now process the links inside the page. + + if linkq.isExternal { + var scannedLink = wrk.cache.Get(linkq.url) + if scannedLink != nil { + linkq.status = scannedLink.ResponseCode + wrk.seen(linkq) + continue + } + } + seenStatus, seen := wrk.seenLink[linkq.url] if !seen { wrk.seenLink[linkq.url] = http.StatusProcessing @@ -257,6 +282,14 @@ func (wrk *worker) processResult( return newList } +func (wrk *worker) seen(linkq linkQueue) { + if linkq.status >= http.StatusBadRequest { + wrk.markBroken(linkq) + return + } + wrk.seenLink[linkq.url] = linkq.status +} + func (wrk *worker) markBroken(linkq linkQueue) { var parentUrl = linkq.parentUrl.String() var listBroken = wrk.result.BrokenLinks[parentUrl] @@ -299,6 +332,7 @@ func (wrk *worker) scan(linkq linkQueue) { defer httpResp.Body.Close() linkq.status = httpResp.StatusCode + linkq.size = httpResp.ContentLength resultq[linkq.url] = linkq if slices.Contains(wrk.opts.ignoreStatus, httpResp.StatusCode) { @@ -361,7 +395,7 @@ func (wrk *worker) scan(linkq linkQueue) { } _, seen := resultq[nodeLink.url] if !seen { - nodeLink.checkExternal(wrk) + wrk.checkExternal(nodeLink) resultq[nodeLink.url] = *nodeLink } } @@ -459,3 +493,19 @@ func (wrk *worker) pushResult(resultq map[string]linkQueue) { } } } + +// checkExternal set the [linkQueue.isExternal] field to true if +// +// (1) [linkQueue.url] does not start with [Options.Url] +// (2) linkQueue is not from scanPastResult, indicated by non-nil +// [worker.pastResult]. +func (wrk *worker) checkExternal(linkq *linkQueue) { + if !strings.HasPrefix(linkq.url, wrk.opts.scanUrl.String()) { + linkq.isExternal = true + return + } + if wrk.pastResult != nil { + linkq.isExternal = true + return + } +} |
