diff options
| -rw-r--r-- | README | 81 | ||||
| -rw-r--r-- | brokenlinks.go | 5 | ||||
| -rw-r--r-- | brokenlinks_test.go | 59 | ||||
| -rw-r--r-- | brokenlinks_worker.go | 206 | ||||
| -rw-r--r-- | cmd/jarink/main.go | 14 | ||||
| -rw-r--r-- | link_queue.go | 20 | ||||
| -rw-r--r-- | testdata/past_result.json | 8 | ||||
| -rw-r--r-- | testdata/past_result.json.license | 2 |
8 files changed, 268 insertions, 127 deletions
@@ -7,40 +7,22 @@ and maintains their website. jarink [OPTIONS] <COMMAND> <args...> -Available commands, +Available command, brokenlinks - scan the website for broken links (page and images). - help - print the usage of the command. + help - print the usage of the command. == Usage -[OPTIONS] brokenlinks URL +[OPTIONS] brokenlinks <URL> - Start scanning for broken links on the web server pointed by URL. + Scan for broken links on the web server pointed by URL. Links will be scanned on anchor href attribute ("<a href=...>") or on the image src attribute ("<img src=..."). The URL can be start from base or from sub path. Scanning from path only report brokenlinks on that path and their sub paths. - For example, given a website that have the following pages, - - - web.tld (base) - - web.tld/page1 - - web.tld/page1/sub1 - - web.tld/page2 - - web.tld/page2/sub1 - - Invoking brokenlinks with - - $ jarink brokenlinks https://web.tld - - will scan all of the pages, but invoking brokenlinks on path - "/page2" - - $ jarink brokenlinks https://web.tld/page2 - - only scan "/page2" and "/page2/sub1". Once finished it will print the page and list of broken links in JSON format to standard output, @@ -58,36 +40,35 @@ Available commands, This command accept the following options, - -verbose : print the page that being scanned to standard - error. + -verbose + + Print the page that being scanned to standard error. + + -past-result=<path to JSON file> - Example, + Scan only the pages reported by result from past scan based + on the content in JSON file. This minimize the time to + re-scan the pages once we have fixed the URLs. + + For example, given a website that have the following pages, + + - web.tld (base) + - web.tld/page1 + - web.tld/page1/sub1 + - web.tld/page2 + - web.tld/page2/sub1 + + Invoking brokenlinks with + + $ jarink brokenlinks https://web.tld + + will scan all of the pages, but invoking brokenlinks on path + "/page2" + + $ jarink brokenlinks https://web.tld/page2 + + only scan "/page2" and "/page2/sub1". - $ jarink brokenlinks https://kilabit.info - { - "https://kilabit.info/some/page": [ - { - "link": "https://kilabit.info/some/page/image.png", - "code": 404 - }, - { - "link": "https://external.com/link", - "error": "Internal server error", - "code": 500 - } - ], - "https://kilabit.info/another/page": [ - { - "link": "https://kilabit.info/another/page/image.png", - "code": 404 - }, - { - "link": "https://external.org/link", - "error": "Internal server error", - "code": 500 - } - ] - } == Notes diff --git a/brokenlinks.go b/brokenlinks.go index 69a14ad..833e2d2 100644 --- a/brokenlinks.go +++ b/brokenlinks.go @@ -25,8 +25,9 @@ type Broken struct { // BrokenlinksOptions define the options for scanning broken links. type BrokenlinksOptions struct { - Url string - IsVerbose bool + Url string + PastResultFile string + IsVerbose bool } // BrokenlinksResult store the result of scanning for broken links. diff --git a/brokenlinks_test.go b/brokenlinks_test.go index 2c8ad15..b2d5c80 100644 --- a/brokenlinks_test.go +++ b/brokenlinks_test.go @@ -4,6 +4,7 @@ package jarink_test import ( + "encoding/json" "net/http" "testing" @@ -105,3 +106,61 @@ func TestBrokenlinks(t *testing.T) { test.Assert(t, tcase.scanUrl, tcase.exp, result.PageLinks) } } + +// Test running Brokenlinks with file PastResultFile is set. +// The PastResultFile is modified to only report errors on "/page2". +func TestBrokenlinks_pastResult(t *testing.T) { + var testUrl = `http://` + testAddress + + type testCase struct { + exp map[string][]jarink.Broken + expError string + opts jarink.BrokenlinksOptions + } + + listCase := []testCase{{ + // With invalid file. + opts: jarink.BrokenlinksOptions{ + Url: testUrl, + PastResultFile: `testdata/invalid`, + }, + expError: `brokenlinks: open testdata/invalid: no such file or directory`, + }, { + // With valid file. + opts: jarink.BrokenlinksOptions{ + Url: testUrl, + PastResultFile: `testdata/past_result.json`, + }, + exp: map[string][]jarink.Broken{ + testUrl + `/page2`: []jarink.Broken{ + { + Link: testUrl + `/broken.png`, + Code: http.StatusNotFound, + }, { + Link: testUrl + `/page2/broken/relative`, + Code: http.StatusNotFound, + }, { + Link: testUrl + `/page2/broken2.png`, + Code: http.StatusNotFound, + }, + }, + }, + }} + + var ( + result *jarink.BrokenlinksResult + err error + ) + for _, tcase := range listCase { + t.Logf(`--- brokenlinks: %s`, tcase.opts.Url) + result, err = jarink.Brokenlinks(tcase.opts) + if err != nil { + test.Assert(t, tcase.opts.Url+` error`, + tcase.expError, err.Error()) + continue + } + got, _ := json.MarshalIndent(result.PageLinks, ``, ` `) + t.Logf(`got=%s`, got) + test.Assert(t, tcase.opts.Url, tcase.exp, result.PageLinks) + } +} diff --git a/brokenlinks_worker.go b/brokenlinks_worker.go index 5cc8c25..dbb3453 100644 --- a/brokenlinks_worker.go +++ b/brokenlinks_worker.go @@ -4,6 +4,7 @@ package jarink import ( + "encoding/json" "fmt" "log" "net/http" @@ -29,6 +30,10 @@ type brokenlinksWorker struct { // scanned. result *BrokenlinksResult + // pastResult containts the past scan result, loaded from file + // [BrokenlinksOptions.PastResultFile]. + pastResult *BrokenlinksResult + // The base URL that will be joined to relative or absolute // links or image. baseUrl *url.URL @@ -57,7 +62,6 @@ func newWorker(opts BrokenlinksOptions) (wrk *brokenlinksWorker, err error) { if err != nil { return nil, fmt.Errorf(`invalid URL %q`, opts.Url) } - wrk.scanUrl.Path = strings.TrimSuffix(wrk.scanUrl.Path, `/`) wrk.scanUrl.Fragment = "" wrk.scanUrl.RawFragment = "" @@ -67,10 +71,36 @@ func newWorker(opts BrokenlinksOptions) (wrk *brokenlinksWorker, err error) { Host: wrk.scanUrl.Host, } + if opts.PastResultFile == "" { + // Run with normal scan. + return wrk, nil + } + + pastresult, err := os.ReadFile(opts.PastResultFile) + if err != nil { + return nil, err + } + + wrk.pastResult = newBrokenlinksResult() + err = json.Unmarshal(pastresult, &wrk.pastResult.PageLinks) + if err != nil { + return nil, err + } + return wrk, nil } func (wrk *brokenlinksWorker) run() (result *BrokenlinksResult, err error) { + if wrk.pastResult == nil { + result, err = wrk.scanAll() + } else { + result, err = wrk.scanPastResult() + } + return result, err +} + +// scanAll scan all pages start from [BrokenlinksOptions.Url]. +func (wrk *brokenlinksWorker) scanAll() (result *BrokenlinksResult, err error) { // Scan the first URL to make sure that the server is reachable. var firstLinkq = linkQueue{ parentUrl: nil, @@ -108,73 +138,50 @@ func (wrk *brokenlinksWorker) run() (result *BrokenlinksResult, err error) { for isScanning { select { case resultq := <-wrk.resultq: + listWaitStatus = wrk.processResult(resultq, listWaitStatus) - // The resultq contains the original URL being scanned - // and its child links. - // For example, scanning "http://example.tld" result - // in - // - // "http://example.tld": {status=200} - // "http://example.tld/page": {status=0} - // "http://example.tld/image.png": {status=0} - // "http://bad:domain/image.png": {status=700} - - var newList []linkQueue - for _, linkq := range resultq { - if linkq.status >= http.StatusBadRequest { - wrk.markBroken(linkq) - continue - } - if linkq.status != 0 { - // linkq is the result of scan with - // non error status. - wrk.seenLink[linkq.url] = linkq.status - continue - } - - seenStatus, seen := wrk.seenLink[linkq.url] - if !seen { - wrk.seenLink[linkq.url] = http.StatusProcessing - wrk.wg.Add(1) - go wrk.scan(linkq) - continue - } - if seenStatus >= http.StatusBadRequest { - linkq.status = seenStatus - wrk.markBroken(linkq) - continue - } - if seenStatus >= http.StatusOK { - // The link has been processed and its - // not an error. - continue - } - if seenStatus == http.StatusProcessing { - // The link being processed by other - // goroutine. - linkq.status = seenStatus - newList = append(newList, linkq) - continue - } - log.Fatalf("link=%s status=%d", linkq.url, linkq.status) + case <-tick.C: + wrk.wg.Wait() + if len(wrk.resultq) != 0 { + continue } - for _, linkq := range listWaitStatus { - seenStatus := wrk.seenLink[linkq.url] - if seenStatus >= http.StatusBadRequest { - linkq.status = seenStatus - wrk.markBroken(linkq) - continue - } - if seenStatus >= http.StatusOK { - continue - } - if seenStatus == http.StatusProcessing { - // Scanning still in progress. - newList = append(newList, linkq) - continue - } + if len(listWaitStatus) != 0 { + // There are links that still waiting for + // scanning to be completed. + continue + } + isScanning = false + } + } + wrk.result.sort() + return wrk.result, nil +} + +// scanPastResult scan only pages reported inside +// [BrokenlinksResult.PageLinks]. +func (wrk *brokenlinksWorker) scanPastResult() ( + result *BrokenlinksResult, err error, +) { + go func() { + for page := range wrk.pastResult.PageLinks { + var linkq = linkQueue{ + parentUrl: nil, + url: page, + status: http.StatusProcessing, } - listWaitStatus = newList + wrk.seenLink[linkq.url] = http.StatusProcessing + wrk.wg.Add(1) + go wrk.scan(linkq) + } + }() + + var tick = time.NewTicker(500 * time.Millisecond) + var listWaitStatus []linkQueue + var isScanning = true + for isScanning { + select { + case resultq := <-wrk.resultq: + listWaitStatus = wrk.processResult(resultq, listWaitStatus) case <-tick.C: wrk.wg.Wait() @@ -193,6 +200,71 @@ func (wrk *brokenlinksWorker) run() (result *BrokenlinksResult, err error) { return wrk.result, nil } +// processResult the resultq contains the original URL being scanned +// and its child links. +// For example, scanning "http://example.tld" result in +// +// "http://example.tld": {status=200} +// "http://example.tld/page": {status=0} +// "http://example.tld/image.png": {status=0} +// "http://bad:domain/image.png": {status=700} +func (wrk *brokenlinksWorker) processResult( + resultq map[string]linkQueue, listWaitStatus []linkQueue, +) ( + newList []linkQueue, +) { + for _, linkq := range resultq { + if linkq.status >= http.StatusBadRequest { + wrk.markBroken(linkq) + continue + } + if linkq.status != 0 { + // linkq is the result of scan with + // non error status. + wrk.seenLink[linkq.url] = linkq.status + continue + } + + seenStatus, seen := wrk.seenLink[linkq.url] + if !seen { + wrk.seenLink[linkq.url] = http.StatusProcessing + wrk.wg.Add(1) + go wrk.scan(linkq) + continue + } + if seenStatus >= http.StatusBadRequest { + linkq.status = seenStatus + wrk.markBroken(linkq) + continue + } + if seenStatus >= http.StatusOK { + // The link has been processed and its + // not an error. + continue + } + // The link being processed by other goroutine. + linkq.status = seenStatus + newList = append(newList, linkq) + } + for _, linkq := range listWaitStatus { + seenStatus := wrk.seenLink[linkq.url] + if seenStatus >= http.StatusBadRequest { + linkq.status = seenStatus + wrk.markBroken(linkq) + continue + } + if seenStatus >= http.StatusOK { + continue + } + if seenStatus == http.StatusProcessing { + // Scanning still in progress. + newList = append(newList, linkq) + continue + } + } + return newList +} + func (wrk *brokenlinksWorker) markBroken(linkq linkQueue) { var parentUrl = linkq.parentUrl.String() var listBroken = wrk.result.PageLinks[parentUrl] @@ -303,9 +375,7 @@ func (wrk *brokenlinksWorker) scan(linkq linkQueue) { } _, seen := resultq[nodeLink.url] if !seen { - if !strings.HasPrefix(nodeLink.url, wrk.scanUrl.String()) { - nodeLink.isExternal = true - } + nodeLink.checkExternal(wrk) resultq[nodeLink.url] = *nodeLink } } diff --git a/cmd/jarink/main.go b/cmd/jarink/main.go index 4f4d206..c8ba2e2 100644 --- a/cmd/jarink/main.go +++ b/cmd/jarink/main.go @@ -17,10 +17,13 @@ import ( func main() { log.SetFlags(0) - var optVerbose bool + var brokenlinksOpts = jarink.BrokenlinksOptions{} - flag.BoolVar(&optVerbose, `verbose`, false, - `print additional information while running`) + flag.BoolVar(&brokenlinksOpts.IsVerbose, `verbose`, false, + `Print additional information while running.`) + + flag.StringVar(&brokenlinksOpts.PastResultFile, `past-result`, ``, + `Scan only pages with broken links from the past JSON result.`) flag.Parse() @@ -28,10 +31,7 @@ func main() { cmd = strings.ToLower(cmd) switch cmd { case `brokenlinks`: - var brokenlinksOpts = jarink.BrokenlinksOptions{ - Url: flag.Arg(1), - IsVerbose: optVerbose, - } + brokenlinksOpts.Url = flag.Arg(1) if brokenlinksOpts.Url == "" { log.Printf(`Missing argument URL to be scanned.`) goto invalid_command diff --git a/link_queue.go b/link_queue.go index 0b419b8..1470115 100644 --- a/link_queue.go +++ b/link_queue.go @@ -5,6 +5,7 @@ package jarink import ( "net/url" + "strings" "golang.org/x/net/html/atom" ) @@ -33,3 +34,22 @@ type linkQueue struct { // 400 - 511: Error. status int } + +// checkExternal set the isExternal field to be true if +// +// (1) [linkQueue.url] does not start with [brokenlinksWorker.scanUrl] +// +// (2) linkQueue is from scanPastResult, indicated by non-nil +// [brokenlinksWorker.pastResult]. +// In this case, we did not want to scan the other pages from the same scanUrl +// domain. +func (linkq *linkQueue) checkExternal(wrk *brokenlinksWorker) { + if !strings.HasPrefix(linkq.url, wrk.scanUrl.String()) { + linkq.isExternal = true + return + } + if wrk.pastResult != nil { + linkq.isExternal = true + return + } +} diff --git a/testdata/past_result.json b/testdata/past_result.json new file mode 100644 index 0000000..3ba37c1 --- /dev/null +++ b/testdata/past_result.json @@ -0,0 +1,8 @@ +{ + "http://127.0.0.1:11836/page2": [ + { + "link": "http://127.0.0.1:11836/", + "code": 404 + } + ] +} diff --git a/testdata/past_result.json.license b/testdata/past_result.json.license new file mode 100644 index 0000000..22616a9 --- /dev/null +++ b/testdata/past_result.json.license @@ -0,0 +1,2 @@ +SPDX-FileCopyrightText: 2025 M. Shulhan <ms@kilabit.info> +SPDX-License-Identifier: GPL-3.0-only |
