diff options
| -rw-r--r-- | README.adoc | 13 | ||||
| -rw-r--r-- | brokenlinks/brokenlinks.go | 19 | ||||
| -rw-r--r-- | brokenlinks/brokenlinks_test.go | 111 | ||||
| -rw-r--r-- | brokenlinks/options.go | 53 | ||||
| -rw-r--r-- | brokenlinks/testdata/web/index.html | 3 | ||||
| -rw-r--r-- | brokenlinks/worker.go | 5 | ||||
| -rw-r--r-- | cmd/jarink/main.go | 11 |
7 files changed, 153 insertions, 62 deletions
diff --git a/README.adoc b/README.adoc index fc7ea3b..f85fa45 100644 --- a/README.adoc +++ b/README.adoc @@ -46,14 +46,17 @@ JSON format to standard output, This command accept the following options, -`-verbose`:: -Print the page that being scanned to standard error. +`-ignore-status=<comma separated HTTP status code>`:: +List of HTTP status code that will be ignored during scan. `-past-result=<path to JSON file>`:: Scan only the pages reported by result from past scan based on the content in JSON file. This minimize the time to re-scan the pages once we have fixed the URLs. +`-verbose`:: +Print the page that being scanned to standard error. + == Examples @@ -78,6 +81,12 @@ Invoking brokenlinks on path "/page2" only scan "/page2" and "/page2/sub1". $ jarink brokenlinks https://web.tld/page2 ---- +Ignore HTTP status code 403 and 418, + +---- +$ jarink -ignore-status=403,418 brokenlinks https://web.tld/page2 +---- + == Notes diff --git a/brokenlinks/brokenlinks.go b/brokenlinks/brokenlinks.go index 8ac458f..5ba25d9 100644 --- a/brokenlinks/brokenlinks.go +++ b/brokenlinks/brokenlinks.go @@ -13,26 +13,25 @@ const Version = `0.1.0` // reachable during GET or HEAD, either timeout or IP or domain not exist. const StatusBadLink = 700 -// Options define the options for scanning broken links. -type Options struct { - Url string - PastResultFile string - IsVerbose bool -} - // Scan the URL for broken links. func Scan(opts Options) (result *Result, err error) { - var logp = `brokenlinks` + var logp = `Scan` + + err = opts.init() + if err != nil { + return nil, fmt.Errorf(`%s: %w`, logp, err) + } + var wrk *worker wrk, err = newWorker(opts) if err != nil { - return nil, fmt.Errorf(`%s: %s`, logp, err) + return nil, fmt.Errorf(`%s: %w`, logp, err) } result, err = wrk.run() if err != nil { - return nil, fmt.Errorf(`%s: %s`, logp, err) + return nil, fmt.Errorf(`%s: %w`, logp, err) } return result, nil diff --git a/brokenlinks/brokenlinks_test.go b/brokenlinks/brokenlinks_test.go index 367ae6c..b868942 100644 --- a/brokenlinks/brokenlinks_test.go +++ b/brokenlinks/brokenlinks_test.go @@ -4,7 +4,6 @@ package brokenlinks_test import ( - "encoding/json" "log" "net/http" "os" @@ -30,36 +29,8 @@ func TestMain(m *testing.M) { var httpDirWeb = http.Dir(`testdata/web`) var fshandle = http.FileServer(httpDirWeb) - go func() { - var mux = http.NewServeMux() - mux.Handle(`/`, fshandle) - var testServer = &http.Server{ - Addr: testAddress, - Handler: mux, - ReadTimeout: 10 * time.Second, - WriteTimeout: 10 * time.Second, - MaxHeaderBytes: 1 << 20, - } - var err = testServer.ListenAndServe() - if err != nil { - log.Fatal(err) - } - }() - go func() { - var mux = http.NewServeMux() - mux.Handle(`/`, fshandle) - var testServer = &http.Server{ - Addr: testExternalAddress, - Handler: mux, - ReadTimeout: 10 * time.Second, - WriteTimeout: 10 * time.Second, - MaxHeaderBytes: 1 << 20, - } - var err = testServer.ListenAndServe() - if err != nil { - log.Fatal(err) - } - }() + go testServer(fshandle) + go testExternalServer(fshandle) var err = libnet.WaitAlive(`tcp`, testAddress, 5*time.Second) if err != nil { @@ -73,23 +44,67 @@ func TestMain(m *testing.M) { os.Exit(m.Run()) } +func testServer(fshandle http.Handler) { + var mux = http.NewServeMux() + mux.HandleFunc(`/page403`, page403) + mux.Handle(`/`, fshandle) + var testServer = &http.Server{ + Addr: testAddress, + Handler: mux, + ReadTimeout: 10 * time.Second, + WriteTimeout: 10 * time.Second, + MaxHeaderBytes: 1 << 20, + } + var err = testServer.ListenAndServe() + if err != nil { + log.Fatal(err) + } +} + +func page403(resp http.ResponseWriter, req *http.Request) { + resp.WriteHeader(http.StatusForbidden) +} + +func testExternalServer(fshandle http.Handler) { + var mux = http.NewServeMux() + mux.Handle(`/`, fshandle) + var testServer = &http.Server{ + Addr: testExternalAddress, + Handler: mux, + ReadTimeout: 10 * time.Second, + WriteTimeout: 10 * time.Second, + MaxHeaderBytes: 1 << 20, + } + var err = testServer.ListenAndServe() + if err != nil { + log.Fatal(err) + } +} + func TestBrokenlinks(t *testing.T) { var testUrl = `http://` + testAddress type testCase struct { exp map[string][]brokenlinks.Broken - scanUrl string expError string + opts brokenlinks.Options } listCase := []testCase{{ - scanUrl: `127.0.0.1:14594`, - expError: `brokenlinks: invalid URL "127.0.0.1:14594"`, + opts: brokenlinks.Options{ + Url: `127.0.0.1:14594`, + }, + expError: `Scan: invalid URL "127.0.0.1:14594"`, }, { - scanUrl: `http://127.0.0.1:14594`, - expError: `brokenlinks: Get "http://127.0.0.1:14594": dial tcp 127.0.0.1:14594: connect: connection refused`, + opts: brokenlinks.Options{ + Url: `http://127.0.0.1:14594`, + }, + expError: `Scan: Get "http://127.0.0.1:14594": dial tcp 127.0.0.1:14594: connect: connection refused`, }, { - scanUrl: testUrl, + opts: brokenlinks.Options{ + Url: testUrl, + IgnoreStatus: `403`, + }, exp: map[string][]brokenlinks.Broken{ testUrl: []brokenlinks.Broken{ { @@ -130,7 +145,9 @@ func TestBrokenlinks(t *testing.T) { }, { // Scanning on "/path" should not scan the the "/" or other // pages other than below of "/path" itself. - scanUrl: testUrl + `/page2`, + opts: brokenlinks.Options{ + Url: testUrl + `/page2`, + }, exp: map[string][]brokenlinks.Broken{ testUrl + `/page2`: []brokenlinks.Broken{ { @@ -152,19 +169,16 @@ func TestBrokenlinks(t *testing.T) { err error ) for _, tcase := range listCase { - t.Logf(`--- brokenlinks: %s`, tcase.scanUrl) - var opts = brokenlinks.Options{ - Url: tcase.scanUrl, - } - result, err = brokenlinks.Scan(opts) + t.Logf(`--- brokenlinks: %s`, tcase.opts.Url) + result, err = brokenlinks.Scan(tcase.opts) if err != nil { - test.Assert(t, tcase.scanUrl+` error`, + test.Assert(t, tcase.opts.Url+` error`, tcase.expError, err.Error()) continue } //got, _ := json.MarshalIndent(result.BrokenLinks, ``, ` `) //t.Logf(`got=%s`, got) - test.Assert(t, tcase.scanUrl, tcase.exp, result.BrokenLinks) + test.Assert(t, tcase.opts.Url, tcase.exp, result.BrokenLinks) } } @@ -185,12 +199,13 @@ func TestBrokenlinks_pastResult(t *testing.T) { Url: testUrl, PastResultFile: `testdata/invalid`, }, - expError: `brokenlinks: open testdata/invalid: no such file or directory`, + expError: `Scan: open testdata/invalid: no such file or directory`, }, { // With valid file. opts: brokenlinks.Options{ Url: testUrl, PastResultFile: `testdata/past_result.json`, + IgnoreStatus: `403`, }, exp: map[string][]brokenlinks.Broken{ testUrl + `/page2`: []brokenlinks.Broken{ @@ -220,8 +235,8 @@ func TestBrokenlinks_pastResult(t *testing.T) { tcase.expError, err.Error()) continue } - got, _ := json.MarshalIndent(result.BrokenLinks, ``, ` `) - t.Logf(`got=%s`, got) + //got, _ := json.MarshalIndent(result.BrokenLinks, ``, ` `) + //t.Logf(`got=%s`, got) test.Assert(t, tcase.opts.Url, tcase.exp, result.BrokenLinks) } } diff --git a/brokenlinks/options.go b/brokenlinks/options.go new file mode 100644 index 0000000..9c4022b --- /dev/null +++ b/brokenlinks/options.go @@ -0,0 +1,53 @@ +// SPDX-FileCopyrightText: 2025 M. Shulhan <ms@kilabit.info> +// SPDX-License-Identifier: GPL-3.0-only + +package brokenlinks + +import ( + "fmt" + "net/http" + "strconv" + "strings" +) + +// Options define the options for scanning broken links. +type Options struct { + Url string + PastResultFile string + + // IgnoreStatus comma separated list HTTP status code that will be + // ignored on scan. + // Page that return one of the IgnoreStatus will be assumed as + // passed and not get processed. + // The status code must in between 100-511. + IgnoreStatus string + ignoreStatus []int + + IsVerbose bool +} + +func (opts *Options) init() (err error) { + var ( + logp = `Options` + listCode = strings.Split(opts.IgnoreStatus, ",") + val string + ) + for _, val = range listCode { + val = strings.TrimSpace(val) + if val == "" { + continue + } + var code int64 + code, err = strconv.ParseInt(val, 10, 64) + if err != nil { + return fmt.Errorf(`%s: invalid status code %q`, logp, val) + } + if code < http.StatusContinue || + code > http.StatusNetworkAuthenticationRequired { + return fmt.Errorf(`%s: status code %s out of range`, logp, val) + } + + opts.ignoreStatus = append(opts.ignoreStatus, int(code)) + } + return nil +} diff --git a/brokenlinks/testdata/web/index.html b/brokenlinks/testdata/web/index.html index 61a1f39..cc11a2f 100644 --- a/brokenlinks/testdata/web/index.html +++ b/brokenlinks/testdata/web/index.html @@ -18,5 +18,8 @@ SPDX-License-Identifier: GPL-3.0-only <!-- Fragment should be skipped and cleaned up --> <a href="#goto_a">Same with href to "/"</a> <a href="/page2#goto_a">Same with href to "/page2"</a> + + <!-- Pages that return custom HTTP status code --> + <a href="/page403">Page 403</a> </body> </html> diff --git a/brokenlinks/worker.go b/brokenlinks/worker.go index 4ed56d2..e3e0c45 100644 --- a/brokenlinks/worker.go +++ b/brokenlinks/worker.go @@ -12,6 +12,7 @@ import ( "net/http" "net/url" "os" + "slices" "strings" "sync" "time" @@ -311,6 +312,10 @@ func (wrk *worker) scan(linkq linkQueue) { linkq.status = httpResp.StatusCode resultq[linkq.url] = linkq + if slices.Contains(wrk.opts.ignoreStatus, httpResp.StatusCode) { + return + } + if httpResp.StatusCode >= http.StatusBadRequest { go wrk.pushResult(resultq) return diff --git a/cmd/jarink/main.go b/cmd/jarink/main.go index b384032..9d6d1e8 100644 --- a/cmd/jarink/main.go +++ b/cmd/jarink/main.go @@ -18,8 +18,14 @@ import ( func main() { log.SetFlags(0) - var optIsVerbose bool - var optPastResult string + var ( + optIgnoreStatus string + optIsVerbose bool + optPastResult string + ) + + flag.StringVar(&optIgnoreStatus, `ignore-status`, ``, + `Comma separated HTTP response status code to be ignored.`) flag.BoolVar(&optIsVerbose, `verbose`, false, `Print additional information while running.`) @@ -34,6 +40,7 @@ func main() { switch cmd { case `brokenlinks`: var opts = brokenlinks.Options{ + IgnoreStatus: optIgnoreStatus, IsVerbose: optIsVerbose, PastResultFile: optPastResult, } |
