aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorShulhan <ms@kilabit.info>2025-06-13 01:33:43 +0700
committerShulhan <ms@kilabit.info>2025-06-13 01:33:43 +0700
commitfb9937797a07816fbc8b25fc03893f74bf7c7663 (patch)
tree068d60e378e33716af7426e507b937523bbe38b3
parentaf0fc1518eb9b0ec7ba72d5dcdd78d5df2213968 (diff)
downloadjarink-fb9937797a07816fbc8b25fc03893f74bf7c7663.tar.xz
brokenlinks: add option to ignore list HTTP status code
When link known to have an issues, one can ignore the status code during scanning broken links using "-ignore-status" option.
-rw-r--r--README.adoc13
-rw-r--r--brokenlinks/brokenlinks.go19
-rw-r--r--brokenlinks/brokenlinks_test.go111
-rw-r--r--brokenlinks/options.go53
-rw-r--r--brokenlinks/testdata/web/index.html3
-rw-r--r--brokenlinks/worker.go5
-rw-r--r--cmd/jarink/main.go11
7 files changed, 153 insertions, 62 deletions
diff --git a/README.adoc b/README.adoc
index fc7ea3b..f85fa45 100644
--- a/README.adoc
+++ b/README.adoc
@@ -46,14 +46,17 @@ JSON format to standard output,
This command accept the following options,
-`-verbose`::
-Print the page that being scanned to standard error.
+`-ignore-status=<comma separated HTTP status code>`::
+List of HTTP status code that will be ignored during scan.
`-past-result=<path to JSON file>`::
Scan only the pages reported by result from past scan based
on the content in JSON file.
This minimize the time to re-scan the pages once we have fixed the URLs.
+`-verbose`::
+Print the page that being scanned to standard error.
+
== Examples
@@ -78,6 +81,12 @@ Invoking brokenlinks on path "/page2" only scan "/page2" and "/page2/sub1".
$ jarink brokenlinks https://web.tld/page2
----
+Ignore HTTP status code 403 and 418,
+
+----
+$ jarink -ignore-status=403,418 brokenlinks https://web.tld/page2
+----
+
== Notes
diff --git a/brokenlinks/brokenlinks.go b/brokenlinks/brokenlinks.go
index 8ac458f..5ba25d9 100644
--- a/brokenlinks/brokenlinks.go
+++ b/brokenlinks/brokenlinks.go
@@ -13,26 +13,25 @@ const Version = `0.1.0`
// reachable during GET or HEAD, either timeout or IP or domain not exist.
const StatusBadLink = 700
-// Options define the options for scanning broken links.
-type Options struct {
- Url string
- PastResultFile string
- IsVerbose bool
-}
-
// Scan the URL for broken links.
func Scan(opts Options) (result *Result, err error) {
- var logp = `brokenlinks`
+ var logp = `Scan`
+
+ err = opts.init()
+ if err != nil {
+ return nil, fmt.Errorf(`%s: %w`, logp, err)
+ }
+
var wrk *worker
wrk, err = newWorker(opts)
if err != nil {
- return nil, fmt.Errorf(`%s: %s`, logp, err)
+ return nil, fmt.Errorf(`%s: %w`, logp, err)
}
result, err = wrk.run()
if err != nil {
- return nil, fmt.Errorf(`%s: %s`, logp, err)
+ return nil, fmt.Errorf(`%s: %w`, logp, err)
}
return result, nil
diff --git a/brokenlinks/brokenlinks_test.go b/brokenlinks/brokenlinks_test.go
index 367ae6c..b868942 100644
--- a/brokenlinks/brokenlinks_test.go
+++ b/brokenlinks/brokenlinks_test.go
@@ -4,7 +4,6 @@
package brokenlinks_test
import (
- "encoding/json"
"log"
"net/http"
"os"
@@ -30,36 +29,8 @@ func TestMain(m *testing.M) {
var httpDirWeb = http.Dir(`testdata/web`)
var fshandle = http.FileServer(httpDirWeb)
- go func() {
- var mux = http.NewServeMux()
- mux.Handle(`/`, fshandle)
- var testServer = &http.Server{
- Addr: testAddress,
- Handler: mux,
- ReadTimeout: 10 * time.Second,
- WriteTimeout: 10 * time.Second,
- MaxHeaderBytes: 1 << 20,
- }
- var err = testServer.ListenAndServe()
- if err != nil {
- log.Fatal(err)
- }
- }()
- go func() {
- var mux = http.NewServeMux()
- mux.Handle(`/`, fshandle)
- var testServer = &http.Server{
- Addr: testExternalAddress,
- Handler: mux,
- ReadTimeout: 10 * time.Second,
- WriteTimeout: 10 * time.Second,
- MaxHeaderBytes: 1 << 20,
- }
- var err = testServer.ListenAndServe()
- if err != nil {
- log.Fatal(err)
- }
- }()
+ go testServer(fshandle)
+ go testExternalServer(fshandle)
var err = libnet.WaitAlive(`tcp`, testAddress, 5*time.Second)
if err != nil {
@@ -73,23 +44,67 @@ func TestMain(m *testing.M) {
os.Exit(m.Run())
}
+func testServer(fshandle http.Handler) {
+ var mux = http.NewServeMux()
+ mux.HandleFunc(`/page403`, page403)
+ mux.Handle(`/`, fshandle)
+ var testServer = &http.Server{
+ Addr: testAddress,
+ Handler: mux,
+ ReadTimeout: 10 * time.Second,
+ WriteTimeout: 10 * time.Second,
+ MaxHeaderBytes: 1 << 20,
+ }
+ var err = testServer.ListenAndServe()
+ if err != nil {
+ log.Fatal(err)
+ }
+}
+
+func page403(resp http.ResponseWriter, req *http.Request) {
+ resp.WriteHeader(http.StatusForbidden)
+}
+
+func testExternalServer(fshandle http.Handler) {
+ var mux = http.NewServeMux()
+ mux.Handle(`/`, fshandle)
+ var testServer = &http.Server{
+ Addr: testExternalAddress,
+ Handler: mux,
+ ReadTimeout: 10 * time.Second,
+ WriteTimeout: 10 * time.Second,
+ MaxHeaderBytes: 1 << 20,
+ }
+ var err = testServer.ListenAndServe()
+ if err != nil {
+ log.Fatal(err)
+ }
+}
+
func TestBrokenlinks(t *testing.T) {
var testUrl = `http://` + testAddress
type testCase struct {
exp map[string][]brokenlinks.Broken
- scanUrl string
expError string
+ opts brokenlinks.Options
}
listCase := []testCase{{
- scanUrl: `127.0.0.1:14594`,
- expError: `brokenlinks: invalid URL "127.0.0.1:14594"`,
+ opts: brokenlinks.Options{
+ Url: `127.0.0.1:14594`,
+ },
+ expError: `Scan: invalid URL "127.0.0.1:14594"`,
}, {
- scanUrl: `http://127.0.0.1:14594`,
- expError: `brokenlinks: Get "http://127.0.0.1:14594": dial tcp 127.0.0.1:14594: connect: connection refused`,
+ opts: brokenlinks.Options{
+ Url: `http://127.0.0.1:14594`,
+ },
+ expError: `Scan: Get "http://127.0.0.1:14594": dial tcp 127.0.0.1:14594: connect: connection refused`,
}, {
- scanUrl: testUrl,
+ opts: brokenlinks.Options{
+ Url: testUrl,
+ IgnoreStatus: `403`,
+ },
exp: map[string][]brokenlinks.Broken{
testUrl: []brokenlinks.Broken{
{
@@ -130,7 +145,9 @@ func TestBrokenlinks(t *testing.T) {
}, {
// Scanning on "/path" should not scan the the "/" or other
// pages other than below of "/path" itself.
- scanUrl: testUrl + `/page2`,
+ opts: brokenlinks.Options{
+ Url: testUrl + `/page2`,
+ },
exp: map[string][]brokenlinks.Broken{
testUrl + `/page2`: []brokenlinks.Broken{
{
@@ -152,19 +169,16 @@ func TestBrokenlinks(t *testing.T) {
err error
)
for _, tcase := range listCase {
- t.Logf(`--- brokenlinks: %s`, tcase.scanUrl)
- var opts = brokenlinks.Options{
- Url: tcase.scanUrl,
- }
- result, err = brokenlinks.Scan(opts)
+ t.Logf(`--- brokenlinks: %s`, tcase.opts.Url)
+ result, err = brokenlinks.Scan(tcase.opts)
if err != nil {
- test.Assert(t, tcase.scanUrl+` error`,
+ test.Assert(t, tcase.opts.Url+` error`,
tcase.expError, err.Error())
continue
}
//got, _ := json.MarshalIndent(result.BrokenLinks, ``, ` `)
//t.Logf(`got=%s`, got)
- test.Assert(t, tcase.scanUrl, tcase.exp, result.BrokenLinks)
+ test.Assert(t, tcase.opts.Url, tcase.exp, result.BrokenLinks)
}
}
@@ -185,12 +199,13 @@ func TestBrokenlinks_pastResult(t *testing.T) {
Url: testUrl,
PastResultFile: `testdata/invalid`,
},
- expError: `brokenlinks: open testdata/invalid: no such file or directory`,
+ expError: `Scan: open testdata/invalid: no such file or directory`,
}, {
// With valid file.
opts: brokenlinks.Options{
Url: testUrl,
PastResultFile: `testdata/past_result.json`,
+ IgnoreStatus: `403`,
},
exp: map[string][]brokenlinks.Broken{
testUrl + `/page2`: []brokenlinks.Broken{
@@ -220,8 +235,8 @@ func TestBrokenlinks_pastResult(t *testing.T) {
tcase.expError, err.Error())
continue
}
- got, _ := json.MarshalIndent(result.BrokenLinks, ``, ` `)
- t.Logf(`got=%s`, got)
+ //got, _ := json.MarshalIndent(result.BrokenLinks, ``, ` `)
+ //t.Logf(`got=%s`, got)
test.Assert(t, tcase.opts.Url, tcase.exp, result.BrokenLinks)
}
}
diff --git a/brokenlinks/options.go b/brokenlinks/options.go
new file mode 100644
index 0000000..9c4022b
--- /dev/null
+++ b/brokenlinks/options.go
@@ -0,0 +1,53 @@
+// SPDX-FileCopyrightText: 2025 M. Shulhan <ms@kilabit.info>
+// SPDX-License-Identifier: GPL-3.0-only
+
+package brokenlinks
+
+import (
+ "fmt"
+ "net/http"
+ "strconv"
+ "strings"
+)
+
+// Options define the options for scanning broken links.
+type Options struct {
+ Url string
+ PastResultFile string
+
+ // IgnoreStatus comma separated list HTTP status code that will be
+ // ignored on scan.
+ // Page that return one of the IgnoreStatus will be assumed as
+ // passed and not get processed.
+ // The status code must in between 100-511.
+ IgnoreStatus string
+ ignoreStatus []int
+
+ IsVerbose bool
+}
+
+func (opts *Options) init() (err error) {
+ var (
+ logp = `Options`
+ listCode = strings.Split(opts.IgnoreStatus, ",")
+ val string
+ )
+ for _, val = range listCode {
+ val = strings.TrimSpace(val)
+ if val == "" {
+ continue
+ }
+ var code int64
+ code, err = strconv.ParseInt(val, 10, 64)
+ if err != nil {
+ return fmt.Errorf(`%s: invalid status code %q`, logp, val)
+ }
+ if code < http.StatusContinue ||
+ code > http.StatusNetworkAuthenticationRequired {
+ return fmt.Errorf(`%s: status code %s out of range`, logp, val)
+ }
+
+ opts.ignoreStatus = append(opts.ignoreStatus, int(code))
+ }
+ return nil
+}
diff --git a/brokenlinks/testdata/web/index.html b/brokenlinks/testdata/web/index.html
index 61a1f39..cc11a2f 100644
--- a/brokenlinks/testdata/web/index.html
+++ b/brokenlinks/testdata/web/index.html
@@ -18,5 +18,8 @@ SPDX-License-Identifier: GPL-3.0-only
<!-- Fragment should be skipped and cleaned up -->
<a href="#goto_a">Same with href to "/"</a>
<a href="/page2#goto_a">Same with href to "/page2"</a>
+
+ <!-- Pages that return custom HTTP status code -->
+ <a href="/page403">Page 403</a>
</body>
</html>
diff --git a/brokenlinks/worker.go b/brokenlinks/worker.go
index 4ed56d2..e3e0c45 100644
--- a/brokenlinks/worker.go
+++ b/brokenlinks/worker.go
@@ -12,6 +12,7 @@ import (
"net/http"
"net/url"
"os"
+ "slices"
"strings"
"sync"
"time"
@@ -311,6 +312,10 @@ func (wrk *worker) scan(linkq linkQueue) {
linkq.status = httpResp.StatusCode
resultq[linkq.url] = linkq
+ if slices.Contains(wrk.opts.ignoreStatus, httpResp.StatusCode) {
+ return
+ }
+
if httpResp.StatusCode >= http.StatusBadRequest {
go wrk.pushResult(resultq)
return
diff --git a/cmd/jarink/main.go b/cmd/jarink/main.go
index b384032..9d6d1e8 100644
--- a/cmd/jarink/main.go
+++ b/cmd/jarink/main.go
@@ -18,8 +18,14 @@ import (
func main() {
log.SetFlags(0)
- var optIsVerbose bool
- var optPastResult string
+ var (
+ optIgnoreStatus string
+ optIsVerbose bool
+ optPastResult string
+ )
+
+ flag.StringVar(&optIgnoreStatus, `ignore-status`, ``,
+ `Comma separated HTTP response status code to be ignored.`)
flag.BoolVar(&optIsVerbose, `verbose`, false,
`Print additional information while running.`)
@@ -34,6 +40,7 @@ func main() {
switch cmd {
case `brokenlinks`:
var opts = brokenlinks.Options{
+ IgnoreStatus: optIgnoreStatus,
IsVerbose: optIsVerbose,
PastResultFile: optPastResult,
}