summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--README81
-rw-r--r--brokenlinks.go5
-rw-r--r--brokenlinks_test.go59
-rw-r--r--brokenlinks_worker.go206
-rw-r--r--cmd/jarink/main.go14
-rw-r--r--link_queue.go20
-rw-r--r--testdata/past_result.json8
-rw-r--r--testdata/past_result.json.license2
8 files changed, 268 insertions, 127 deletions
diff --git a/README b/README
index 3335049..796cbd7 100644
--- a/README
+++ b/README
@@ -7,40 +7,22 @@ and maintains their website.
jarink [OPTIONS] <COMMAND> <args...>
-Available commands,
+Available command,
brokenlinks - scan the website for broken links (page and images).
- help - print the usage of the command.
+ help - print the usage of the command.
== Usage
-[OPTIONS] brokenlinks URL
+[OPTIONS] brokenlinks <URL>
- Start scanning for broken links on the web server pointed by URL.
+ Scan for broken links on the web server pointed by URL.
Links will be scanned on anchor href attribute ("<a href=...>") or
on the image src attribute ("<img src=...").
The URL can be start from base or from sub path.
Scanning from path only report brokenlinks on that path and their
sub paths.
- For example, given a website that have the following pages,
-
- - web.tld (base)
- - web.tld/page1
- - web.tld/page1/sub1
- - web.tld/page2
- - web.tld/page2/sub1
-
- Invoking brokenlinks with
-
- $ jarink brokenlinks https://web.tld
-
- will scan all of the pages, but invoking brokenlinks on path
- "/page2"
-
- $ jarink brokenlinks https://web.tld/page2
-
- only scan "/page2" and "/page2/sub1".
Once finished it will print the page and list of broken links in
JSON format to standard output,
@@ -58,36 +40,35 @@ Available commands,
This command accept the following options,
- -verbose : print the page that being scanned to standard
- error.
+ -verbose
+
+ Print the page that being scanned to standard error.
+
+ -past-result=<path to JSON file>
- Example,
+ Scan only the pages reported by result from past scan based
+ on the content in JSON file. This minimize the time to
+ re-scan the pages once we have fixed the URLs.
+
+ For example, given a website that have the following pages,
+
+ - web.tld (base)
+ - web.tld/page1
+ - web.tld/page1/sub1
+ - web.tld/page2
+ - web.tld/page2/sub1
+
+ Invoking brokenlinks with
+
+ $ jarink brokenlinks https://web.tld
+
+ will scan all of the pages, but invoking brokenlinks on path
+ "/page2"
+
+ $ jarink brokenlinks https://web.tld/page2
+
+ only scan "/page2" and "/page2/sub1".
- $ jarink brokenlinks https://kilabit.info
- {
- "https://kilabit.info/some/page": [
- {
- "link": "https://kilabit.info/some/page/image.png",
- "code": 404
- },
- {
- "link": "https://external.com/link",
- "error": "Internal server error",
- "code": 500
- }
- ],
- "https://kilabit.info/another/page": [
- {
- "link": "https://kilabit.info/another/page/image.png",
- "code": 404
- },
- {
- "link": "https://external.org/link",
- "error": "Internal server error",
- "code": 500
- }
- ]
- }
== Notes
diff --git a/brokenlinks.go b/brokenlinks.go
index 69a14ad..833e2d2 100644
--- a/brokenlinks.go
+++ b/brokenlinks.go
@@ -25,8 +25,9 @@ type Broken struct {
// BrokenlinksOptions define the options for scanning broken links.
type BrokenlinksOptions struct {
- Url string
- IsVerbose bool
+ Url string
+ PastResultFile string
+ IsVerbose bool
}
// BrokenlinksResult store the result of scanning for broken links.
diff --git a/brokenlinks_test.go b/brokenlinks_test.go
index 2c8ad15..b2d5c80 100644
--- a/brokenlinks_test.go
+++ b/brokenlinks_test.go
@@ -4,6 +4,7 @@
package jarink_test
import (
+ "encoding/json"
"net/http"
"testing"
@@ -105,3 +106,61 @@ func TestBrokenlinks(t *testing.T) {
test.Assert(t, tcase.scanUrl, tcase.exp, result.PageLinks)
}
}
+
+// Test running Brokenlinks with file PastResultFile is set.
+// The PastResultFile is modified to only report errors on "/page2".
+func TestBrokenlinks_pastResult(t *testing.T) {
+ var testUrl = `http://` + testAddress
+
+ type testCase struct {
+ exp map[string][]jarink.Broken
+ expError string
+ opts jarink.BrokenlinksOptions
+ }
+
+ listCase := []testCase{{
+ // With invalid file.
+ opts: jarink.BrokenlinksOptions{
+ Url: testUrl,
+ PastResultFile: `testdata/invalid`,
+ },
+ expError: `brokenlinks: open testdata/invalid: no such file or directory`,
+ }, {
+ // With valid file.
+ opts: jarink.BrokenlinksOptions{
+ Url: testUrl,
+ PastResultFile: `testdata/past_result.json`,
+ },
+ exp: map[string][]jarink.Broken{
+ testUrl + `/page2`: []jarink.Broken{
+ {
+ Link: testUrl + `/broken.png`,
+ Code: http.StatusNotFound,
+ }, {
+ Link: testUrl + `/page2/broken/relative`,
+ Code: http.StatusNotFound,
+ }, {
+ Link: testUrl + `/page2/broken2.png`,
+ Code: http.StatusNotFound,
+ },
+ },
+ },
+ }}
+
+ var (
+ result *jarink.BrokenlinksResult
+ err error
+ )
+ for _, tcase := range listCase {
+ t.Logf(`--- brokenlinks: %s`, tcase.opts.Url)
+ result, err = jarink.Brokenlinks(tcase.opts)
+ if err != nil {
+ test.Assert(t, tcase.opts.Url+` error`,
+ tcase.expError, err.Error())
+ continue
+ }
+ got, _ := json.MarshalIndent(result.PageLinks, ``, ` `)
+ t.Logf(`got=%s`, got)
+ test.Assert(t, tcase.opts.Url, tcase.exp, result.PageLinks)
+ }
+}
diff --git a/brokenlinks_worker.go b/brokenlinks_worker.go
index 5cc8c25..dbb3453 100644
--- a/brokenlinks_worker.go
+++ b/brokenlinks_worker.go
@@ -4,6 +4,7 @@
package jarink
import (
+ "encoding/json"
"fmt"
"log"
"net/http"
@@ -29,6 +30,10 @@ type brokenlinksWorker struct {
// scanned.
result *BrokenlinksResult
+ // pastResult containts the past scan result, loaded from file
+ // [BrokenlinksOptions.PastResultFile].
+ pastResult *BrokenlinksResult
+
// The base URL that will be joined to relative or absolute
// links or image.
baseUrl *url.URL
@@ -57,7 +62,6 @@ func newWorker(opts BrokenlinksOptions) (wrk *brokenlinksWorker, err error) {
if err != nil {
return nil, fmt.Errorf(`invalid URL %q`, opts.Url)
}
-
wrk.scanUrl.Path = strings.TrimSuffix(wrk.scanUrl.Path, `/`)
wrk.scanUrl.Fragment = ""
wrk.scanUrl.RawFragment = ""
@@ -67,10 +71,36 @@ func newWorker(opts BrokenlinksOptions) (wrk *brokenlinksWorker, err error) {
Host: wrk.scanUrl.Host,
}
+ if opts.PastResultFile == "" {
+ // Run with normal scan.
+ return wrk, nil
+ }
+
+ pastresult, err := os.ReadFile(opts.PastResultFile)
+ if err != nil {
+ return nil, err
+ }
+
+ wrk.pastResult = newBrokenlinksResult()
+ err = json.Unmarshal(pastresult, &wrk.pastResult.PageLinks)
+ if err != nil {
+ return nil, err
+ }
+
return wrk, nil
}
func (wrk *brokenlinksWorker) run() (result *BrokenlinksResult, err error) {
+ if wrk.pastResult == nil {
+ result, err = wrk.scanAll()
+ } else {
+ result, err = wrk.scanPastResult()
+ }
+ return result, err
+}
+
+// scanAll scan all pages start from [BrokenlinksOptions.Url].
+func (wrk *brokenlinksWorker) scanAll() (result *BrokenlinksResult, err error) {
// Scan the first URL to make sure that the server is reachable.
var firstLinkq = linkQueue{
parentUrl: nil,
@@ -108,73 +138,50 @@ func (wrk *brokenlinksWorker) run() (result *BrokenlinksResult, err error) {
for isScanning {
select {
case resultq := <-wrk.resultq:
+ listWaitStatus = wrk.processResult(resultq, listWaitStatus)
- // The resultq contains the original URL being scanned
- // and its child links.
- // For example, scanning "http://example.tld" result
- // in
- //
- // "http://example.tld": {status=200}
- // "http://example.tld/page": {status=0}
- // "http://example.tld/image.png": {status=0}
- // "http://bad:domain/image.png": {status=700}
-
- var newList []linkQueue
- for _, linkq := range resultq {
- if linkq.status >= http.StatusBadRequest {
- wrk.markBroken(linkq)
- continue
- }
- if linkq.status != 0 {
- // linkq is the result of scan with
- // non error status.
- wrk.seenLink[linkq.url] = linkq.status
- continue
- }
-
- seenStatus, seen := wrk.seenLink[linkq.url]
- if !seen {
- wrk.seenLink[linkq.url] = http.StatusProcessing
- wrk.wg.Add(1)
- go wrk.scan(linkq)
- continue
- }
- if seenStatus >= http.StatusBadRequest {
- linkq.status = seenStatus
- wrk.markBroken(linkq)
- continue
- }
- if seenStatus >= http.StatusOK {
- // The link has been processed and its
- // not an error.
- continue
- }
- if seenStatus == http.StatusProcessing {
- // The link being processed by other
- // goroutine.
- linkq.status = seenStatus
- newList = append(newList, linkq)
- continue
- }
- log.Fatalf("link=%s status=%d", linkq.url, linkq.status)
+ case <-tick.C:
+ wrk.wg.Wait()
+ if len(wrk.resultq) != 0 {
+ continue
}
- for _, linkq := range listWaitStatus {
- seenStatus := wrk.seenLink[linkq.url]
- if seenStatus >= http.StatusBadRequest {
- linkq.status = seenStatus
- wrk.markBroken(linkq)
- continue
- }
- if seenStatus >= http.StatusOK {
- continue
- }
- if seenStatus == http.StatusProcessing {
- // Scanning still in progress.
- newList = append(newList, linkq)
- continue
- }
+ if len(listWaitStatus) != 0 {
+ // There are links that still waiting for
+ // scanning to be completed.
+ continue
+ }
+ isScanning = false
+ }
+ }
+ wrk.result.sort()
+ return wrk.result, nil
+}
+
+// scanPastResult scan only pages reported inside
+// [BrokenlinksResult.PageLinks].
+func (wrk *brokenlinksWorker) scanPastResult() (
+ result *BrokenlinksResult, err error,
+) {
+ go func() {
+ for page := range wrk.pastResult.PageLinks {
+ var linkq = linkQueue{
+ parentUrl: nil,
+ url: page,
+ status: http.StatusProcessing,
}
- listWaitStatus = newList
+ wrk.seenLink[linkq.url] = http.StatusProcessing
+ wrk.wg.Add(1)
+ go wrk.scan(linkq)
+ }
+ }()
+
+ var tick = time.NewTicker(500 * time.Millisecond)
+ var listWaitStatus []linkQueue
+ var isScanning = true
+ for isScanning {
+ select {
+ case resultq := <-wrk.resultq:
+ listWaitStatus = wrk.processResult(resultq, listWaitStatus)
case <-tick.C:
wrk.wg.Wait()
@@ -193,6 +200,71 @@ func (wrk *brokenlinksWorker) run() (result *BrokenlinksResult, err error) {
return wrk.result, nil
}
+// processResult the resultq contains the original URL being scanned
+// and its child links.
+// For example, scanning "http://example.tld" result in
+//
+// "http://example.tld": {status=200}
+// "http://example.tld/page": {status=0}
+// "http://example.tld/image.png": {status=0}
+// "http://bad:domain/image.png": {status=700}
+func (wrk *brokenlinksWorker) processResult(
+ resultq map[string]linkQueue, listWaitStatus []linkQueue,
+) (
+ newList []linkQueue,
+) {
+ for _, linkq := range resultq {
+ if linkq.status >= http.StatusBadRequest {
+ wrk.markBroken(linkq)
+ continue
+ }
+ if linkq.status != 0 {
+ // linkq is the result of scan with
+ // non error status.
+ wrk.seenLink[linkq.url] = linkq.status
+ continue
+ }
+
+ seenStatus, seen := wrk.seenLink[linkq.url]
+ if !seen {
+ wrk.seenLink[linkq.url] = http.StatusProcessing
+ wrk.wg.Add(1)
+ go wrk.scan(linkq)
+ continue
+ }
+ if seenStatus >= http.StatusBadRequest {
+ linkq.status = seenStatus
+ wrk.markBroken(linkq)
+ continue
+ }
+ if seenStatus >= http.StatusOK {
+ // The link has been processed and its
+ // not an error.
+ continue
+ }
+ // The link being processed by other goroutine.
+ linkq.status = seenStatus
+ newList = append(newList, linkq)
+ }
+ for _, linkq := range listWaitStatus {
+ seenStatus := wrk.seenLink[linkq.url]
+ if seenStatus >= http.StatusBadRequest {
+ linkq.status = seenStatus
+ wrk.markBroken(linkq)
+ continue
+ }
+ if seenStatus >= http.StatusOK {
+ continue
+ }
+ if seenStatus == http.StatusProcessing {
+ // Scanning still in progress.
+ newList = append(newList, linkq)
+ continue
+ }
+ }
+ return newList
+}
+
func (wrk *brokenlinksWorker) markBroken(linkq linkQueue) {
var parentUrl = linkq.parentUrl.String()
var listBroken = wrk.result.PageLinks[parentUrl]
@@ -303,9 +375,7 @@ func (wrk *brokenlinksWorker) scan(linkq linkQueue) {
}
_, seen := resultq[nodeLink.url]
if !seen {
- if !strings.HasPrefix(nodeLink.url, wrk.scanUrl.String()) {
- nodeLink.isExternal = true
- }
+ nodeLink.checkExternal(wrk)
resultq[nodeLink.url] = *nodeLink
}
}
diff --git a/cmd/jarink/main.go b/cmd/jarink/main.go
index 4f4d206..c8ba2e2 100644
--- a/cmd/jarink/main.go
+++ b/cmd/jarink/main.go
@@ -17,10 +17,13 @@ import (
func main() {
log.SetFlags(0)
- var optVerbose bool
+ var brokenlinksOpts = jarink.BrokenlinksOptions{}
- flag.BoolVar(&optVerbose, `verbose`, false,
- `print additional information while running`)
+ flag.BoolVar(&brokenlinksOpts.IsVerbose, `verbose`, false,
+ `Print additional information while running.`)
+
+ flag.StringVar(&brokenlinksOpts.PastResultFile, `past-result`, ``,
+ `Scan only pages with broken links from the past JSON result.`)
flag.Parse()
@@ -28,10 +31,7 @@ func main() {
cmd = strings.ToLower(cmd)
switch cmd {
case `brokenlinks`:
- var brokenlinksOpts = jarink.BrokenlinksOptions{
- Url: flag.Arg(1),
- IsVerbose: optVerbose,
- }
+ brokenlinksOpts.Url = flag.Arg(1)
if brokenlinksOpts.Url == "" {
log.Printf(`Missing argument URL to be scanned.`)
goto invalid_command
diff --git a/link_queue.go b/link_queue.go
index 0b419b8..1470115 100644
--- a/link_queue.go
+++ b/link_queue.go
@@ -5,6 +5,7 @@ package jarink
import (
"net/url"
+ "strings"
"golang.org/x/net/html/atom"
)
@@ -33,3 +34,22 @@ type linkQueue struct {
// 400 - 511: Error.
status int
}
+
+// checkExternal set the isExternal field to be true if
+//
+// (1) [linkQueue.url] does not start with [brokenlinksWorker.scanUrl]
+//
+// (2) linkQueue is from scanPastResult, indicated by non-nil
+// [brokenlinksWorker.pastResult].
+// In this case, we did not want to scan the other pages from the same scanUrl
+// domain.
+func (linkq *linkQueue) checkExternal(wrk *brokenlinksWorker) {
+ if !strings.HasPrefix(linkq.url, wrk.scanUrl.String()) {
+ linkq.isExternal = true
+ return
+ }
+ if wrk.pastResult != nil {
+ linkq.isExternal = true
+ return
+ }
+}
diff --git a/testdata/past_result.json b/testdata/past_result.json
new file mode 100644
index 0000000..3ba37c1
--- /dev/null
+++ b/testdata/past_result.json
@@ -0,0 +1,8 @@
+{
+ "http://127.0.0.1:11836/page2": [
+ {
+ "link": "http://127.0.0.1:11836/",
+ "code": 404
+ }
+ ]
+}
diff --git a/testdata/past_result.json.license b/testdata/past_result.json.license
new file mode 100644
index 0000000..22616a9
--- /dev/null
+++ b/testdata/past_result.json.license
@@ -0,0 +1,2 @@
+SPDX-FileCopyrightText: 2025 M. Shulhan <ms@kilabit.info>
+SPDX-License-Identifier: GPL-3.0-only