diff options
| -rw-r--r-- | brokenlinks.go | 69 | ||||
| -rw-r--r-- | brokenlinks/brokenlinks.go | 39 | ||||
| -rw-r--r-- | brokenlinks/brokenlinks_test.go (renamed from brokenlinks_test.go) | 105 | ||||
| -rw-r--r-- | brokenlinks/link_queue.go (renamed from link_queue.go) | 8 | ||||
| -rw-r--r-- | brokenlinks/result.go | 37 | ||||
| -rw-r--r-- | brokenlinks/testdata/past_result.json (renamed from testdata/past_result.json) | 0 | ||||
| -rw-r--r-- | brokenlinks/testdata/past_result.json.license (renamed from testdata/past_result.json.license) | 0 | ||||
| -rw-r--r-- | brokenlinks/testdata/web/broken.html (renamed from testdata/web/broken.html) | 0 | ||||
| -rw-r--r-- | brokenlinks/testdata/web/gopher.png (renamed from testdata/web/gopher.png) | bin | 32775 -> 32775 bytes | |||
| -rw-r--r-- | brokenlinks/testdata/web/index.html (renamed from testdata/web/index.html) | 0 | ||||
| -rw-r--r-- | brokenlinks/testdata/web/page2/index.html (renamed from testdata/web/page2/index.html) | 0 | ||||
| -rw-r--r-- | brokenlinks/worker.go (renamed from brokenlinks_worker.go) | 44 | ||||
| -rw-r--r-- | cmd/jarink/main.go | 25 | ||||
| -rw-r--r-- | jarink_test.go | 70 |
14 files changed, 202 insertions, 195 deletions
diff --git a/brokenlinks.go b/brokenlinks.go deleted file mode 100644 index 96580e5..0000000 --- a/brokenlinks.go +++ /dev/null @@ -1,69 +0,0 @@ -// SPDX-FileCopyrightText: 2025 M. Shulhan <ms@kilabit.info> -// SPDX-License-Identifier: GPL-3.0-only - -package jarink - -import ( - "fmt" - "slices" - "strings" -) - -const Version = `0.1.0` - -// StatusBadLink status for link that is not parseable by [url.Parse] or not -// reachable during GET or HEAD, either timeout or IP or domain not exist. -const StatusBadLink = 700 - -// Broken store the broken link, HTTP status code, and the error message that -// cause it. -type Broken struct { - Link string `json:"link"` - Error string `json:"error,omitempty"` - Code int `json:"code"` -} - -// BrokenlinksOptions define the options for scanning broken links. -type BrokenlinksOptions struct { - Url string - PastResultFile string - IsVerbose bool -} - -// BrokenlinksResult store the result of scanning for broken links. -type BrokenlinksResult struct { - // BrokenLinks store the page and its broken links. - BrokenLinks map[string][]Broken `json:"broken_links"` -} - -func newBrokenlinksResult() *BrokenlinksResult { - return &BrokenlinksResult{ - BrokenLinks: map[string][]Broken{}, - } -} - -func (result *BrokenlinksResult) sort() { - for _, listBroken := range result.BrokenLinks { - slices.SortFunc(listBroken, func(a, b Broken) int { - return strings.Compare(a.Link, b.Link) - }) - } -} - -// Brokenlinks scan the URL for broken links. -func Brokenlinks(opts BrokenlinksOptions) (result *BrokenlinksResult, err error) { - var logp = `brokenlinks` - var wrk *brokenlinksWorker - - wrk, err = newWorker(opts) - if err != nil { - return nil, fmt.Errorf(`%s: %s`, logp, err) - } - - result, err = wrk.run() - if err != nil { - return nil, fmt.Errorf(`%s: %s`, logp, err) - } - - return result, nil -} diff --git a/brokenlinks/brokenlinks.go b/brokenlinks/brokenlinks.go new file mode 100644 index 0000000..8ac458f --- /dev/null +++ b/brokenlinks/brokenlinks.go @@ -0,0 +1,39 @@ +// SPDX-FileCopyrightText: 2025 M. Shulhan <ms@kilabit.info> +// SPDX-License-Identifier: GPL-3.0-only + +package brokenlinks + +import ( + "fmt" +) + +const Version = `0.1.0` + +// StatusBadLink status for link that is not parseable by [url.Parse] or not +// reachable during GET or HEAD, either timeout or IP or domain not exist. +const StatusBadLink = 700 + +// Options define the options for scanning broken links. +type Options struct { + Url string + PastResultFile string + IsVerbose bool +} + +// Scan the URL for broken links. +func Scan(opts Options) (result *Result, err error) { + var logp = `brokenlinks` + var wrk *worker + + wrk, err = newWorker(opts) + if err != nil { + return nil, fmt.Errorf(`%s: %s`, logp, err) + } + + result, err = wrk.run() + if err != nil { + return nil, fmt.Errorf(`%s: %s`, logp, err) + } + + return result, nil +} diff --git a/brokenlinks_test.go b/brokenlinks/brokenlinks_test.go index 3818fbc..367ae6c 100644 --- a/brokenlinks_test.go +++ b/brokenlinks/brokenlinks_test.go @@ -1,22 +1,83 @@ // SPDX-FileCopyrightText: 2025 M. Shulhan <ms@kilabit.info> // SPDX-License-Identifier: GPL-3.0-only -package jarink_test +package brokenlinks_test import ( "encoding/json" + "log" "net/http" + "os" "testing" + "time" - "git.sr.ht/~shulhan/jarink" + libnet "git.sr.ht/~shulhan/pakakeh.go/lib/net" "git.sr.ht/~shulhan/pakakeh.go/lib/test" + + "git.sr.ht/~shulhan/jarink/brokenlinks" ) +// The test run two web servers that serve content on "testdata/web/". +// The first web server is the one that we want to scan. +// The second web server is external web server, where HTML pages should not +// be parsed. + +const testAddress = `127.0.0.1:11836` +const testExternalAddress = `127.0.0.1:11900` + +func TestMain(m *testing.M) { + log.SetFlags(0) + var httpDirWeb = http.Dir(`testdata/web`) + var fshandle = http.FileServer(httpDirWeb) + + go func() { + var mux = http.NewServeMux() + mux.Handle(`/`, fshandle) + var testServer = &http.Server{ + Addr: testAddress, + Handler: mux, + ReadTimeout: 10 * time.Second, + WriteTimeout: 10 * time.Second, + MaxHeaderBytes: 1 << 20, + } + var err = testServer.ListenAndServe() + if err != nil { + log.Fatal(err) + } + }() + go func() { + var mux = http.NewServeMux() + mux.Handle(`/`, fshandle) + var testServer = &http.Server{ + Addr: testExternalAddress, + Handler: mux, + ReadTimeout: 10 * time.Second, + WriteTimeout: 10 * time.Second, + MaxHeaderBytes: 1 << 20, + } + var err = testServer.ListenAndServe() + if err != nil { + log.Fatal(err) + } + }() + + var err = libnet.WaitAlive(`tcp`, testAddress, 5*time.Second) + if err != nil { + log.Fatal(err) + } + err = libnet.WaitAlive(`tcp`, testExternalAddress, 5*time.Second) + if err != nil { + log.Fatal(err) + } + + os.Exit(m.Run()) +} + func TestBrokenlinks(t *testing.T) { var testUrl = `http://` + testAddress type testCase struct { - exp map[string][]jarink.Broken + exp map[string][]brokenlinks.Broken scanUrl string expError string } @@ -29,8 +90,8 @@ func TestBrokenlinks(t *testing.T) { expError: `brokenlinks: Get "http://127.0.0.1:14594": dial tcp 127.0.0.1:14594: connect: connection refused`, }, { scanUrl: testUrl, - exp: map[string][]jarink.Broken{ - testUrl: []jarink.Broken{ + exp: map[string][]brokenlinks.Broken{ + testUrl: []brokenlinks.Broken{ { Link: testUrl + `/broken.png`, Code: http.StatusNotFound, @@ -40,20 +101,20 @@ func TestBrokenlinks(t *testing.T) { }, { Link: `http://127.0.0.1:abc`, Error: `parse "http://127.0.0.1:abc": invalid port ":abc" after host`, - Code: jarink.StatusBadLink, + Code: brokenlinks.StatusBadLink, }, { Link: `http:/127.0.0.1:11836`, Error: `Get "http:/127.0.0.1:11836": http: no Host in request URL`, - Code: jarink.StatusBadLink, + Code: brokenlinks.StatusBadLink, }, }, - testUrl + `/broken.html`: []jarink.Broken{ + testUrl + `/broken.html`: []brokenlinks.Broken{ { Link: testUrl + `/brokenPage`, Code: http.StatusNotFound, }, }, - testUrl + `/page2`: []jarink.Broken{ + testUrl + `/page2`: []brokenlinks.Broken{ { Link: testUrl + `/broken.png`, Code: http.StatusNotFound, @@ -70,8 +131,8 @@ func TestBrokenlinks(t *testing.T) { // Scanning on "/path" should not scan the the "/" or other // pages other than below of "/path" itself. scanUrl: testUrl + `/page2`, - exp: map[string][]jarink.Broken{ - testUrl + `/page2`: []jarink.Broken{ + exp: map[string][]brokenlinks.Broken{ + testUrl + `/page2`: []brokenlinks.Broken{ { Link: testUrl + `/broken.png`, Code: http.StatusNotFound, @@ -87,15 +148,15 @@ func TestBrokenlinks(t *testing.T) { }} var ( - result *jarink.BrokenlinksResult + result *brokenlinks.Result err error ) for _, tcase := range listCase { t.Logf(`--- brokenlinks: %s`, tcase.scanUrl) - var brokenlinksOpts = jarink.BrokenlinksOptions{ + var opts = brokenlinks.Options{ Url: tcase.scanUrl, } - result, err = jarink.Brokenlinks(brokenlinksOpts) + result, err = brokenlinks.Scan(opts) if err != nil { test.Assert(t, tcase.scanUrl+` error`, tcase.expError, err.Error()) @@ -113,26 +174,26 @@ func TestBrokenlinks_pastResult(t *testing.T) { var testUrl = `http://` + testAddress type testCase struct { - exp map[string][]jarink.Broken + exp map[string][]brokenlinks.Broken expError string - opts jarink.BrokenlinksOptions + opts brokenlinks.Options } listCase := []testCase{{ // With invalid file. - opts: jarink.BrokenlinksOptions{ + opts: brokenlinks.Options{ Url: testUrl, PastResultFile: `testdata/invalid`, }, expError: `brokenlinks: open testdata/invalid: no such file or directory`, }, { // With valid file. - opts: jarink.BrokenlinksOptions{ + opts: brokenlinks.Options{ Url: testUrl, PastResultFile: `testdata/past_result.json`, }, - exp: map[string][]jarink.Broken{ - testUrl + `/page2`: []jarink.Broken{ + exp: map[string][]brokenlinks.Broken{ + testUrl + `/page2`: []brokenlinks.Broken{ { Link: testUrl + `/broken.png`, Code: http.StatusNotFound, @@ -148,12 +209,12 @@ func TestBrokenlinks_pastResult(t *testing.T) { }} var ( - result *jarink.BrokenlinksResult + result *brokenlinks.Result err error ) for _, tcase := range listCase { t.Logf(`--- brokenlinks: %s`, tcase.opts.Url) - result, err = jarink.Brokenlinks(tcase.opts) + result, err = brokenlinks.Scan(tcase.opts) if err != nil { test.Assert(t, tcase.opts.Url+` error`, tcase.expError, err.Error()) diff --git a/link_queue.go b/brokenlinks/link_queue.go index 1470115..164a902 100644 --- a/link_queue.go +++ b/brokenlinks/link_queue.go @@ -1,7 +1,7 @@ // SPDX-FileCopyrightText: 2025 M. Shulhan <ms@kilabit.info> // SPDX-License-Identifier: GPL-3.0-only -package jarink +package brokenlinks import ( "net/url" @@ -37,13 +37,13 @@ type linkQueue struct { // checkExternal set the isExternal field to be true if // -// (1) [linkQueue.url] does not start with [brokenlinksWorker.scanUrl] +// (1) [linkQueue.url] does not start with [worker.scanUrl] // // (2) linkQueue is from scanPastResult, indicated by non-nil -// [brokenlinksWorker.pastResult]. +// [worker.pastResult]. // In this case, we did not want to scan the other pages from the same scanUrl // domain. -func (linkq *linkQueue) checkExternal(wrk *brokenlinksWorker) { +func (linkq *linkQueue) checkExternal(wrk *worker) { if !strings.HasPrefix(linkq.url, wrk.scanUrl.String()) { linkq.isExternal = true return diff --git a/brokenlinks/result.go b/brokenlinks/result.go new file mode 100644 index 0000000..676859b --- /dev/null +++ b/brokenlinks/result.go @@ -0,0 +1,37 @@ +// SPDX-FileCopyrightText: 2025 M. Shulhan <ms@kilabit.info> +// SPDX-License-Identifier: GPL-3.0-only + +package brokenlinks + +import ( + "slices" + "strings" +) + +// Broken store the broken link, HTTP status code, and the error message that +// cause it. +type Broken struct { + Link string `json:"link"` + Error string `json:"error,omitempty"` + Code int `json:"code"` +} + +// Result store the result of scanning for broken links. +type Result struct { + // BrokenLinks store the page and its broken links. + BrokenLinks map[string][]Broken `json:"broken_links"` +} + +func newResult() *Result { + return &Result{ + BrokenLinks: map[string][]Broken{}, + } +} + +func (result *Result) sort() { + for _, listBroken := range result.BrokenLinks { + slices.SortFunc(listBroken, func(a, b Broken) int { + return strings.Compare(a.Link, b.Link) + }) + } +} diff --git a/testdata/past_result.json b/brokenlinks/testdata/past_result.json index ca29d35..ca29d35 100644 --- a/testdata/past_result.json +++ b/brokenlinks/testdata/past_result.json diff --git a/testdata/past_result.json.license b/brokenlinks/testdata/past_result.json.license index 22616a9..22616a9 100644 --- a/testdata/past_result.json.license +++ b/brokenlinks/testdata/past_result.json.license diff --git a/testdata/web/broken.html b/brokenlinks/testdata/web/broken.html index 533e542..533e542 100644 --- a/testdata/web/broken.html +++ b/brokenlinks/testdata/web/broken.html diff --git a/testdata/web/gopher.png b/brokenlinks/testdata/web/gopher.png Binary files differindex 79352be..79352be 100644 --- a/testdata/web/gopher.png +++ b/brokenlinks/testdata/web/gopher.png diff --git a/testdata/web/index.html b/brokenlinks/testdata/web/index.html index 61a1f39..61a1f39 100644 --- a/testdata/web/index.html +++ b/brokenlinks/testdata/web/index.html diff --git a/testdata/web/page2/index.html b/brokenlinks/testdata/web/page2/index.html index ae6b4ea..ae6b4ea 100644 --- a/testdata/web/page2/index.html +++ b/brokenlinks/testdata/web/page2/index.html diff --git a/brokenlinks_worker.go b/brokenlinks/worker.go index a4e854d..4ed56d2 100644 --- a/brokenlinks_worker.go +++ b/brokenlinks/worker.go @@ -1,7 +1,7 @@ // SPDX-FileCopyrightText: 2025 M. Shulhan <ms@kilabit.info> // SPDX-License-Identifier: GPL-3.0-only -package jarink +package brokenlinks import ( "encoding/json" @@ -20,7 +20,7 @@ import ( "golang.org/x/net/html/atom" ) -type brokenlinksWorker struct { +type worker struct { // seenLink store the URL being or has been scanned and its HTTP // status code. seenLink map[string]int @@ -30,11 +30,11 @@ type brokenlinksWorker struct { // result contains the final result after all of the pages has been // scanned. - result *BrokenlinksResult + result *Result // pastResult containts the past scan result, loaded from file - // [BrokenlinksOptions.PastResultFile]. - pastResult *BrokenlinksResult + // [Options.PastResultFile]. + pastResult *Result // The base URL that will be joined to relative or absolute // links or image. @@ -45,18 +45,18 @@ type brokenlinksWorker struct { log *log.Logger - opts BrokenlinksOptions + opts Options // wg sync the goroutine scanner. wg sync.WaitGroup } -func newWorker(opts BrokenlinksOptions) (wrk *brokenlinksWorker, err error) { - wrk = &brokenlinksWorker{ +func newWorker(opts Options) (wrk *worker, err error) { + wrk = &worker{ opts: opts, seenLink: map[string]int{}, resultq: make(chan map[string]linkQueue, 100), - result: newBrokenlinksResult(), + result: newResult(), log: log.New(os.Stderr, ``, log.LstdFlags), } @@ -83,7 +83,7 @@ func newWorker(opts BrokenlinksOptions) (wrk *brokenlinksWorker, err error) { return nil, err } - wrk.pastResult = newBrokenlinksResult() + wrk.pastResult = newResult() err = json.Unmarshal(pastresult, &wrk.pastResult) if err != nil { return nil, err @@ -92,7 +92,7 @@ func newWorker(opts BrokenlinksOptions) (wrk *brokenlinksWorker, err error) { return wrk, nil } -func (wrk *brokenlinksWorker) run() (result *BrokenlinksResult, err error) { +func (wrk *worker) run() (result *Result, err error) { if wrk.pastResult == nil { result, err = wrk.scanAll() } else { @@ -101,8 +101,8 @@ func (wrk *brokenlinksWorker) run() (result *BrokenlinksResult, err error) { return result, err } -// scanAll scan all pages start from [BrokenlinksOptions.Url]. -func (wrk *brokenlinksWorker) scanAll() (result *BrokenlinksResult, err error) { +// scanAll scan all pages start from [Options.Url]. +func (wrk *worker) scanAll() (result *Result, err error) { // Scan the first URL to make sure that the server is reachable. var firstLinkq = linkQueue{ parentUrl: nil, @@ -160,9 +160,9 @@ func (wrk *brokenlinksWorker) scanAll() (result *BrokenlinksResult, err error) { } // scanPastResult scan only pages reported inside -// [BrokenlinksResult.BrokenLinks]. -func (wrk *brokenlinksWorker) scanPastResult() ( - result *BrokenlinksResult, err error, +// [Result.BrokenLinks]. +func (wrk *worker) scanPastResult() ( + result *Result, err error, ) { go func() { for page := range wrk.pastResult.BrokenLinks { @@ -210,7 +210,7 @@ func (wrk *brokenlinksWorker) scanPastResult() ( // "http://example.tld/page": {status=0} // "http://example.tld/image.png": {status=0} // "http://bad:domain/image.png": {status=700} -func (wrk *brokenlinksWorker) processResult( +func (wrk *worker) processResult( resultq map[string]linkQueue, listWaitStatus []linkQueue, ) ( newList []linkQueue, @@ -267,7 +267,7 @@ func (wrk *brokenlinksWorker) processResult( return newList } -func (wrk *brokenlinksWorker) markBroken(linkq linkQueue) { +func (wrk *worker) markBroken(linkq linkQueue) { var parentUrl = linkq.parentUrl.String() var listBroken = wrk.result.BrokenLinks[parentUrl] var brokenLink = Broken{ @@ -284,7 +284,7 @@ func (wrk *brokenlinksWorker) markBroken(linkq linkQueue) { } // scan fetch the HTML page or image to check if its valid. -func (wrk *brokenlinksWorker) scan(linkq linkQueue) { +func (wrk *worker) scan(linkq linkQueue) { defer func() { if wrk.opts.IsVerbose && linkq.errScan != nil { wrk.log.Printf("error: %d %s error=%v\n", linkq.status, @@ -374,7 +374,7 @@ func (wrk *brokenlinksWorker) scan(linkq linkQueue) { go wrk.pushResult(resultq) } -func (wrk *brokenlinksWorker) fetch(linkq linkQueue) ( +func (wrk *worker) fetch(linkq linkQueue) ( httpResp *http.Response, err error, ) { @@ -406,7 +406,7 @@ func (wrk *brokenlinksWorker) fetch(linkq linkQueue) ( return nil, err } -func (wrk *brokenlinksWorker) processLink(parentUrl *url.URL, val string, kind atom.Atom) ( +func (wrk *worker) processLink(parentUrl *url.URL, val string, kind atom.Atom) ( linkq *linkQueue, ) { if len(val) == 0 { @@ -454,7 +454,7 @@ func (wrk *brokenlinksWorker) processLink(parentUrl *url.URL, val string, kind a return linkq } -func (wrk *brokenlinksWorker) pushResult(resultq map[string]linkQueue) { +func (wrk *worker) pushResult(resultq map[string]linkQueue) { var tick = time.NewTicker(100 * time.Millisecond) for { select { diff --git a/cmd/jarink/main.go b/cmd/jarink/main.go index cba254f..b384032 100644 --- a/cmd/jarink/main.go +++ b/cmd/jarink/main.go @@ -12,17 +12,19 @@ import ( "strings" "git.sr.ht/~shulhan/jarink" + "git.sr.ht/~shulhan/jarink/brokenlinks" ) func main() { log.SetFlags(0) - var brokenlinksOpts = jarink.BrokenlinksOptions{} + var optIsVerbose bool + var optPastResult string - flag.BoolVar(&brokenlinksOpts.IsVerbose, `verbose`, false, + flag.BoolVar(&optIsVerbose, `verbose`, false, `Print additional information while running.`) - flag.StringVar(&brokenlinksOpts.PastResultFile, `past-result`, ``, + flag.StringVar(&optPastResult, `past-result`, ``, `Scan only pages with broken links from the past JSON result.`) flag.Parse() @@ -31,15 +33,22 @@ func main() { cmd = strings.ToLower(cmd) switch cmd { case `brokenlinks`: - brokenlinksOpts.Url = flag.Arg(1) - if brokenlinksOpts.Url == "" { + var opts = brokenlinks.Options{ + IsVerbose: optIsVerbose, + PastResultFile: optPastResult, + } + + opts.Url = flag.Arg(1) + if opts.Url == "" { log.Printf(`Missing argument URL to be scanned.`) goto invalid_command } - var result *jarink.BrokenlinksResult - var err error - result, err = jarink.Brokenlinks(brokenlinksOpts) + var ( + result *brokenlinks.Result + err error + ) + result, err = brokenlinks.Scan(opts) if err != nil { log.Fatal(err.Error()) } diff --git a/jarink_test.go b/jarink_test.go deleted file mode 100644 index 91d38a0..0000000 --- a/jarink_test.go +++ /dev/null @@ -1,70 +0,0 @@ -// SPDX-FileCopyrightText: 2025 M. Shulhan <ms@kilabit.info> -// SPDX-License-Identifier: GPL-3.0-only - -package jarink_test - -import ( - "log" - "net/http" - "os" - "testing" - "time" - - libnet "git.sr.ht/~shulhan/pakakeh.go/lib/net" -) - -// The test run two web servers that serve content on "testdata/web/". -// The first web server is the one that we want to scan. -// The second web server is external web server, where HTML pages should not -// be parsed. - -const testAddress = `127.0.0.1:11836` -const testExternalAddress = `127.0.0.1:11900` - -func TestMain(m *testing.M) { - log.SetFlags(0) - var httpDirWeb = http.Dir(`testdata/web`) - var fshandle = http.FileServer(httpDirWeb) - - go func() { - var mux = http.NewServeMux() - mux.Handle(`/`, fshandle) - var testServer = &http.Server{ - Addr: testAddress, - Handler: mux, - ReadTimeout: 10 * time.Second, - WriteTimeout: 10 * time.Second, - MaxHeaderBytes: 1 << 20, - } - var err = testServer.ListenAndServe() - if err != nil { - log.Fatal(err) - } - }() - go func() { - var mux = http.NewServeMux() - mux.Handle(`/`, fshandle) - var testServer = &http.Server{ - Addr: testExternalAddress, - Handler: mux, - ReadTimeout: 10 * time.Second, - WriteTimeout: 10 * time.Second, - MaxHeaderBytes: 1 << 20, - } - var err = testServer.ListenAndServe() - if err != nil { - log.Fatal(err) - } - }() - - var err = libnet.WaitAlive(`tcp`, testAddress, 5*time.Second) - if err != nil { - log.Fatal(err) - } - err = libnet.WaitAlive(`tcp`, testExternalAddress, 5*time.Second) - if err != nil { - log.Fatal(err) - } - - os.Exit(m.Run()) -} |
