From 3ed69f535b6081bfa88b83f6fbf8c94527afe350 Mon Sep 17 00:00:00 2001 From: Shulhan Date: Sun, 1 Jun 2025 01:05:21 +0700 Subject: all: rename the program and repository into jarink Jarink is a program to help web administrator to maintains their website. Currently its provides a command to scan for brokenlinks. --- brokenlinks.go | 68 ++++++++++ brokenlinks_test.go | 188 +++++++++++++++++++++++++ brokenlinks_worker.go | 369 ++++++++++++++++++++++++++++++++++++++++++++++++++ cmd/deadlinks/main.go | 111 --------------- cmd/jarink/main.go | 118 ++++++++++++++++ deadlinks.go | 32 ----- deadlinks_test.go | 188 ------------------------- go.mod | 4 +- link_queue.go | 2 +- result.go | 36 ----- scan_options.go | 10 -- url_test.go | 2 +- worker.go | 369 -------------------------------------------------- 13 files changed, 747 insertions(+), 750 deletions(-) create mode 100644 brokenlinks.go create mode 100644 brokenlinks_test.go create mode 100644 brokenlinks_worker.go delete mode 100644 cmd/deadlinks/main.go create mode 100644 cmd/jarink/main.go delete mode 100644 deadlinks.go delete mode 100644 deadlinks_test.go delete mode 100644 result.go delete mode 100644 scan_options.go delete mode 100644 worker.go diff --git a/brokenlinks.go b/brokenlinks.go new file mode 100644 index 0000000..768216d --- /dev/null +++ b/brokenlinks.go @@ -0,0 +1,68 @@ +// SPDX-FileCopyrightText: 2025 M. Shulhan +// SPDX-License-Identifier: GPL-3.0-only + +package jarink + +import ( + "fmt" + "slices" + "strings" +) + +const Version = `0.1.0` + +// StatusBadLink status for link that is not parseable by [url.Parse] or not +// reachable during GET or HEAD, either timeout or IP or domain not exist. +const StatusBadLink = 700 + +// BrokenlinksOptions define the options for scanning broken links. +type BrokenlinksOptions struct { + Url string + IsVerbose bool +} + +// Broken store the broken link, HTTP status code, and the error message that +// cause it. +type Broken struct { + Link string + Error string `json:"omitempty"` + Code int +} + +// BrokenlinksResult store the result of scanning for broken links. +type BrokenlinksResult struct { + // PageLinks store the page and its broken links. + PageLinks map[string][]Broken +} + +func newBrokenlinksResult() *BrokenlinksResult { + return &BrokenlinksResult{ + PageLinks: map[string][]Broken{}, + } +} + +func (result *BrokenlinksResult) sort() { + for _, listBroken := range result.PageLinks { + slices.SortFunc(listBroken, func(a, b Broken) int { + return strings.Compare(a.Link, b.Link) + }) + } +} + +// Brokenlinks scan the URL for broken links. +func Brokenlinks(opts BrokenlinksOptions) (result *BrokenlinksResult, err error) { + var logp = `brokenlinks` + var wrk *brokenlinksWorker + + wrk, err = newWorker(opts) + if err != nil { + return nil, fmt.Errorf(`%s: %s`, logp, err) + } + + result, err = wrk.run() + if err != nil { + return nil, fmt.Errorf(`%s: %s`, logp, err) + } + + return result, nil +} diff --git a/brokenlinks_test.go b/brokenlinks_test.go new file mode 100644 index 0000000..c1a607f --- /dev/null +++ b/brokenlinks_test.go @@ -0,0 +1,188 @@ +// SPDX-FileCopyrightText: 2025 M. Shulhan +// SPDX-License-Identifier: GPL-3.0-only + +package jarink_test + +import ( + "log" + "net/http" + "os" + "testing" + "time" + + "git.sr.ht/~shulhan/jarink" + libnet "git.sr.ht/~shulhan/pakakeh.go/lib/net" + "git.sr.ht/~shulhan/pakakeh.go/lib/test" +) + +// The test run two web servers that serve content on "testdata/web/". +// The first web server is the one that we want to scan. +// The second web server is external web server, where HTML pages should not +// be parsed. + +const testAddress = `127.0.0.1:11836` +const testExternalAddress = `127.0.0.1:11900` + +func TestMain(m *testing.M) { + log.SetFlags(0) + var httpDirWeb = http.Dir(`testdata/web`) + var fshandle = http.FileServer(httpDirWeb) + + go func() { + var mux = http.NewServeMux() + mux.Handle(`/`, fshandle) + var testServer = &http.Server{ + Addr: testAddress, + Handler: mux, + ReadTimeout: 10 * time.Second, + WriteTimeout: 10 * time.Second, + MaxHeaderBytes: 1 << 20, + } + var err = testServer.ListenAndServe() + if err != nil { + log.Fatal(err) + } + }() + go func() { + var mux = http.NewServeMux() + mux.Handle(`/`, fshandle) + var testServer = &http.Server{ + Addr: testExternalAddress, + Handler: mux, + ReadTimeout: 10 * time.Second, + WriteTimeout: 10 * time.Second, + MaxHeaderBytes: 1 << 20, + } + var err = testServer.ListenAndServe() + if err != nil { + log.Fatal(err) + } + }() + + var err = libnet.WaitAlive(`tcp`, testAddress, 5*time.Second) + if err != nil { + log.Fatal(err) + } + err = libnet.WaitAlive(`tcp`, testExternalAddress, 5*time.Second) + if err != nil { + log.Fatal(err) + } + + os.Exit(m.Run()) +} + +func TestBrokenlinks(t *testing.T) { + var testUrl = `http://` + testAddress + + type testCase struct { + exp map[string][]jarink.Broken + scanUrl string + expError string + } + + listCase := []testCase{{ + scanUrl: `127.0.0.1:14594`, + expError: `brokenlinks: invalid URL "127.0.0.1:14594"`, + }, { + scanUrl: `http://127.0.0.1:14594`, + expError: `brokenlinks: Get "http://127.0.0.1:14594": dial tcp 127.0.0.1:14594: connect: connection refused`, + }, { + scanUrl: testUrl, + exp: map[string][]jarink.Broken{ + testUrl: []jarink.Broken{ + { + Link: testUrl + `/broken.png`, + Code: http.StatusNotFound, + }, { + Link: testUrl + `/brokenPage`, + Code: http.StatusNotFound, + }, { + Link: `http://127.0.0.1:abc`, + Error: `parse "http://127.0.0.1:abc": invalid port ":abc" after host`, + Code: jarink.StatusBadLink, + }, { + Link: `http:/127.0.0.1:11836`, + Error: `Head "http:/127.0.0.1:11836": http: no Host in request URL`, + Code: jarink.StatusBadLink, + }, + }, + testUrl + `/broken.html`: []jarink.Broken{ + { + Link: testUrl + `/brokenPage`, + Code: http.StatusNotFound, + }, + }, + testUrl + `/page2`: []jarink.Broken{ + { + Link: testUrl + `/broken.png`, + Code: http.StatusNotFound, + }, { + Link: testUrl + `/page2/broken/relative`, + Code: http.StatusNotFound, + }, { + Link: testUrl + `/page2/broken2.png`, + Code: http.StatusNotFound, + }, + }, + }, + }, { + scanUrl: testUrl + `/page2`, + exp: map[string][]jarink.Broken{ + testUrl: []jarink.Broken{ + { + Link: testUrl + `/broken.png`, + Code: http.StatusNotFound, + }, { + Link: testUrl + `/brokenPage`, + Code: http.StatusNotFound, + }, { + Link: `http://127.0.0.1:abc`, + Error: `parse "http://127.0.0.1:abc": invalid port ":abc" after host`, + Code: jarink.StatusBadLink, + }, { + Link: `http:/127.0.0.1:11836`, + Error: `Head "http:/127.0.0.1:11836": http: no Host in request URL`, + Code: jarink.StatusBadLink, + }, + }, + testUrl + `/broken.html`: []jarink.Broken{ + { + Link: testUrl + `/brokenPage`, + Code: http.StatusNotFound, + }, + }, + testUrl + `/page2`: []jarink.Broken{ + { + Link: testUrl + `/broken.png`, + Code: http.StatusNotFound, + }, { + Link: testUrl + `/page2/broken/relative`, + Code: http.StatusNotFound, + }, { + Link: testUrl + `/page2/broken2.png`, + Code: http.StatusNotFound, + }, + }, + }, + }} + + var ( + result *jarink.BrokenlinksResult + err error + ) + for _, tcase := range listCase { + t.Logf(`--- brokenlinks: %s`, tcase.scanUrl) + var brokenlinksOpts = jarink.BrokenlinksOptions{ + Url: tcase.scanUrl, + } + result, err = jarink.Brokenlinks(brokenlinksOpts) + if err != nil { + test.Assert(t, tcase.scanUrl+` error`, + tcase.expError, err.Error()) + continue + } + //got, _ := json.MarshalIndent(result.PageLinks, ``, ` `) + //t.Logf(`got=%s`, got) + test.Assert(t, tcase.scanUrl, tcase.exp, result.PageLinks) + } +} diff --git a/brokenlinks_worker.go b/brokenlinks_worker.go new file mode 100644 index 0000000..03359b7 --- /dev/null +++ b/brokenlinks_worker.go @@ -0,0 +1,369 @@ +// SPDX-FileCopyrightText: 2025 M. Shulhan +// SPDX-License-Identifier: GPL-3.0-only + +package jarink + +import ( + "fmt" + "log" + "net/http" + "net/url" + "strings" + "sync" + "time" + + "golang.org/x/net/html" + "golang.org/x/net/html/atom" +) + +type brokenlinksWorker struct { + // seenLink store the URL being or has been scanned and its HTTP + // status code. + seenLink map[string]int + + // resultq channel that collect result from scanning. + resultq chan map[string]linkQueue + + // result contains the final result after all of the pages has been + // scanned. + result *BrokenlinksResult + + // The base URL that will be joined to relative or absolute + // links or image. + baseUrl *url.URL + + // The URL to scan. + scanUrl *url.URL + + opts BrokenlinksOptions + + // wg sync the goroutine scanner. + wg sync.WaitGroup +} + +func newWorker(opts BrokenlinksOptions) (wrk *brokenlinksWorker, err error) { + wrk = &brokenlinksWorker{ + opts: opts, + seenLink: map[string]int{}, + resultq: make(chan map[string]linkQueue, 100), + result: newBrokenlinksResult(), + } + + wrk.scanUrl, err = url.Parse(opts.Url) + if err != nil { + return nil, fmt.Errorf(`invalid URL %q`, opts.Url) + } + + wrk.scanUrl.Path = strings.TrimSuffix(wrk.scanUrl.Path, `/`) + wrk.scanUrl.Fragment = "" + wrk.scanUrl.RawFragment = "" + + wrk.baseUrl = &url.URL{ + Scheme: wrk.scanUrl.Scheme, + Host: wrk.scanUrl.Host, + } + + return wrk, nil +} + +func (wrk *brokenlinksWorker) run() (result *BrokenlinksResult, err error) { + // Scan the first URL to make sure that the server is reachable. + var firstLinkq = linkQueue{ + parentUrl: nil, + url: wrk.scanUrl.String(), + status: http.StatusProcessing, + } + wrk.seenLink[firstLinkq.url] = http.StatusProcessing + + wrk.wg.Add(1) + go wrk.scan(firstLinkq) + wrk.wg.Wait() + + var resultq = <-wrk.resultq + for _, linkq := range resultq { + if linkq.url == firstLinkq.url { + if linkq.errScan != nil { + return nil, linkq.errScan + } + wrk.seenLink[linkq.url] = linkq.status + continue + } + if linkq.status >= http.StatusBadRequest { + wrk.markBroken(linkq) + continue + } + + wrk.seenLink[linkq.url] = http.StatusProcessing + wrk.wg.Add(1) + go wrk.scan(linkq) + } + + var tick = time.NewTicker(500 * time.Millisecond) + var listWaitStatus []linkQueue + var isScanning = true + for isScanning { + select { + case resultq := <-wrk.resultq: + + // The resultq contains the original URL being scanned + // and its child links. + // For example, scanning "http://example.tld" result + // in + // + // "http://example.tld": {status=200} + // "http://example.tld/page": {status=0} + // "http://example.tld/image.png": {status=0} + // "http://bad:domain/image.png": {status=700} + + var newList []linkQueue + for _, linkq := range resultq { + if linkq.status >= http.StatusBadRequest { + wrk.markBroken(linkq) + continue + } + if linkq.status != 0 { + // linkq is the result of scan with + // non error status. + wrk.seenLink[linkq.url] = linkq.status + continue + } + + seenStatus, seen := wrk.seenLink[linkq.url] + if !seen { + wrk.seenLink[linkq.url] = http.StatusProcessing + wrk.wg.Add(1) + go wrk.scan(linkq) + continue + } + if seenStatus >= http.StatusBadRequest { + linkq.status = seenStatus + wrk.markBroken(linkq) + continue + } + if seenStatus >= http.StatusOK { + // The link has been processed and its + // not an error. + continue + } + if seenStatus == http.StatusProcessing { + // The link being processed by other + // goroutine. + linkq.status = seenStatus + newList = append(newList, linkq) + continue + } + log.Fatalf("link=%s status=%d", linkq.url, linkq.status) + } + for _, linkq := range listWaitStatus { + seenStatus := wrk.seenLink[linkq.url] + if seenStatus >= http.StatusBadRequest { + linkq.status = seenStatus + wrk.markBroken(linkq) + continue + } + if seenStatus >= http.StatusOK { + continue + } + if seenStatus == http.StatusProcessing { + // Scanning still in progress. + newList = append(newList, linkq) + continue + } + } + listWaitStatus = newList + + case <-tick.C: + wrk.wg.Wait() + if len(wrk.resultq) != 0 { + continue + } + if len(listWaitStatus) != 0 { + // There are links that still waiting for + // scanning to be completed. + continue + } + isScanning = false + } + } + wrk.result.sort() + return wrk.result, nil +} + +func (wrk *brokenlinksWorker) markBroken(linkq linkQueue) { + var parentUrl = linkq.parentUrl.String() + var listBroken = wrk.result.PageLinks[parentUrl] + var brokenLink = Broken{ + Link: linkq.url, + Code: linkq.status, + } + if linkq.errScan != nil { + brokenLink.Error = linkq.errScan.Error() + } + listBroken = append(listBroken, brokenLink) + wrk.result.PageLinks[parentUrl] = listBroken + + wrk.seenLink[linkq.url] = linkq.status +} + +// scan fetch the HTML page or image to check if its valid. +func (wrk *brokenlinksWorker) scan(linkq linkQueue) { + defer func() { + if wrk.opts.IsVerbose && linkq.errScan != nil { + fmt.Printf("error: %d %s error=%v\n", linkq.status, + linkq.url, linkq.errScan) + } + wrk.wg.Done() + }() + + var ( + resultq = map[string]linkQueue{} + httpResp *http.Response + err error + ) + if linkq.kind == atom.Img || linkq.isExternal { + if wrk.opts.IsVerbose { + fmt.Printf("scan: HEAD %s\n", linkq.url) + } + httpResp, err = http.Head(linkq.url) + } else { + if wrk.opts.IsVerbose { + fmt.Printf("scan: GET %s\n", linkq.url) + } + httpResp, err = http.Get(linkq.url) + } + if err != nil { + linkq.status = StatusBadLink + linkq.errScan = err + resultq[linkq.url] = linkq + go wrk.pushResult(resultq) + return + } + defer httpResp.Body.Close() + + linkq.status = httpResp.StatusCode + resultq[linkq.url] = linkq + + if httpResp.StatusCode >= http.StatusBadRequest { + go wrk.pushResult(resultq) + return + } + if linkq.kind == atom.Img || linkq.isExternal { + go wrk.pushResult(resultq) + return + } + + var doc *html.Node + doc, _ = html.Parse(httpResp.Body) + + // After we check the code and test for [html.Parse] there are + // no case actual cases where HTML content will return an error. + // The only possible error is when reading from body (io.Reader), and + // that is also almost impossible. + // + // [html.Parse]: https://go.googlesource.com/net/+/refs/tags/v0.40.0/html/parse.go#2347 + + var scanUrl *url.URL + + scanUrl, err = url.Parse(linkq.url) + if err != nil { + log.Fatal(err) + } + + var node *html.Node + for node = range doc.Descendants() { + if node.Type != html.ElementNode { + continue + } + var nodeLink *linkQueue + if node.DataAtom == atom.A { + for _, attr := range node.Attr { + if attr.Key != `href` { + continue + } + nodeLink = wrk.processLink(scanUrl, attr.Val, atom.A) + break + } + } else if node.DataAtom == atom.Img { + for _, attr := range node.Attr { + if attr.Key != `src` { + continue + } + nodeLink = wrk.processLink(scanUrl, attr.Val, atom.Img) + break + } + } else { + continue + } + if nodeLink == nil { + continue + } + _, seen := resultq[nodeLink.url] + if !seen { + if !strings.HasPrefix(nodeLink.url, wrk.baseUrl.String()) { + nodeLink.isExternal = true + } + resultq[nodeLink.url] = *nodeLink + } + } + go wrk.pushResult(resultq) +} + +func (wrk *brokenlinksWorker) processLink(parentUrl *url.URL, val string, kind atom.Atom) ( + linkq *linkQueue, +) { + if len(val) == 0 { + return nil + } + + var newUrl *url.URL + var err error + newUrl, err = url.Parse(val) + if err != nil { + return &linkQueue{ + parentUrl: parentUrl, + errScan: err, + url: val, + kind: kind, + status: StatusBadLink, + } + } + newUrl.Fragment = "" + newUrl.RawFragment = "" + + if kind == atom.A && val[0] == '#' { + // Ignore link to ID, like `href="#element_id"`. + return nil + } + if strings.HasPrefix(val, `http`) { + return &linkQueue{ + parentUrl: parentUrl, + url: strings.TrimSuffix(newUrl.String(), `/`), + kind: kind, + } + } + if val[0] == '/' { + // val is absolute to parent URL. + newUrl = wrk.baseUrl.JoinPath(newUrl.Path) + } else { + // val is relative to parent URL. + newUrl = parentUrl.JoinPath(`/`, newUrl.Path) + } + linkq = &linkQueue{ + parentUrl: parentUrl, + url: strings.TrimSuffix(newUrl.String(), `/`), + kind: kind, + } + return linkq +} + +func (wrk *brokenlinksWorker) pushResult(resultq map[string]linkQueue) { + var tick = time.NewTicker(100 * time.Millisecond) + for { + select { + case wrk.resultq <- resultq: + tick.Stop() + return + case <-tick.C: + } + } +} diff --git a/cmd/deadlinks/main.go b/cmd/deadlinks/main.go deleted file mode 100644 index 16057ee..0000000 --- a/cmd/deadlinks/main.go +++ /dev/null @@ -1,111 +0,0 @@ -// SPDX-FileCopyrightText: 2025 M. Shulhan -// SPDX-License-Identifier: GPL-3.0-only - -package main - -import ( - "encoding/json" - "flag" - "fmt" - "log" - "os" - "strings" - - "git.sr.ht/~shulhan/deadlinks" -) - -func main() { - var optVerbose bool - - flag.BoolVar(&optVerbose, `verbose`, false, - `print additional information while running`) - - flag.Parse() - - var cmd = flag.Arg(0) - if cmd == "" { - goto invalid_command - } - - cmd = strings.ToLower(cmd) - if cmd == "scan" { - var scanOpts = deadlinks.ScanOptions{ - Url: flag.Arg(1), - IsVerbose: optVerbose, - } - if scanOpts.Url == "" { - goto invalid_command - } - - var result *deadlinks.Result - var err error - result, err = deadlinks.Scan(scanOpts) - if err != nil { - log.Fatal(err.Error()) - } - - var resultJson []byte - resultJson, err = json.MarshalIndent(result.PageLinks, ``, ` `) - if err != nil { - log.Fatal(err.Error()) - } - fmt.Printf("%s\n", resultJson) - return - } - -invalid_command: - usage() - os.Exit(1) -} - -func usage() { - log.Println(` -deadlinks - -Deadlinks is a program to scan for invalid links inside HTML page on the live -web server. -Invalid links will be scanned on anchor href attribute ("") or on -the image src attribute (" +// SPDX-License-Identifier: GPL-3.0-only + +package main + +import ( + "encoding/json" + "flag" + "fmt" + "log" + "os" + "strings" + + "git.sr.ht/~shulhan/jarink" +) + +func main() { + var optVerbose bool + + flag.BoolVar(&optVerbose, `verbose`, false, + `print additional information while running`) + + flag.Parse() + + var cmd = flag.Arg(0) + if cmd == "" { + goto invalid_command + } + + cmd = strings.ToLower(cmd) + if cmd == "brokenlinks" { + var brokenlinksOpts = jarink.BrokenlinksOptions{ + Url: flag.Arg(1), + IsVerbose: optVerbose, + } + if brokenlinksOpts.Url == "" { + goto invalid_command + } + + var result *jarink.BrokenlinksResult + var err error + result, err = jarink.Brokenlinks(brokenlinksOpts) + if err != nil { + log.Fatal(err.Error()) + } + + var resultJson []byte + resultJson, err = json.MarshalIndent(result.PageLinks, ``, ` `) + if err != nil { + log.Fatal(err.Error()) + } + fmt.Printf("%s\n", resultJson) + return + } + +invalid_command: + usage() + os.Exit(1) +} + +func usage() { + log.Println(` +Jarink is a program to help web administrator to maintains their website. + +== Synopsis + + jarink [OPTIONS] + +Available commands, + + brokenlinks - scan the website for broken links (page and images). + +== Usage + +[OPTIONS] brokenlinks URL + + Start scanning for broken links on the web server pointed by URL. + Invalid links will be scanned on anchor href attribute + ("") or on the image src attribute (" -// SPDX-License-Identifier: GPL-3.0-only - -package deadlinks - -import ( - "fmt" -) - -const Version = `0.1.0` - -// StatusBadLink status for link that is not parseable by [url.Parse] or not -// reachable during GET or HEAD, either timeout or IP or domain not exist. -const StatusBadLink = 700 - -// Scan the baseUrl for dead links. -func Scan(opts ScanOptions) (result *Result, err error) { - var logp = `Scan` - var wrk *worker - - wrk, err = newWorker(opts) - if err != nil { - return nil, fmt.Errorf(`%s: %s`, logp, err) - } - - result, err = wrk.run() - if err != nil { - return nil, fmt.Errorf(`%s: %s`, logp, err) - } - - return result, nil -} diff --git a/deadlinks_test.go b/deadlinks_test.go deleted file mode 100644 index c219aa0..0000000 --- a/deadlinks_test.go +++ /dev/null @@ -1,188 +0,0 @@ -// SPDX-FileCopyrightText: 2025 M. Shulhan -// SPDX-License-Identifier: GPL-3.0-only - -package deadlinks_test - -import ( - "log" - "net/http" - "os" - "testing" - "time" - - "git.sr.ht/~shulhan/deadlinks" - libnet "git.sr.ht/~shulhan/pakakeh.go/lib/net" - "git.sr.ht/~shulhan/pakakeh.go/lib/test" -) - -// The test run two web servers that serve content on "testdata/web/". -// The first web server is the one that we want to scan. -// The second web server is external web server, where HTML pages should not -// be parsed. - -const testAddress = `127.0.0.1:11836` -const testExternalAddress = `127.0.0.1:11900` - -func TestMain(m *testing.M) { - log.SetFlags(0) - var httpDirWeb = http.Dir(`testdata/web`) - var fshandle = http.FileServer(httpDirWeb) - - go func() { - var mux = http.NewServeMux() - mux.Handle(`/`, fshandle) - var testServer = &http.Server{ - Addr: testAddress, - Handler: mux, - ReadTimeout: 10 * time.Second, - WriteTimeout: 10 * time.Second, - MaxHeaderBytes: 1 << 20, - } - var err = testServer.ListenAndServe() - if err != nil { - log.Fatal(err) - } - }() - go func() { - var mux = http.NewServeMux() - mux.Handle(`/`, fshandle) - var testServer = &http.Server{ - Addr: testExternalAddress, - Handler: mux, - ReadTimeout: 10 * time.Second, - WriteTimeout: 10 * time.Second, - MaxHeaderBytes: 1 << 20, - } - var err = testServer.ListenAndServe() - if err != nil { - log.Fatal(err) - } - }() - - var err = libnet.WaitAlive(`tcp`, testAddress, 5*time.Second) - if err != nil { - log.Fatal(err) - } - err = libnet.WaitAlive(`tcp`, testExternalAddress, 5*time.Second) - if err != nil { - log.Fatal(err) - } - - os.Exit(m.Run()) -} - -func TestDeadLinks_Scan(t *testing.T) { - var testUrl = `http://` + testAddress - - type testCase struct { - exp map[string][]deadlinks.Broken - scanUrl string - expError string - } - - listCase := []testCase{{ - scanUrl: `127.0.0.1:14594`, - expError: `Scan: invalid URL "127.0.0.1:14594"`, - }, { - scanUrl: `http://127.0.0.1:14594`, - expError: `Scan: Get "http://127.0.0.1:14594": dial tcp 127.0.0.1:14594: connect: connection refused`, - }, { - scanUrl: testUrl, - exp: map[string][]deadlinks.Broken{ - testUrl: []deadlinks.Broken{ - { - Link: testUrl + `/broken.png`, - Code: http.StatusNotFound, - }, { - Link: testUrl + `/brokenPage`, - Code: http.StatusNotFound, - }, { - Link: `http://127.0.0.1:abc`, - Error: `parse "http://127.0.0.1:abc": invalid port ":abc" after host`, - Code: deadlinks.StatusBadLink, - }, { - Link: `http:/127.0.0.1:11836`, - Error: `Head "http:/127.0.0.1:11836": http: no Host in request URL`, - Code: deadlinks.StatusBadLink, - }, - }, - testUrl + `/broken.html`: []deadlinks.Broken{ - { - Link: testUrl + `/brokenPage`, - Code: http.StatusNotFound, - }, - }, - testUrl + `/page2`: []deadlinks.Broken{ - { - Link: testUrl + `/broken.png`, - Code: http.StatusNotFound, - }, { - Link: testUrl + `/page2/broken/relative`, - Code: http.StatusNotFound, - }, { - Link: testUrl + `/page2/broken2.png`, - Code: http.StatusNotFound, - }, - }, - }, - }, { - scanUrl: testUrl + `/page2`, - exp: map[string][]deadlinks.Broken{ - testUrl: []deadlinks.Broken{ - { - Link: testUrl + `/broken.png`, - Code: http.StatusNotFound, - }, { - Link: testUrl + `/brokenPage`, - Code: http.StatusNotFound, - }, { - Link: `http://127.0.0.1:abc`, - Error: `parse "http://127.0.0.1:abc": invalid port ":abc" after host`, - Code: deadlinks.StatusBadLink, - }, { - Link: `http:/127.0.0.1:11836`, - Error: `Head "http:/127.0.0.1:11836": http: no Host in request URL`, - Code: deadlinks.StatusBadLink, - }, - }, - testUrl + `/broken.html`: []deadlinks.Broken{ - { - Link: testUrl + `/brokenPage`, - Code: http.StatusNotFound, - }, - }, - testUrl + `/page2`: []deadlinks.Broken{ - { - Link: testUrl + `/broken.png`, - Code: http.StatusNotFound, - }, { - Link: testUrl + `/page2/broken/relative`, - Code: http.StatusNotFound, - }, { - Link: testUrl + `/page2/broken2.png`, - Code: http.StatusNotFound, - }, - }, - }, - }} - - var ( - result *deadlinks.Result - err error - ) - for _, tcase := range listCase { - t.Logf(`--- Scan: %s`, tcase.scanUrl) - var scanOpts = deadlinks.ScanOptions{ - Url: tcase.scanUrl, - } - result, err = deadlinks.Scan(scanOpts) - if err != nil { - test.Assert(t, tcase.scanUrl+` error`, - tcase.expError, err.Error()) - continue - } - //got, _ := json.MarshalIndent(result.PageLinks, ``, ` `) - //t.Logf(`got=%s`, got) - test.Assert(t, tcase.scanUrl, tcase.exp, result.PageLinks) - } -} diff --git a/go.mod b/go.mod index a57a7c9..2063444 100644 --- a/go.mod +++ b/go.mod @@ -1,9 +1,9 @@ // SPDX-FileCopyrightText: 2025 M. Shulhan // SPDX-License-Identifier: GPL-3.0-only -module git.sr.ht/~shulhan/deadlinks +module git.sr.ht/~shulhan/jarink -go 1.25 +go 1.24 require ( git.sr.ht/~shulhan/pakakeh.go v0.60.1 diff --git a/link_queue.go b/link_queue.go index 63940cc..0b419b8 100644 --- a/link_queue.go +++ b/link_queue.go @@ -1,7 +1,7 @@ // SPDX-FileCopyrightText: 2025 M. Shulhan // SPDX-License-Identifier: GPL-3.0-only -package deadlinks +package jarink import ( "net/url" diff --git a/result.go b/result.go deleted file mode 100644 index 6fdc817..0000000 --- a/result.go +++ /dev/null @@ -1,36 +0,0 @@ -// SPDX-FileCopyrightText: 2025 M. Shulhan -// SPDX-License-Identifier: GPL-3.0-only - -package deadlinks - -import ( - "slices" - "strings" -) - -// Broken store the link with its HTTP status. -type Broken struct { - Link string - Error string `json:"omitempty"` - Code int -} - -// Result store the result of Scan. -type Result struct { - // PageLinks store the page and its broken links. - PageLinks map[string][]Broken -} - -func newResult() *Result { - return &Result{ - PageLinks: map[string][]Broken{}, - } -} - -func (result *Result) sort() { - for _, listBroken := range result.PageLinks { - slices.SortFunc(listBroken, func(a, b Broken) int { - return strings.Compare(a.Link, b.Link) - }) - } -} diff --git a/scan_options.go b/scan_options.go deleted file mode 100644 index bc5484e..0000000 --- a/scan_options.go +++ /dev/null @@ -1,10 +0,0 @@ -// SPDX-FileCopyrightText: 2025 M. Shulhan -// SPDX-License-Identifier: GPL-3.0-only - -package deadlinks - -// ScanOptions define the options for scan command or Scan function. -type ScanOptions struct { - Url string - IsVerbose bool -} diff --git a/url_test.go b/url_test.go index 506090d..0b0bf03 100644 --- a/url_test.go +++ b/url_test.go @@ -1,7 +1,7 @@ // SPDX-FileCopyrightText: 2025 M. Shulhan // SPDX-License-Identifier: GPL-3.0-only -package deadlinks +package jarink import ( "net/url" diff --git a/worker.go b/worker.go deleted file mode 100644 index 817ff3b..0000000 --- a/worker.go +++ /dev/null @@ -1,369 +0,0 @@ -// SPDX-FileCopyrightText: 2025 M. Shulhan -// SPDX-License-Identifier: GPL-3.0-only - -package deadlinks - -import ( - "fmt" - "log" - "net/http" - "net/url" - "strings" - "sync" - "time" - - "golang.org/x/net/html" - "golang.org/x/net/html/atom" -) - -type worker struct { - // seenLink store the URL being or has been scanned and its HTTP - // status code. - seenLink map[string]int - - // resultq channel that collect result from scanning. - resultq chan map[string]linkQueue - - // result contains the final result after all of the pages has been - // scanned. - result *Result - - // The base URL that will be joined to relative or absolute - // links or image. - baseUrl *url.URL - - // The URL to scan. - scanUrl *url.URL - - opts ScanOptions - - // wg sync the goroutine scanner. - wg sync.WaitGroup -} - -func newWorker(opts ScanOptions) (wrk *worker, err error) { - wrk = &worker{ - opts: opts, - seenLink: map[string]int{}, - resultq: make(chan map[string]linkQueue, 100), - result: newResult(), - } - - wrk.scanUrl, err = url.Parse(opts.Url) - if err != nil { - return nil, fmt.Errorf(`invalid URL %q`, opts.Url) - } - - wrk.scanUrl.Path = strings.TrimSuffix(wrk.scanUrl.Path, `/`) - wrk.scanUrl.Fragment = "" - wrk.scanUrl.RawFragment = "" - - wrk.baseUrl = &url.URL{ - Scheme: wrk.scanUrl.Scheme, - Host: wrk.scanUrl.Host, - } - - return wrk, nil -} - -func (wrk *worker) run() (result *Result, err error) { - // Scan the first URL to make sure that the server is reachable. - var firstLinkq = linkQueue{ - parentUrl: nil, - url: wrk.scanUrl.String(), - status: http.StatusProcessing, - } - wrk.seenLink[firstLinkq.url] = http.StatusProcessing - - wrk.wg.Add(1) - go wrk.scan(firstLinkq) - wrk.wg.Wait() - - var resultq = <-wrk.resultq - for _, linkq := range resultq { - if linkq.url == firstLinkq.url { - if linkq.errScan != nil { - return nil, linkq.errScan - } - wrk.seenLink[linkq.url] = linkq.status - continue - } - if linkq.status >= http.StatusBadRequest { - wrk.markDead(linkq) - continue - } - - wrk.seenLink[linkq.url] = http.StatusProcessing - wrk.wg.Add(1) - go wrk.scan(linkq) - } - - var tick = time.NewTicker(500 * time.Millisecond) - var listWaitStatus []linkQueue - var isScanning = true - for isScanning { - select { - case resultq := <-wrk.resultq: - - // The resultq contains the original URL being scanned - // and its child links. - // For example, scanning "http://example.tld" result - // in - // - // "http://example.tld": {status=200} - // "http://example.tld/page": {status=0} - // "http://example.tld/image.png": {status=0} - // "http://bad:domain/image.png": {status=700} - - var newList []linkQueue - for _, linkq := range resultq { - if linkq.status >= http.StatusBadRequest { - wrk.markDead(linkq) - continue - } - if linkq.status != 0 { - // linkq is the result of scan with - // non error status. - wrk.seenLink[linkq.url] = linkq.status - continue - } - - seenStatus, seen := wrk.seenLink[linkq.url] - if !seen { - wrk.seenLink[linkq.url] = http.StatusProcessing - wrk.wg.Add(1) - go wrk.scan(linkq) - continue - } - if seenStatus >= http.StatusBadRequest { - linkq.status = seenStatus - wrk.markDead(linkq) - continue - } - if seenStatus >= http.StatusOK { - // The link has been processed and its - // not an error. - continue - } - if seenStatus == http.StatusProcessing { - // The link being processed by other - // goroutine. - linkq.status = seenStatus - newList = append(newList, linkq) - continue - } - log.Fatalf("link=%s status=%d", linkq.url, linkq.status) - } - for _, linkq := range listWaitStatus { - seenStatus := wrk.seenLink[linkq.url] - if seenStatus >= http.StatusBadRequest { - linkq.status = seenStatus - wrk.markDead(linkq) - continue - } - if seenStatus >= http.StatusOK { - continue - } - if seenStatus == http.StatusProcessing { - // Scanning still in progress. - newList = append(newList, linkq) - continue - } - } - listWaitStatus = newList - - case <-tick.C: - wrk.wg.Wait() - if len(wrk.resultq) != 0 { - continue - } - if len(listWaitStatus) != 0 { - // There are links that still waiting for - // scanning to be completed. - continue - } - isScanning = false - } - } - wrk.result.sort() - return wrk.result, nil -} - -func (wrk *worker) markDead(linkq linkQueue) { - var parentUrl = linkq.parentUrl.String() - var listBroken = wrk.result.PageLinks[parentUrl] - var brokenLink = Broken{ - Link: linkq.url, - Code: linkq.status, - } - if linkq.errScan != nil { - brokenLink.Error = linkq.errScan.Error() - } - listBroken = append(listBroken, brokenLink) - wrk.result.PageLinks[parentUrl] = listBroken - - wrk.seenLink[linkq.url] = linkq.status -} - -// scan fetch the HTML page or image to check if its valid. -func (wrk *worker) scan(linkq linkQueue) { - defer func() { - if wrk.opts.IsVerbose && linkq.errScan != nil { - fmt.Printf("error: %d %s error=%v\n", linkq.status, - linkq.url, linkq.errScan) - } - wrk.wg.Done() - }() - - var ( - resultq = map[string]linkQueue{} - httpResp *http.Response - err error - ) - if linkq.kind == atom.Img || linkq.isExternal { - if wrk.opts.IsVerbose { - fmt.Printf("scan: HEAD %s\n", linkq.url) - } - httpResp, err = http.Head(linkq.url) - } else { - if wrk.opts.IsVerbose { - fmt.Printf("scan: GET %s\n", linkq.url) - } - httpResp, err = http.Get(linkq.url) - } - if err != nil { - linkq.status = StatusBadLink - linkq.errScan = err - resultq[linkq.url] = linkq - go wrk.pushResult(resultq) - return - } - defer httpResp.Body.Close() - - linkq.status = httpResp.StatusCode - resultq[linkq.url] = linkq - - if httpResp.StatusCode >= http.StatusBadRequest { - go wrk.pushResult(resultq) - return - } - if linkq.kind == atom.Img || linkq.isExternal { - go wrk.pushResult(resultq) - return - } - - var doc *html.Node - doc, _ = html.Parse(httpResp.Body) - - // After we check the code and test for [html.Parse] there are - // no case actual cases where HTML content will return an error. - // The only possible error is when reading from body (io.Reader), and - // that is also almost impossible. - // - // [html.Parse]: https://go.googlesource.com/net/+/refs/tags/v0.40.0/html/parse.go#2347 - - var scanUrl *url.URL - - scanUrl, err = url.Parse(linkq.url) - if err != nil { - log.Fatal(err) - } - - var node *html.Node - for node = range doc.Descendants() { - if node.Type != html.ElementNode { - continue - } - var nodeLink *linkQueue - if node.DataAtom == atom.A { - for _, attr := range node.Attr { - if attr.Key != `href` { - continue - } - nodeLink = wrk.processLink(scanUrl, attr.Val, atom.A) - break - } - } else if node.DataAtom == atom.Img { - for _, attr := range node.Attr { - if attr.Key != `src` { - continue - } - nodeLink = wrk.processLink(scanUrl, attr.Val, atom.Img) - break - } - } else { - continue - } - if nodeLink == nil { - continue - } - _, seen := resultq[nodeLink.url] - if !seen { - if !strings.HasPrefix(nodeLink.url, wrk.baseUrl.String()) { - nodeLink.isExternal = true - } - resultq[nodeLink.url] = *nodeLink - } - } - go wrk.pushResult(resultq) -} - -func (wrk *worker) processLink(parentUrl *url.URL, val string, kind atom.Atom) ( - linkq *linkQueue, -) { - if len(val) == 0 { - return nil - } - - var newUrl *url.URL - var err error - newUrl, err = url.Parse(val) - if err != nil { - return &linkQueue{ - parentUrl: parentUrl, - errScan: err, - url: val, - kind: kind, - status: StatusBadLink, - } - } - newUrl.Fragment = "" - newUrl.RawFragment = "" - - if kind == atom.A && val[0] == '#' { - // Ignore link to ID, like `href="#element_id"`. - return nil - } - if strings.HasPrefix(val, `http`) { - return &linkQueue{ - parentUrl: parentUrl, - url: strings.TrimSuffix(newUrl.String(), `/`), - kind: kind, - } - } - if val[0] == '/' { - // val is absolute to parent URL. - newUrl = wrk.baseUrl.JoinPath(newUrl.Path) - } else { - // val is relative to parent URL. - newUrl = parentUrl.JoinPath(`/`, newUrl.Path) - } - linkq = &linkQueue{ - parentUrl: parentUrl, - url: strings.TrimSuffix(newUrl.String(), `/`), - kind: kind, - } - return linkq -} - -func (wrk *worker) pushResult(resultq map[string]linkQueue) { - var tick = time.NewTicker(100 * time.Millisecond) - for { - select { - case wrk.resultq <- resultq: - tick.Stop() - return - case <-tick.C: - } - } -} -- cgit v1.3