aboutsummaryrefslogtreecommitdiff
path: root/brokenlinks
diff options
context:
space:
mode:
Diffstat (limited to 'brokenlinks')
-rw-r--r--brokenlinks/brokenlinks.go39
-rw-r--r--brokenlinks/brokenlinks_test.go227
-rw-r--r--brokenlinks/link_queue.go55
-rw-r--r--brokenlinks/result.go37
-rw-r--r--brokenlinks/testdata/past_result.json10
-rw-r--r--brokenlinks/testdata/past_result.json.license2
-rw-r--r--brokenlinks/testdata/web/broken.html7
-rw-r--r--brokenlinks/testdata/web/gopher.pngbin0 -> 32775 bytes
-rw-r--r--brokenlinks/testdata/web/index.html22
-rw-r--r--brokenlinks/testdata/web/page2/index.html14
-rw-r--r--brokenlinks/worker.go467
11 files changed, 880 insertions, 0 deletions
diff --git a/brokenlinks/brokenlinks.go b/brokenlinks/brokenlinks.go
new file mode 100644
index 0000000..8ac458f
--- /dev/null
+++ b/brokenlinks/brokenlinks.go
@@ -0,0 +1,39 @@
+// SPDX-FileCopyrightText: 2025 M. Shulhan <ms@kilabit.info>
+// SPDX-License-Identifier: GPL-3.0-only
+
+package brokenlinks
+
+import (
+ "fmt"
+)
+
+const Version = `0.1.0`
+
+// StatusBadLink status for link that is not parseable by [url.Parse] or not
+// reachable during GET or HEAD, either timeout or IP or domain not exist.
+const StatusBadLink = 700
+
+// Options define the options for scanning broken links.
+type Options struct {
+ Url string
+ PastResultFile string
+ IsVerbose bool
+}
+
+// Scan the URL for broken links.
+func Scan(opts Options) (result *Result, err error) {
+ var logp = `brokenlinks`
+ var wrk *worker
+
+ wrk, err = newWorker(opts)
+ if err != nil {
+ return nil, fmt.Errorf(`%s: %s`, logp, err)
+ }
+
+ result, err = wrk.run()
+ if err != nil {
+ return nil, fmt.Errorf(`%s: %s`, logp, err)
+ }
+
+ return result, nil
+}
diff --git a/brokenlinks/brokenlinks_test.go b/brokenlinks/brokenlinks_test.go
new file mode 100644
index 0000000..367ae6c
--- /dev/null
+++ b/brokenlinks/brokenlinks_test.go
@@ -0,0 +1,227 @@
+// SPDX-FileCopyrightText: 2025 M. Shulhan <ms@kilabit.info>
+// SPDX-License-Identifier: GPL-3.0-only
+
+package brokenlinks_test
+
+import (
+ "encoding/json"
+ "log"
+ "net/http"
+ "os"
+ "testing"
+ "time"
+
+ libnet "git.sr.ht/~shulhan/pakakeh.go/lib/net"
+ "git.sr.ht/~shulhan/pakakeh.go/lib/test"
+
+ "git.sr.ht/~shulhan/jarink/brokenlinks"
+)
+
+// The test run two web servers that serve content on "testdata/web/".
+// The first web server is the one that we want to scan.
+// The second web server is external web server, where HTML pages should not
+// be parsed.
+
+const testAddress = `127.0.0.1:11836`
+const testExternalAddress = `127.0.0.1:11900`
+
+func TestMain(m *testing.M) {
+ log.SetFlags(0)
+ var httpDirWeb = http.Dir(`testdata/web`)
+ var fshandle = http.FileServer(httpDirWeb)
+
+ go func() {
+ var mux = http.NewServeMux()
+ mux.Handle(`/`, fshandle)
+ var testServer = &http.Server{
+ Addr: testAddress,
+ Handler: mux,
+ ReadTimeout: 10 * time.Second,
+ WriteTimeout: 10 * time.Second,
+ MaxHeaderBytes: 1 << 20,
+ }
+ var err = testServer.ListenAndServe()
+ if err != nil {
+ log.Fatal(err)
+ }
+ }()
+ go func() {
+ var mux = http.NewServeMux()
+ mux.Handle(`/`, fshandle)
+ var testServer = &http.Server{
+ Addr: testExternalAddress,
+ Handler: mux,
+ ReadTimeout: 10 * time.Second,
+ WriteTimeout: 10 * time.Second,
+ MaxHeaderBytes: 1 << 20,
+ }
+ var err = testServer.ListenAndServe()
+ if err != nil {
+ log.Fatal(err)
+ }
+ }()
+
+ var err = libnet.WaitAlive(`tcp`, testAddress, 5*time.Second)
+ if err != nil {
+ log.Fatal(err)
+ }
+ err = libnet.WaitAlive(`tcp`, testExternalAddress, 5*time.Second)
+ if err != nil {
+ log.Fatal(err)
+ }
+
+ os.Exit(m.Run())
+}
+
+func TestBrokenlinks(t *testing.T) {
+ var testUrl = `http://` + testAddress
+
+ type testCase struct {
+ exp map[string][]brokenlinks.Broken
+ scanUrl string
+ expError string
+ }
+
+ listCase := []testCase{{
+ scanUrl: `127.0.0.1:14594`,
+ expError: `brokenlinks: invalid URL "127.0.0.1:14594"`,
+ }, {
+ scanUrl: `http://127.0.0.1:14594`,
+ expError: `brokenlinks: Get "http://127.0.0.1:14594": dial tcp 127.0.0.1:14594: connect: connection refused`,
+ }, {
+ scanUrl: testUrl,
+ exp: map[string][]brokenlinks.Broken{
+ testUrl: []brokenlinks.Broken{
+ {
+ Link: testUrl + `/broken.png`,
+ Code: http.StatusNotFound,
+ }, {
+ Link: testUrl + `/brokenPage`,
+ Code: http.StatusNotFound,
+ }, {
+ Link: `http://127.0.0.1:abc`,
+ Error: `parse "http://127.0.0.1:abc": invalid port ":abc" after host`,
+ Code: brokenlinks.StatusBadLink,
+ }, {
+ Link: `http:/127.0.0.1:11836`,
+ Error: `Get "http:/127.0.0.1:11836": http: no Host in request URL`,
+ Code: brokenlinks.StatusBadLink,
+ },
+ },
+ testUrl + `/broken.html`: []brokenlinks.Broken{
+ {
+ Link: testUrl + `/brokenPage`,
+ Code: http.StatusNotFound,
+ },
+ },
+ testUrl + `/page2`: []brokenlinks.Broken{
+ {
+ Link: testUrl + `/broken.png`,
+ Code: http.StatusNotFound,
+ }, {
+ Link: testUrl + `/page2/broken/relative`,
+ Code: http.StatusNotFound,
+ }, {
+ Link: testUrl + `/page2/broken2.png`,
+ Code: http.StatusNotFound,
+ },
+ },
+ },
+ }, {
+ // Scanning on "/path" should not scan the the "/" or other
+ // pages other than below of "/path" itself.
+ scanUrl: testUrl + `/page2`,
+ exp: map[string][]brokenlinks.Broken{
+ testUrl + `/page2`: []brokenlinks.Broken{
+ {
+ Link: testUrl + `/broken.png`,
+ Code: http.StatusNotFound,
+ }, {
+ Link: testUrl + `/page2/broken/relative`,
+ Code: http.StatusNotFound,
+ }, {
+ Link: testUrl + `/page2/broken2.png`,
+ Code: http.StatusNotFound,
+ },
+ },
+ },
+ }}
+
+ var (
+ result *brokenlinks.Result
+ err error
+ )
+ for _, tcase := range listCase {
+ t.Logf(`--- brokenlinks: %s`, tcase.scanUrl)
+ var opts = brokenlinks.Options{
+ Url: tcase.scanUrl,
+ }
+ result, err = brokenlinks.Scan(opts)
+ if err != nil {
+ test.Assert(t, tcase.scanUrl+` error`,
+ tcase.expError, err.Error())
+ continue
+ }
+ //got, _ := json.MarshalIndent(result.BrokenLinks, ``, ` `)
+ //t.Logf(`got=%s`, got)
+ test.Assert(t, tcase.scanUrl, tcase.exp, result.BrokenLinks)
+ }
+}
+
+// Test running Brokenlinks with file PastResultFile is set.
+// The PastResultFile is modified to only report errors on "/page2".
+func TestBrokenlinks_pastResult(t *testing.T) {
+ var testUrl = `http://` + testAddress
+
+ type testCase struct {
+ exp map[string][]brokenlinks.Broken
+ expError string
+ opts brokenlinks.Options
+ }
+
+ listCase := []testCase{{
+ // With invalid file.
+ opts: brokenlinks.Options{
+ Url: testUrl,
+ PastResultFile: `testdata/invalid`,
+ },
+ expError: `brokenlinks: open testdata/invalid: no such file or directory`,
+ }, {
+ // With valid file.
+ opts: brokenlinks.Options{
+ Url: testUrl,
+ PastResultFile: `testdata/past_result.json`,
+ },
+ exp: map[string][]brokenlinks.Broken{
+ testUrl + `/page2`: []brokenlinks.Broken{
+ {
+ Link: testUrl + `/broken.png`,
+ Code: http.StatusNotFound,
+ }, {
+ Link: testUrl + `/page2/broken/relative`,
+ Code: http.StatusNotFound,
+ }, {
+ Link: testUrl + `/page2/broken2.png`,
+ Code: http.StatusNotFound,
+ },
+ },
+ },
+ }}
+
+ var (
+ result *brokenlinks.Result
+ err error
+ )
+ for _, tcase := range listCase {
+ t.Logf(`--- brokenlinks: %s`, tcase.opts.Url)
+ result, err = brokenlinks.Scan(tcase.opts)
+ if err != nil {
+ test.Assert(t, tcase.opts.Url+` error`,
+ tcase.expError, err.Error())
+ continue
+ }
+ got, _ := json.MarshalIndent(result.BrokenLinks, ``, ` `)
+ t.Logf(`got=%s`, got)
+ test.Assert(t, tcase.opts.Url, tcase.exp, result.BrokenLinks)
+ }
+}
diff --git a/brokenlinks/link_queue.go b/brokenlinks/link_queue.go
new file mode 100644
index 0000000..164a902
--- /dev/null
+++ b/brokenlinks/link_queue.go
@@ -0,0 +1,55 @@
+// SPDX-FileCopyrightText: 2025 M. Shulhan <ms@kilabit.info>
+// SPDX-License-Identifier: GPL-3.0-only
+
+package brokenlinks
+
+import (
+ "net/url"
+ "strings"
+
+ "golang.org/x/net/html/atom"
+)
+
+type linkQueue struct {
+ parentUrl *url.URL
+
+ // The error from scan.
+ errScan error
+
+ // url being scanned.
+ url string
+
+ // kind of url, its either an anchor or image.
+ // It set to 0 if url is the first URL being scanned.
+ kind atom.Atom
+
+ // isExternal if true the scan will issue HTTP method HEAD instead of
+ // GET.
+ isExternal bool
+
+ // Status of link after scan, its mostly used the HTTP status code.
+ // 0: link is the result of scan, not processed yet.
+ // StatusBadLink: link is invalid, not parseable or unreachable.
+ // 200 - 211: OK.
+ // 400 - 511: Error.
+ status int
+}
+
+// checkExternal set the isExternal field to be true if
+//
+// (1) [linkQueue.url] does not start with [worker.scanUrl]
+//
+// (2) linkQueue is from scanPastResult, indicated by non-nil
+// [worker.pastResult].
+// In this case, we did not want to scan the other pages from the same scanUrl
+// domain.
+func (linkq *linkQueue) checkExternal(wrk *worker) {
+ if !strings.HasPrefix(linkq.url, wrk.scanUrl.String()) {
+ linkq.isExternal = true
+ return
+ }
+ if wrk.pastResult != nil {
+ linkq.isExternal = true
+ return
+ }
+}
diff --git a/brokenlinks/result.go b/brokenlinks/result.go
new file mode 100644
index 0000000..676859b
--- /dev/null
+++ b/brokenlinks/result.go
@@ -0,0 +1,37 @@
+// SPDX-FileCopyrightText: 2025 M. Shulhan <ms@kilabit.info>
+// SPDX-License-Identifier: GPL-3.0-only
+
+package brokenlinks
+
+import (
+ "slices"
+ "strings"
+)
+
+// Broken store the broken link, HTTP status code, and the error message that
+// cause it.
+type Broken struct {
+ Link string `json:"link"`
+ Error string `json:"error,omitempty"`
+ Code int `json:"code"`
+}
+
+// Result store the result of scanning for broken links.
+type Result struct {
+ // BrokenLinks store the page and its broken links.
+ BrokenLinks map[string][]Broken `json:"broken_links"`
+}
+
+func newResult() *Result {
+ return &Result{
+ BrokenLinks: map[string][]Broken{},
+ }
+}
+
+func (result *Result) sort() {
+ for _, listBroken := range result.BrokenLinks {
+ slices.SortFunc(listBroken, func(a, b Broken) int {
+ return strings.Compare(a.Link, b.Link)
+ })
+ }
+}
diff --git a/brokenlinks/testdata/past_result.json b/brokenlinks/testdata/past_result.json
new file mode 100644
index 0000000..ca29d35
--- /dev/null
+++ b/brokenlinks/testdata/past_result.json
@@ -0,0 +1,10 @@
+{
+ "broken_links": {
+ "http://127.0.0.1:11836/page2": [
+ {
+ "link": "http://127.0.0.1:11836/",
+ "code": 404
+ }
+ ]
+ }
+}
diff --git a/brokenlinks/testdata/past_result.json.license b/brokenlinks/testdata/past_result.json.license
new file mode 100644
index 0000000..22616a9
--- /dev/null
+++ b/brokenlinks/testdata/past_result.json.license
@@ -0,0 +1,2 @@
+SPDX-FileCopyrightText: 2025 M. Shulhan <ms@kilabit.info>
+SPDX-License-Identifier: GPL-3.0-only
diff --git a/brokenlinks/testdata/web/broken.html b/brokenlinks/testdata/web/broken.html
new file mode 100644
index 0000000..533e542
--- /dev/null
+++ b/brokenlinks/testdata/web/broken.html
@@ -0,0 +1,7 @@
+<html>
+ <head></head>
+ <body>
+ <a href="/brokenPage"
+ <p>
+ </body>
+</html>
diff --git a/brokenlinks/testdata/web/gopher.png b/brokenlinks/testdata/web/gopher.png
new file mode 100644
index 0000000..79352be
--- /dev/null
+++ b/brokenlinks/testdata/web/gopher.png
Binary files differ
diff --git a/brokenlinks/testdata/web/index.html b/brokenlinks/testdata/web/index.html
new file mode 100644
index 0000000..61a1f39
--- /dev/null
+++ b/brokenlinks/testdata/web/index.html
@@ -0,0 +1,22 @@
+<!--
+SPDX-FileCopyrightText: 2025 M. Shulhan <ms@kilabit.info>
+SPDX-License-Identifier: GPL-3.0-only
+-->
+<html>
+ <body>
+ <img src="/broken.png" />
+ <a href="/brokenPage">Broken page</a>
+ <img src="/gopher.png" />
+ <img width="200" src="" />
+ <a href="/page2">Page 2</a>
+ <a href="/broken.html">Broken HTML</a>
+ <a href="http://127.0.0.1:11900">External URL</a>
+ <!-- Error when fetching with GET -->
+ <a href="http:/127.0.0.1:11836">Invalid external URL</a>
+ <!-- Error when parsing URL -->
+ <a href="http://127.0.0.1:abc">Invalid URL port</a>
+ <!-- Fragment should be skipped and cleaned up -->
+ <a href="#goto_a">Same with href to "/"</a>
+ <a href="/page2#goto_a">Same with href to "/page2"</a>
+ </body>
+</html>
diff --git a/brokenlinks/testdata/web/page2/index.html b/brokenlinks/testdata/web/page2/index.html
new file mode 100644
index 0000000..ae6b4ea
--- /dev/null
+++ b/brokenlinks/testdata/web/page2/index.html
@@ -0,0 +1,14 @@
+<!--
+SPDX-FileCopyrightText: 2025 M. Shulhan <ms@kilabit.info>
+SPDX-License-Identifier: GPL-3.0-only
+-->
+<html>
+ <body>
+ <img src="/broken.png" />
+ <img src="broken2.png" />
+ <a href="broken/relative">broken relative link</a>
+ <a href="/">Back with absolute path</a>
+ <a href="../">Back with relative path</a>
+ <a href="http://127.0.0.1:11900/page2">External URL page2</a>
+ </body>
+</html>
diff --git a/brokenlinks/worker.go b/brokenlinks/worker.go
new file mode 100644
index 0000000..4ed56d2
--- /dev/null
+++ b/brokenlinks/worker.go
@@ -0,0 +1,467 @@
+// SPDX-FileCopyrightText: 2025 M. Shulhan <ms@kilabit.info>
+// SPDX-License-Identifier: GPL-3.0-only
+
+package brokenlinks
+
+import (
+ "encoding/json"
+ "errors"
+ "fmt"
+ "log"
+ "net"
+ "net/http"
+ "net/url"
+ "os"
+ "strings"
+ "sync"
+ "time"
+
+ "golang.org/x/net/html"
+ "golang.org/x/net/html/atom"
+)
+
+type worker struct {
+ // seenLink store the URL being or has been scanned and its HTTP
+ // status code.
+ seenLink map[string]int
+
+ // resultq channel that collect result from scanning.
+ resultq chan map[string]linkQueue
+
+ // result contains the final result after all of the pages has been
+ // scanned.
+ result *Result
+
+ // pastResult containts the past scan result, loaded from file
+ // [Options.PastResultFile].
+ pastResult *Result
+
+ // The base URL that will be joined to relative or absolute
+ // links or image.
+ baseUrl *url.URL
+
+ // The URL to scan.
+ scanUrl *url.URL
+
+ log *log.Logger
+
+ opts Options
+
+ // wg sync the goroutine scanner.
+ wg sync.WaitGroup
+}
+
+func newWorker(opts Options) (wrk *worker, err error) {
+ wrk = &worker{
+ opts: opts,
+ seenLink: map[string]int{},
+ resultq: make(chan map[string]linkQueue, 100),
+ result: newResult(),
+ log: log.New(os.Stderr, ``, log.LstdFlags),
+ }
+
+ wrk.scanUrl, err = url.Parse(opts.Url)
+ if err != nil {
+ return nil, fmt.Errorf(`invalid URL %q`, opts.Url)
+ }
+ wrk.scanUrl.Path = strings.TrimSuffix(wrk.scanUrl.Path, `/`)
+ wrk.scanUrl.Fragment = ""
+ wrk.scanUrl.RawFragment = ""
+
+ wrk.baseUrl = &url.URL{
+ Scheme: wrk.scanUrl.Scheme,
+ Host: wrk.scanUrl.Host,
+ }
+
+ if opts.PastResultFile == "" {
+ // Run with normal scan.
+ return wrk, nil
+ }
+
+ pastresult, err := os.ReadFile(opts.PastResultFile)
+ if err != nil {
+ return nil, err
+ }
+
+ wrk.pastResult = newResult()
+ err = json.Unmarshal(pastresult, &wrk.pastResult)
+ if err != nil {
+ return nil, err
+ }
+
+ return wrk, nil
+}
+
+func (wrk *worker) run() (result *Result, err error) {
+ if wrk.pastResult == nil {
+ result, err = wrk.scanAll()
+ } else {
+ result, err = wrk.scanPastResult()
+ }
+ return result, err
+}
+
+// scanAll scan all pages start from [Options.Url].
+func (wrk *worker) scanAll() (result *Result, err error) {
+ // Scan the first URL to make sure that the server is reachable.
+ var firstLinkq = linkQueue{
+ parentUrl: nil,
+ url: wrk.scanUrl.String(),
+ status: http.StatusProcessing,
+ }
+ wrk.seenLink[firstLinkq.url] = http.StatusProcessing
+
+ wrk.wg.Add(1)
+ go wrk.scan(firstLinkq)
+ wrk.wg.Wait()
+
+ var resultq = <-wrk.resultq
+ for _, linkq := range resultq {
+ if linkq.url == firstLinkq.url {
+ if linkq.errScan != nil {
+ return nil, linkq.errScan
+ }
+ wrk.seenLink[linkq.url] = linkq.status
+ continue
+ }
+ if linkq.status >= http.StatusBadRequest {
+ wrk.markBroken(linkq)
+ continue
+ }
+
+ wrk.seenLink[linkq.url] = http.StatusProcessing
+ wrk.wg.Add(1)
+ go wrk.scan(linkq)
+ }
+
+ var tick = time.NewTicker(500 * time.Millisecond)
+ var listWaitStatus []linkQueue
+ var isScanning = true
+ for isScanning {
+ select {
+ case resultq := <-wrk.resultq:
+ listWaitStatus = wrk.processResult(resultq, listWaitStatus)
+
+ case <-tick.C:
+ wrk.wg.Wait()
+ if len(wrk.resultq) != 0 {
+ continue
+ }
+ if len(listWaitStatus) != 0 {
+ // There are links that still waiting for
+ // scanning to be completed.
+ continue
+ }
+ isScanning = false
+ }
+ }
+ wrk.result.sort()
+ return wrk.result, nil
+}
+
+// scanPastResult scan only pages reported inside
+// [Result.BrokenLinks].
+func (wrk *worker) scanPastResult() (
+ result *Result, err error,
+) {
+ go func() {
+ for page := range wrk.pastResult.BrokenLinks {
+ var linkq = linkQueue{
+ parentUrl: nil,
+ url: page,
+ status: http.StatusProcessing,
+ }
+ wrk.seenLink[linkq.url] = http.StatusProcessing
+ wrk.wg.Add(1)
+ go wrk.scan(linkq)
+ }
+ }()
+
+ var tick = time.NewTicker(500 * time.Millisecond)
+ var listWaitStatus []linkQueue
+ var isScanning = true
+ for isScanning {
+ select {
+ case resultq := <-wrk.resultq:
+ listWaitStatus = wrk.processResult(resultq, listWaitStatus)
+
+ case <-tick.C:
+ wrk.wg.Wait()
+ if len(wrk.resultq) != 0 {
+ continue
+ }
+ if len(listWaitStatus) != 0 {
+ // There are links that still waiting for
+ // scanning to be completed.
+ continue
+ }
+ isScanning = false
+ }
+ }
+ wrk.result.sort()
+ return wrk.result, nil
+}
+
+// processResult the resultq contains the original URL being scanned
+// and its child links.
+// For example, scanning "http://example.tld" result in
+//
+// "http://example.tld": {status=200}
+// "http://example.tld/page": {status=0}
+// "http://example.tld/image.png": {status=0}
+// "http://bad:domain/image.png": {status=700}
+func (wrk *worker) processResult(
+ resultq map[string]linkQueue, listWaitStatus []linkQueue,
+) (
+ newList []linkQueue,
+) {
+ for _, linkq := range resultq {
+ if linkq.status >= http.StatusBadRequest {
+ wrk.markBroken(linkq)
+ continue
+ }
+ if linkq.status != 0 {
+ // linkq is the result of scan with
+ // non error status.
+ wrk.seenLink[linkq.url] = linkq.status
+ continue
+ }
+
+ seenStatus, seen := wrk.seenLink[linkq.url]
+ if !seen {
+ wrk.seenLink[linkq.url] = http.StatusProcessing
+ wrk.wg.Add(1)
+ go wrk.scan(linkq)
+ continue
+ }
+ if seenStatus >= http.StatusBadRequest {
+ linkq.status = seenStatus
+ wrk.markBroken(linkq)
+ continue
+ }
+ if seenStatus >= http.StatusOK {
+ // The link has been processed and its
+ // not an error.
+ continue
+ }
+ // The link being processed by other goroutine.
+ linkq.status = seenStatus
+ newList = append(newList, linkq)
+ }
+ for _, linkq := range listWaitStatus {
+ seenStatus := wrk.seenLink[linkq.url]
+ if seenStatus >= http.StatusBadRequest {
+ linkq.status = seenStatus
+ wrk.markBroken(linkq)
+ continue
+ }
+ if seenStatus >= http.StatusOK {
+ continue
+ }
+ if seenStatus == http.StatusProcessing {
+ // Scanning still in progress.
+ newList = append(newList, linkq)
+ continue
+ }
+ }
+ return newList
+}
+
+func (wrk *worker) markBroken(linkq linkQueue) {
+ var parentUrl = linkq.parentUrl.String()
+ var listBroken = wrk.result.BrokenLinks[parentUrl]
+ var brokenLink = Broken{
+ Link: linkq.url,
+ Code: linkq.status,
+ }
+ if linkq.errScan != nil {
+ brokenLink.Error = linkq.errScan.Error()
+ }
+ listBroken = append(listBroken, brokenLink)
+ wrk.result.BrokenLinks[parentUrl] = listBroken
+
+ wrk.seenLink[linkq.url] = linkq.status
+}
+
+// scan fetch the HTML page or image to check if its valid.
+func (wrk *worker) scan(linkq linkQueue) {
+ defer func() {
+ if wrk.opts.IsVerbose && linkq.errScan != nil {
+ wrk.log.Printf("error: %d %s error=%v\n", linkq.status,
+ linkq.url, linkq.errScan)
+ }
+ wrk.wg.Done()
+ }()
+
+ var (
+ resultq = map[string]linkQueue{}
+ httpResp *http.Response
+ err error
+ )
+ httpResp, err = wrk.fetch(linkq)
+ if err != nil {
+ linkq.status = StatusBadLink
+ linkq.errScan = err
+ resultq[linkq.url] = linkq
+ go wrk.pushResult(resultq)
+ return
+ }
+ defer httpResp.Body.Close()
+
+ linkq.status = httpResp.StatusCode
+ resultq[linkq.url] = linkq
+
+ if httpResp.StatusCode >= http.StatusBadRequest {
+ go wrk.pushResult(resultq)
+ return
+ }
+ if linkq.kind == atom.Img || linkq.isExternal {
+ go wrk.pushResult(resultq)
+ return
+ }
+
+ var doc *html.Node
+ doc, _ = html.Parse(httpResp.Body)
+
+ // After we check the code and test for [html.Parse] there are
+ // no case actual cases where HTML content will return an error.
+ // The only possible error is when reading from body (io.Reader), and
+ // that is also almost impossible.
+ //
+ // [html.Parse]: https://go.googlesource.com/net/+/refs/tags/v0.40.0/html/parse.go#2347
+
+ var scanUrl *url.URL
+
+ scanUrl, err = url.Parse(linkq.url)
+ if err != nil {
+ log.Fatal(err)
+ }
+
+ var node *html.Node
+ for node = range doc.Descendants() {
+ if node.Type != html.ElementNode {
+ continue
+ }
+ var nodeLink *linkQueue
+ if node.DataAtom == atom.A {
+ for _, attr := range node.Attr {
+ if attr.Key != `href` {
+ continue
+ }
+ nodeLink = wrk.processLink(scanUrl, attr.Val, atom.A)
+ break
+ }
+ } else if node.DataAtom == atom.Img {
+ for _, attr := range node.Attr {
+ if attr.Key != `src` {
+ continue
+ }
+ nodeLink = wrk.processLink(scanUrl, attr.Val, atom.Img)
+ break
+ }
+ } else {
+ continue
+ }
+ if nodeLink == nil {
+ continue
+ }
+ _, seen := resultq[nodeLink.url]
+ if !seen {
+ nodeLink.checkExternal(wrk)
+ resultq[nodeLink.url] = *nodeLink
+ }
+ }
+ go wrk.pushResult(resultq)
+}
+
+func (wrk *worker) fetch(linkq linkQueue) (
+ httpResp *http.Response,
+ err error,
+) {
+ const maxRetry = 5
+ var retry int
+ for retry < 5 {
+ if linkq.kind == atom.Img {
+ if wrk.opts.IsVerbose {
+ wrk.log.Printf("scan: HEAD %s\n", linkq.url)
+ }
+ httpResp, err = http.Head(linkq.url)
+ } else {
+ if wrk.opts.IsVerbose {
+ wrk.log.Printf("scan: GET %s\n", linkq.url)
+ }
+ httpResp, err = http.Get(linkq.url)
+ }
+ if err == nil {
+ return httpResp, nil
+ }
+ var errDNS *net.DNSError
+ if !errors.As(err, &errDNS) {
+ return nil, err
+ }
+ if errDNS.Timeout() {
+ retry++
+ }
+ }
+ return nil, err
+}
+
+func (wrk *worker) processLink(parentUrl *url.URL, val string, kind atom.Atom) (
+ linkq *linkQueue,
+) {
+ if len(val) == 0 {
+ return nil
+ }
+
+ var newUrl *url.URL
+ var err error
+ newUrl, err = url.Parse(val)
+ if err != nil {
+ return &linkQueue{
+ parentUrl: parentUrl,
+ errScan: err,
+ url: val,
+ kind: kind,
+ status: StatusBadLink,
+ }
+ }
+ newUrl.Fragment = ""
+ newUrl.RawFragment = ""
+
+ if kind == atom.A && val[0] == '#' {
+ // Ignore link to ID, like `href="#element_id"`.
+ return nil
+ }
+ if strings.HasPrefix(val, `http`) {
+ return &linkQueue{
+ parentUrl: parentUrl,
+ url: strings.TrimSuffix(newUrl.String(), `/`),
+ kind: kind,
+ }
+ }
+ if val[0] == '/' {
+ // val is absolute to parent URL.
+ newUrl = wrk.baseUrl.JoinPath(newUrl.Path)
+ } else {
+ // val is relative to parent URL.
+ newUrl = parentUrl.JoinPath(`/`, newUrl.Path)
+ }
+ linkq = &linkQueue{
+ parentUrl: parentUrl,
+ url: strings.TrimSuffix(newUrl.String(), `/`),
+ kind: kind,
+ }
+ return linkq
+}
+
+func (wrk *worker) pushResult(resultq map[string]linkQueue) {
+ var tick = time.NewTicker(100 * time.Millisecond)
+ for {
+ select {
+ case wrk.resultq <- resultq:
+ tick.Stop()
+ return
+ case <-tick.C:
+ }
+ }
+}