diff options
Diffstat (limited to 'brokenlinks')
| -rw-r--r-- | brokenlinks/brokenlinks.go | 39 | ||||
| -rw-r--r-- | brokenlinks/brokenlinks_test.go | 227 | ||||
| -rw-r--r-- | brokenlinks/link_queue.go | 55 | ||||
| -rw-r--r-- | brokenlinks/result.go | 37 | ||||
| -rw-r--r-- | brokenlinks/testdata/past_result.json | 10 | ||||
| -rw-r--r-- | brokenlinks/testdata/past_result.json.license | 2 | ||||
| -rw-r--r-- | brokenlinks/testdata/web/broken.html | 7 | ||||
| -rw-r--r-- | brokenlinks/testdata/web/gopher.png | bin | 0 -> 32775 bytes | |||
| -rw-r--r-- | brokenlinks/testdata/web/index.html | 22 | ||||
| -rw-r--r-- | brokenlinks/testdata/web/page2/index.html | 14 | ||||
| -rw-r--r-- | brokenlinks/worker.go | 467 |
11 files changed, 880 insertions, 0 deletions
diff --git a/brokenlinks/brokenlinks.go b/brokenlinks/brokenlinks.go new file mode 100644 index 0000000..8ac458f --- /dev/null +++ b/brokenlinks/brokenlinks.go @@ -0,0 +1,39 @@ +// SPDX-FileCopyrightText: 2025 M. Shulhan <ms@kilabit.info> +// SPDX-License-Identifier: GPL-3.0-only + +package brokenlinks + +import ( + "fmt" +) + +const Version = `0.1.0` + +// StatusBadLink status for link that is not parseable by [url.Parse] or not +// reachable during GET or HEAD, either timeout or IP or domain not exist. +const StatusBadLink = 700 + +// Options define the options for scanning broken links. +type Options struct { + Url string + PastResultFile string + IsVerbose bool +} + +// Scan the URL for broken links. +func Scan(opts Options) (result *Result, err error) { + var logp = `brokenlinks` + var wrk *worker + + wrk, err = newWorker(opts) + if err != nil { + return nil, fmt.Errorf(`%s: %s`, logp, err) + } + + result, err = wrk.run() + if err != nil { + return nil, fmt.Errorf(`%s: %s`, logp, err) + } + + return result, nil +} diff --git a/brokenlinks/brokenlinks_test.go b/brokenlinks/brokenlinks_test.go new file mode 100644 index 0000000..367ae6c --- /dev/null +++ b/brokenlinks/brokenlinks_test.go @@ -0,0 +1,227 @@ +// SPDX-FileCopyrightText: 2025 M. Shulhan <ms@kilabit.info> +// SPDX-License-Identifier: GPL-3.0-only + +package brokenlinks_test + +import ( + "encoding/json" + "log" + "net/http" + "os" + "testing" + "time" + + libnet "git.sr.ht/~shulhan/pakakeh.go/lib/net" + "git.sr.ht/~shulhan/pakakeh.go/lib/test" + + "git.sr.ht/~shulhan/jarink/brokenlinks" +) + +// The test run two web servers that serve content on "testdata/web/". +// The first web server is the one that we want to scan. +// The second web server is external web server, where HTML pages should not +// be parsed. + +const testAddress = `127.0.0.1:11836` +const testExternalAddress = `127.0.0.1:11900` + +func TestMain(m *testing.M) { + log.SetFlags(0) + var httpDirWeb = http.Dir(`testdata/web`) + var fshandle = http.FileServer(httpDirWeb) + + go func() { + var mux = http.NewServeMux() + mux.Handle(`/`, fshandle) + var testServer = &http.Server{ + Addr: testAddress, + Handler: mux, + ReadTimeout: 10 * time.Second, + WriteTimeout: 10 * time.Second, + MaxHeaderBytes: 1 << 20, + } + var err = testServer.ListenAndServe() + if err != nil { + log.Fatal(err) + } + }() + go func() { + var mux = http.NewServeMux() + mux.Handle(`/`, fshandle) + var testServer = &http.Server{ + Addr: testExternalAddress, + Handler: mux, + ReadTimeout: 10 * time.Second, + WriteTimeout: 10 * time.Second, + MaxHeaderBytes: 1 << 20, + } + var err = testServer.ListenAndServe() + if err != nil { + log.Fatal(err) + } + }() + + var err = libnet.WaitAlive(`tcp`, testAddress, 5*time.Second) + if err != nil { + log.Fatal(err) + } + err = libnet.WaitAlive(`tcp`, testExternalAddress, 5*time.Second) + if err != nil { + log.Fatal(err) + } + + os.Exit(m.Run()) +} + +func TestBrokenlinks(t *testing.T) { + var testUrl = `http://` + testAddress + + type testCase struct { + exp map[string][]brokenlinks.Broken + scanUrl string + expError string + } + + listCase := []testCase{{ + scanUrl: `127.0.0.1:14594`, + expError: `brokenlinks: invalid URL "127.0.0.1:14594"`, + }, { + scanUrl: `http://127.0.0.1:14594`, + expError: `brokenlinks: Get "http://127.0.0.1:14594": dial tcp 127.0.0.1:14594: connect: connection refused`, + }, { + scanUrl: testUrl, + exp: map[string][]brokenlinks.Broken{ + testUrl: []brokenlinks.Broken{ + { + Link: testUrl + `/broken.png`, + Code: http.StatusNotFound, + }, { + Link: testUrl + `/brokenPage`, + Code: http.StatusNotFound, + }, { + Link: `http://127.0.0.1:abc`, + Error: `parse "http://127.0.0.1:abc": invalid port ":abc" after host`, + Code: brokenlinks.StatusBadLink, + }, { + Link: `http:/127.0.0.1:11836`, + Error: `Get "http:/127.0.0.1:11836": http: no Host in request URL`, + Code: brokenlinks.StatusBadLink, + }, + }, + testUrl + `/broken.html`: []brokenlinks.Broken{ + { + Link: testUrl + `/brokenPage`, + Code: http.StatusNotFound, + }, + }, + testUrl + `/page2`: []brokenlinks.Broken{ + { + Link: testUrl + `/broken.png`, + Code: http.StatusNotFound, + }, { + Link: testUrl + `/page2/broken/relative`, + Code: http.StatusNotFound, + }, { + Link: testUrl + `/page2/broken2.png`, + Code: http.StatusNotFound, + }, + }, + }, + }, { + // Scanning on "/path" should not scan the the "/" or other + // pages other than below of "/path" itself. + scanUrl: testUrl + `/page2`, + exp: map[string][]brokenlinks.Broken{ + testUrl + `/page2`: []brokenlinks.Broken{ + { + Link: testUrl + `/broken.png`, + Code: http.StatusNotFound, + }, { + Link: testUrl + `/page2/broken/relative`, + Code: http.StatusNotFound, + }, { + Link: testUrl + `/page2/broken2.png`, + Code: http.StatusNotFound, + }, + }, + }, + }} + + var ( + result *brokenlinks.Result + err error + ) + for _, tcase := range listCase { + t.Logf(`--- brokenlinks: %s`, tcase.scanUrl) + var opts = brokenlinks.Options{ + Url: tcase.scanUrl, + } + result, err = brokenlinks.Scan(opts) + if err != nil { + test.Assert(t, tcase.scanUrl+` error`, + tcase.expError, err.Error()) + continue + } + //got, _ := json.MarshalIndent(result.BrokenLinks, ``, ` `) + //t.Logf(`got=%s`, got) + test.Assert(t, tcase.scanUrl, tcase.exp, result.BrokenLinks) + } +} + +// Test running Brokenlinks with file PastResultFile is set. +// The PastResultFile is modified to only report errors on "/page2". +func TestBrokenlinks_pastResult(t *testing.T) { + var testUrl = `http://` + testAddress + + type testCase struct { + exp map[string][]brokenlinks.Broken + expError string + opts brokenlinks.Options + } + + listCase := []testCase{{ + // With invalid file. + opts: brokenlinks.Options{ + Url: testUrl, + PastResultFile: `testdata/invalid`, + }, + expError: `brokenlinks: open testdata/invalid: no such file or directory`, + }, { + // With valid file. + opts: brokenlinks.Options{ + Url: testUrl, + PastResultFile: `testdata/past_result.json`, + }, + exp: map[string][]brokenlinks.Broken{ + testUrl + `/page2`: []brokenlinks.Broken{ + { + Link: testUrl + `/broken.png`, + Code: http.StatusNotFound, + }, { + Link: testUrl + `/page2/broken/relative`, + Code: http.StatusNotFound, + }, { + Link: testUrl + `/page2/broken2.png`, + Code: http.StatusNotFound, + }, + }, + }, + }} + + var ( + result *brokenlinks.Result + err error + ) + for _, tcase := range listCase { + t.Logf(`--- brokenlinks: %s`, tcase.opts.Url) + result, err = brokenlinks.Scan(tcase.opts) + if err != nil { + test.Assert(t, tcase.opts.Url+` error`, + tcase.expError, err.Error()) + continue + } + got, _ := json.MarshalIndent(result.BrokenLinks, ``, ` `) + t.Logf(`got=%s`, got) + test.Assert(t, tcase.opts.Url, tcase.exp, result.BrokenLinks) + } +} diff --git a/brokenlinks/link_queue.go b/brokenlinks/link_queue.go new file mode 100644 index 0000000..164a902 --- /dev/null +++ b/brokenlinks/link_queue.go @@ -0,0 +1,55 @@ +// SPDX-FileCopyrightText: 2025 M. Shulhan <ms@kilabit.info> +// SPDX-License-Identifier: GPL-3.0-only + +package brokenlinks + +import ( + "net/url" + "strings" + + "golang.org/x/net/html/atom" +) + +type linkQueue struct { + parentUrl *url.URL + + // The error from scan. + errScan error + + // url being scanned. + url string + + // kind of url, its either an anchor or image. + // It set to 0 if url is the first URL being scanned. + kind atom.Atom + + // isExternal if true the scan will issue HTTP method HEAD instead of + // GET. + isExternal bool + + // Status of link after scan, its mostly used the HTTP status code. + // 0: link is the result of scan, not processed yet. + // StatusBadLink: link is invalid, not parseable or unreachable. + // 200 - 211: OK. + // 400 - 511: Error. + status int +} + +// checkExternal set the isExternal field to be true if +// +// (1) [linkQueue.url] does not start with [worker.scanUrl] +// +// (2) linkQueue is from scanPastResult, indicated by non-nil +// [worker.pastResult]. +// In this case, we did not want to scan the other pages from the same scanUrl +// domain. +func (linkq *linkQueue) checkExternal(wrk *worker) { + if !strings.HasPrefix(linkq.url, wrk.scanUrl.String()) { + linkq.isExternal = true + return + } + if wrk.pastResult != nil { + linkq.isExternal = true + return + } +} diff --git a/brokenlinks/result.go b/brokenlinks/result.go new file mode 100644 index 0000000..676859b --- /dev/null +++ b/brokenlinks/result.go @@ -0,0 +1,37 @@ +// SPDX-FileCopyrightText: 2025 M. Shulhan <ms@kilabit.info> +// SPDX-License-Identifier: GPL-3.0-only + +package brokenlinks + +import ( + "slices" + "strings" +) + +// Broken store the broken link, HTTP status code, and the error message that +// cause it. +type Broken struct { + Link string `json:"link"` + Error string `json:"error,omitempty"` + Code int `json:"code"` +} + +// Result store the result of scanning for broken links. +type Result struct { + // BrokenLinks store the page and its broken links. + BrokenLinks map[string][]Broken `json:"broken_links"` +} + +func newResult() *Result { + return &Result{ + BrokenLinks: map[string][]Broken{}, + } +} + +func (result *Result) sort() { + for _, listBroken := range result.BrokenLinks { + slices.SortFunc(listBroken, func(a, b Broken) int { + return strings.Compare(a.Link, b.Link) + }) + } +} diff --git a/brokenlinks/testdata/past_result.json b/brokenlinks/testdata/past_result.json new file mode 100644 index 0000000..ca29d35 --- /dev/null +++ b/brokenlinks/testdata/past_result.json @@ -0,0 +1,10 @@ +{ + "broken_links": { + "http://127.0.0.1:11836/page2": [ + { + "link": "http://127.0.0.1:11836/", + "code": 404 + } + ] + } +} diff --git a/brokenlinks/testdata/past_result.json.license b/brokenlinks/testdata/past_result.json.license new file mode 100644 index 0000000..22616a9 --- /dev/null +++ b/brokenlinks/testdata/past_result.json.license @@ -0,0 +1,2 @@ +SPDX-FileCopyrightText: 2025 M. Shulhan <ms@kilabit.info> +SPDX-License-Identifier: GPL-3.0-only diff --git a/brokenlinks/testdata/web/broken.html b/brokenlinks/testdata/web/broken.html new file mode 100644 index 0000000..533e542 --- /dev/null +++ b/brokenlinks/testdata/web/broken.html @@ -0,0 +1,7 @@ +<html> + <head></head> + <body> + <a href="/brokenPage" + <p> + </body> +</html> diff --git a/brokenlinks/testdata/web/gopher.png b/brokenlinks/testdata/web/gopher.png Binary files differnew file mode 100644 index 0000000..79352be --- /dev/null +++ b/brokenlinks/testdata/web/gopher.png diff --git a/brokenlinks/testdata/web/index.html b/brokenlinks/testdata/web/index.html new file mode 100644 index 0000000..61a1f39 --- /dev/null +++ b/brokenlinks/testdata/web/index.html @@ -0,0 +1,22 @@ +<!-- +SPDX-FileCopyrightText: 2025 M. Shulhan <ms@kilabit.info> +SPDX-License-Identifier: GPL-3.0-only +--> +<html> + <body> + <img src="/broken.png" /> + <a href="/brokenPage">Broken page</a> + <img src="/gopher.png" /> + <img width="200" src="" /> + <a href="/page2">Page 2</a> + <a href="/broken.html">Broken HTML</a> + <a href="http://127.0.0.1:11900">External URL</a> + <!-- Error when fetching with GET --> + <a href="http:/127.0.0.1:11836">Invalid external URL</a> + <!-- Error when parsing URL --> + <a href="http://127.0.0.1:abc">Invalid URL port</a> + <!-- Fragment should be skipped and cleaned up --> + <a href="#goto_a">Same with href to "/"</a> + <a href="/page2#goto_a">Same with href to "/page2"</a> + </body> +</html> diff --git a/brokenlinks/testdata/web/page2/index.html b/brokenlinks/testdata/web/page2/index.html new file mode 100644 index 0000000..ae6b4ea --- /dev/null +++ b/brokenlinks/testdata/web/page2/index.html @@ -0,0 +1,14 @@ +<!-- +SPDX-FileCopyrightText: 2025 M. Shulhan <ms@kilabit.info> +SPDX-License-Identifier: GPL-3.0-only +--> +<html> + <body> + <img src="/broken.png" /> + <img src="broken2.png" /> + <a href="broken/relative">broken relative link</a> + <a href="/">Back with absolute path</a> + <a href="../">Back with relative path</a> + <a href="http://127.0.0.1:11900/page2">External URL page2</a> + </body> +</html> diff --git a/brokenlinks/worker.go b/brokenlinks/worker.go new file mode 100644 index 0000000..4ed56d2 --- /dev/null +++ b/brokenlinks/worker.go @@ -0,0 +1,467 @@ +// SPDX-FileCopyrightText: 2025 M. Shulhan <ms@kilabit.info> +// SPDX-License-Identifier: GPL-3.0-only + +package brokenlinks + +import ( + "encoding/json" + "errors" + "fmt" + "log" + "net" + "net/http" + "net/url" + "os" + "strings" + "sync" + "time" + + "golang.org/x/net/html" + "golang.org/x/net/html/atom" +) + +type worker struct { + // seenLink store the URL being or has been scanned and its HTTP + // status code. + seenLink map[string]int + + // resultq channel that collect result from scanning. + resultq chan map[string]linkQueue + + // result contains the final result after all of the pages has been + // scanned. + result *Result + + // pastResult containts the past scan result, loaded from file + // [Options.PastResultFile]. + pastResult *Result + + // The base URL that will be joined to relative or absolute + // links or image. + baseUrl *url.URL + + // The URL to scan. + scanUrl *url.URL + + log *log.Logger + + opts Options + + // wg sync the goroutine scanner. + wg sync.WaitGroup +} + +func newWorker(opts Options) (wrk *worker, err error) { + wrk = &worker{ + opts: opts, + seenLink: map[string]int{}, + resultq: make(chan map[string]linkQueue, 100), + result: newResult(), + log: log.New(os.Stderr, ``, log.LstdFlags), + } + + wrk.scanUrl, err = url.Parse(opts.Url) + if err != nil { + return nil, fmt.Errorf(`invalid URL %q`, opts.Url) + } + wrk.scanUrl.Path = strings.TrimSuffix(wrk.scanUrl.Path, `/`) + wrk.scanUrl.Fragment = "" + wrk.scanUrl.RawFragment = "" + + wrk.baseUrl = &url.URL{ + Scheme: wrk.scanUrl.Scheme, + Host: wrk.scanUrl.Host, + } + + if opts.PastResultFile == "" { + // Run with normal scan. + return wrk, nil + } + + pastresult, err := os.ReadFile(opts.PastResultFile) + if err != nil { + return nil, err + } + + wrk.pastResult = newResult() + err = json.Unmarshal(pastresult, &wrk.pastResult) + if err != nil { + return nil, err + } + + return wrk, nil +} + +func (wrk *worker) run() (result *Result, err error) { + if wrk.pastResult == nil { + result, err = wrk.scanAll() + } else { + result, err = wrk.scanPastResult() + } + return result, err +} + +// scanAll scan all pages start from [Options.Url]. +func (wrk *worker) scanAll() (result *Result, err error) { + // Scan the first URL to make sure that the server is reachable. + var firstLinkq = linkQueue{ + parentUrl: nil, + url: wrk.scanUrl.String(), + status: http.StatusProcessing, + } + wrk.seenLink[firstLinkq.url] = http.StatusProcessing + + wrk.wg.Add(1) + go wrk.scan(firstLinkq) + wrk.wg.Wait() + + var resultq = <-wrk.resultq + for _, linkq := range resultq { + if linkq.url == firstLinkq.url { + if linkq.errScan != nil { + return nil, linkq.errScan + } + wrk.seenLink[linkq.url] = linkq.status + continue + } + if linkq.status >= http.StatusBadRequest { + wrk.markBroken(linkq) + continue + } + + wrk.seenLink[linkq.url] = http.StatusProcessing + wrk.wg.Add(1) + go wrk.scan(linkq) + } + + var tick = time.NewTicker(500 * time.Millisecond) + var listWaitStatus []linkQueue + var isScanning = true + for isScanning { + select { + case resultq := <-wrk.resultq: + listWaitStatus = wrk.processResult(resultq, listWaitStatus) + + case <-tick.C: + wrk.wg.Wait() + if len(wrk.resultq) != 0 { + continue + } + if len(listWaitStatus) != 0 { + // There are links that still waiting for + // scanning to be completed. + continue + } + isScanning = false + } + } + wrk.result.sort() + return wrk.result, nil +} + +// scanPastResult scan only pages reported inside +// [Result.BrokenLinks]. +func (wrk *worker) scanPastResult() ( + result *Result, err error, +) { + go func() { + for page := range wrk.pastResult.BrokenLinks { + var linkq = linkQueue{ + parentUrl: nil, + url: page, + status: http.StatusProcessing, + } + wrk.seenLink[linkq.url] = http.StatusProcessing + wrk.wg.Add(1) + go wrk.scan(linkq) + } + }() + + var tick = time.NewTicker(500 * time.Millisecond) + var listWaitStatus []linkQueue + var isScanning = true + for isScanning { + select { + case resultq := <-wrk.resultq: + listWaitStatus = wrk.processResult(resultq, listWaitStatus) + + case <-tick.C: + wrk.wg.Wait() + if len(wrk.resultq) != 0 { + continue + } + if len(listWaitStatus) != 0 { + // There are links that still waiting for + // scanning to be completed. + continue + } + isScanning = false + } + } + wrk.result.sort() + return wrk.result, nil +} + +// processResult the resultq contains the original URL being scanned +// and its child links. +// For example, scanning "http://example.tld" result in +// +// "http://example.tld": {status=200} +// "http://example.tld/page": {status=0} +// "http://example.tld/image.png": {status=0} +// "http://bad:domain/image.png": {status=700} +func (wrk *worker) processResult( + resultq map[string]linkQueue, listWaitStatus []linkQueue, +) ( + newList []linkQueue, +) { + for _, linkq := range resultq { + if linkq.status >= http.StatusBadRequest { + wrk.markBroken(linkq) + continue + } + if linkq.status != 0 { + // linkq is the result of scan with + // non error status. + wrk.seenLink[linkq.url] = linkq.status + continue + } + + seenStatus, seen := wrk.seenLink[linkq.url] + if !seen { + wrk.seenLink[linkq.url] = http.StatusProcessing + wrk.wg.Add(1) + go wrk.scan(linkq) + continue + } + if seenStatus >= http.StatusBadRequest { + linkq.status = seenStatus + wrk.markBroken(linkq) + continue + } + if seenStatus >= http.StatusOK { + // The link has been processed and its + // not an error. + continue + } + // The link being processed by other goroutine. + linkq.status = seenStatus + newList = append(newList, linkq) + } + for _, linkq := range listWaitStatus { + seenStatus := wrk.seenLink[linkq.url] + if seenStatus >= http.StatusBadRequest { + linkq.status = seenStatus + wrk.markBroken(linkq) + continue + } + if seenStatus >= http.StatusOK { + continue + } + if seenStatus == http.StatusProcessing { + // Scanning still in progress. + newList = append(newList, linkq) + continue + } + } + return newList +} + +func (wrk *worker) markBroken(linkq linkQueue) { + var parentUrl = linkq.parentUrl.String() + var listBroken = wrk.result.BrokenLinks[parentUrl] + var brokenLink = Broken{ + Link: linkq.url, + Code: linkq.status, + } + if linkq.errScan != nil { + brokenLink.Error = linkq.errScan.Error() + } + listBroken = append(listBroken, brokenLink) + wrk.result.BrokenLinks[parentUrl] = listBroken + + wrk.seenLink[linkq.url] = linkq.status +} + +// scan fetch the HTML page or image to check if its valid. +func (wrk *worker) scan(linkq linkQueue) { + defer func() { + if wrk.opts.IsVerbose && linkq.errScan != nil { + wrk.log.Printf("error: %d %s error=%v\n", linkq.status, + linkq.url, linkq.errScan) + } + wrk.wg.Done() + }() + + var ( + resultq = map[string]linkQueue{} + httpResp *http.Response + err error + ) + httpResp, err = wrk.fetch(linkq) + if err != nil { + linkq.status = StatusBadLink + linkq.errScan = err + resultq[linkq.url] = linkq + go wrk.pushResult(resultq) + return + } + defer httpResp.Body.Close() + + linkq.status = httpResp.StatusCode + resultq[linkq.url] = linkq + + if httpResp.StatusCode >= http.StatusBadRequest { + go wrk.pushResult(resultq) + return + } + if linkq.kind == atom.Img || linkq.isExternal { + go wrk.pushResult(resultq) + return + } + + var doc *html.Node + doc, _ = html.Parse(httpResp.Body) + + // After we check the code and test for [html.Parse] there are + // no case actual cases where HTML content will return an error. + // The only possible error is when reading from body (io.Reader), and + // that is also almost impossible. + // + // [html.Parse]: https://go.googlesource.com/net/+/refs/tags/v0.40.0/html/parse.go#2347 + + var scanUrl *url.URL + + scanUrl, err = url.Parse(linkq.url) + if err != nil { + log.Fatal(err) + } + + var node *html.Node + for node = range doc.Descendants() { + if node.Type != html.ElementNode { + continue + } + var nodeLink *linkQueue + if node.DataAtom == atom.A { + for _, attr := range node.Attr { + if attr.Key != `href` { + continue + } + nodeLink = wrk.processLink(scanUrl, attr.Val, atom.A) + break + } + } else if node.DataAtom == atom.Img { + for _, attr := range node.Attr { + if attr.Key != `src` { + continue + } + nodeLink = wrk.processLink(scanUrl, attr.Val, atom.Img) + break + } + } else { + continue + } + if nodeLink == nil { + continue + } + _, seen := resultq[nodeLink.url] + if !seen { + nodeLink.checkExternal(wrk) + resultq[nodeLink.url] = *nodeLink + } + } + go wrk.pushResult(resultq) +} + +func (wrk *worker) fetch(linkq linkQueue) ( + httpResp *http.Response, + err error, +) { + const maxRetry = 5 + var retry int + for retry < 5 { + if linkq.kind == atom.Img { + if wrk.opts.IsVerbose { + wrk.log.Printf("scan: HEAD %s\n", linkq.url) + } + httpResp, err = http.Head(linkq.url) + } else { + if wrk.opts.IsVerbose { + wrk.log.Printf("scan: GET %s\n", linkq.url) + } + httpResp, err = http.Get(linkq.url) + } + if err == nil { + return httpResp, nil + } + var errDNS *net.DNSError + if !errors.As(err, &errDNS) { + return nil, err + } + if errDNS.Timeout() { + retry++ + } + } + return nil, err +} + +func (wrk *worker) processLink(parentUrl *url.URL, val string, kind atom.Atom) ( + linkq *linkQueue, +) { + if len(val) == 0 { + return nil + } + + var newUrl *url.URL + var err error + newUrl, err = url.Parse(val) + if err != nil { + return &linkQueue{ + parentUrl: parentUrl, + errScan: err, + url: val, + kind: kind, + status: StatusBadLink, + } + } + newUrl.Fragment = "" + newUrl.RawFragment = "" + + if kind == atom.A && val[0] == '#' { + // Ignore link to ID, like `href="#element_id"`. + return nil + } + if strings.HasPrefix(val, `http`) { + return &linkQueue{ + parentUrl: parentUrl, + url: strings.TrimSuffix(newUrl.String(), `/`), + kind: kind, + } + } + if val[0] == '/' { + // val is absolute to parent URL. + newUrl = wrk.baseUrl.JoinPath(newUrl.Path) + } else { + // val is relative to parent URL. + newUrl = parentUrl.JoinPath(`/`, newUrl.Path) + } + linkq = &linkQueue{ + parentUrl: parentUrl, + url: strings.TrimSuffix(newUrl.String(), `/`), + kind: kind, + } + return linkq +} + +func (wrk *worker) pushResult(resultq map[string]linkQueue) { + var tick = time.NewTicker(100 * time.Millisecond) + for { + select { + case wrk.resultq <- resultq: + tick.Stop() + return + case <-tick.C: + } + } +} |
