aboutsummaryrefslogtreecommitdiff
path: root/brokenlinks_worker.go
diff options
context:
space:
mode:
Diffstat (limited to 'brokenlinks_worker.go')
-rw-r--r--brokenlinks_worker.go369
1 files changed, 369 insertions, 0 deletions
diff --git a/brokenlinks_worker.go b/brokenlinks_worker.go
new file mode 100644
index 0000000..03359b7
--- /dev/null
+++ b/brokenlinks_worker.go
@@ -0,0 +1,369 @@
+// SPDX-FileCopyrightText: 2025 M. Shulhan <ms@kilabit.info>
+// SPDX-License-Identifier: GPL-3.0-only
+
+package jarink
+
+import (
+ "fmt"
+ "log"
+ "net/http"
+ "net/url"
+ "strings"
+ "sync"
+ "time"
+
+ "golang.org/x/net/html"
+ "golang.org/x/net/html/atom"
+)
+
+type brokenlinksWorker struct {
+ // seenLink store the URL being or has been scanned and its HTTP
+ // status code.
+ seenLink map[string]int
+
+ // resultq channel that collect result from scanning.
+ resultq chan map[string]linkQueue
+
+ // result contains the final result after all of the pages has been
+ // scanned.
+ result *BrokenlinksResult
+
+ // The base URL that will be joined to relative or absolute
+ // links or image.
+ baseUrl *url.URL
+
+ // The URL to scan.
+ scanUrl *url.URL
+
+ opts BrokenlinksOptions
+
+ // wg sync the goroutine scanner.
+ wg sync.WaitGroup
+}
+
+func newWorker(opts BrokenlinksOptions) (wrk *brokenlinksWorker, err error) {
+ wrk = &brokenlinksWorker{
+ opts: opts,
+ seenLink: map[string]int{},
+ resultq: make(chan map[string]linkQueue, 100),
+ result: newBrokenlinksResult(),
+ }
+
+ wrk.scanUrl, err = url.Parse(opts.Url)
+ if err != nil {
+ return nil, fmt.Errorf(`invalid URL %q`, opts.Url)
+ }
+
+ wrk.scanUrl.Path = strings.TrimSuffix(wrk.scanUrl.Path, `/`)
+ wrk.scanUrl.Fragment = ""
+ wrk.scanUrl.RawFragment = ""
+
+ wrk.baseUrl = &url.URL{
+ Scheme: wrk.scanUrl.Scheme,
+ Host: wrk.scanUrl.Host,
+ }
+
+ return wrk, nil
+}
+
+func (wrk *brokenlinksWorker) run() (result *BrokenlinksResult, err error) {
+ // Scan the first URL to make sure that the server is reachable.
+ var firstLinkq = linkQueue{
+ parentUrl: nil,
+ url: wrk.scanUrl.String(),
+ status: http.StatusProcessing,
+ }
+ wrk.seenLink[firstLinkq.url] = http.StatusProcessing
+
+ wrk.wg.Add(1)
+ go wrk.scan(firstLinkq)
+ wrk.wg.Wait()
+
+ var resultq = <-wrk.resultq
+ for _, linkq := range resultq {
+ if linkq.url == firstLinkq.url {
+ if linkq.errScan != nil {
+ return nil, linkq.errScan
+ }
+ wrk.seenLink[linkq.url] = linkq.status
+ continue
+ }
+ if linkq.status >= http.StatusBadRequest {
+ wrk.markBroken(linkq)
+ continue
+ }
+
+ wrk.seenLink[linkq.url] = http.StatusProcessing
+ wrk.wg.Add(1)
+ go wrk.scan(linkq)
+ }
+
+ var tick = time.NewTicker(500 * time.Millisecond)
+ var listWaitStatus []linkQueue
+ var isScanning = true
+ for isScanning {
+ select {
+ case resultq := <-wrk.resultq:
+
+ // The resultq contains the original URL being scanned
+ // and its child links.
+ // For example, scanning "http://example.tld" result
+ // in
+ //
+ // "http://example.tld": {status=200}
+ // "http://example.tld/page": {status=0}
+ // "http://example.tld/image.png": {status=0}
+ // "http://bad:domain/image.png": {status=700}
+
+ var newList []linkQueue
+ for _, linkq := range resultq {
+ if linkq.status >= http.StatusBadRequest {
+ wrk.markBroken(linkq)
+ continue
+ }
+ if linkq.status != 0 {
+ // linkq is the result of scan with
+ // non error status.
+ wrk.seenLink[linkq.url] = linkq.status
+ continue
+ }
+
+ seenStatus, seen := wrk.seenLink[linkq.url]
+ if !seen {
+ wrk.seenLink[linkq.url] = http.StatusProcessing
+ wrk.wg.Add(1)
+ go wrk.scan(linkq)
+ continue
+ }
+ if seenStatus >= http.StatusBadRequest {
+ linkq.status = seenStatus
+ wrk.markBroken(linkq)
+ continue
+ }
+ if seenStatus >= http.StatusOK {
+ // The link has been processed and its
+ // not an error.
+ continue
+ }
+ if seenStatus == http.StatusProcessing {
+ // The link being processed by other
+ // goroutine.
+ linkq.status = seenStatus
+ newList = append(newList, linkq)
+ continue
+ }
+ log.Fatalf("link=%s status=%d", linkq.url, linkq.status)
+ }
+ for _, linkq := range listWaitStatus {
+ seenStatus := wrk.seenLink[linkq.url]
+ if seenStatus >= http.StatusBadRequest {
+ linkq.status = seenStatus
+ wrk.markBroken(linkq)
+ continue
+ }
+ if seenStatus >= http.StatusOK {
+ continue
+ }
+ if seenStatus == http.StatusProcessing {
+ // Scanning still in progress.
+ newList = append(newList, linkq)
+ continue
+ }
+ }
+ listWaitStatus = newList
+
+ case <-tick.C:
+ wrk.wg.Wait()
+ if len(wrk.resultq) != 0 {
+ continue
+ }
+ if len(listWaitStatus) != 0 {
+ // There are links that still waiting for
+ // scanning to be completed.
+ continue
+ }
+ isScanning = false
+ }
+ }
+ wrk.result.sort()
+ return wrk.result, nil
+}
+
+func (wrk *brokenlinksWorker) markBroken(linkq linkQueue) {
+ var parentUrl = linkq.parentUrl.String()
+ var listBroken = wrk.result.PageLinks[parentUrl]
+ var brokenLink = Broken{
+ Link: linkq.url,
+ Code: linkq.status,
+ }
+ if linkq.errScan != nil {
+ brokenLink.Error = linkq.errScan.Error()
+ }
+ listBroken = append(listBroken, brokenLink)
+ wrk.result.PageLinks[parentUrl] = listBroken
+
+ wrk.seenLink[linkq.url] = linkq.status
+}
+
+// scan fetch the HTML page or image to check if its valid.
+func (wrk *brokenlinksWorker) scan(linkq linkQueue) {
+ defer func() {
+ if wrk.opts.IsVerbose && linkq.errScan != nil {
+ fmt.Printf("error: %d %s error=%v\n", linkq.status,
+ linkq.url, linkq.errScan)
+ }
+ wrk.wg.Done()
+ }()
+
+ var (
+ resultq = map[string]linkQueue{}
+ httpResp *http.Response
+ err error
+ )
+ if linkq.kind == atom.Img || linkq.isExternal {
+ if wrk.opts.IsVerbose {
+ fmt.Printf("scan: HEAD %s\n", linkq.url)
+ }
+ httpResp, err = http.Head(linkq.url)
+ } else {
+ if wrk.opts.IsVerbose {
+ fmt.Printf("scan: GET %s\n", linkq.url)
+ }
+ httpResp, err = http.Get(linkq.url)
+ }
+ if err != nil {
+ linkq.status = StatusBadLink
+ linkq.errScan = err
+ resultq[linkq.url] = linkq
+ go wrk.pushResult(resultq)
+ return
+ }
+ defer httpResp.Body.Close()
+
+ linkq.status = httpResp.StatusCode
+ resultq[linkq.url] = linkq
+
+ if httpResp.StatusCode >= http.StatusBadRequest {
+ go wrk.pushResult(resultq)
+ return
+ }
+ if linkq.kind == atom.Img || linkq.isExternal {
+ go wrk.pushResult(resultq)
+ return
+ }
+
+ var doc *html.Node
+ doc, _ = html.Parse(httpResp.Body)
+
+ // After we check the code and test for [html.Parse] there are
+ // no case actual cases where HTML content will return an error.
+ // The only possible error is when reading from body (io.Reader), and
+ // that is also almost impossible.
+ //
+ // [html.Parse]: https://go.googlesource.com/net/+/refs/tags/v0.40.0/html/parse.go#2347
+
+ var scanUrl *url.URL
+
+ scanUrl, err = url.Parse(linkq.url)
+ if err != nil {
+ log.Fatal(err)
+ }
+
+ var node *html.Node
+ for node = range doc.Descendants() {
+ if node.Type != html.ElementNode {
+ continue
+ }
+ var nodeLink *linkQueue
+ if node.DataAtom == atom.A {
+ for _, attr := range node.Attr {
+ if attr.Key != `href` {
+ continue
+ }
+ nodeLink = wrk.processLink(scanUrl, attr.Val, atom.A)
+ break
+ }
+ } else if node.DataAtom == atom.Img {
+ for _, attr := range node.Attr {
+ if attr.Key != `src` {
+ continue
+ }
+ nodeLink = wrk.processLink(scanUrl, attr.Val, atom.Img)
+ break
+ }
+ } else {
+ continue
+ }
+ if nodeLink == nil {
+ continue
+ }
+ _, seen := resultq[nodeLink.url]
+ if !seen {
+ if !strings.HasPrefix(nodeLink.url, wrk.baseUrl.String()) {
+ nodeLink.isExternal = true
+ }
+ resultq[nodeLink.url] = *nodeLink
+ }
+ }
+ go wrk.pushResult(resultq)
+}
+
+func (wrk *brokenlinksWorker) processLink(parentUrl *url.URL, val string, kind atom.Atom) (
+ linkq *linkQueue,
+) {
+ if len(val) == 0 {
+ return nil
+ }
+
+ var newUrl *url.URL
+ var err error
+ newUrl, err = url.Parse(val)
+ if err != nil {
+ return &linkQueue{
+ parentUrl: parentUrl,
+ errScan: err,
+ url: val,
+ kind: kind,
+ status: StatusBadLink,
+ }
+ }
+ newUrl.Fragment = ""
+ newUrl.RawFragment = ""
+
+ if kind == atom.A && val[0] == '#' {
+ // Ignore link to ID, like `href="#element_id"`.
+ return nil
+ }
+ if strings.HasPrefix(val, `http`) {
+ return &linkQueue{
+ parentUrl: parentUrl,
+ url: strings.TrimSuffix(newUrl.String(), `/`),
+ kind: kind,
+ }
+ }
+ if val[0] == '/' {
+ // val is absolute to parent URL.
+ newUrl = wrk.baseUrl.JoinPath(newUrl.Path)
+ } else {
+ // val is relative to parent URL.
+ newUrl = parentUrl.JoinPath(`/`, newUrl.Path)
+ }
+ linkq = &linkQueue{
+ parentUrl: parentUrl,
+ url: strings.TrimSuffix(newUrl.String(), `/`),
+ kind: kind,
+ }
+ return linkq
+}
+
+func (wrk *brokenlinksWorker) pushResult(resultq map[string]linkQueue) {
+ var tick = time.NewTicker(100 * time.Millisecond)
+ for {
+ select {
+ case wrk.resultq <- resultq:
+ tick.Stop()
+ return
+ case <-tick.C:
+ }
+ }
+}