diff options
Diffstat (limited to 'brokenlinks_worker.go')
| -rw-r--r-- | brokenlinks_worker.go | 369 |
1 files changed, 369 insertions, 0 deletions
diff --git a/brokenlinks_worker.go b/brokenlinks_worker.go new file mode 100644 index 0000000..03359b7 --- /dev/null +++ b/brokenlinks_worker.go @@ -0,0 +1,369 @@ +// SPDX-FileCopyrightText: 2025 M. Shulhan <ms@kilabit.info> +// SPDX-License-Identifier: GPL-3.0-only + +package jarink + +import ( + "fmt" + "log" + "net/http" + "net/url" + "strings" + "sync" + "time" + + "golang.org/x/net/html" + "golang.org/x/net/html/atom" +) + +type brokenlinksWorker struct { + // seenLink store the URL being or has been scanned and its HTTP + // status code. + seenLink map[string]int + + // resultq channel that collect result from scanning. + resultq chan map[string]linkQueue + + // result contains the final result after all of the pages has been + // scanned. + result *BrokenlinksResult + + // The base URL that will be joined to relative or absolute + // links or image. + baseUrl *url.URL + + // The URL to scan. + scanUrl *url.URL + + opts BrokenlinksOptions + + // wg sync the goroutine scanner. + wg sync.WaitGroup +} + +func newWorker(opts BrokenlinksOptions) (wrk *brokenlinksWorker, err error) { + wrk = &brokenlinksWorker{ + opts: opts, + seenLink: map[string]int{}, + resultq: make(chan map[string]linkQueue, 100), + result: newBrokenlinksResult(), + } + + wrk.scanUrl, err = url.Parse(opts.Url) + if err != nil { + return nil, fmt.Errorf(`invalid URL %q`, opts.Url) + } + + wrk.scanUrl.Path = strings.TrimSuffix(wrk.scanUrl.Path, `/`) + wrk.scanUrl.Fragment = "" + wrk.scanUrl.RawFragment = "" + + wrk.baseUrl = &url.URL{ + Scheme: wrk.scanUrl.Scheme, + Host: wrk.scanUrl.Host, + } + + return wrk, nil +} + +func (wrk *brokenlinksWorker) run() (result *BrokenlinksResult, err error) { + // Scan the first URL to make sure that the server is reachable. + var firstLinkq = linkQueue{ + parentUrl: nil, + url: wrk.scanUrl.String(), + status: http.StatusProcessing, + } + wrk.seenLink[firstLinkq.url] = http.StatusProcessing + + wrk.wg.Add(1) + go wrk.scan(firstLinkq) + wrk.wg.Wait() + + var resultq = <-wrk.resultq + for _, linkq := range resultq { + if linkq.url == firstLinkq.url { + if linkq.errScan != nil { + return nil, linkq.errScan + } + wrk.seenLink[linkq.url] = linkq.status + continue + } + if linkq.status >= http.StatusBadRequest { + wrk.markBroken(linkq) + continue + } + + wrk.seenLink[linkq.url] = http.StatusProcessing + wrk.wg.Add(1) + go wrk.scan(linkq) + } + + var tick = time.NewTicker(500 * time.Millisecond) + var listWaitStatus []linkQueue + var isScanning = true + for isScanning { + select { + case resultq := <-wrk.resultq: + + // The resultq contains the original URL being scanned + // and its child links. + // For example, scanning "http://example.tld" result + // in + // + // "http://example.tld": {status=200} + // "http://example.tld/page": {status=0} + // "http://example.tld/image.png": {status=0} + // "http://bad:domain/image.png": {status=700} + + var newList []linkQueue + for _, linkq := range resultq { + if linkq.status >= http.StatusBadRequest { + wrk.markBroken(linkq) + continue + } + if linkq.status != 0 { + // linkq is the result of scan with + // non error status. + wrk.seenLink[linkq.url] = linkq.status + continue + } + + seenStatus, seen := wrk.seenLink[linkq.url] + if !seen { + wrk.seenLink[linkq.url] = http.StatusProcessing + wrk.wg.Add(1) + go wrk.scan(linkq) + continue + } + if seenStatus >= http.StatusBadRequest { + linkq.status = seenStatus + wrk.markBroken(linkq) + continue + } + if seenStatus >= http.StatusOK { + // The link has been processed and its + // not an error. + continue + } + if seenStatus == http.StatusProcessing { + // The link being processed by other + // goroutine. + linkq.status = seenStatus + newList = append(newList, linkq) + continue + } + log.Fatalf("link=%s status=%d", linkq.url, linkq.status) + } + for _, linkq := range listWaitStatus { + seenStatus := wrk.seenLink[linkq.url] + if seenStatus >= http.StatusBadRequest { + linkq.status = seenStatus + wrk.markBroken(linkq) + continue + } + if seenStatus >= http.StatusOK { + continue + } + if seenStatus == http.StatusProcessing { + // Scanning still in progress. + newList = append(newList, linkq) + continue + } + } + listWaitStatus = newList + + case <-tick.C: + wrk.wg.Wait() + if len(wrk.resultq) != 0 { + continue + } + if len(listWaitStatus) != 0 { + // There are links that still waiting for + // scanning to be completed. + continue + } + isScanning = false + } + } + wrk.result.sort() + return wrk.result, nil +} + +func (wrk *brokenlinksWorker) markBroken(linkq linkQueue) { + var parentUrl = linkq.parentUrl.String() + var listBroken = wrk.result.PageLinks[parentUrl] + var brokenLink = Broken{ + Link: linkq.url, + Code: linkq.status, + } + if linkq.errScan != nil { + brokenLink.Error = linkq.errScan.Error() + } + listBroken = append(listBroken, brokenLink) + wrk.result.PageLinks[parentUrl] = listBroken + + wrk.seenLink[linkq.url] = linkq.status +} + +// scan fetch the HTML page or image to check if its valid. +func (wrk *brokenlinksWorker) scan(linkq linkQueue) { + defer func() { + if wrk.opts.IsVerbose && linkq.errScan != nil { + fmt.Printf("error: %d %s error=%v\n", linkq.status, + linkq.url, linkq.errScan) + } + wrk.wg.Done() + }() + + var ( + resultq = map[string]linkQueue{} + httpResp *http.Response + err error + ) + if linkq.kind == atom.Img || linkq.isExternal { + if wrk.opts.IsVerbose { + fmt.Printf("scan: HEAD %s\n", linkq.url) + } + httpResp, err = http.Head(linkq.url) + } else { + if wrk.opts.IsVerbose { + fmt.Printf("scan: GET %s\n", linkq.url) + } + httpResp, err = http.Get(linkq.url) + } + if err != nil { + linkq.status = StatusBadLink + linkq.errScan = err + resultq[linkq.url] = linkq + go wrk.pushResult(resultq) + return + } + defer httpResp.Body.Close() + + linkq.status = httpResp.StatusCode + resultq[linkq.url] = linkq + + if httpResp.StatusCode >= http.StatusBadRequest { + go wrk.pushResult(resultq) + return + } + if linkq.kind == atom.Img || linkq.isExternal { + go wrk.pushResult(resultq) + return + } + + var doc *html.Node + doc, _ = html.Parse(httpResp.Body) + + // After we check the code and test for [html.Parse] there are + // no case actual cases where HTML content will return an error. + // The only possible error is when reading from body (io.Reader), and + // that is also almost impossible. + // + // [html.Parse]: https://go.googlesource.com/net/+/refs/tags/v0.40.0/html/parse.go#2347 + + var scanUrl *url.URL + + scanUrl, err = url.Parse(linkq.url) + if err != nil { + log.Fatal(err) + } + + var node *html.Node + for node = range doc.Descendants() { + if node.Type != html.ElementNode { + continue + } + var nodeLink *linkQueue + if node.DataAtom == atom.A { + for _, attr := range node.Attr { + if attr.Key != `href` { + continue + } + nodeLink = wrk.processLink(scanUrl, attr.Val, atom.A) + break + } + } else if node.DataAtom == atom.Img { + for _, attr := range node.Attr { + if attr.Key != `src` { + continue + } + nodeLink = wrk.processLink(scanUrl, attr.Val, atom.Img) + break + } + } else { + continue + } + if nodeLink == nil { + continue + } + _, seen := resultq[nodeLink.url] + if !seen { + if !strings.HasPrefix(nodeLink.url, wrk.baseUrl.String()) { + nodeLink.isExternal = true + } + resultq[nodeLink.url] = *nodeLink + } + } + go wrk.pushResult(resultq) +} + +func (wrk *brokenlinksWorker) processLink(parentUrl *url.URL, val string, kind atom.Atom) ( + linkq *linkQueue, +) { + if len(val) == 0 { + return nil + } + + var newUrl *url.URL + var err error + newUrl, err = url.Parse(val) + if err != nil { + return &linkQueue{ + parentUrl: parentUrl, + errScan: err, + url: val, + kind: kind, + status: StatusBadLink, + } + } + newUrl.Fragment = "" + newUrl.RawFragment = "" + + if kind == atom.A && val[0] == '#' { + // Ignore link to ID, like `href="#element_id"`. + return nil + } + if strings.HasPrefix(val, `http`) { + return &linkQueue{ + parentUrl: parentUrl, + url: strings.TrimSuffix(newUrl.String(), `/`), + kind: kind, + } + } + if val[0] == '/' { + // val is absolute to parent URL. + newUrl = wrk.baseUrl.JoinPath(newUrl.Path) + } else { + // val is relative to parent URL. + newUrl = parentUrl.JoinPath(`/`, newUrl.Path) + } + linkq = &linkQueue{ + parentUrl: parentUrl, + url: strings.TrimSuffix(newUrl.String(), `/`), + kind: kind, + } + return linkq +} + +func (wrk *brokenlinksWorker) pushResult(resultq map[string]linkQueue) { + var tick = time.NewTicker(100 * time.Millisecond) + for { + select { + case wrk.resultq <- resultq: + tick.Stop() + return + case <-tick.C: + } + } +} |
