diff options
Diffstat (limited to 'brokenlinks/worker.go')
| -rw-r--r-- | brokenlinks/worker.go | 345 |
1 files changed, 111 insertions, 234 deletions
diff --git a/brokenlinks/worker.go b/brokenlinks/worker.go index 3c6f97e..c0a33dd 100644 --- a/brokenlinks/worker.go +++ b/brokenlinks/worker.go @@ -1,5 +1,5 @@ -// SPDX-FileCopyrightText: 2025 M. Shulhan <ms@kilabit.info> // SPDX-License-Identifier: GPL-3.0-only +// SPDX-FileCopyrightText: 2025 M. Shulhan <ms@kilabit.info> package brokenlinks @@ -14,7 +14,6 @@ import ( "os" "slices" "strings" - "sync" "time" "golang.org/x/net/html" @@ -26,10 +25,11 @@ import ( type worker struct { // seenLink store the URL being or has been scanned and its HTTP // status code. + // It store all links, including the broken one. seenLink map[string]int - // resultq channel that collect result from scanning. - resultq chan map[string]linkQueue + // queue contains list of link to be scanned. + queue []linkQueue // result contains the final result after all of the pages has been // scanned. @@ -41,6 +41,7 @@ type worker struct { // The base URL that will be joined to relative or absolute // links or image. + // baseURL is set to "scheme://host" only. baseUrl *url.URL // cache of scanned links. @@ -51,12 +52,10 @@ type worker struct { httpc *http.Client opts Options - - // wg sync the goroutine scanner. - wg sync.WaitGroup } func newWorker(opts Options) (wrk *worker, err error) { + var logp = `newWorker` var netDial = &net.Dialer{ Timeout: 30 * time.Second, KeepAlive: 30 * time.Second, @@ -68,7 +67,6 @@ func newWorker(opts Options) (wrk *worker, err error) { wrk = &worker{ opts: opts, seenLink: map[string]int{}, - resultq: make(chan map[string]linkQueue, 100), result: newResult(), log: log.New(os.Stderr, ``, log.LstdFlags), httpc: &http.Client{ @@ -107,7 +105,8 @@ func newWorker(opts Options) (wrk *worker, err error) { wrk.pastResult = newResult() err = json.Unmarshal(pastresult, &wrk.pastResult) if err != nil { - return nil, err + log.Printf(`%s: invalid past result file format: %s`, logp, err) + log.Printf(`%s: ignoring past result`, logp) } return wrk, nil @@ -125,184 +124,107 @@ func (wrk *worker) run() (result *Result, err error) { // scanAll scan all pages start from [Options.Url]. func (wrk *worker) scanAll() (result *Result, err error) { // Scan the first URL to make sure that the server is reachable. - var firstLinkq = linkQueue{ + var linkq = linkQueue{ parentUrl: nil, url: wrk.opts.scanUrl.String(), - status: http.StatusProcessing, } - wrk.seenLink[firstLinkq.url] = http.StatusProcessing - - wrk.wg.Add(1) - go func() { - var resultq = wrk.scan(firstLinkq) - wrk.pushResult(resultq) - }() - wrk.wg.Wait() - - var resultq = <-wrk.resultq - for _, linkq := range resultq { - if linkq.url == firstLinkq.url { - if linkq.errScan != nil { - return nil, linkq.errScan - } - wrk.seenLink[linkq.url] = linkq.status - continue - } - if linkq.isExternal { - var scannedLink = wrk.cache.Get(linkq.url) - if scannedLink != nil { - linkq.status = scannedLink.ResponseCode - wrk.seen(linkq) - continue - } - } + var resultq = wrk.scan(linkq) + linkq = resultq[linkq.url] + if linkq.errScan != nil { + return nil, linkq.errScan + } + wrk.processResult(resultq) - wrk.seenLink[linkq.url] = http.StatusProcessing - wrk.wg.Add(1) - go func() { - var resultq = wrk.scan(linkq) - wrk.pushResult(resultq) - }() + var x int + for x < len(wrk.queue) { + linkq = wrk.queue[x] + x++ + resultq = wrk.scan(linkq) + wrk.processResult(resultq) } - wrk.processAndWait() + wrk.result.sort() return wrk.result, nil } -// scanPastResult scan only pages reported inside -// [Result.BrokenLinks]. +// scanPastResult scan only pages reported inside [Result.BrokenLinks]. func (wrk *worker) scanPastResult() (result *Result, err error) { for page := range wrk.pastResult.BrokenLinks { - var linkq = linkQueue{ - parentUrl: nil, - url: page, - status: http.StatusProcessing, + wrk.opts.scanUrl, err = url.Parse(page) + if err != nil { + log.Printf(`scanPastResult: invalid URL %q: %s`, page, err) + continue } - wrk.seenLink[linkq.url] = http.StatusProcessing - wrk.wg.Add(1) - go func() { - var resultq = wrk.scan(linkq) - wrk.pushResult(resultq) - }() - } - - wrk.processAndWait() - return wrk.result, nil -} - -func (wrk *worker) processAndWait() { - var tick = time.NewTicker(500 * time.Millisecond) - var listWaitStatus []linkQueue - var isScanning = true - for isScanning { - select { - case resultq := <-wrk.resultq: - listWaitStatus = wrk.processResult(resultq, listWaitStatus) - - case <-tick.C: - wrk.wg.Wait() - if len(wrk.resultq) != 0 { - continue - } - if len(listWaitStatus) != 0 { - // There are links that still waiting for - // scanning to be completed. - continue - } - isScanning = false + wrk.queue = nil + _, err = wrk.scanAll() + if err != nil { + log.Printf(`scanPastResult: %q: %s`, page, err) } } + wrk.result.sort() + return wrk.result, nil } -// processResult the resultq contains the original URL being scanned -// and its child links. -// For example, scanning "http://example.tld" result in +// processResult process the scan result and push it to queue. +// The resultq contains the original URL being scanned and its child links. +// For each link item in resultq, // -// "http://example.tld": {status=200} -// "http://example.tld/page": {status=0} -// "http://example.tld/image.png": {status=0} -// "http://bad:domain/image.png": {status=700} -func (wrk *worker) processResult( - resultq map[string]linkQueue, listWaitStatus []linkQueue, -) ( - newList []linkQueue, -) { - for _, linkq := range resultq { - // Process the scanned page first. - +// - For non-zero status code, mark it as seen and store to cache if +// it external. +// - Skip external link that has been checked before. +// - Skip link that has been seen. +// - Otherwise push it to queue. +func (wrk *worker) processResult(resultq map[string]linkQueue) { + var linkq linkQueue + var seen bool + for _, linkq = range resultq { if linkq.status != 0 { wrk.seen(linkq) - if linkq.isExternal && linkq.status != StatusBadLink { - wrk.cache.Set(linkq.url, linkq.status, linkq.size) - } continue } - // Now process the links inside the page. - if linkq.isExternal { var scannedLink = wrk.cache.Get(linkq.url) if scannedLink != nil { - linkq.status = scannedLink.ResponseCode - wrk.seen(linkq) + // The external link has been scanned + // previously. continue } } - seenStatus, seen := wrk.seenLink[linkq.url] - if !seen { - wrk.seenLink[linkq.url] = http.StatusProcessing - wrk.wg.Add(1) - go func() { - var resultq = wrk.scan(linkq) - wrk.pushResult(resultq) - }() - continue - } - if seenStatus >= http.StatusBadRequest { - linkq.status = seenStatus - wrk.markBroken(linkq) - continue - } - if seenStatus >= http.StatusOK { - // The link has been processed and its - // not an error. - continue - } - // The link being processed by other goroutine. - linkq.status = seenStatus - newList = append(newList, linkq) - } - for _, linkq := range listWaitStatus { - seenStatus := wrk.seenLink[linkq.url] - if seenStatus >= http.StatusBadRequest { - linkq.status = seenStatus - wrk.markBroken(linkq) - continue - } - if seenStatus >= http.StatusOK { - continue - } - if seenStatus == http.StatusProcessing { - // Scanning still in progress. - newList = append(newList, linkq) + linkq.status, seen = wrk.seenLink[linkq.url] + if seen { + if linkq.status >= http.StatusBadRequest { + // Different pages may have the same broken + // link. + wrk.markAsBroken(linkq) + } continue } + wrk.queue = append(wrk.queue, linkq) } - return newList } func (wrk *worker) seen(linkq linkQueue) { + wrk.seenLink[linkq.url] = linkq.status + + if linkq.isExternal { + if linkq.status != StatusBadLink { + wrk.cache.Set(linkq.url, linkq.status, linkq.size) + } + } + if linkq.status >= http.StatusBadRequest { - wrk.markBroken(linkq) - return + wrk.markAsBroken(linkq) } - wrk.seenLink[linkq.url] = linkq.status } -func (wrk *worker) markBroken(linkq linkQueue) { +func (wrk *worker) markAsBroken(linkq linkQueue) { + if slices.Contains(wrk.opts.ignoreStatus, linkq.status) { + return + } var parentUrl = linkq.parentUrl.String() var listBroken = wrk.result.BrokenLinks[parentUrl] var brokenLink = Broken{ @@ -314,20 +236,10 @@ func (wrk *worker) markBroken(linkq linkQueue) { } listBroken = append(listBroken, brokenLink) wrk.result.BrokenLinks[parentUrl] = listBroken - - wrk.seenLink[linkq.url] = linkq.status } -// scan fetch the HTML page or image to check if its valid. +// scan the link to HTML page or image. func (wrk *worker) scan(linkq linkQueue) (resultq map[string]linkQueue) { - defer func() { - if wrk.opts.IsVerbose && linkq.errScan != nil { - wrk.log.Printf("error: %d %s error=%v\n", linkq.status, - linkq.url, linkq.errScan) - } - wrk.wg.Done() - }() - resultq = make(map[string]linkQueue) var ( httpResp *http.Response @@ -346,10 +258,6 @@ func (wrk *worker) scan(linkq linkQueue) (resultq map[string]linkQueue) { linkq.size = httpResp.ContentLength resultq[linkq.url] = linkq - if slices.Contains(wrk.opts.ignoreStatus, httpResp.StatusCode) { - return nil - } - if httpResp.StatusCode >= http.StatusBadRequest { return resultq } @@ -357,21 +265,23 @@ func (wrk *worker) scan(linkq linkQueue) (resultq map[string]linkQueue) { return resultq } - var doc *html.Node - doc, _ = html.Parse(httpResp.Body) - - // After we check the code and test for [html.Parse] there are - // no case actual cases where HTML content will return an error. + // After we check the code for [html.Parse] there are no cases where + // it will return an error. // The only possible error is when reading from body (io.Reader), and // that is also almost impossible. // // [html.Parse]: https://go.googlesource.com/net/+/refs/tags/v0.40.0/html/parse.go#2347 + var doc *html.Node + doc, _ = html.Parse(httpResp.Body) - var scanUrl *url.URL + var parentUrl *url.URL - scanUrl, err = url.Parse(linkq.url) + parentUrl, err = url.Parse(linkq.url) if err != nil { - log.Fatal(err) + linkq.status = StatusBadLink + linkq.errScan = err + resultq[linkq.url] = linkq + return resultq } var node *html.Node @@ -379,42 +289,42 @@ func (wrk *worker) scan(linkq linkQueue) (resultq map[string]linkQueue) { if node.Type != html.ElementNode { continue } + if node.DataAtom != atom.A && node.DataAtom != atom.Img { + continue + } var nodeLink *linkQueue if node.DataAtom == atom.A { for _, attr := range node.Attr { if attr.Key != `href` { continue } - nodeLink = wrk.processLink(scanUrl, attr.Val, atom.A) + nodeLink = wrk.processLink(parentUrl, attr.Val, atom.A) break } - } else if node.DataAtom == atom.Img { + } else { // atom.Img for _, attr := range node.Attr { if attr.Key != `src` { continue } - nodeLink = wrk.processLink(scanUrl, attr.Val, atom.Img) + nodeLink = wrk.processLink(parentUrl, attr.Val, atom.Img) break } - } else { - continue } if nodeLink == nil { + // Link is invalid. continue } _, seen := resultq[nodeLink.url] - if !seen { - wrk.checkExternal(nodeLink) - resultq[nodeLink.url] = *nodeLink + if seen { + // The same link already exist previously. + continue } + resultq[nodeLink.url] = *nodeLink } return resultq } -func (wrk *worker) fetch(linkq linkQueue) ( - httpResp *http.Response, - err error, -) { +func (wrk *worker) fetch(linkq linkQueue) (httpResp *http.Response, err error) { const maxRetry = 5 var retry int for retry < 5 { @@ -446,6 +356,8 @@ func (wrk *worker) fetch(linkq linkQueue) ( return nil, err } +// processLink given a parentURL and link value `val` +// check if link `val` is valid and return it as linkQueue. func (wrk *worker) processLink(parentUrl *url.URL, val string, kind atom.Atom) ( linkq *linkQueue, ) { @@ -453,17 +365,19 @@ func (wrk *worker) processLink(parentUrl *url.URL, val string, kind atom.Atom) ( return nil } + linkq = &linkQueue{ + parentUrl: parentUrl, + kind: kind, + } + var newUrl *url.URL var err error newUrl, err = url.Parse(val) if err != nil { - return &linkQueue{ - parentUrl: parentUrl, - errScan: err, - url: val, - kind: kind, - status: StatusBadLink, - } + linkq.errScan = err + linkq.url = val + linkq.status = StatusBadLink + return linkq } newUrl.Fragment = "" newUrl.RawFragment = "" @@ -472,55 +386,18 @@ func (wrk *worker) processLink(parentUrl *url.URL, val string, kind atom.Atom) ( // Ignore link to ID, like `href="#element_id"`. return nil } - if strings.HasPrefix(val, `http`) { - return &linkQueue{ - parentUrl: parentUrl, - url: strings.TrimSuffix(newUrl.String(), `/`), - kind: kind, - } - } - if val[0] == '/' { - // val is absolute to parent URL. - newUrl = wrk.baseUrl.JoinPath(newUrl.Path) - } else { - // val is relative to parent URL. - newUrl = parentUrl.JoinPath(`/`, newUrl.Path) - } - linkq = &linkQueue{ - parentUrl: parentUrl, - url: strings.TrimSuffix(newUrl.String(), `/`), - kind: kind, - } - return linkq -} - -func (wrk *worker) pushResult(resultq map[string]linkQueue) { - if len(resultq) == 0 { - return - } - var tick = time.NewTicker(100 * time.Millisecond) - for { - select { - case wrk.resultq <- resultq: - tick.Stop() - return - case <-tick.C: + if !strings.HasPrefix(val, `http`) { + if val[0] == '/' { + // val is absolute link. + newUrl = wrk.baseUrl.JoinPath(newUrl.Path) + } else { + // val is relative to parent URL. + newUrl = parentUrl.JoinPath(`/`, newUrl.Path) } } -} - -// checkExternal set the [linkQueue.isExternal] field to true if -// -// (1) [linkQueue.url] does not start with [Options.Url] -// (2) linkQueue is not from scanPastResult, indicated by non-nil -// [worker.pastResult]. -func (wrk *worker) checkExternal(linkq *linkQueue) { + linkq.url = strings.TrimSuffix(newUrl.String(), `/`) if !strings.HasPrefix(linkq.url, wrk.opts.scanUrl.String()) { linkq.isExternal = true - return - } - if wrk.pastResult != nil { - linkq.isExternal = true - return } + return linkq } |
