From 95c32b6f6b0dbe0a026a5c0650cdb74bd45d1175 Mon Sep 17 00:00:00 2001 From: Shulhan Date: Wed, 25 Jun 2025 20:59:06 +0700 Subject: brokenlinks: reduce the number of goroutines on scan Previously, each scan run on one goroutine and the result is pushed using pushResult also in one goroutine. This makes one link consume two goroutines. This changes the scan function to return the result and push it in the same goroutine. --- brokenlinks/worker.go | 40 ++++++++++++++++++++++++++-------------- 1 file changed, 26 insertions(+), 14 deletions(-) diff --git a/brokenlinks/worker.go b/brokenlinks/worker.go index 8d7918f..0493a77 100644 --- a/brokenlinks/worker.go +++ b/brokenlinks/worker.go @@ -133,7 +133,10 @@ func (wrk *worker) scanAll() (result *Result, err error) { wrk.seenLink[firstLinkq.url] = http.StatusProcessing wrk.wg.Add(1) - go wrk.scan(firstLinkq) + go func() { + var resultq = wrk.scan(firstLinkq) + wrk.pushResult(resultq) + }() wrk.wg.Wait() var resultq = <-wrk.resultq @@ -157,7 +160,10 @@ func (wrk *worker) scanAll() (result *Result, err error) { wrk.seenLink[linkq.url] = http.StatusProcessing wrk.wg.Add(1) - go wrk.scan(linkq) + go func() { + var resultq = wrk.scan(linkq) + wrk.pushResult(resultq) + }() } wrk.processAndWait() @@ -175,7 +181,10 @@ func (wrk *worker) scanPastResult() (result *Result, err error) { } wrk.seenLink[linkq.url] = http.StatusProcessing wrk.wg.Add(1) - go wrk.scan(linkq) + go func() { + var resultq = wrk.scan(linkq) + wrk.pushResult(resultq) + }() } wrk.processAndWait() @@ -246,7 +255,10 @@ func (wrk *worker) processResult( if !seen { wrk.seenLink[linkq.url] = http.StatusProcessing wrk.wg.Add(1) - go wrk.scan(linkq) + go func() { + var resultq = wrk.scan(linkq) + wrk.pushResult(resultq) + }() continue } if seenStatus >= http.StatusBadRequest { @@ -307,7 +319,7 @@ func (wrk *worker) markBroken(linkq linkQueue) { } // scan fetch the HTML page or image to check if its valid. -func (wrk *worker) scan(linkq linkQueue) { +func (wrk *worker) scan(linkq linkQueue) (resultq map[string]linkQueue) { defer func() { if wrk.opts.IsVerbose && linkq.errScan != nil { wrk.log.Printf("error: %d %s error=%v\n", linkq.status, @@ -316,8 +328,8 @@ func (wrk *worker) scan(linkq linkQueue) { wrk.wg.Done() }() + resultq = make(map[string]linkQueue) var ( - resultq = map[string]linkQueue{} httpResp *http.Response err error ) @@ -326,8 +338,7 @@ func (wrk *worker) scan(linkq linkQueue) { linkq.status = StatusBadLink linkq.errScan = err resultq[linkq.url] = linkq - go wrk.pushResult(resultq) - return + return resultq } defer httpResp.Body.Close() @@ -336,16 +347,14 @@ func (wrk *worker) scan(linkq linkQueue) { resultq[linkq.url] = linkq if slices.Contains(wrk.opts.ignoreStatus, httpResp.StatusCode) { - return + return nil } if httpResp.StatusCode >= http.StatusBadRequest { - go wrk.pushResult(resultq) - return + return resultq } if linkq.kind == atom.Img || linkq.isExternal { - go wrk.pushResult(resultq) - return + return resultq } var doc *html.Node @@ -399,7 +408,7 @@ func (wrk *worker) scan(linkq linkQueue) { resultq[nodeLink.url] = *nodeLink } } - go wrk.pushResult(resultq) + return resultq } func (wrk *worker) fetch(linkq linkQueue) ( @@ -483,6 +492,9 @@ func (wrk *worker) processLink(parentUrl *url.URL, val string, kind atom.Atom) ( } func (wrk *worker) pushResult(resultq map[string]linkQueue) { + if len(resultq) == 0 { + return + } var tick = time.NewTicker(100 * time.Millisecond) for { select { -- cgit v1.3