aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--deadlinks.go4
-rw-r--r--deadlinks_test.go121
-rw-r--r--link_queue.go19
-rw-r--r--worker.go321
4 files changed, 284 insertions, 181 deletions
diff --git a/deadlinks.go b/deadlinks.go
index 1d31b8f..365a6fb 100644
--- a/deadlinks.go
+++ b/deadlinks.go
@@ -7,6 +7,10 @@ import (
"fmt"
)
+// StatusBadLink status for link that is not parseable by [url.Parse] or not
+// reachable during GET or HEAD, either timeout or IP or domain not exist.
+const StatusBadLink = 700
+
// Scan the baseUrl for dead links.
func Scan(opts ScanOptions) (result *Result, err error) {
var logp = `Scan`
diff --git a/deadlinks_test.go b/deadlinks_test.go
index 2950db4..8b7d83e 100644
--- a/deadlinks_test.go
+++ b/deadlinks_test.go
@@ -89,64 +89,76 @@ func TestDeadLinks_Scan(t *testing.T) {
}, {
scanUrl: testUrl,
exp: map[string][]deadlinks.Broken{
- testUrl: []deadlinks.Broken{{
- Link: testUrl + `/broken.png`,
- Code: http.StatusNotFound,
- }, {
- Link: testUrl + `/brokenPage`,
- Code: http.StatusNotFound,
- }, {
- Link: `http://127.0.0.1:abc`,
- Code: 700,
- }, {
- Link: `http:/127.0.0.1:11836`,
- Code: http.StatusNotFound,
- }},
- testUrl + `/broken.html`: []deadlinks.Broken{{
- Link: testUrl + `/brokenPage`,
- Code: http.StatusNotFound,
- }},
- testUrl + `/page2`: []deadlinks.Broken{{
- Link: testUrl + `/broken.png`,
- Code: http.StatusNotFound,
- }, {
- Link: testUrl + `/page2/broken/relative`,
- Code: http.StatusNotFound,
- }, {
- Link: testUrl + `/page2/broken2.png`,
- Code: http.StatusNotFound,
- }},
+ testUrl: []deadlinks.Broken{
+ {
+ Link: testUrl + `/broken.png`,
+ Code: http.StatusNotFound,
+ }, {
+ Link: testUrl + `/brokenPage`,
+ Code: http.StatusNotFound,
+ }, {
+ Link: `http://127.0.0.1:abc`,
+ Code: deadlinks.StatusBadLink,
+ }, {
+ Link: `http:/127.0.0.1:11836`,
+ Code: deadlinks.StatusBadLink,
+ },
+ },
+ testUrl + `/broken.html`: []deadlinks.Broken{
+ {
+ Link: testUrl + `/brokenPage`,
+ Code: http.StatusNotFound,
+ },
+ },
+ testUrl + `/page2`: []deadlinks.Broken{
+ {
+ Link: testUrl + `/broken.png`,
+ Code: http.StatusNotFound,
+ }, {
+ Link: testUrl + `/page2/broken/relative`,
+ Code: http.StatusNotFound,
+ }, {
+ Link: testUrl + `/page2/broken2.png`,
+ Code: http.StatusNotFound,
+ },
+ },
},
}, {
scanUrl: testUrl + `/page2`,
exp: map[string][]deadlinks.Broken{
- testUrl: []deadlinks.Broken{{
- Link: testUrl + `/broken.png`,
- Code: http.StatusNotFound,
- }, {
- Link: testUrl + `/brokenPage`,
- Code: http.StatusNotFound,
- }, {
- Link: `http://127.0.0.1:abc`,
- Code: 700,
- }, {
- Link: `http:/127.0.0.1:11836`,
- Code: http.StatusNotFound,
- }},
- testUrl + `/broken.html`: []deadlinks.Broken{{
- Link: testUrl + `/brokenPage`,
- Code: http.StatusNotFound,
- }},
- testUrl + `/page2`: []deadlinks.Broken{{
- Link: testUrl + `/broken.png`,
- Code: http.StatusNotFound,
- }, {
- Link: testUrl + `/page2/broken/relative`,
- Code: http.StatusNotFound,
- }, {
- Link: testUrl + `/page2/broken2.png`,
- Code: http.StatusNotFound,
- }},
+ testUrl: []deadlinks.Broken{
+ {
+ Link: testUrl + `/broken.png`,
+ Code: http.StatusNotFound,
+ }, {
+ Link: testUrl + `/brokenPage`,
+ Code: http.StatusNotFound,
+ }, {
+ Link: `http://127.0.0.1:abc`,
+ Code: deadlinks.StatusBadLink,
+ }, {
+ Link: `http:/127.0.0.1:11836`,
+ Code: deadlinks.StatusBadLink,
+ },
+ },
+ testUrl + `/broken.html`: []deadlinks.Broken{
+ {
+ Link: testUrl + `/brokenPage`,
+ Code: http.StatusNotFound,
+ },
+ },
+ testUrl + `/page2`: []deadlinks.Broken{
+ {
+ Link: testUrl + `/broken.png`,
+ Code: http.StatusNotFound,
+ }, {
+ Link: testUrl + `/page2/broken/relative`,
+ Code: http.StatusNotFound,
+ }, {
+ Link: testUrl + `/page2/broken2.png`,
+ Code: http.StatusNotFound,
+ },
+ },
},
}}
@@ -155,6 +167,7 @@ func TestDeadLinks_Scan(t *testing.T) {
err error
)
for _, tcase := range listCase {
+ t.Logf(`--- Scan: %s`, tcase.scanUrl)
var scanOpts = deadlinks.ScanOptions{
Url: tcase.scanUrl,
}
diff --git a/link_queue.go b/link_queue.go
index 9987586..10dbfff 100644
--- a/link_queue.go
+++ b/link_queue.go
@@ -11,6 +11,21 @@ import (
type linkQueue struct {
parentUrl *url.URL
- url string
- kind atom.Atom
+
+ // The error from scan.
+ errScan error
+
+ // url being scanned.
+ url string
+
+ // kind of url, its either an anchor or image.
+ // It set to 0 if url is the first URL being scanned.
+ kind atom.Atom
+
+ // Status of link after scan, its mostly used the HTTP status code.
+ // 0: link is the result of scan, not processed yet.
+ // StatusBadLink: link is invalid, not parseable or unreachable.
+ // 200 - 211: OK.
+ // 400 - 511: Error.
+ status int
}
diff --git a/worker.go b/worker.go
index 99bef6f..6bc5c13 100644
--- a/worker.go
+++ b/worker.go
@@ -5,29 +5,27 @@ package deadlinks
import (
"fmt"
- "io"
"log"
"net/http"
"net/url"
"strings"
"sync"
+ "time"
"golang.org/x/net/html"
"golang.org/x/net/html/atom"
)
type worker struct {
- // seenLink store the page URL that has been scanned and its HTTP status
- // code.
+ // seenLink store the URL being or has been scanned and its HTTP
+ // status code.
seenLink map[string]int
- // linkq contains queue of page URL to be scanned.
- linkq chan linkQueue
+ // resultq channel that collect result from scanning.
+ resultq chan map[string]linkQueue
- // errq contains error from scanning a page URL.
- errq chan error
-
- // result contains map of page URL and its list of broken link.
+ // result contains the final result after all of the pages has been
+ // scanned.
result *Result
// The base URL that will be joined to relative or absolute
@@ -41,17 +39,13 @@ type worker struct {
// wg sync the goroutine scanner.
wg sync.WaitGroup
-
- // seenLinkMtx guard the seenLink field from concurrent read/write.
- seenLinkMtx sync.Mutex
}
func newWorker(opts ScanOptions) (wrk *worker, err error) {
wrk = &worker{
opts: opts,
seenLink: map[string]int{},
- linkq: make(chan linkQueue, 10000),
- errq: make(chan error, 1),
+ resultq: make(chan map[string]linkQueue, 100),
result: newResult(),
}
@@ -69,63 +63,151 @@ func newWorker(opts ScanOptions) (wrk *worker, err error) {
Host: wrk.scanUrl.Host,
}
- wrk.linkq <- linkQueue{
- parentUrl: nil,
- url: wrk.scanUrl.String(),
- }
return wrk, nil
}
func (wrk *worker) run() (result *Result, err error) {
- var ever bool = true
- for ever {
+ // Scan the first URL to make sure that the server is reachable.
+ var firstLinkq = linkQueue{
+ parentUrl: nil,
+ url: wrk.scanUrl.String(),
+ status: http.StatusProcessing,
+ }
+ wrk.seenLink[firstLinkq.url] = http.StatusProcessing
+
+ wrk.wg.Add(1)
+ go wrk.scan(firstLinkq)
+ wrk.wg.Wait()
+
+ var resultq = <-wrk.resultq
+ for _, linkq := range resultq {
+ if linkq.url == firstLinkq.url {
+ if linkq.errScan != nil {
+ return nil, linkq.errScan
+ }
+ wrk.seenLink[linkq.url] = linkq.status
+ continue
+ }
+ if linkq.status >= http.StatusBadRequest {
+ wrk.markDead(linkq)
+ continue
+ }
+
+ wrk.seenLink[linkq.url] = http.StatusProcessing
+ wrk.wg.Add(1)
+ go wrk.scan(linkq)
+ }
+
+ var listWaitStatus []linkQueue
+ var isScanning = true
+ for isScanning {
select {
- case linkq := <-wrk.linkq:
- wrk.wg.Add(1)
- go wrk.scan(linkq)
+ case resultq := <-wrk.resultq:
+
+ // The resultq contains the original URL being scanned
+ // and its child links.
+ // For example, scanning "http://example.tld" result
+ // in
+ //
+ // "http://example.tld": {status=200}
+ // "http://example.tld/page": {status=0}
+ // "http://example.tld/image.png": {status=0}
+ // "http://bad:domain/image.png": {status=700}
+
+ for _, linkq := range resultq {
+ if linkq.status >= http.StatusBadRequest {
+ wrk.markDead(linkq)
+ continue
+ }
+
+ seenStatus, seen := wrk.seenLink[linkq.url]
+ if !seen {
+ wrk.seenLink[linkq.url] = http.StatusProcessing
+ wrk.wg.Add(1)
+ go wrk.scan(linkq)
+ continue
+ }
+ if seenStatus >= http.StatusBadRequest {
+ linkq.status = seenStatus
+ wrk.markDead(linkq)
+ continue
+ }
+ if seenStatus >= http.StatusOK {
+ // The link has been processed and its
+ // not an error.
+ continue
+ }
+ if linkq.status != 0 {
+ // linkq is the result of scan with
+ // non error status.
+ wrk.seenLink[linkq.url] = linkq.status
+ continue
+ }
+
+ // The link being processed by other
+ // goroutine.
+ listWaitStatus = append(listWaitStatus, linkq)
+ }
default:
wrk.wg.Wait()
-
- select {
- case err = <-wrk.errq:
- return nil, err
- default:
- if len(wrk.linkq) == 0 {
- ever = false
+ if len(wrk.resultq) != 0 {
+ continue
+ }
+ var newList []linkQueue
+ for _, linkq := range listWaitStatus {
+ seenStatus := wrk.seenLink[linkq.url]
+ if seenStatus == http.StatusProcessing {
+ // Scanning still in progress.
+ newList = append(newList, linkq)
+ continue
+ }
+ if seenStatus >= http.StatusBadRequest {
+ linkq.status = seenStatus
+ wrk.markDead(linkq)
+ continue
}
}
+ if len(newList) != 0 {
+ // There are link that still waiting for
+ // scanning to be completed.
+ listWaitStatus = newList
+ continue
+ }
+ isScanning = false
}
}
wrk.result.sort()
return wrk.result, nil
}
+func (wrk *worker) markDead(linkq linkQueue) {
+ var parentUrl = linkq.parentUrl.String()
+ var listBroken = wrk.result.PageLinks[parentUrl]
+ var brokenLink = Broken{
+ Link: linkq.url,
+ Code: linkq.status,
+ }
+ listBroken = append(listBroken, brokenLink)
+ wrk.result.PageLinks[parentUrl] = listBroken
+ wrk.seenLink[linkq.url] = linkq.status
+}
+
// scan fetch the HTML page or image to check if its valid.
func (wrk *worker) scan(linkq linkQueue) {
- defer wrk.wg.Done()
-
- wrk.seenLinkMtx.Lock()
- statusCode, seen := wrk.seenLink[linkq.url]
- wrk.seenLinkMtx.Unlock()
- if seen {
- if statusCode >= http.StatusBadRequest {
- wrk.markDead(linkq, statusCode)
- }
+ defer func() {
if wrk.opts.IsVerbose {
- fmt.Printf("scan: %s %d\n", linkq.url, statusCode)
+ fmt.Printf(" done: %d %s\n", linkq.status, linkq.url)
}
- return
- }
- wrk.seenLinkMtx.Lock()
- wrk.seenLink[linkq.url] = http.StatusProcessing
- wrk.seenLinkMtx.Unlock()
+ wrk.wg.Done()
+ }()
if wrk.opts.IsVerbose {
- fmt.Printf("scan: %s %d\n", linkq.url, http.StatusProcessing)
+ fmt.Printf("scan: %d %s\n", linkq.status, linkq.url)
}
var (
+ resultq = map[string]linkQueue{}
httpResp *http.Response
err error
)
@@ -135,51 +217,34 @@ func (wrk *worker) scan(linkq linkQueue) {
httpResp, err = http.Get(linkq.url)
}
if err != nil {
- if linkq.parentUrl == nil {
- wrk.errq <- err
- } else {
- wrk.markDead(linkq, http.StatusNotFound)
- }
+ linkq.status = StatusBadLink
+ linkq.errScan = err
+ resultq[linkq.url] = linkq
+ go wrk.pushResult(resultq)
return
}
defer httpResp.Body.Close()
- if httpResp.StatusCode != http.StatusOK {
- wrk.markDead(linkq, httpResp.StatusCode)
+ linkq.status = httpResp.StatusCode
+ resultq[linkq.url] = linkq
+
+ if httpResp.StatusCode >= http.StatusBadRequest {
+ go wrk.pushResult(resultq)
return
}
- wrk.seenLinkMtx.Lock()
- wrk.seenLink[linkq.url] = http.StatusOK
- wrk.seenLinkMtx.Unlock()
-
if linkq.kind == atom.Img {
+ go wrk.pushResult(resultq)
return
}
if !strings.HasPrefix(linkq.url, wrk.baseUrl.String()) {
- // Do not parse the page from external domain.
+ // Do not parse the HTML page from external domain, only need
+ // its HTTP status code.
+ go wrk.pushResult(resultq)
return
}
- wrk.parseHTML(linkq.url, httpResp.Body)
-}
-
-func (wrk *worker) markDead(linkq linkQueue, httpStatusCode int) {
- var parentUrl = linkq.parentUrl.String()
-
- wrk.seenLinkMtx.Lock()
- var listBroken = wrk.result.PageLinks[parentUrl]
- listBroken = append(listBroken, Broken{
- Link: linkq.url,
- Code: httpStatusCode,
- })
- wrk.result.PageLinks[parentUrl] = listBroken
- wrk.seenLink[linkq.url] = httpStatusCode
- wrk.seenLinkMtx.Unlock()
-}
-func (wrk *worker) parseHTML(linkUrl string, body io.Reader) {
var doc *html.Node
-
- doc, _ = html.Parse(body)
+ doc, _ = html.Parse(httpResp.Body)
// After we check the code and test for [html.Parse] there are
// no case actual cases where HTML content will return an error.
@@ -188,90 +253,96 @@ func (wrk *worker) parseHTML(linkUrl string, body io.Reader) {
//
// [html.Parse]: https://go.googlesource.com/net/+/refs/tags/v0.40.0/html/parse.go#2347
+ var scanUrl *url.URL
+
+ scanUrl, err = url.Parse(linkq.url)
+ if err != nil {
+ log.Fatal(err)
+ }
+
var node *html.Node
+ var link string
+ var status int
for node = range doc.Descendants() {
if node.Type != html.ElementNode {
continue
}
+ link = ""
if node.DataAtom == atom.A {
for _, attr := range node.Attr {
if attr.Key != `href` {
continue
}
- wrk.processLink(linkUrl, attr.Val, atom.A)
+ link, status = wrk.processLink(scanUrl, attr.Val, atom.A)
+ break
}
- }
- if node.DataAtom == atom.Img {
+ } else if node.DataAtom == atom.Img {
for _, attr := range node.Attr {
if attr.Key != `src` {
continue
}
- wrk.processLink(linkUrl, attr.Val, atom.Img)
+ link, status = wrk.processLink(scanUrl, attr.Val, atom.Img)
+ break
}
+ } else {
+ continue
+ }
+ if link == "" {
+ continue
+ }
+ resultq[link] = linkQueue{
+ parentUrl: scanUrl,
+ url: link,
+ kind: node.DataAtom,
+ status: status,
}
}
+ go wrk.pushResult(resultq)
}
-func (wrk *worker) processLink(rawParentUrl, val string, kind atom.Atom) {
+func (wrk *worker) processLink(parentUrl *url.URL, val string, kind atom.Atom) (
+ link string, status int,
+) {
if len(val) == 0 {
- return
- }
-
- var parentUrl *url.URL
- var err error
-
- parentUrl, err = url.Parse(rawParentUrl)
- if err != nil {
- log.Fatal(err)
+ return "", 0
}
var newUrl *url.URL
+ var err error
newUrl, err = url.Parse(val)
if err != nil {
- var linkq = linkQueue{
- parentUrl: parentUrl,
- url: val,
- kind: kind,
- }
- wrk.markDead(linkq, 700)
- return
+ return val, StatusBadLink
}
newUrl.Fragment = ""
newUrl.RawFragment = ""
- var newUrlStr = strings.TrimSuffix(newUrl.String(), `/`)
-
if kind == atom.A && val[0] == '#' {
// Ignore link to ID, like `href="#element_id"`.
- return
+ return "", 0
+ }
+ if strings.HasPrefix(val, `http`) {
+ link = strings.TrimSuffix(newUrl.String(), `/`)
+ return link, 0
}
-
- // val is absolute to parent URL.
if val[0] == '/' {
- // Link to the same domain will queued for scanning.
+ // val is absolute to parent URL.
newUrl = wrk.baseUrl.JoinPath(newUrl.Path)
- newUrlStr = strings.TrimSuffix(newUrl.String(), `/`)
- wrk.linkq <- linkQueue{
- parentUrl: parentUrl,
- url: newUrlStr,
- kind: kind,
- }
- return
+ } else {
+ // val is relative to parent URL.
+ newUrl = parentUrl.JoinPath(`/`, newUrl.Path)
}
- if strings.HasPrefix(val, `http`) {
- wrk.linkq <- linkQueue{
- parentUrl: parentUrl,
- url: newUrlStr,
- kind: kind,
+ link = strings.TrimSuffix(newUrl.String(), `/`)
+ return link, 0
+}
+
+func (wrk *worker) pushResult(resultq map[string]linkQueue) {
+ var tick = time.NewTicker(100 * time.Millisecond)
+ for {
+ select {
+ case wrk.resultq <- resultq:
+ tick.Stop()
+ return
+ case <-tick.C:
}
- return
- }
- // val is relative to parent URL.
- newUrl = parentUrl.JoinPath(`/`, newUrl.Path)
- newUrlStr = strings.TrimSuffix(newUrl.String(), `/`)
- wrk.linkq <- linkQueue{
- parentUrl: parentUrl,
- url: newUrlStr,
- kind: kind,
}
}