summaryrefslogtreecommitdiff
path: root/worker.go
diff options
context:
space:
mode:
Diffstat (limited to 'worker.go')
-rw-r--r--worker.go161
1 files changed, 125 insertions, 36 deletions
diff --git a/worker.go b/worker.go
index 8463c79..0b2ffc0 100644
--- a/worker.go
+++ b/worker.go
@@ -6,19 +6,23 @@ package deadlinks
import (
"fmt"
"io"
+ "log"
"net/http"
+ "net/url"
+ "strings"
"sync"
"golang.org/x/net/html"
+ "golang.org/x/net/html/atom"
)
type worker struct {
- // seenPage store the page URL that has been scanned and its HTTP status
+ // seenLink store the page URL that has been scanned and its HTTP status
// code.
- seenPage map[string]int
+ seenLink map[string]int
- // pageq contains queue of page URL to be scanned.
- pageq chan string
+ // linkq contains queue of page URL to be scanned.
+ linkq chan linkQueue
// errq contains error from scanning a page URL.
errq chan error
@@ -26,82 +30,121 @@ type worker struct {
// result contains map of page URL and its list of broken link.
result *Result
+ // The base URL to scan that will be joined to relative or absolute
+ // links or image.
+ baseUrl *url.URL
+
// wg sync the goroutine scanner.
wg sync.WaitGroup
- // seenPageMtx guard the seenPage field from concurrent read/write.
- seenPageMtx sync.Mutex
+ // seenLinkMtx guard the seenLink field from concurrent read/write.
+ seenLinkMtx sync.Mutex
}
-func newWorker(baseUrl string) (wrk *worker) {
+func newWorker(baseUrl string) (wrk *worker, err error) {
wrk = &worker{
- pageq: make(chan string, 1),
+ seenLink: map[string]int{},
+ linkq: make(chan linkQueue, 1000),
errq: make(chan error, 1),
- seenPage: map[string]int{},
result: newResult(),
}
- wrk.pageq <- baseUrl
- return wrk
+ wrk.baseUrl, err = url.Parse(baseUrl)
+ if err != nil {
+ return nil, err
+ }
+
+ wrk.baseUrl = wrk.baseUrl.JoinPath(`/`)
+ wrk.linkq <- linkQueue{
+ parentUrl: nil,
+ url: wrk.baseUrl.String(),
+ }
+ return wrk, nil
}
func (wrk *worker) run() (result *Result, err error) {
- for len(wrk.pageq) > 0 {
+ var ever bool = true
+ for ever {
select {
- case page := <-wrk.pageq:
+ case linkq := <-wrk.linkq:
wrk.wg.Add(1)
- go wrk.scan(page)
+ go wrk.scan(linkq)
case err = <-wrk.errq:
- close(wrk.pageq)
return nil, err
default:
wrk.wg.Wait()
+ if len(wrk.linkq) == 0 {
+ ever = false
+ }
}
}
+ wrk.result.sort()
return wrk.result, nil
}
// scan the function that fetch the HTML page and scan for broken links.
-func (wrk *worker) scan(pageUrl string) {
+func (wrk *worker) scan(linkq linkQueue) {
var logp = `scan`
defer wrk.wg.Done()
- if wrk.hasSeen(pageUrl) {
+ wrk.seenLinkMtx.Lock()
+ statusCode, seen := wrk.seenLink[linkq.url]
+ wrk.seenLinkMtx.Unlock()
+ if seen {
+ if statusCode >= http.StatusBadRequest {
+ wrk.markDead(linkq, statusCode)
+ }
return
}
- wrk.markSeen(pageUrl, http.StatusProcessing)
+ wrk.seenLinkMtx.Lock()
+ wrk.seenLink[linkq.url] = http.StatusProcessing
+ wrk.seenLinkMtx.Unlock()
var (
httpResp *http.Response
err error
)
- httpResp, err = http.Get(pageUrl)
+ httpResp, err = http.Get(linkq.url)
if err != nil {
wrk.errq <- err
return
}
if httpResp.StatusCode != http.StatusOK {
- wrk.errq <- fmt.Errorf(`%s %s: return HTTP status code %d`,
- logp, pageUrl, httpResp.StatusCode)
+ wrk.markDead(linkq, httpResp.StatusCode)
return
}
defer httpResp.Body.Close()
- err = wrk.parseHTML(pageUrl, httpResp.Body)
+ err = wrk.parseHTML(linkq.url, httpResp.Body)
if err != nil {
- wrk.errq <- fmt.Errorf(`%s %s: %w`, logp, pageUrl, err)
+ wrk.errq <- fmt.Errorf(`%s %s: %w`, logp, linkq.url, err)
return
}
}
-func (wrk *worker) parseHTML(pageUrl string, body io.Reader) (err error) {
+func (wrk *worker) markDead(linkq linkQueue, httpStatusCode int) {
+ var parentUrl = linkq.parentUrl.String()
+
+ wrk.seenLinkMtx.Lock()
+ var listBroken = wrk.result.PageLinks[parentUrl]
+ listBroken = append(listBroken, Broken{
+ Link: linkq.url,
+ Code: httpStatusCode,
+ })
+ wrk.result.PageLinks[parentUrl] = listBroken
+ wrk.seenLink[linkq.url] = httpStatusCode
+ wrk.seenLinkMtx.Unlock()
+}
+
+func (wrk *worker) parseHTML(linkUrl string, body io.Reader) (err error) {
var (
logp = `parseHTML`
doc *html.Node
)
+
doc, err = html.Parse(body)
if err != nil {
return fmt.Errorf(`%s: %w`, logp, err)
@@ -112,20 +155,66 @@ func (wrk *worker) parseHTML(pageUrl string, body io.Reader) (err error) {
if node.Type != html.ElementNode {
continue
}
+ if node.DataAtom == atom.A {
+ for _, attr := range node.Attr {
+ if attr.Key != `href` {
+ continue
+ }
+ wrk.processLink(linkUrl, attr.Val)
+ }
+ }
+ if node.DataAtom == atom.Img {
+ for _, attr := range node.Attr {
+ if attr.Key != `src` {
+ continue
+ }
+ wrk.processLink(linkUrl, attr.Val)
+ }
+ }
}
return nil
}
-// hasSeen return true if the pageUrl has been scanned.
-func (wrk *worker) hasSeen(pageUrl string) (ok bool) {
- wrk.seenPageMtx.Lock()
- _, ok = wrk.seenPage[pageUrl]
- wrk.seenPageMtx.Unlock()
- return ok
-}
-
-func (wrk *worker) markSeen(pageUrl string, httpStatusCode int) {
- wrk.seenPageMtx.Lock()
- wrk.seenPage[pageUrl] = httpStatusCode
- wrk.seenPageMtx.Unlock()
+func (wrk *worker) processLink(rawParentUrl string, val string) {
+ if len(val) == 0 {
+ return
+ }
+ var parentUrl *url.URL
+ var err error
+ parentUrl, err = url.Parse(rawParentUrl)
+ if err != nil {
+ log.Fatal(err)
+ }
+ if val[0] == '/' {
+ // Link to the same domain will queued for scanning.
+ var newUrl = wrk.baseUrl.JoinPath(val)
+ wrk.linkq <- linkQueue{
+ parentUrl: parentUrl,
+ url: newUrl.String(),
+ }
+ return
+ }
+ if strings.HasPrefix(val, `http`) {
+ var newUrl *url.URL
+ newUrl, err = url.Parse(val)
+ if err != nil {
+ var linkq = linkQueue{
+ parentUrl: parentUrl,
+ url: val,
+ }
+ wrk.markDead(linkq, 700)
+ return
+ }
+ wrk.linkq <- linkQueue{
+ parentUrl: parentUrl,
+ url: newUrl.String(),
+ }
+ return
+ }
+ // val is relative to parent URL.
+ var newUrl = parentUrl.JoinPath(`/`, val)
+ wrk.linkq <- linkQueue{
+ parentUrl: parentUrl,
+ url: newUrl.String(),
+ }
}