aboutsummaryrefslogtreecommitdiff
path: root/brokenlinks/worker.go
diff options
context:
space:
mode:
Diffstat (limited to 'brokenlinks/worker.go')
-rw-r--r--brokenlinks/worker.go76
1 files changed, 44 insertions, 32 deletions
diff --git a/brokenlinks/worker.go b/brokenlinks/worker.go
index 082dc11..387cc05 100644
--- a/brokenlinks/worker.go
+++ b/brokenlinks/worker.go
@@ -288,6 +288,14 @@ func (wrk *worker) scan(linkq jarink.Link) (resultq map[string]jarink.Link) {
return resultq
}
+ ext := path.Ext(linkq.Url)
+ if ext == `` {
+ if linkq.Url[len(linkq.Url)-1] != '/' {
+ linkq.Url += `/`
+ resultq[linkq.Url] = linkq
+ }
+ }
+
var doc *html.Node
doc, err = html.Parse(httpResp.Body)
if err != nil {
@@ -302,6 +310,8 @@ func (wrk *worker) scan(linkq jarink.Link) (resultq map[string]jarink.Link) {
location := httpResp.Header.Get(`Location`)
if location == `` {
location = linkq.Url
+ } else {
+ linkq.Url = location
}
parentUrl, err = url.Parse(location)
@@ -311,6 +321,9 @@ func (wrk *worker) scan(linkq jarink.Link) (resultq map[string]jarink.Link) {
resultq[location] = linkq
return resultq
}
+ if parentUrl.Path == `` {
+ parentUrl.Path = `/`
+ }
var node *html.Node
for node = range doc.Descendants() {
@@ -388,18 +401,24 @@ func (wrk *worker) fetch(linkq jarink.Link) (httpResp *http.Response, err error)
func (wrk *worker) processLink(parentUrl *url.URL, val string, kind int) (
linkq *jarink.Link,
) {
+ val = strings.TrimSpace(val)
if len(val) == 0 {
return nil
}
+ if kind == int(atom.A) && val[0] == '#' {
+ // Ignore link to ID, like `href="#element_id"`.
+ return nil
+ }
+ if strings.HasPrefix(val, `mailto:`) {
+ return nil
+ }
linkq = &jarink.Link{
ParentUrl: parentUrl,
Kind: kind,
}
- var newUrl *url.URL
- var err error
- newUrl, err = url.Parse(val)
+ newUrl, err := url.Parse(val)
if err != nil {
linkq.ErrScan = err
linkq.Url = val
@@ -409,38 +428,31 @@ func (wrk *worker) processLink(parentUrl *url.URL, val string, kind int) (
newUrl.Fragment = ""
newUrl.RawFragment = ""
- if kind == int(atom.A) && val[0] == '#' {
- // Ignore link to ID, like `href="#element_id"`.
- return nil
- }
- if strings.HasPrefix(val, `mailto:`) {
- return nil
- }
- if !strings.HasPrefix(val, `http`) {
- if val[0] == '/' {
- // val is absolute link.
- newUrl = wrk.baseUrl.JoinPath(newUrl.Path)
- } else {
- // val is relative to parent URL.
- newUrl = genURLRelative(parentUrl, newUrl.Path)
+ if strings.HasPrefix(val, `http`) {
+ linkq.Url = newUrl.String()
+ if !strings.HasPrefix(linkq.Url, wrk.opts.scanUrl.String()) {
+ linkq.IsExternal = true
}
+ return linkq
}
- linkq.Url = strings.TrimSuffix(newUrl.String(), `/`)
- if !strings.HasPrefix(linkq.Url, wrk.opts.scanUrl.String()) {
- linkq.IsExternal = true
+ if val[0] == '/' {
+ // val is absolute link.
+ linkq.Url = wrk.baseUrl.JoinPath(newUrl.Path).String()
+ if !strings.HasPrefix(linkq.Url, wrk.opts.scanUrl.String()) {
+ linkq.IsExternal = true
+ }
+ return linkq
}
- return linkq
-}
-// genURLRelative generate new URL from parent URL and relative path
-// `relPath`.
-func genURLRelative(parentUrl *url.URL, relPath string) (newUrl *url.URL) {
- var parentPath = parentUrl.Path
- var ext = strings.ToLower(path.Ext(parentPath))
- if ext == `.html` || ext == `.htm` {
- parentPath = path.Dir(parentPath)
+ // val is relative to parent URL.
+ ext := path.Ext(parentUrl.Path)
+ if ext == `` {
+ linkq.Url = parentUrl.JoinPath(newUrl.Path).String()
+ } else {
+ // "parent/page.ext" + "val" => "parent/val"
+ tmp, _ := url.Parse(parentUrl.String())
+ tmp.Path = path.Dir(tmp.Path)
+ linkq.Url = parentUrl.JoinPath(tmp.Path).String()
}
- newUrl, _ = url.Parse(parentUrl.String())
- newUrl.Path = path.Join(parentPath, relPath)
- return newUrl
+ return linkq
}