diff options
Diffstat (limited to 'brokenlinks/worker.go')
| -rw-r--r-- | brokenlinks/worker.go | 76 |
1 files changed, 44 insertions, 32 deletions
diff --git a/brokenlinks/worker.go b/brokenlinks/worker.go index 082dc11..387cc05 100644 --- a/brokenlinks/worker.go +++ b/brokenlinks/worker.go @@ -288,6 +288,14 @@ func (wrk *worker) scan(linkq jarink.Link) (resultq map[string]jarink.Link) { return resultq } + ext := path.Ext(linkq.Url) + if ext == `` { + if linkq.Url[len(linkq.Url)-1] != '/' { + linkq.Url += `/` + resultq[linkq.Url] = linkq + } + } + var doc *html.Node doc, err = html.Parse(httpResp.Body) if err != nil { @@ -302,6 +310,8 @@ func (wrk *worker) scan(linkq jarink.Link) (resultq map[string]jarink.Link) { location := httpResp.Header.Get(`Location`) if location == `` { location = linkq.Url + } else { + linkq.Url = location } parentUrl, err = url.Parse(location) @@ -311,6 +321,9 @@ func (wrk *worker) scan(linkq jarink.Link) (resultq map[string]jarink.Link) { resultq[location] = linkq return resultq } + if parentUrl.Path == `` { + parentUrl.Path = `/` + } var node *html.Node for node = range doc.Descendants() { @@ -388,18 +401,24 @@ func (wrk *worker) fetch(linkq jarink.Link) (httpResp *http.Response, err error) func (wrk *worker) processLink(parentUrl *url.URL, val string, kind int) ( linkq *jarink.Link, ) { + val = strings.TrimSpace(val) if len(val) == 0 { return nil } + if kind == int(atom.A) && val[0] == '#' { + // Ignore link to ID, like `href="#element_id"`. + return nil + } + if strings.HasPrefix(val, `mailto:`) { + return nil + } linkq = &jarink.Link{ ParentUrl: parentUrl, Kind: kind, } - var newUrl *url.URL - var err error - newUrl, err = url.Parse(val) + newUrl, err := url.Parse(val) if err != nil { linkq.ErrScan = err linkq.Url = val @@ -409,38 +428,31 @@ func (wrk *worker) processLink(parentUrl *url.URL, val string, kind int) ( newUrl.Fragment = "" newUrl.RawFragment = "" - if kind == int(atom.A) && val[0] == '#' { - // Ignore link to ID, like `href="#element_id"`. - return nil - } - if strings.HasPrefix(val, `mailto:`) { - return nil - } - if !strings.HasPrefix(val, `http`) { - if val[0] == '/' { - // val is absolute link. - newUrl = wrk.baseUrl.JoinPath(newUrl.Path) - } else { - // val is relative to parent URL. - newUrl = genURLRelative(parentUrl, newUrl.Path) + if strings.HasPrefix(val, `http`) { + linkq.Url = newUrl.String() + if !strings.HasPrefix(linkq.Url, wrk.opts.scanUrl.String()) { + linkq.IsExternal = true } + return linkq } - linkq.Url = strings.TrimSuffix(newUrl.String(), `/`) - if !strings.HasPrefix(linkq.Url, wrk.opts.scanUrl.String()) { - linkq.IsExternal = true + if val[0] == '/' { + // val is absolute link. + linkq.Url = wrk.baseUrl.JoinPath(newUrl.Path).String() + if !strings.HasPrefix(linkq.Url, wrk.opts.scanUrl.String()) { + linkq.IsExternal = true + } + return linkq } - return linkq -} -// genURLRelative generate new URL from parent URL and relative path -// `relPath`. -func genURLRelative(parentUrl *url.URL, relPath string) (newUrl *url.URL) { - var parentPath = parentUrl.Path - var ext = strings.ToLower(path.Ext(parentPath)) - if ext == `.html` || ext == `.htm` { - parentPath = path.Dir(parentPath) + // val is relative to parent URL. + ext := path.Ext(parentUrl.Path) + if ext == `` { + linkq.Url = parentUrl.JoinPath(newUrl.Path).String() + } else { + // "parent/page.ext" + "val" => "parent/val" + tmp, _ := url.Parse(parentUrl.String()) + tmp.Path = path.Dir(tmp.Path) + linkq.Url = parentUrl.JoinPath(tmp.Path).String() } - newUrl, _ = url.Parse(parentUrl.String()) - newUrl.Path = path.Join(parentPath, relPath) - return newUrl + return linkq } |
