diff options
| author | Shulhan <ms@kilabit.info> | 2026-02-11 10:47:42 +0700 |
|---|---|---|
| committer | Shulhan <ms@kilabit.info> | 2026-02-11 21:45:06 +0700 |
| commit | 8100b3be0730173a77f1a64f9ac6bc8862a159ac (patch) | |
| tree | 46d0bdb4a6f3e6c5e709826d61d34259e209d3a4 /brokenlinks/worker.go | |
| parent | e59724f4d701f8889167219b2ccc18f4e8954034 (diff) | |
| download | jarink-8100b3be0730173a77f1a64f9ac6bc8862a159ac.tar.xz | |
brokenlinks: make link that return HTML always end with slash
If parent URL like "/page" return the body as HTML page, the URL should
be end with slash to make the relative links inside it works when joined
with the parent URL.
Diffstat (limited to 'brokenlinks/worker.go')
| -rw-r--r-- | brokenlinks/worker.go | 76 |
1 files changed, 44 insertions, 32 deletions
diff --git a/brokenlinks/worker.go b/brokenlinks/worker.go index 082dc11..387cc05 100644 --- a/brokenlinks/worker.go +++ b/brokenlinks/worker.go @@ -288,6 +288,14 @@ func (wrk *worker) scan(linkq jarink.Link) (resultq map[string]jarink.Link) { return resultq } + ext := path.Ext(linkq.Url) + if ext == `` { + if linkq.Url[len(linkq.Url)-1] != '/' { + linkq.Url += `/` + resultq[linkq.Url] = linkq + } + } + var doc *html.Node doc, err = html.Parse(httpResp.Body) if err != nil { @@ -302,6 +310,8 @@ func (wrk *worker) scan(linkq jarink.Link) (resultq map[string]jarink.Link) { location := httpResp.Header.Get(`Location`) if location == `` { location = linkq.Url + } else { + linkq.Url = location } parentUrl, err = url.Parse(location) @@ -311,6 +321,9 @@ func (wrk *worker) scan(linkq jarink.Link) (resultq map[string]jarink.Link) { resultq[location] = linkq return resultq } + if parentUrl.Path == `` { + parentUrl.Path = `/` + } var node *html.Node for node = range doc.Descendants() { @@ -388,18 +401,24 @@ func (wrk *worker) fetch(linkq jarink.Link) (httpResp *http.Response, err error) func (wrk *worker) processLink(parentUrl *url.URL, val string, kind int) ( linkq *jarink.Link, ) { + val = strings.TrimSpace(val) if len(val) == 0 { return nil } + if kind == int(atom.A) && val[0] == '#' { + // Ignore link to ID, like `href="#element_id"`. + return nil + } + if strings.HasPrefix(val, `mailto:`) { + return nil + } linkq = &jarink.Link{ ParentUrl: parentUrl, Kind: kind, } - var newUrl *url.URL - var err error - newUrl, err = url.Parse(val) + newUrl, err := url.Parse(val) if err != nil { linkq.ErrScan = err linkq.Url = val @@ -409,38 +428,31 @@ func (wrk *worker) processLink(parentUrl *url.URL, val string, kind int) ( newUrl.Fragment = "" newUrl.RawFragment = "" - if kind == int(atom.A) && val[0] == '#' { - // Ignore link to ID, like `href="#element_id"`. - return nil - } - if strings.HasPrefix(val, `mailto:`) { - return nil - } - if !strings.HasPrefix(val, `http`) { - if val[0] == '/' { - // val is absolute link. - newUrl = wrk.baseUrl.JoinPath(newUrl.Path) - } else { - // val is relative to parent URL. - newUrl = genURLRelative(parentUrl, newUrl.Path) + if strings.HasPrefix(val, `http`) { + linkq.Url = newUrl.String() + if !strings.HasPrefix(linkq.Url, wrk.opts.scanUrl.String()) { + linkq.IsExternal = true } + return linkq } - linkq.Url = strings.TrimSuffix(newUrl.String(), `/`) - if !strings.HasPrefix(linkq.Url, wrk.opts.scanUrl.String()) { - linkq.IsExternal = true + if val[0] == '/' { + // val is absolute link. + linkq.Url = wrk.baseUrl.JoinPath(newUrl.Path).String() + if !strings.HasPrefix(linkq.Url, wrk.opts.scanUrl.String()) { + linkq.IsExternal = true + } + return linkq } - return linkq -} -// genURLRelative generate new URL from parent URL and relative path -// `relPath`. -func genURLRelative(parentUrl *url.URL, relPath string) (newUrl *url.URL) { - var parentPath = parentUrl.Path - var ext = strings.ToLower(path.Ext(parentPath)) - if ext == `.html` || ext == `.htm` { - parentPath = path.Dir(parentPath) + // val is relative to parent URL. + ext := path.Ext(parentUrl.Path) + if ext == `` { + linkq.Url = parentUrl.JoinPath(newUrl.Path).String() + } else { + // "parent/page.ext" + "val" => "parent/val" + tmp, _ := url.Parse(parentUrl.String()) + tmp.Path = path.Dir(tmp.Path) + linkq.Url = parentUrl.JoinPath(tmp.Path).String() } - newUrl, _ = url.Parse(parentUrl.String()) - newUrl.Path = path.Join(parentPath, relPath) - return newUrl + return linkq } |
