aboutsummaryrefslogtreecommitdiff
path: root/brokenlinks/worker.go
diff options
context:
space:
mode:
authorShulhan <ms@kilabit.info>2026-02-11 10:47:42 +0700
committerShulhan <ms@kilabit.info>2026-02-11 21:45:06 +0700
commit8100b3be0730173a77f1a64f9ac6bc8862a159ac (patch)
tree46d0bdb4a6f3e6c5e709826d61d34259e209d3a4 /brokenlinks/worker.go
parente59724f4d701f8889167219b2ccc18f4e8954034 (diff)
downloadjarink-8100b3be0730173a77f1a64f9ac6bc8862a159ac.tar.xz
brokenlinks: make link that return HTML always end with slash
If parent URL like "/page" return the body as HTML page, the URL should be end with slash to make the relative links inside it works when joined with the parent URL.
Diffstat (limited to 'brokenlinks/worker.go')
-rw-r--r--brokenlinks/worker.go76
1 files changed, 44 insertions, 32 deletions
diff --git a/brokenlinks/worker.go b/brokenlinks/worker.go
index 082dc11..387cc05 100644
--- a/brokenlinks/worker.go
+++ b/brokenlinks/worker.go
@@ -288,6 +288,14 @@ func (wrk *worker) scan(linkq jarink.Link) (resultq map[string]jarink.Link) {
return resultq
}
+ ext := path.Ext(linkq.Url)
+ if ext == `` {
+ if linkq.Url[len(linkq.Url)-1] != '/' {
+ linkq.Url += `/`
+ resultq[linkq.Url] = linkq
+ }
+ }
+
var doc *html.Node
doc, err = html.Parse(httpResp.Body)
if err != nil {
@@ -302,6 +310,8 @@ func (wrk *worker) scan(linkq jarink.Link) (resultq map[string]jarink.Link) {
location := httpResp.Header.Get(`Location`)
if location == `` {
location = linkq.Url
+ } else {
+ linkq.Url = location
}
parentUrl, err = url.Parse(location)
@@ -311,6 +321,9 @@ func (wrk *worker) scan(linkq jarink.Link) (resultq map[string]jarink.Link) {
resultq[location] = linkq
return resultq
}
+ if parentUrl.Path == `` {
+ parentUrl.Path = `/`
+ }
var node *html.Node
for node = range doc.Descendants() {
@@ -388,18 +401,24 @@ func (wrk *worker) fetch(linkq jarink.Link) (httpResp *http.Response, err error)
func (wrk *worker) processLink(parentUrl *url.URL, val string, kind int) (
linkq *jarink.Link,
) {
+ val = strings.TrimSpace(val)
if len(val) == 0 {
return nil
}
+ if kind == int(atom.A) && val[0] == '#' {
+ // Ignore link to ID, like `href="#element_id"`.
+ return nil
+ }
+ if strings.HasPrefix(val, `mailto:`) {
+ return nil
+ }
linkq = &jarink.Link{
ParentUrl: parentUrl,
Kind: kind,
}
- var newUrl *url.URL
- var err error
- newUrl, err = url.Parse(val)
+ newUrl, err := url.Parse(val)
if err != nil {
linkq.ErrScan = err
linkq.Url = val
@@ -409,38 +428,31 @@ func (wrk *worker) processLink(parentUrl *url.URL, val string, kind int) (
newUrl.Fragment = ""
newUrl.RawFragment = ""
- if kind == int(atom.A) && val[0] == '#' {
- // Ignore link to ID, like `href="#element_id"`.
- return nil
- }
- if strings.HasPrefix(val, `mailto:`) {
- return nil
- }
- if !strings.HasPrefix(val, `http`) {
- if val[0] == '/' {
- // val is absolute link.
- newUrl = wrk.baseUrl.JoinPath(newUrl.Path)
- } else {
- // val is relative to parent URL.
- newUrl = genURLRelative(parentUrl, newUrl.Path)
+ if strings.HasPrefix(val, `http`) {
+ linkq.Url = newUrl.String()
+ if !strings.HasPrefix(linkq.Url, wrk.opts.scanUrl.String()) {
+ linkq.IsExternal = true
}
+ return linkq
}
- linkq.Url = strings.TrimSuffix(newUrl.String(), `/`)
- if !strings.HasPrefix(linkq.Url, wrk.opts.scanUrl.String()) {
- linkq.IsExternal = true
+ if val[0] == '/' {
+ // val is absolute link.
+ linkq.Url = wrk.baseUrl.JoinPath(newUrl.Path).String()
+ if !strings.HasPrefix(linkq.Url, wrk.opts.scanUrl.String()) {
+ linkq.IsExternal = true
+ }
+ return linkq
}
- return linkq
-}
-// genURLRelative generate new URL from parent URL and relative path
-// `relPath`.
-func genURLRelative(parentUrl *url.URL, relPath string) (newUrl *url.URL) {
- var parentPath = parentUrl.Path
- var ext = strings.ToLower(path.Ext(parentPath))
- if ext == `.html` || ext == `.htm` {
- parentPath = path.Dir(parentPath)
+ // val is relative to parent URL.
+ ext := path.Ext(parentUrl.Path)
+ if ext == `` {
+ linkq.Url = parentUrl.JoinPath(newUrl.Path).String()
+ } else {
+ // "parent/page.ext" + "val" => "parent/val"
+ tmp, _ := url.Parse(parentUrl.String())
+ tmp.Path = path.Dir(tmp.Path)
+ linkq.Url = parentUrl.JoinPath(tmp.Path).String()
}
- newUrl, _ = url.Parse(parentUrl.String())
- newUrl.Path = path.Join(parentPath, relPath)
- return newUrl
+ return linkq
}