aboutsummaryrefslogtreecommitdiff
path: root/brokenlinks
diff options
context:
space:
mode:
authorShulhan <ms@kilabit.info>2026-02-11 10:47:42 +0700
committerShulhan <ms@kilabit.info>2026-02-11 21:45:06 +0700
commit8100b3be0730173a77f1a64f9ac6bc8862a159ac (patch)
tree46d0bdb4a6f3e6c5e709826d61d34259e209d3a4 /brokenlinks
parente59724f4d701f8889167219b2ccc18f4e8954034 (diff)
downloadjarink-8100b3be0730173a77f1a64f9ac6bc8862a159ac.tar.xz
brokenlinks: make link that return HTML always end with slash
If parent URL like "/page" return the body as HTML page, the URL should be end with slash to make the relative links inside it works when joined with the parent URL.
Diffstat (limited to 'brokenlinks')
-rw-r--r--brokenlinks/brokenlinks_test.go19
-rw-r--r--brokenlinks/options.go4
-rw-r--r--brokenlinks/worker.go76
-rw-r--r--brokenlinks/worker_test.go44
4 files changed, 57 insertions, 86 deletions
diff --git a/brokenlinks/brokenlinks_test.go b/brokenlinks/brokenlinks_test.go
index f251165..ab857cd 100644
--- a/brokenlinks/brokenlinks_test.go
+++ b/brokenlinks/brokenlinks_test.go
@@ -223,11 +223,12 @@ func TestScan(t *testing.T) {
if err != nil {
t.Fatal(err)
}
+ parsedTestUrl.Path = `/`
parsedUrlBrokenHtml, err := url.Parse(testUrl + `/broken.html`)
if err != nil {
t.Fatal(err)
}
- parsedUrlPage2, err := url.Parse(testUrl + `/page2`)
+ parsedUrlPage2, err := url.Parse(testUrl + `/page2/`)
if err != nil {
t.Fatal(err)
}
@@ -254,7 +255,7 @@ func TestScan(t *testing.T) {
opts: brokenlinks.Options{
Url: `http://127.0.0.1:14594`,
},
- expError: `Scan: Get "http://127.0.0.1:14594": dial tcp 127.0.0.1:14594: connect: connection refused`,
+ expError: `Scan: Get "http://127.0.0.1:14594/": dial tcp 127.0.0.1:14594: connect: connection refused`,
}, {
desc: `With invalid IgnoreStatus`,
opts: brokenlinks.Options{
@@ -270,14 +271,14 @@ func TestScan(t *testing.T) {
},
expError: `Scan: Options: unknown status code "50"`,
}, {
- desc: `With Url=testUrl`,
+ desc: `With Url=` + testUrl,
opts: brokenlinks.Options{
Url: testUrl,
IgnoreStatus: `403`,
Insecure: true,
},
exp: map[string][]jarink.Link{
- testUrl: []jarink.Link{
+ testUrl + `/`: []jarink.Link{
{
ParentUrl: parsedTestUrl,
Url: testUrl + `/broken.png`,
@@ -337,7 +338,7 @@ func TestScan(t *testing.T) {
Kind: int(atom.A),
},
},
- testUrl + `/page2`: []jarink.Link{
+ testUrl + `/page2/`: []jarink.Link{
{
ParentUrl: parsedUrlPage2,
Url: testUrl + `/broken.png`,
@@ -359,12 +360,12 @@ func TestScan(t *testing.T) {
}, {
// Scanning on "/page2" should not scan the the "/" or other
// pages other than below of "/page2" itself.
- desc: `With Url=/page2`,
+ desc: `With Url=` + testUrl + `/page2`,
opts: brokenlinks.Options{
Url: testUrl + `/page2`,
},
exp: map[string][]jarink.Link{
- testUrl + `/page2`: []jarink.Link{
+ testUrl + `/page2/`: []jarink.Link{
{
ParentUrl: parsedUrlPage2,
Url: testUrl + `/broken.png`,
@@ -411,7 +412,7 @@ func TestScan(t *testing.T) {
func TestScan_pastResult(t *testing.T) {
var testUrl = `http://` + testAddress
- parsedUrlPage2, err := url.Parse(testUrl + `/page2`)
+ parsedUrlPage2, err := url.Parse(testUrl + `/page2/`)
if err != nil {
t.Fatal(err)
}
@@ -438,7 +439,7 @@ func TestScan_pastResult(t *testing.T) {
IgnoreStatus: `403`,
},
exp: map[string][]jarink.Link{
- testUrl + `/page2`: []jarink.Link{
+ testUrl + `/page2/`: []jarink.Link{
{
ParentUrl: parsedUrlPage2,
Url: testUrl + `/broken.png`,
diff --git a/brokenlinks/options.go b/brokenlinks/options.go
index e5f9fcf..1063f20 100644
--- a/brokenlinks/options.go
+++ b/brokenlinks/options.go
@@ -38,7 +38,9 @@ func (opts *Options) init() (err error) {
if err != nil {
return fmt.Errorf(`%s: invalid URL %q`, logp, opts.Url)
}
- opts.scanUrl.Path = strings.TrimSuffix(opts.scanUrl.Path, `/`)
+ if opts.scanUrl.Path == `` {
+ opts.scanUrl.Path = `/`
+ }
opts.scanUrl.Fragment = ""
opts.scanUrl.RawFragment = ""
diff --git a/brokenlinks/worker.go b/brokenlinks/worker.go
index 082dc11..387cc05 100644
--- a/brokenlinks/worker.go
+++ b/brokenlinks/worker.go
@@ -288,6 +288,14 @@ func (wrk *worker) scan(linkq jarink.Link) (resultq map[string]jarink.Link) {
return resultq
}
+ ext := path.Ext(linkq.Url)
+ if ext == `` {
+ if linkq.Url[len(linkq.Url)-1] != '/' {
+ linkq.Url += `/`
+ resultq[linkq.Url] = linkq
+ }
+ }
+
var doc *html.Node
doc, err = html.Parse(httpResp.Body)
if err != nil {
@@ -302,6 +310,8 @@ func (wrk *worker) scan(linkq jarink.Link) (resultq map[string]jarink.Link) {
location := httpResp.Header.Get(`Location`)
if location == `` {
location = linkq.Url
+ } else {
+ linkq.Url = location
}
parentUrl, err = url.Parse(location)
@@ -311,6 +321,9 @@ func (wrk *worker) scan(linkq jarink.Link) (resultq map[string]jarink.Link) {
resultq[location] = linkq
return resultq
}
+ if parentUrl.Path == `` {
+ parentUrl.Path = `/`
+ }
var node *html.Node
for node = range doc.Descendants() {
@@ -388,18 +401,24 @@ func (wrk *worker) fetch(linkq jarink.Link) (httpResp *http.Response, err error)
func (wrk *worker) processLink(parentUrl *url.URL, val string, kind int) (
linkq *jarink.Link,
) {
+ val = strings.TrimSpace(val)
if len(val) == 0 {
return nil
}
+ if kind == int(atom.A) && val[0] == '#' {
+ // Ignore link to ID, like `href="#element_id"`.
+ return nil
+ }
+ if strings.HasPrefix(val, `mailto:`) {
+ return nil
+ }
linkq = &jarink.Link{
ParentUrl: parentUrl,
Kind: kind,
}
- var newUrl *url.URL
- var err error
- newUrl, err = url.Parse(val)
+ newUrl, err := url.Parse(val)
if err != nil {
linkq.ErrScan = err
linkq.Url = val
@@ -409,38 +428,31 @@ func (wrk *worker) processLink(parentUrl *url.URL, val string, kind int) (
newUrl.Fragment = ""
newUrl.RawFragment = ""
- if kind == int(atom.A) && val[0] == '#' {
- // Ignore link to ID, like `href="#element_id"`.
- return nil
- }
- if strings.HasPrefix(val, `mailto:`) {
- return nil
- }
- if !strings.HasPrefix(val, `http`) {
- if val[0] == '/' {
- // val is absolute link.
- newUrl = wrk.baseUrl.JoinPath(newUrl.Path)
- } else {
- // val is relative to parent URL.
- newUrl = genURLRelative(parentUrl, newUrl.Path)
+ if strings.HasPrefix(val, `http`) {
+ linkq.Url = newUrl.String()
+ if !strings.HasPrefix(linkq.Url, wrk.opts.scanUrl.String()) {
+ linkq.IsExternal = true
}
+ return linkq
}
- linkq.Url = strings.TrimSuffix(newUrl.String(), `/`)
- if !strings.HasPrefix(linkq.Url, wrk.opts.scanUrl.String()) {
- linkq.IsExternal = true
+ if val[0] == '/' {
+ // val is absolute link.
+ linkq.Url = wrk.baseUrl.JoinPath(newUrl.Path).String()
+ if !strings.HasPrefix(linkq.Url, wrk.opts.scanUrl.String()) {
+ linkq.IsExternal = true
+ }
+ return linkq
}
- return linkq
-}
-// genURLRelative generate new URL from parent URL and relative path
-// `relPath`.
-func genURLRelative(parentUrl *url.URL, relPath string) (newUrl *url.URL) {
- var parentPath = parentUrl.Path
- var ext = strings.ToLower(path.Ext(parentPath))
- if ext == `.html` || ext == `.htm` {
- parentPath = path.Dir(parentPath)
+ // val is relative to parent URL.
+ ext := path.Ext(parentUrl.Path)
+ if ext == `` {
+ linkq.Url = parentUrl.JoinPath(newUrl.Path).String()
+ } else {
+ // "parent/page.ext" + "val" => "parent/val"
+ tmp, _ := url.Parse(parentUrl.String())
+ tmp.Path = path.Dir(tmp.Path)
+ linkq.Url = parentUrl.JoinPath(tmp.Path).String()
}
- newUrl, _ = url.Parse(parentUrl.String())
- newUrl.Path = path.Join(parentPath, relPath)
- return newUrl
+ return linkq
}
diff --git a/brokenlinks/worker_test.go b/brokenlinks/worker_test.go
index 122221e..a125092 100644
--- a/brokenlinks/worker_test.go
+++ b/brokenlinks/worker_test.go
@@ -2,47 +2,3 @@
// SPDX-FileCopyrightText: 2026 M. Shulhan <ms@kilabit.info>
package brokenlinks
-
-import (
- "net/url"
- "testing"
-
- "git.sr.ht/~shulhan/pakakeh.go/lib/test"
-)
-
-func TestGenURLRelative(t *testing.T) {
- listCase := []struct {
- parentURL string
- relPath string
- expURL string
- }{{
- parentURL: `https://domain/a/b/`,
- relPath: `c`,
- expURL: `https://domain/a/b/c`,
- }, {
- parentURL: `https://domain/a/b`,
- relPath: `c`,
- expURL: `https://domain/a/b/c`,
- }, {
- parentURL: `https://domain/a/b/page.html`,
- relPath: `c`,
- expURL: `https://domain/a/b/c`,
- }, {
- parentURL: `https://domain/a/b/page.htm`,
- relPath: `c`,
- expURL: `https://domain/a/b/c`,
- }, {
- parentURL: `https://domain/a/b/page.HTML`,
- relPath: `c`,
- expURL: `https://domain/a/b/c`,
- }, {
- parentURL: `https://domain/a/b/page.HTML`,
- relPath: `../c.html`,
- expURL: `https://domain/a/c.html`,
- }}
- for _, tc := range listCase {
- parentURL, _ := url.Parse(tc.parentURL)
- got := genURLRelative(parentURL, tc.relPath)
- test.Assert(t, ``, tc.expURL, got.String())
- }
-}