aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorShulhan <ms@kilabit.info>2026-02-11 10:47:42 +0700
committerShulhan <ms@kilabit.info>2026-02-11 21:45:06 +0700
commit8100b3be0730173a77f1a64f9ac6bc8862a159ac (patch)
tree46d0bdb4a6f3e6c5e709826d61d34259e209d3a4
parente59724f4d701f8889167219b2ccc18f4e8954034 (diff)
downloadjarink-8100b3be0730173a77f1a64f9ac6bc8862a159ac.tar.xz
brokenlinks: make link that return HTML always end with slash
If parent URL like "/page" return the body as HTML page, the URL should be end with slash to make the relative links inside it works when joined with the parent URL.
-rw-r--r--brokenlinks/brokenlinks_test.go19
-rw-r--r--brokenlinks/options.go4
-rw-r--r--brokenlinks/worker.go76
-rw-r--r--brokenlinks/worker_test.go44
4 files changed, 57 insertions, 86 deletions
diff --git a/brokenlinks/brokenlinks_test.go b/brokenlinks/brokenlinks_test.go
index f251165..ab857cd 100644
--- a/brokenlinks/brokenlinks_test.go
+++ b/brokenlinks/brokenlinks_test.go
@@ -223,11 +223,12 @@ func TestScan(t *testing.T) {
if err != nil {
t.Fatal(err)
}
+ parsedTestUrl.Path = `/`
parsedUrlBrokenHtml, err := url.Parse(testUrl + `/broken.html`)
if err != nil {
t.Fatal(err)
}
- parsedUrlPage2, err := url.Parse(testUrl + `/page2`)
+ parsedUrlPage2, err := url.Parse(testUrl + `/page2/`)
if err != nil {
t.Fatal(err)
}
@@ -254,7 +255,7 @@ func TestScan(t *testing.T) {
opts: brokenlinks.Options{
Url: `http://127.0.0.1:14594`,
},
- expError: `Scan: Get "http://127.0.0.1:14594": dial tcp 127.0.0.1:14594: connect: connection refused`,
+ expError: `Scan: Get "http://127.0.0.1:14594/": dial tcp 127.0.0.1:14594: connect: connection refused`,
}, {
desc: `With invalid IgnoreStatus`,
opts: brokenlinks.Options{
@@ -270,14 +271,14 @@ func TestScan(t *testing.T) {
},
expError: `Scan: Options: unknown status code "50"`,
}, {
- desc: `With Url=testUrl`,
+ desc: `With Url=` + testUrl,
opts: brokenlinks.Options{
Url: testUrl,
IgnoreStatus: `403`,
Insecure: true,
},
exp: map[string][]jarink.Link{
- testUrl: []jarink.Link{
+ testUrl + `/`: []jarink.Link{
{
ParentUrl: parsedTestUrl,
Url: testUrl + `/broken.png`,
@@ -337,7 +338,7 @@ func TestScan(t *testing.T) {
Kind: int(atom.A),
},
},
- testUrl + `/page2`: []jarink.Link{
+ testUrl + `/page2/`: []jarink.Link{
{
ParentUrl: parsedUrlPage2,
Url: testUrl + `/broken.png`,
@@ -359,12 +360,12 @@ func TestScan(t *testing.T) {
}, {
// Scanning on "/page2" should not scan the the "/" or other
// pages other than below of "/page2" itself.
- desc: `With Url=/page2`,
+ desc: `With Url=` + testUrl + `/page2`,
opts: brokenlinks.Options{
Url: testUrl + `/page2`,
},
exp: map[string][]jarink.Link{
- testUrl + `/page2`: []jarink.Link{
+ testUrl + `/page2/`: []jarink.Link{
{
ParentUrl: parsedUrlPage2,
Url: testUrl + `/broken.png`,
@@ -411,7 +412,7 @@ func TestScan(t *testing.T) {
func TestScan_pastResult(t *testing.T) {
var testUrl = `http://` + testAddress
- parsedUrlPage2, err := url.Parse(testUrl + `/page2`)
+ parsedUrlPage2, err := url.Parse(testUrl + `/page2/`)
if err != nil {
t.Fatal(err)
}
@@ -438,7 +439,7 @@ func TestScan_pastResult(t *testing.T) {
IgnoreStatus: `403`,
},
exp: map[string][]jarink.Link{
- testUrl + `/page2`: []jarink.Link{
+ testUrl + `/page2/`: []jarink.Link{
{
ParentUrl: parsedUrlPage2,
Url: testUrl + `/broken.png`,
diff --git a/brokenlinks/options.go b/brokenlinks/options.go
index e5f9fcf..1063f20 100644
--- a/brokenlinks/options.go
+++ b/brokenlinks/options.go
@@ -38,7 +38,9 @@ func (opts *Options) init() (err error) {
if err != nil {
return fmt.Errorf(`%s: invalid URL %q`, logp, opts.Url)
}
- opts.scanUrl.Path = strings.TrimSuffix(opts.scanUrl.Path, `/`)
+ if opts.scanUrl.Path == `` {
+ opts.scanUrl.Path = `/`
+ }
opts.scanUrl.Fragment = ""
opts.scanUrl.RawFragment = ""
diff --git a/brokenlinks/worker.go b/brokenlinks/worker.go
index 082dc11..387cc05 100644
--- a/brokenlinks/worker.go
+++ b/brokenlinks/worker.go
@@ -288,6 +288,14 @@ func (wrk *worker) scan(linkq jarink.Link) (resultq map[string]jarink.Link) {
return resultq
}
+ ext := path.Ext(linkq.Url)
+ if ext == `` {
+ if linkq.Url[len(linkq.Url)-1] != '/' {
+ linkq.Url += `/`
+ resultq[linkq.Url] = linkq
+ }
+ }
+
var doc *html.Node
doc, err = html.Parse(httpResp.Body)
if err != nil {
@@ -302,6 +310,8 @@ func (wrk *worker) scan(linkq jarink.Link) (resultq map[string]jarink.Link) {
location := httpResp.Header.Get(`Location`)
if location == `` {
location = linkq.Url
+ } else {
+ linkq.Url = location
}
parentUrl, err = url.Parse(location)
@@ -311,6 +321,9 @@ func (wrk *worker) scan(linkq jarink.Link) (resultq map[string]jarink.Link) {
resultq[location] = linkq
return resultq
}
+ if parentUrl.Path == `` {
+ parentUrl.Path = `/`
+ }
var node *html.Node
for node = range doc.Descendants() {
@@ -388,18 +401,24 @@ func (wrk *worker) fetch(linkq jarink.Link) (httpResp *http.Response, err error)
func (wrk *worker) processLink(parentUrl *url.URL, val string, kind int) (
linkq *jarink.Link,
) {
+ val = strings.TrimSpace(val)
if len(val) == 0 {
return nil
}
+ if kind == int(atom.A) && val[0] == '#' {
+ // Ignore link to ID, like `href="#element_id"`.
+ return nil
+ }
+ if strings.HasPrefix(val, `mailto:`) {
+ return nil
+ }
linkq = &jarink.Link{
ParentUrl: parentUrl,
Kind: kind,
}
- var newUrl *url.URL
- var err error
- newUrl, err = url.Parse(val)
+ newUrl, err := url.Parse(val)
if err != nil {
linkq.ErrScan = err
linkq.Url = val
@@ -409,38 +428,31 @@ func (wrk *worker) processLink(parentUrl *url.URL, val string, kind int) (
newUrl.Fragment = ""
newUrl.RawFragment = ""
- if kind == int(atom.A) && val[0] == '#' {
- // Ignore link to ID, like `href="#element_id"`.
- return nil
- }
- if strings.HasPrefix(val, `mailto:`) {
- return nil
- }
- if !strings.HasPrefix(val, `http`) {
- if val[0] == '/' {
- // val is absolute link.
- newUrl = wrk.baseUrl.JoinPath(newUrl.Path)
- } else {
- // val is relative to parent URL.
- newUrl = genURLRelative(parentUrl, newUrl.Path)
+ if strings.HasPrefix(val, `http`) {
+ linkq.Url = newUrl.String()
+ if !strings.HasPrefix(linkq.Url, wrk.opts.scanUrl.String()) {
+ linkq.IsExternal = true
}
+ return linkq
}
- linkq.Url = strings.TrimSuffix(newUrl.String(), `/`)
- if !strings.HasPrefix(linkq.Url, wrk.opts.scanUrl.String()) {
- linkq.IsExternal = true
+ if val[0] == '/' {
+ // val is absolute link.
+ linkq.Url = wrk.baseUrl.JoinPath(newUrl.Path).String()
+ if !strings.HasPrefix(linkq.Url, wrk.opts.scanUrl.String()) {
+ linkq.IsExternal = true
+ }
+ return linkq
}
- return linkq
-}
-// genURLRelative generate new URL from parent URL and relative path
-// `relPath`.
-func genURLRelative(parentUrl *url.URL, relPath string) (newUrl *url.URL) {
- var parentPath = parentUrl.Path
- var ext = strings.ToLower(path.Ext(parentPath))
- if ext == `.html` || ext == `.htm` {
- parentPath = path.Dir(parentPath)
+ // val is relative to parent URL.
+ ext := path.Ext(parentUrl.Path)
+ if ext == `` {
+ linkq.Url = parentUrl.JoinPath(newUrl.Path).String()
+ } else {
+ // "parent/page.ext" + "val" => "parent/val"
+ tmp, _ := url.Parse(parentUrl.String())
+ tmp.Path = path.Dir(tmp.Path)
+ linkq.Url = parentUrl.JoinPath(tmp.Path).String()
}
- newUrl, _ = url.Parse(parentUrl.String())
- newUrl.Path = path.Join(parentPath, relPath)
- return newUrl
+ return linkq
}
diff --git a/brokenlinks/worker_test.go b/brokenlinks/worker_test.go
index 122221e..a125092 100644
--- a/brokenlinks/worker_test.go
+++ b/brokenlinks/worker_test.go
@@ -2,47 +2,3 @@
// SPDX-FileCopyrightText: 2026 M. Shulhan <ms@kilabit.info>
package brokenlinks
-
-import (
- "net/url"
- "testing"
-
- "git.sr.ht/~shulhan/pakakeh.go/lib/test"
-)
-
-func TestGenURLRelative(t *testing.T) {
- listCase := []struct {
- parentURL string
- relPath string
- expURL string
- }{{
- parentURL: `https://domain/a/b/`,
- relPath: `c`,
- expURL: `https://domain/a/b/c`,
- }, {
- parentURL: `https://domain/a/b`,
- relPath: `c`,
- expURL: `https://domain/a/b/c`,
- }, {
- parentURL: `https://domain/a/b/page.html`,
- relPath: `c`,
- expURL: `https://domain/a/b/c`,
- }, {
- parentURL: `https://domain/a/b/page.htm`,
- relPath: `c`,
- expURL: `https://domain/a/b/c`,
- }, {
- parentURL: `https://domain/a/b/page.HTML`,
- relPath: `c`,
- expURL: `https://domain/a/b/c`,
- }, {
- parentURL: `https://domain/a/b/page.HTML`,
- relPath: `../c.html`,
- expURL: `https://domain/a/c.html`,
- }}
- for _, tc := range listCase {
- parentURL, _ := url.Parse(tc.parentURL)
- got := genURLRelative(parentURL, tc.relPath)
- test.Assert(t, ``, tc.expURL, got.String())
- }
-}