diff options
| author | Shulhan <ms@kilabit.info> | 2026-02-11 10:47:42 +0700 |
|---|---|---|
| committer | Shulhan <ms@kilabit.info> | 2026-02-11 21:45:06 +0700 |
| commit | 8100b3be0730173a77f1a64f9ac6bc8862a159ac (patch) | |
| tree | 46d0bdb4a6f3e6c5e709826d61d34259e209d3a4 | |
| parent | e59724f4d701f8889167219b2ccc18f4e8954034 (diff) | |
| download | jarink-8100b3be0730173a77f1a64f9ac6bc8862a159ac.tar.xz | |
brokenlinks: make link that return HTML always end with slash
If parent URL like "/page" return the body as HTML page, the URL should
be end with slash to make the relative links inside it works when joined
with the parent URL.
| -rw-r--r-- | brokenlinks/brokenlinks_test.go | 19 | ||||
| -rw-r--r-- | brokenlinks/options.go | 4 | ||||
| -rw-r--r-- | brokenlinks/worker.go | 76 | ||||
| -rw-r--r-- | brokenlinks/worker_test.go | 44 |
4 files changed, 57 insertions, 86 deletions
diff --git a/brokenlinks/brokenlinks_test.go b/brokenlinks/brokenlinks_test.go index f251165..ab857cd 100644 --- a/brokenlinks/brokenlinks_test.go +++ b/brokenlinks/brokenlinks_test.go @@ -223,11 +223,12 @@ func TestScan(t *testing.T) { if err != nil { t.Fatal(err) } + parsedTestUrl.Path = `/` parsedUrlBrokenHtml, err := url.Parse(testUrl + `/broken.html`) if err != nil { t.Fatal(err) } - parsedUrlPage2, err := url.Parse(testUrl + `/page2`) + parsedUrlPage2, err := url.Parse(testUrl + `/page2/`) if err != nil { t.Fatal(err) } @@ -254,7 +255,7 @@ func TestScan(t *testing.T) { opts: brokenlinks.Options{ Url: `http://127.0.0.1:14594`, }, - expError: `Scan: Get "http://127.0.0.1:14594": dial tcp 127.0.0.1:14594: connect: connection refused`, + expError: `Scan: Get "http://127.0.0.1:14594/": dial tcp 127.0.0.1:14594: connect: connection refused`, }, { desc: `With invalid IgnoreStatus`, opts: brokenlinks.Options{ @@ -270,14 +271,14 @@ func TestScan(t *testing.T) { }, expError: `Scan: Options: unknown status code "50"`, }, { - desc: `With Url=testUrl`, + desc: `With Url=` + testUrl, opts: brokenlinks.Options{ Url: testUrl, IgnoreStatus: `403`, Insecure: true, }, exp: map[string][]jarink.Link{ - testUrl: []jarink.Link{ + testUrl + `/`: []jarink.Link{ { ParentUrl: parsedTestUrl, Url: testUrl + `/broken.png`, @@ -337,7 +338,7 @@ func TestScan(t *testing.T) { Kind: int(atom.A), }, }, - testUrl + `/page2`: []jarink.Link{ + testUrl + `/page2/`: []jarink.Link{ { ParentUrl: parsedUrlPage2, Url: testUrl + `/broken.png`, @@ -359,12 +360,12 @@ func TestScan(t *testing.T) { }, { // Scanning on "/page2" should not scan the the "/" or other // pages other than below of "/page2" itself. - desc: `With Url=/page2`, + desc: `With Url=` + testUrl + `/page2`, opts: brokenlinks.Options{ Url: testUrl + `/page2`, }, exp: map[string][]jarink.Link{ - testUrl + `/page2`: []jarink.Link{ + testUrl + `/page2/`: []jarink.Link{ { ParentUrl: parsedUrlPage2, Url: testUrl + `/broken.png`, @@ -411,7 +412,7 @@ func TestScan(t *testing.T) { func TestScan_pastResult(t *testing.T) { var testUrl = `http://` + testAddress - parsedUrlPage2, err := url.Parse(testUrl + `/page2`) + parsedUrlPage2, err := url.Parse(testUrl + `/page2/`) if err != nil { t.Fatal(err) } @@ -438,7 +439,7 @@ func TestScan_pastResult(t *testing.T) { IgnoreStatus: `403`, }, exp: map[string][]jarink.Link{ - testUrl + `/page2`: []jarink.Link{ + testUrl + `/page2/`: []jarink.Link{ { ParentUrl: parsedUrlPage2, Url: testUrl + `/broken.png`, diff --git a/brokenlinks/options.go b/brokenlinks/options.go index e5f9fcf..1063f20 100644 --- a/brokenlinks/options.go +++ b/brokenlinks/options.go @@ -38,7 +38,9 @@ func (opts *Options) init() (err error) { if err != nil { return fmt.Errorf(`%s: invalid URL %q`, logp, opts.Url) } - opts.scanUrl.Path = strings.TrimSuffix(opts.scanUrl.Path, `/`) + if opts.scanUrl.Path == `` { + opts.scanUrl.Path = `/` + } opts.scanUrl.Fragment = "" opts.scanUrl.RawFragment = "" diff --git a/brokenlinks/worker.go b/brokenlinks/worker.go index 082dc11..387cc05 100644 --- a/brokenlinks/worker.go +++ b/brokenlinks/worker.go @@ -288,6 +288,14 @@ func (wrk *worker) scan(linkq jarink.Link) (resultq map[string]jarink.Link) { return resultq } + ext := path.Ext(linkq.Url) + if ext == `` { + if linkq.Url[len(linkq.Url)-1] != '/' { + linkq.Url += `/` + resultq[linkq.Url] = linkq + } + } + var doc *html.Node doc, err = html.Parse(httpResp.Body) if err != nil { @@ -302,6 +310,8 @@ func (wrk *worker) scan(linkq jarink.Link) (resultq map[string]jarink.Link) { location := httpResp.Header.Get(`Location`) if location == `` { location = linkq.Url + } else { + linkq.Url = location } parentUrl, err = url.Parse(location) @@ -311,6 +321,9 @@ func (wrk *worker) scan(linkq jarink.Link) (resultq map[string]jarink.Link) { resultq[location] = linkq return resultq } + if parentUrl.Path == `` { + parentUrl.Path = `/` + } var node *html.Node for node = range doc.Descendants() { @@ -388,18 +401,24 @@ func (wrk *worker) fetch(linkq jarink.Link) (httpResp *http.Response, err error) func (wrk *worker) processLink(parentUrl *url.URL, val string, kind int) ( linkq *jarink.Link, ) { + val = strings.TrimSpace(val) if len(val) == 0 { return nil } + if kind == int(atom.A) && val[0] == '#' { + // Ignore link to ID, like `href="#element_id"`. + return nil + } + if strings.HasPrefix(val, `mailto:`) { + return nil + } linkq = &jarink.Link{ ParentUrl: parentUrl, Kind: kind, } - var newUrl *url.URL - var err error - newUrl, err = url.Parse(val) + newUrl, err := url.Parse(val) if err != nil { linkq.ErrScan = err linkq.Url = val @@ -409,38 +428,31 @@ func (wrk *worker) processLink(parentUrl *url.URL, val string, kind int) ( newUrl.Fragment = "" newUrl.RawFragment = "" - if kind == int(atom.A) && val[0] == '#' { - // Ignore link to ID, like `href="#element_id"`. - return nil - } - if strings.HasPrefix(val, `mailto:`) { - return nil - } - if !strings.HasPrefix(val, `http`) { - if val[0] == '/' { - // val is absolute link. - newUrl = wrk.baseUrl.JoinPath(newUrl.Path) - } else { - // val is relative to parent URL. - newUrl = genURLRelative(parentUrl, newUrl.Path) + if strings.HasPrefix(val, `http`) { + linkq.Url = newUrl.String() + if !strings.HasPrefix(linkq.Url, wrk.opts.scanUrl.String()) { + linkq.IsExternal = true } + return linkq } - linkq.Url = strings.TrimSuffix(newUrl.String(), `/`) - if !strings.HasPrefix(linkq.Url, wrk.opts.scanUrl.String()) { - linkq.IsExternal = true + if val[0] == '/' { + // val is absolute link. + linkq.Url = wrk.baseUrl.JoinPath(newUrl.Path).String() + if !strings.HasPrefix(linkq.Url, wrk.opts.scanUrl.String()) { + linkq.IsExternal = true + } + return linkq } - return linkq -} -// genURLRelative generate new URL from parent URL and relative path -// `relPath`. -func genURLRelative(parentUrl *url.URL, relPath string) (newUrl *url.URL) { - var parentPath = parentUrl.Path - var ext = strings.ToLower(path.Ext(parentPath)) - if ext == `.html` || ext == `.htm` { - parentPath = path.Dir(parentPath) + // val is relative to parent URL. + ext := path.Ext(parentUrl.Path) + if ext == `` { + linkq.Url = parentUrl.JoinPath(newUrl.Path).String() + } else { + // "parent/page.ext" + "val" => "parent/val" + tmp, _ := url.Parse(parentUrl.String()) + tmp.Path = path.Dir(tmp.Path) + linkq.Url = parentUrl.JoinPath(tmp.Path).String() } - newUrl, _ = url.Parse(parentUrl.String()) - newUrl.Path = path.Join(parentPath, relPath) - return newUrl + return linkq } diff --git a/brokenlinks/worker_test.go b/brokenlinks/worker_test.go index 122221e..a125092 100644 --- a/brokenlinks/worker_test.go +++ b/brokenlinks/worker_test.go @@ -2,47 +2,3 @@ // SPDX-FileCopyrightText: 2026 M. Shulhan <ms@kilabit.info> package brokenlinks - -import ( - "net/url" - "testing" - - "git.sr.ht/~shulhan/pakakeh.go/lib/test" -) - -func TestGenURLRelative(t *testing.T) { - listCase := []struct { - parentURL string - relPath string - expURL string - }{{ - parentURL: `https://domain/a/b/`, - relPath: `c`, - expURL: `https://domain/a/b/c`, - }, { - parentURL: `https://domain/a/b`, - relPath: `c`, - expURL: `https://domain/a/b/c`, - }, { - parentURL: `https://domain/a/b/page.html`, - relPath: `c`, - expURL: `https://domain/a/b/c`, - }, { - parentURL: `https://domain/a/b/page.htm`, - relPath: `c`, - expURL: `https://domain/a/b/c`, - }, { - parentURL: `https://domain/a/b/page.HTML`, - relPath: `c`, - expURL: `https://domain/a/b/c`, - }, { - parentURL: `https://domain/a/b/page.HTML`, - relPath: `../c.html`, - expURL: `https://domain/a/c.html`, - }} - for _, tc := range listCase { - parentURL, _ := url.Parse(tc.parentURL) - got := genURLRelative(parentURL, tc.relPath) - test.Assert(t, ``, tc.expURL, got.String()) - } -} |
