aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorShulhan <ms@kilabit.info>2025-05-30 00:25:33 +0700
committerShulhan <ms@kilabit.info>2025-05-30 00:25:33 +0700
commit37c5a44f1330c968332a0a00d925ed32add80084 (patch)
treef6d27e92ad538ff7c6cecb727a3871e28a6eff1d
parent22b2556a8116eb93f9194efe9e5e6ffb35c2a542 (diff)
downloadjarink-37c5a44f1330c968332a0a00d925ed32add80084.tar.xz
all: cleaning up fragment on links
The fragment part on URL, for example "/page#fragment" should be removed, otherwise it will indexed as different URL.
-rw-r--r--testdata/web/index.html3
-rw-r--r--url_test.go11
-rw-r--r--worker.go43
3 files changed, 41 insertions, 16 deletions
diff --git a/testdata/web/index.html b/testdata/web/index.html
index a245d07..61a1f39 100644
--- a/testdata/web/index.html
+++ b/testdata/web/index.html
@@ -15,5 +15,8 @@ SPDX-License-Identifier: GPL-3.0-only
<a href="http:/127.0.0.1:11836">Invalid external URL</a>
<!-- Error when parsing URL -->
<a href="http://127.0.0.1:abc">Invalid URL port</a>
+ <!-- Fragment should be skipped and cleaned up -->
+ <a href="#goto_a">Same with href to "/"</a>
+ <a href="/page2#goto_a">Same with href to "/page2"</a>
</body>
</html>
diff --git a/url_test.go b/url_test.go
index 698c49c..506090d 100644
--- a/url_test.go
+++ b/url_test.go
@@ -16,6 +16,9 @@ func TestUrlString(t *testing.T) {
exp string
}
var listCase = []testCase{{
+ rawUrl: `/page#goto`,
+ exp: `/page`,
+ }, {
rawUrl: `http://127.0.0.1`,
exp: `http://127.0.0.1`,
}, {
@@ -27,12 +30,20 @@ func TestUrlString(t *testing.T) {
}, {
rawUrl: `http://127.0.0.1/page/`,
exp: `http://127.0.0.1/page/`,
+ }, {
+ rawUrl: `http://127.0.0.1/page/#gotoa`,
+ exp: `http://127.0.0.1/page/`,
+ }, {
+ rawUrl: `http://127.0.0.1/page#gotoa`,
+ exp: `http://127.0.0.1/page`,
}}
for _, tcase := range listCase {
gotUrl, err := url.Parse(tcase.rawUrl)
if err != nil {
t.Fatal(err)
}
+ gotUrl.Fragment = ""
+ gotUrl.RawFragment = ""
var got = gotUrl.String()
test.Assert(t, tcase.rawUrl, tcase.exp, got)
}
diff --git a/worker.go b/worker.go
index c6e344d..f2be07f 100644
--- a/worker.go
+++ b/worker.go
@@ -61,6 +61,8 @@ func newWorker(opts ScanOptions) (wrk *worker, err error) {
}
wrk.scanUrl.Path = strings.TrimSuffix(wrk.scanUrl.Path, `/`)
+ wrk.scanUrl.Fragment = ""
+ wrk.scanUrl.RawFragment = ""
wrk.baseUrl = &url.URL{
Scheme: wrk.scanUrl.Scheme,
@@ -219,11 +221,32 @@ func (wrk *worker) processLink(rawParentUrl, val string, kind atom.Atom) {
log.Fatal(err)
}
+ var newUrl *url.URL
+ newUrl, err = url.Parse(val)
+ if err != nil {
+ var linkq = linkQueue{
+ parentUrl: parentUrl,
+ url: val,
+ kind: kind,
+ }
+ wrk.markDead(linkq, 700)
+ return
+ }
+ newUrl.Fragment = ""
+ newUrl.RawFragment = ""
+
+ var newUrlStr = strings.TrimSuffix(newUrl.String(), `/`)
+
+ if kind == atom.A && val[0] == '#' {
+ // Ignore link to ID, like `href="#element_id"`.
+ return
+ }
+
// val is absolute to parent URL.
if val[0] == '/' {
// Link to the same domain will queued for scanning.
- var newUrl = wrk.baseUrl.JoinPath(val)
- var newUrlStr = strings.TrimSuffix(newUrl.String(), `/`)
+ newUrl = wrk.baseUrl.JoinPath(newUrl.Path)
+ newUrlStr = strings.TrimSuffix(newUrl.String(), `/`)
wrk.linkq <- linkQueue{
parentUrl: parentUrl,
url: newUrlStr,
@@ -232,18 +255,6 @@ func (wrk *worker) processLink(rawParentUrl, val string, kind atom.Atom) {
return
}
if strings.HasPrefix(val, `http`) {
- var newUrl *url.URL
- newUrl, err = url.Parse(val)
- if err != nil {
- var linkq = linkQueue{
- parentUrl: parentUrl,
- url: val,
- kind: kind,
- }
- wrk.markDead(linkq, 700)
- return
- }
- var newUrlStr = strings.TrimSuffix(newUrl.String(), `/`)
wrk.linkq <- linkQueue{
parentUrl: parentUrl,
url: newUrlStr,
@@ -252,8 +263,8 @@ func (wrk *worker) processLink(rawParentUrl, val string, kind atom.Atom) {
return
}
// val is relative to parent URL.
- var newUrl = parentUrl.JoinPath(`/`, val)
- var newUrlStr = strings.TrimSuffix(newUrl.String(), `/`)
+ newUrl = parentUrl.JoinPath(`/`, newUrl.Path)
+ newUrlStr = strings.TrimSuffix(newUrl.String(), `/`)
wrk.linkq <- linkQueue{
parentUrl: parentUrl,
url: newUrlStr,