aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--testdata/web/index.html3
-rw-r--r--url_test.go11
-rw-r--r--worker.go43
3 files changed, 41 insertions, 16 deletions
diff --git a/testdata/web/index.html b/testdata/web/index.html
index a245d07..61a1f39 100644
--- a/testdata/web/index.html
+++ b/testdata/web/index.html
@@ -15,5 +15,8 @@ SPDX-License-Identifier: GPL-3.0-only
<a href="http:/127.0.0.1:11836">Invalid external URL</a>
<!-- Error when parsing URL -->
<a href="http://127.0.0.1:abc">Invalid URL port</a>
+ <!-- Fragment should be skipped and cleaned up -->
+ <a href="#goto_a">Same with href to "/"</a>
+ <a href="/page2#goto_a">Same with href to "/page2"</a>
</body>
</html>
diff --git a/url_test.go b/url_test.go
index 698c49c..506090d 100644
--- a/url_test.go
+++ b/url_test.go
@@ -16,6 +16,9 @@ func TestUrlString(t *testing.T) {
exp string
}
var listCase = []testCase{{
+ rawUrl: `/page#goto`,
+ exp: `/page`,
+ }, {
rawUrl: `http://127.0.0.1`,
exp: `http://127.0.0.1`,
}, {
@@ -27,12 +30,20 @@ func TestUrlString(t *testing.T) {
}, {
rawUrl: `http://127.0.0.1/page/`,
exp: `http://127.0.0.1/page/`,
+ }, {
+ rawUrl: `http://127.0.0.1/page/#gotoa`,
+ exp: `http://127.0.0.1/page/`,
+ }, {
+ rawUrl: `http://127.0.0.1/page#gotoa`,
+ exp: `http://127.0.0.1/page`,
}}
for _, tcase := range listCase {
gotUrl, err := url.Parse(tcase.rawUrl)
if err != nil {
t.Fatal(err)
}
+ gotUrl.Fragment = ""
+ gotUrl.RawFragment = ""
var got = gotUrl.String()
test.Assert(t, tcase.rawUrl, tcase.exp, got)
}
diff --git a/worker.go b/worker.go
index c6e344d..f2be07f 100644
--- a/worker.go
+++ b/worker.go
@@ -61,6 +61,8 @@ func newWorker(opts ScanOptions) (wrk *worker, err error) {
}
wrk.scanUrl.Path = strings.TrimSuffix(wrk.scanUrl.Path, `/`)
+ wrk.scanUrl.Fragment = ""
+ wrk.scanUrl.RawFragment = ""
wrk.baseUrl = &url.URL{
Scheme: wrk.scanUrl.Scheme,
@@ -219,11 +221,32 @@ func (wrk *worker) processLink(rawParentUrl, val string, kind atom.Atom) {
log.Fatal(err)
}
+ var newUrl *url.URL
+ newUrl, err = url.Parse(val)
+ if err != nil {
+ var linkq = linkQueue{
+ parentUrl: parentUrl,
+ url: val,
+ kind: kind,
+ }
+ wrk.markDead(linkq, 700)
+ return
+ }
+ newUrl.Fragment = ""
+ newUrl.RawFragment = ""
+
+ var newUrlStr = strings.TrimSuffix(newUrl.String(), `/`)
+
+ if kind == atom.A && val[0] == '#' {
+ // Ignore link to ID, like `href="#element_id"`.
+ return
+ }
+
// val is absolute to parent URL.
if val[0] == '/' {
// Link to the same domain will queued for scanning.
- var newUrl = wrk.baseUrl.JoinPath(val)
- var newUrlStr = strings.TrimSuffix(newUrl.String(), `/`)
+ newUrl = wrk.baseUrl.JoinPath(newUrl.Path)
+ newUrlStr = strings.TrimSuffix(newUrl.String(), `/`)
wrk.linkq <- linkQueue{
parentUrl: parentUrl,
url: newUrlStr,
@@ -232,18 +255,6 @@ func (wrk *worker) processLink(rawParentUrl, val string, kind atom.Atom) {
return
}
if strings.HasPrefix(val, `http`) {
- var newUrl *url.URL
- newUrl, err = url.Parse(val)
- if err != nil {
- var linkq = linkQueue{
- parentUrl: parentUrl,
- url: val,
- kind: kind,
- }
- wrk.markDead(linkq, 700)
- return
- }
- var newUrlStr = strings.TrimSuffix(newUrl.String(), `/`)
wrk.linkq <- linkQueue{
parentUrl: parentUrl,
url: newUrlStr,
@@ -252,8 +263,8 @@ func (wrk *worker) processLink(rawParentUrl, val string, kind atom.Atom) {
return
}
// val is relative to parent URL.
- var newUrl = parentUrl.JoinPath(`/`, val)
- var newUrlStr = strings.TrimSuffix(newUrl.String(), `/`)
+ newUrl = parentUrl.JoinPath(`/`, newUrl.Path)
+ newUrlStr = strings.TrimSuffix(newUrl.String(), `/`)
wrk.linkq <- linkQueue{
parentUrl: parentUrl,
url: newUrlStr,