diff options
| -rw-r--r-- | testdata/web/index.html | 3 | ||||
| -rw-r--r-- | url_test.go | 11 | ||||
| -rw-r--r-- | worker.go | 43 |
3 files changed, 41 insertions, 16 deletions
diff --git a/testdata/web/index.html b/testdata/web/index.html index a245d07..61a1f39 100644 --- a/testdata/web/index.html +++ b/testdata/web/index.html @@ -15,5 +15,8 @@ SPDX-License-Identifier: GPL-3.0-only <a href="http:/127.0.0.1:11836">Invalid external URL</a> <!-- Error when parsing URL --> <a href="http://127.0.0.1:abc">Invalid URL port</a> + <!-- Fragment should be skipped and cleaned up --> + <a href="#goto_a">Same with href to "/"</a> + <a href="/page2#goto_a">Same with href to "/page2"</a> </body> </html> diff --git a/url_test.go b/url_test.go index 698c49c..506090d 100644 --- a/url_test.go +++ b/url_test.go @@ -16,6 +16,9 @@ func TestUrlString(t *testing.T) { exp string } var listCase = []testCase{{ + rawUrl: `/page#goto`, + exp: `/page`, + }, { rawUrl: `http://127.0.0.1`, exp: `http://127.0.0.1`, }, { @@ -27,12 +30,20 @@ func TestUrlString(t *testing.T) { }, { rawUrl: `http://127.0.0.1/page/`, exp: `http://127.0.0.1/page/`, + }, { + rawUrl: `http://127.0.0.1/page/#gotoa`, + exp: `http://127.0.0.1/page/`, + }, { + rawUrl: `http://127.0.0.1/page#gotoa`, + exp: `http://127.0.0.1/page`, }} for _, tcase := range listCase { gotUrl, err := url.Parse(tcase.rawUrl) if err != nil { t.Fatal(err) } + gotUrl.Fragment = "" + gotUrl.RawFragment = "" var got = gotUrl.String() test.Assert(t, tcase.rawUrl, tcase.exp, got) } @@ -61,6 +61,8 @@ func newWorker(opts ScanOptions) (wrk *worker, err error) { } wrk.scanUrl.Path = strings.TrimSuffix(wrk.scanUrl.Path, `/`) + wrk.scanUrl.Fragment = "" + wrk.scanUrl.RawFragment = "" wrk.baseUrl = &url.URL{ Scheme: wrk.scanUrl.Scheme, @@ -219,11 +221,32 @@ func (wrk *worker) processLink(rawParentUrl, val string, kind atom.Atom) { log.Fatal(err) } + var newUrl *url.URL + newUrl, err = url.Parse(val) + if err != nil { + var linkq = linkQueue{ + parentUrl: parentUrl, + url: val, + kind: kind, + } + wrk.markDead(linkq, 700) + return + } + newUrl.Fragment = "" + newUrl.RawFragment = "" + + var newUrlStr = strings.TrimSuffix(newUrl.String(), `/`) + + if kind == atom.A && val[0] == '#' { + // Ignore link to ID, like `href="#element_id"`. + return + } + // val is absolute to parent URL. if val[0] == '/' { // Link to the same domain will queued for scanning. - var newUrl = wrk.baseUrl.JoinPath(val) - var newUrlStr = strings.TrimSuffix(newUrl.String(), `/`) + newUrl = wrk.baseUrl.JoinPath(newUrl.Path) + newUrlStr = strings.TrimSuffix(newUrl.String(), `/`) wrk.linkq <- linkQueue{ parentUrl: parentUrl, url: newUrlStr, @@ -232,18 +255,6 @@ func (wrk *worker) processLink(rawParentUrl, val string, kind atom.Atom) { return } if strings.HasPrefix(val, `http`) { - var newUrl *url.URL - newUrl, err = url.Parse(val) - if err != nil { - var linkq = linkQueue{ - parentUrl: parentUrl, - url: val, - kind: kind, - } - wrk.markDead(linkq, 700) - return - } - var newUrlStr = strings.TrimSuffix(newUrl.String(), `/`) wrk.linkq <- linkQueue{ parentUrl: parentUrl, url: newUrlStr, @@ -252,8 +263,8 @@ func (wrk *worker) processLink(rawParentUrl, val string, kind atom.Atom) { return } // val is relative to parent URL. - var newUrl = parentUrl.JoinPath(`/`, val) - var newUrlStr = strings.TrimSuffix(newUrl.String(), `/`) + newUrl = parentUrl.JoinPath(`/`, newUrl.Path) + newUrlStr = strings.TrimSuffix(newUrl.String(), `/`) wrk.linkq <- linkQueue{ parentUrl: parentUrl, url: newUrlStr, |
