From 37c5a44f1330c968332a0a00d925ed32add80084 Mon Sep 17 00:00:00 2001 From: Shulhan Date: Fri, 30 May 2025 00:25:33 +0700 Subject: all: cleaning up fragment on links The fragment part on URL, for example "/page#fragment" should be removed, otherwise it will indexed as different URL. --- testdata/web/index.html | 3 +++ url_test.go | 11 +++++++++++ worker.go | 43 +++++++++++++++++++++++++++---------------- 3 files changed, 41 insertions(+), 16 deletions(-) diff --git a/testdata/web/index.html b/testdata/web/index.html index a245d07..61a1f39 100644 --- a/testdata/web/index.html +++ b/testdata/web/index.html @@ -15,5 +15,8 @@ SPDX-License-Identifier: GPL-3.0-only Invalid external URL Invalid URL port + + Same with href to "/" + Same with href to "/page2" diff --git a/url_test.go b/url_test.go index 698c49c..506090d 100644 --- a/url_test.go +++ b/url_test.go @@ -16,6 +16,9 @@ func TestUrlString(t *testing.T) { exp string } var listCase = []testCase{{ + rawUrl: `/page#goto`, + exp: `/page`, + }, { rawUrl: `http://127.0.0.1`, exp: `http://127.0.0.1`, }, { @@ -27,12 +30,20 @@ func TestUrlString(t *testing.T) { }, { rawUrl: `http://127.0.0.1/page/`, exp: `http://127.0.0.1/page/`, + }, { + rawUrl: `http://127.0.0.1/page/#gotoa`, + exp: `http://127.0.0.1/page/`, + }, { + rawUrl: `http://127.0.0.1/page#gotoa`, + exp: `http://127.0.0.1/page`, }} for _, tcase := range listCase { gotUrl, err := url.Parse(tcase.rawUrl) if err != nil { t.Fatal(err) } + gotUrl.Fragment = "" + gotUrl.RawFragment = "" var got = gotUrl.String() test.Assert(t, tcase.rawUrl, tcase.exp, got) } diff --git a/worker.go b/worker.go index c6e344d..f2be07f 100644 --- a/worker.go +++ b/worker.go @@ -61,6 +61,8 @@ func newWorker(opts ScanOptions) (wrk *worker, err error) { } wrk.scanUrl.Path = strings.TrimSuffix(wrk.scanUrl.Path, `/`) + wrk.scanUrl.Fragment = "" + wrk.scanUrl.RawFragment = "" wrk.baseUrl = &url.URL{ Scheme: wrk.scanUrl.Scheme, @@ -219,11 +221,32 @@ func (wrk *worker) processLink(rawParentUrl, val string, kind atom.Atom) { log.Fatal(err) } + var newUrl *url.URL + newUrl, err = url.Parse(val) + if err != nil { + var linkq = linkQueue{ + parentUrl: parentUrl, + url: val, + kind: kind, + } + wrk.markDead(linkq, 700) + return + } + newUrl.Fragment = "" + newUrl.RawFragment = "" + + var newUrlStr = strings.TrimSuffix(newUrl.String(), `/`) + + if kind == atom.A && val[0] == '#' { + // Ignore link to ID, like `href="#element_id"`. + return + } + // val is absolute to parent URL. if val[0] == '/' { // Link to the same domain will queued for scanning. - var newUrl = wrk.baseUrl.JoinPath(val) - var newUrlStr = strings.TrimSuffix(newUrl.String(), `/`) + newUrl = wrk.baseUrl.JoinPath(newUrl.Path) + newUrlStr = strings.TrimSuffix(newUrl.String(), `/`) wrk.linkq <- linkQueue{ parentUrl: parentUrl, url: newUrlStr, @@ -232,18 +255,6 @@ func (wrk *worker) processLink(rawParentUrl, val string, kind atom.Atom) { return } if strings.HasPrefix(val, `http`) { - var newUrl *url.URL - newUrl, err = url.Parse(val) - if err != nil { - var linkq = linkQueue{ - parentUrl: parentUrl, - url: val, - kind: kind, - } - wrk.markDead(linkq, 700) - return - } - var newUrlStr = strings.TrimSuffix(newUrl.String(), `/`) wrk.linkq <- linkQueue{ parentUrl: parentUrl, url: newUrlStr, @@ -252,8 +263,8 @@ func (wrk *worker) processLink(rawParentUrl, val string, kind atom.Atom) { return } // val is relative to parent URL. - var newUrl = parentUrl.JoinPath(`/`, val) - var newUrlStr = strings.TrimSuffix(newUrl.String(), `/`) + newUrl = parentUrl.JoinPath(`/`, newUrl.Path) + newUrlStr = strings.TrimSuffix(newUrl.String(), `/`) wrk.linkq <- linkQueue{ parentUrl: parentUrl, url: newUrlStr, -- cgit v1.3