diff options
| author | Shulhan <ms@kilabit.info> | 2025-05-29 14:04:51 +0700 |
|---|---|---|
| committer | Shulhan <ms@kilabit.info> | 2025-05-29 14:04:51 +0700 |
| commit | b0c320e436ff5cdc70ad38a980a2af2a7f3e5dfd (patch) | |
| tree | d7fb2016f426d51b72d506f22345634300528fdf | |
| parent | 3d39941514395137610fb1c58768814a390b7c35 (diff) | |
| download | jarink-b0c320e436ff5cdc70ad38a980a2af2a7f3e5dfd.tar.xz | |
all: ignore HTML page from external domain
Any HTML link that is from domain other than the scanned domain should
net get parsed.
It only check if the link is valid or not.
| -rw-r--r-- | deadlinks_test.go | 43 | ||||
| -rw-r--r-- | testdata/web/index.html | 1 | ||||
| -rw-r--r-- | testdata/web/page2/index.html | 1 | ||||
| -rw-r--r-- | worker.go | 4 |
4 files changed, 44 insertions, 5 deletions
diff --git a/deadlinks_test.go b/deadlinks_test.go index 519647d..4269e5d 100644 --- a/deadlinks_test.go +++ b/deadlinks_test.go @@ -15,21 +15,54 @@ import ( "git.sr.ht/~shulhan/pakakeh.go/lib/test" ) -const testListenAddress = `127.0.0.1:11836` +// The test run two web servers that serve content on "testdata/web/". +// The first web server is the one that we want to scan. +// The second web server is external web server, where HTML pages should not +// be parsed. + +const testAddress = `127.0.0.1:11836` +const testExternalAddress = `127.0.0.1:11900` func TestMain(m *testing.M) { var httpDirWeb = http.Dir(`testdata/web`) var fshandle = http.FileServer(httpDirWeb) - http.Handle(`/`, fshandle) go func() { - var err = http.ListenAndServe(testListenAddress, nil) + var mux = http.NewServeMux() + mux.Handle(`/`, fshandle) + var testServer = &http.Server{ + Addr: testAddress, + Handler: mux, + ReadTimeout: 10 * time.Second, + WriteTimeout: 10 * time.Second, + MaxHeaderBytes: 1 << 20, + } + var err = testServer.ListenAndServe() + if err != nil { + log.Fatal(err) + } + }() + go func() { + var mux = http.NewServeMux() + mux.Handle(`/`, fshandle) + var testServer = &http.Server{ + Addr: testExternalAddress, + Handler: mux, + ReadTimeout: 10 * time.Second, + WriteTimeout: 10 * time.Second, + MaxHeaderBytes: 1 << 20, + } + var err = testServer.ListenAndServe() if err != nil { log.Fatal(err) } }() - var err = libnet.WaitAlive(`tcp`, testListenAddress, 5*time.Second) + var err = libnet.WaitAlive(`tcp`, testAddress, 5*time.Second) + if err != nil { + log.Fatal(err) + } + err = libnet.WaitAlive(`tcp`, testExternalAddress, 5*time.Second) if err != nil { log.Fatal(err) } @@ -38,7 +71,7 @@ func TestMain(m *testing.M) { } func TestDeadLinks_Scan(t *testing.T) { - var testUrl = `http://` + testListenAddress + var testUrl = `http://` + testAddress type testCase struct { exp map[string][]deadlinks.Broken diff --git a/testdata/web/index.html b/testdata/web/index.html index e4d8bd0..f4f86d8 100644 --- a/testdata/web/index.html +++ b/testdata/web/index.html @@ -9,5 +9,6 @@ SPDX-License-Identifier: GPL-3.0-only <img src="/gopher.png" /> <a href="/page2">Page 2</a> <a href="/broken.html">Broken HTML</a> + <a href="http://127.0.0.1:11900">External URL</a> </body> </html> diff --git a/testdata/web/page2/index.html b/testdata/web/page2/index.html index 0fc7601..ae6b4ea 100644 --- a/testdata/web/page2/index.html +++ b/testdata/web/page2/index.html @@ -9,5 +9,6 @@ SPDX-License-Identifier: GPL-3.0-only <a href="broken/relative">broken relative link</a> <a href="/">Back with absolute path</a> <a href="../">Back with relative path</a> + <a href="http://127.0.0.1:11900/page2">External URL page2</a> </body> </html> @@ -137,6 +137,10 @@ func (wrk *worker) scan(linkq linkQueue) { if linkq.kind == atom.Img { return } + if !strings.HasPrefix(linkq.url, wrk.baseUrl.String()) { + // Do not parse the page from external domain. + return + } err = wrk.parseHTML(linkq.url, httpResp.Body) if err != nil { wrk.errq <- fmt.Errorf(`%s %s: %w`, logp, linkq.url, err) |
