From b0c320e436ff5cdc70ad38a980a2af2a7f3e5dfd Mon Sep 17 00:00:00 2001 From: Shulhan Date: Thu, 29 May 2025 14:04:51 +0700 Subject: all: ignore HTML page from external domain Any HTML link that is from domain other than the scanned domain should net get parsed. It only check if the link is valid or not. --- deadlinks_test.go | 43 ++++++++++++++++++++++++++++++++++++++----- testdata/web/index.html | 1 + testdata/web/page2/index.html | 1 + worker.go | 4 ++++ 4 files changed, 44 insertions(+), 5 deletions(-) diff --git a/deadlinks_test.go b/deadlinks_test.go index 519647d..4269e5d 100644 --- a/deadlinks_test.go +++ b/deadlinks_test.go @@ -15,21 +15,54 @@ import ( "git.sr.ht/~shulhan/pakakeh.go/lib/test" ) -const testListenAddress = `127.0.0.1:11836` +// The test run two web servers that serve content on "testdata/web/". +// The first web server is the one that we want to scan. +// The second web server is external web server, where HTML pages should not +// be parsed. + +const testAddress = `127.0.0.1:11836` +const testExternalAddress = `127.0.0.1:11900` func TestMain(m *testing.M) { var httpDirWeb = http.Dir(`testdata/web`) var fshandle = http.FileServer(httpDirWeb) - http.Handle(`/`, fshandle) go func() { - var err = http.ListenAndServe(testListenAddress, nil) + var mux = http.NewServeMux() + mux.Handle(`/`, fshandle) + var testServer = &http.Server{ + Addr: testAddress, + Handler: mux, + ReadTimeout: 10 * time.Second, + WriteTimeout: 10 * time.Second, + MaxHeaderBytes: 1 << 20, + } + var err = testServer.ListenAndServe() + if err != nil { + log.Fatal(err) + } + }() + go func() { + var mux = http.NewServeMux() + mux.Handle(`/`, fshandle) + var testServer = &http.Server{ + Addr: testExternalAddress, + Handler: mux, + ReadTimeout: 10 * time.Second, + WriteTimeout: 10 * time.Second, + MaxHeaderBytes: 1 << 20, + } + var err = testServer.ListenAndServe() if err != nil { log.Fatal(err) } }() - var err = libnet.WaitAlive(`tcp`, testListenAddress, 5*time.Second) + var err = libnet.WaitAlive(`tcp`, testAddress, 5*time.Second) + if err != nil { + log.Fatal(err) + } + err = libnet.WaitAlive(`tcp`, testExternalAddress, 5*time.Second) if err != nil { log.Fatal(err) } @@ -38,7 +71,7 @@ func TestMain(m *testing.M) { } func TestDeadLinks_Scan(t *testing.T) { - var testUrl = `http://` + testListenAddress + var testUrl = `http://` + testAddress type testCase struct { exp map[string][]deadlinks.Broken diff --git a/testdata/web/index.html b/testdata/web/index.html index e4d8bd0..f4f86d8 100644 --- a/testdata/web/index.html +++ b/testdata/web/index.html @@ -9,5 +9,6 @@ SPDX-License-Identifier: GPL-3.0-only Page 2 Broken HTML + External URL diff --git a/testdata/web/page2/index.html b/testdata/web/page2/index.html index 0fc7601..ae6b4ea 100644 --- a/testdata/web/page2/index.html +++ b/testdata/web/page2/index.html @@ -9,5 +9,6 @@ SPDX-License-Identifier: GPL-3.0-only broken relative link Back with absolute path Back with relative path + External URL page2 diff --git a/worker.go b/worker.go index ac25bf4..700c9a5 100644 --- a/worker.go +++ b/worker.go @@ -137,6 +137,10 @@ func (wrk *worker) scan(linkq linkQueue) { if linkq.kind == atom.Img { return } + if !strings.HasPrefix(linkq.url, wrk.baseUrl.String()) { + // Do not parse the page from external domain. + return + } err = wrk.parseHTML(linkq.url, httpResp.Body) if err != nil { wrk.errq <- fmt.Errorf(`%s %s: %w`, logp, linkq.url, err) -- cgit v1.3