aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorShulhan <ms@kilabit.info>2025-05-29 14:04:51 +0700
committerShulhan <ms@kilabit.info>2025-05-29 14:04:51 +0700
commitb0c320e436ff5cdc70ad38a980a2af2a7f3e5dfd (patch)
treed7fb2016f426d51b72d506f22345634300528fdf
parent3d39941514395137610fb1c58768814a390b7c35 (diff)
downloadjarink-b0c320e436ff5cdc70ad38a980a2af2a7f3e5dfd.tar.xz
all: ignore HTML page from external domain
Any HTML link that is from domain other than the scanned domain should net get parsed. It only check if the link is valid or not.
-rw-r--r--deadlinks_test.go43
-rw-r--r--testdata/web/index.html1
-rw-r--r--testdata/web/page2/index.html1
-rw-r--r--worker.go4
4 files changed, 44 insertions, 5 deletions
diff --git a/deadlinks_test.go b/deadlinks_test.go
index 519647d..4269e5d 100644
--- a/deadlinks_test.go
+++ b/deadlinks_test.go
@@ -15,21 +15,54 @@ import (
"git.sr.ht/~shulhan/pakakeh.go/lib/test"
)
-const testListenAddress = `127.0.0.1:11836`
+// The test run two web servers that serve content on "testdata/web/".
+// The first web server is the one that we want to scan.
+// The second web server is external web server, where HTML pages should not
+// be parsed.
+
+const testAddress = `127.0.0.1:11836`
+const testExternalAddress = `127.0.0.1:11900`
func TestMain(m *testing.M) {
var httpDirWeb = http.Dir(`testdata/web`)
var fshandle = http.FileServer(httpDirWeb)
- http.Handle(`/`, fshandle)
go func() {
- var err = http.ListenAndServe(testListenAddress, nil)
+ var mux = http.NewServeMux()
+ mux.Handle(`/`, fshandle)
+ var testServer = &http.Server{
+ Addr: testAddress,
+ Handler: mux,
+ ReadTimeout: 10 * time.Second,
+ WriteTimeout: 10 * time.Second,
+ MaxHeaderBytes: 1 << 20,
+ }
+ var err = testServer.ListenAndServe()
+ if err != nil {
+ log.Fatal(err)
+ }
+ }()
+ go func() {
+ var mux = http.NewServeMux()
+ mux.Handle(`/`, fshandle)
+ var testServer = &http.Server{
+ Addr: testExternalAddress,
+ Handler: mux,
+ ReadTimeout: 10 * time.Second,
+ WriteTimeout: 10 * time.Second,
+ MaxHeaderBytes: 1 << 20,
+ }
+ var err = testServer.ListenAndServe()
if err != nil {
log.Fatal(err)
}
}()
- var err = libnet.WaitAlive(`tcp`, testListenAddress, 5*time.Second)
+ var err = libnet.WaitAlive(`tcp`, testAddress, 5*time.Second)
+ if err != nil {
+ log.Fatal(err)
+ }
+ err = libnet.WaitAlive(`tcp`, testExternalAddress, 5*time.Second)
if err != nil {
log.Fatal(err)
}
@@ -38,7 +71,7 @@ func TestMain(m *testing.M) {
}
func TestDeadLinks_Scan(t *testing.T) {
- var testUrl = `http://` + testListenAddress
+ var testUrl = `http://` + testAddress
type testCase struct {
exp map[string][]deadlinks.Broken
diff --git a/testdata/web/index.html b/testdata/web/index.html
index e4d8bd0..f4f86d8 100644
--- a/testdata/web/index.html
+++ b/testdata/web/index.html
@@ -9,5 +9,6 @@ SPDX-License-Identifier: GPL-3.0-only
<img src="/gopher.png" />
<a href="/page2">Page 2</a>
<a href="/broken.html">Broken HTML</a>
+ <a href="http://127.0.0.1:11900">External URL</a>
</body>
</html>
diff --git a/testdata/web/page2/index.html b/testdata/web/page2/index.html
index 0fc7601..ae6b4ea 100644
--- a/testdata/web/page2/index.html
+++ b/testdata/web/page2/index.html
@@ -9,5 +9,6 @@ SPDX-License-Identifier: GPL-3.0-only
<a href="broken/relative">broken relative link</a>
<a href="/">Back with absolute path</a>
<a href="../">Back with relative path</a>
+ <a href="http://127.0.0.1:11900/page2">External URL page2</a>
</body>
</html>
diff --git a/worker.go b/worker.go
index ac25bf4..700c9a5 100644
--- a/worker.go
+++ b/worker.go
@@ -137,6 +137,10 @@ func (wrk *worker) scan(linkq linkQueue) {
if linkq.kind == atom.Img {
return
}
+ if !strings.HasPrefix(linkq.url, wrk.baseUrl.String()) {
+ // Do not parse the page from external domain.
+ return
+ }
err = wrk.parseHTML(linkq.url, httpResp.Body)
if err != nil {
wrk.errq <- fmt.Errorf(`%s %s: %w`, logp, linkq.url, err)