aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--deadlinks_test.go43
-rw-r--r--testdata/web/index.html1
-rw-r--r--testdata/web/page2/index.html1
-rw-r--r--worker.go4
4 files changed, 44 insertions, 5 deletions
diff --git a/deadlinks_test.go b/deadlinks_test.go
index 519647d..4269e5d 100644
--- a/deadlinks_test.go
+++ b/deadlinks_test.go
@@ -15,21 +15,54 @@ import (
"git.sr.ht/~shulhan/pakakeh.go/lib/test"
)
-const testListenAddress = `127.0.0.1:11836`
+// The test run two web servers that serve content on "testdata/web/".
+// The first web server is the one that we want to scan.
+// The second web server is external web server, where HTML pages should not
+// be parsed.
+
+const testAddress = `127.0.0.1:11836`
+const testExternalAddress = `127.0.0.1:11900`
func TestMain(m *testing.M) {
var httpDirWeb = http.Dir(`testdata/web`)
var fshandle = http.FileServer(httpDirWeb)
- http.Handle(`/`, fshandle)
go func() {
- var err = http.ListenAndServe(testListenAddress, nil)
+ var mux = http.NewServeMux()
+ mux.Handle(`/`, fshandle)
+ var testServer = &http.Server{
+ Addr: testAddress,
+ Handler: mux,
+ ReadTimeout: 10 * time.Second,
+ WriteTimeout: 10 * time.Second,
+ MaxHeaderBytes: 1 << 20,
+ }
+ var err = testServer.ListenAndServe()
+ if err != nil {
+ log.Fatal(err)
+ }
+ }()
+ go func() {
+ var mux = http.NewServeMux()
+ mux.Handle(`/`, fshandle)
+ var testServer = &http.Server{
+ Addr: testExternalAddress,
+ Handler: mux,
+ ReadTimeout: 10 * time.Second,
+ WriteTimeout: 10 * time.Second,
+ MaxHeaderBytes: 1 << 20,
+ }
+ var err = testServer.ListenAndServe()
if err != nil {
log.Fatal(err)
}
}()
- var err = libnet.WaitAlive(`tcp`, testListenAddress, 5*time.Second)
+ var err = libnet.WaitAlive(`tcp`, testAddress, 5*time.Second)
+ if err != nil {
+ log.Fatal(err)
+ }
+ err = libnet.WaitAlive(`tcp`, testExternalAddress, 5*time.Second)
if err != nil {
log.Fatal(err)
}
@@ -38,7 +71,7 @@ func TestMain(m *testing.M) {
}
func TestDeadLinks_Scan(t *testing.T) {
- var testUrl = `http://` + testListenAddress
+ var testUrl = `http://` + testAddress
type testCase struct {
exp map[string][]deadlinks.Broken
diff --git a/testdata/web/index.html b/testdata/web/index.html
index e4d8bd0..f4f86d8 100644
--- a/testdata/web/index.html
+++ b/testdata/web/index.html
@@ -9,5 +9,6 @@ SPDX-License-Identifier: GPL-3.0-only
<img src="/gopher.png" />
<a href="/page2">Page 2</a>
<a href="/broken.html">Broken HTML</a>
+ <a href="http://127.0.0.1:11900">External URL</a>
</body>
</html>
diff --git a/testdata/web/page2/index.html b/testdata/web/page2/index.html
index 0fc7601..ae6b4ea 100644
--- a/testdata/web/page2/index.html
+++ b/testdata/web/page2/index.html
@@ -9,5 +9,6 @@ SPDX-License-Identifier: GPL-3.0-only
<a href="broken/relative">broken relative link</a>
<a href="/">Back with absolute path</a>
<a href="../">Back with relative path</a>
+ <a href="http://127.0.0.1:11900/page2">External URL page2</a>
</body>
</html>
diff --git a/worker.go b/worker.go
index ac25bf4..700c9a5 100644
--- a/worker.go
+++ b/worker.go
@@ -137,6 +137,10 @@ func (wrk *worker) scan(linkq linkQueue) {
if linkq.kind == atom.Img {
return
}
+ if !strings.HasPrefix(linkq.url, wrk.baseUrl.String()) {
+ // Do not parse the page from external domain.
+ return
+ }
err = wrk.parseHTML(linkq.url, httpResp.Body)
if err != nil {
wrk.errq <- fmt.Errorf(`%s %s: %w`, logp, linkq.url, err)