all: ignore HTML page from external domain

Any HTML link that is from domain other than the scanned domain should net get parsed. It only check if the link is valid or not.
author: Shulhan <ms@kilabit.info> 2025-05-29 14:04:51 +0700
committer: Shulhan <ms@kilabit.info> 2025-05-29 14:04:51 +0700
commit: b0c320e436ff5cdc70ad38a980a2af2a7f3e5dfd (patch)
tree: d7fb2016f426d51b72d506f22345634300528fdf /worker.go
parent: 3d39941514395137610fb1c58768814a390b7c35 (diff)
download: jarink-b0c320e436ff5cdc70ad38a980a2af2a7f3e5dfd.tar.xz
1 files changed, 4 insertions, 0 deletions
diff --git a/worker.go b/worker.go
index ac25bf4..700c9a5 100644
--- a/worker.go
+++ b/worker.go
@@ -137,6 +137,10 @@ func (wrk *worker) scan(linkq linkQueue) {
 	if linkq.kind == atom.Img {
 		return
 	}
+	if !strings.HasPrefix(linkq.url, wrk.baseUrl.String()) {
+		// Do not parse the page from external domain.
+		return
+	}
 	err = wrk.parseHTML(linkq.url, httpResp.Body)
 	if err != nil {
 		wrk.errq <- fmt.Errorf(`%s %s: %w`, logp, linkq.url, err)
author	Shulhan <ms@kilabit.info>	2025-05-29 14:04:51 +0700
committer	Shulhan <ms@kilabit.info>	2025-05-29 14:04:51 +0700
commit	b0c320e436ff5cdc70ad38a980a2af2a7f3e5dfd (patch)
tree	d7fb2016f426d51b72d506f22345634300528fdf /worker.go
parent	3d39941514395137610fb1c58768814a390b7c35 (diff)
download	jarink-b0c320e436ff5cdc70ad38a980a2af2a7f3e5dfd.tar.xz