aboutsummaryrefslogtreecommitdiff
path: root/brokenlinks/link_queue.go
diff options
context:
space:
mode:
Diffstat (limited to 'brokenlinks/link_queue.go')
-rw-r--r--brokenlinks/link_queue.go55
1 files changed, 55 insertions, 0 deletions
diff --git a/brokenlinks/link_queue.go b/brokenlinks/link_queue.go
new file mode 100644
index 0000000..164a902
--- /dev/null
+++ b/brokenlinks/link_queue.go
@@ -0,0 +1,55 @@
+// SPDX-FileCopyrightText: 2025 M. Shulhan <ms@kilabit.info>
+// SPDX-License-Identifier: GPL-3.0-only
+
+package brokenlinks
+
+import (
+ "net/url"
+ "strings"
+
+ "golang.org/x/net/html/atom"
+)
+
+type linkQueue struct {
+ parentUrl *url.URL
+
+ // The error from scan.
+ errScan error
+
+ // url being scanned.
+ url string
+
+ // kind of url, its either an anchor or image.
+ // It set to 0 if url is the first URL being scanned.
+ kind atom.Atom
+
+ // isExternal if true the scan will issue HTTP method HEAD instead of
+ // GET.
+ isExternal bool
+
+ // Status of link after scan, its mostly used the HTTP status code.
+ // 0: link is the result of scan, not processed yet.
+ // StatusBadLink: link is invalid, not parseable or unreachable.
+ // 200 - 211: OK.
+ // 400 - 511: Error.
+ status int
+}
+
+// checkExternal set the isExternal field to be true if
+//
+// (1) [linkQueue.url] does not start with [worker.scanUrl]
+//
+// (2) linkQueue is from scanPastResult, indicated by non-nil
+// [worker.pastResult].
+// In this case, we did not want to scan the other pages from the same scanUrl
+// domain.
+func (linkq *linkQueue) checkExternal(wrk *worker) {
+ if !strings.HasPrefix(linkq.url, wrk.scanUrl.String()) {
+ linkq.isExternal = true
+ return
+ }
+ if wrk.pastResult != nil {
+ linkq.isExternal = true
+ return
+ }
+}