all: refactoring, move brokenlinks code to its own package

When two or more struct has the same prefix that means it is time to move it to group it. Also, we will group one command to one package in the future.
author: Shulhan <ms@kilabit.info> 2025-06-12 21:13:58 +0700
committer: Shulhan <ms@kilabit.info> 2025-06-12 21:13:58 +0700
commit: a02e915388723a5d8cc3b555fb3dfec477fc2a55 (patch)
tree: aa35678b263646e1edd730a16cb35a66e7b933d8 /brokenlinks
parent: f408c77795a9dd6d4551fadd2e8352ba08915feb (diff)
download: jarink-a02e915388723a5d8cc3b555fb3dfec477fc2a55.tar.xz
11 files changed, 880 insertions, 0 deletions
diff --git a/brokenlinks/brokenlinks.go b/brokenlinks/brokenlinks.go
new file mode 100644
index 0000000..8ac458f
--- /dev/null
+++ b/brokenlinks/brokenlinks.go
@@ -0,0 +1,39 @@
+// SPDX-FileCopyrightText: 2025 M. Shulhan <ms@kilabit.info>
+// SPDX-License-Identifier: GPL-3.0-only
+
+package brokenlinks
+
+import (
+	"fmt"
+)
+
+const Version = `0.1.0`
+
+// StatusBadLink status for link that is not parseable by [url.Parse] or not
+// reachable during GET or HEAD, either timeout or IP or domain not exist.
+const StatusBadLink = 700
+
+// Options define the options for scanning broken links.
+type Options struct {
+	Url            string
+	PastResultFile string
+	IsVerbose      bool
+}
+
+// Scan the URL for broken links.
+func Scan(opts Options) (result *Result, err error) {
+	var logp = `brokenlinks`
+	var wrk *worker
+
+	wrk, err = newWorker(opts)
+	if err != nil {
+		return nil, fmt.Errorf(`%s: %s`, logp, err)
+	}
+
+	result, err = wrk.run()
+	if err != nil {
+		return nil, fmt.Errorf(`%s: %s`, logp, err)
+	}
+
+	return result, nil
+}
diff --git a/brokenlinks/brokenlinks_test.go b/brokenlinks/brokenlinks_test.go
new file mode 100644
index 0000000..367ae6c
--- /dev/null
+++ b/brokenlinks/brokenlinks_test.go
@@ -0,0 +1,227 @@
+// SPDX-FileCopyrightText: 2025 M. Shulhan <ms@kilabit.info>
+// SPDX-License-Identifier: GPL-3.0-only
+
+package brokenlinks_test
+
+import (
+	"encoding/json"
+	"log"
+	"net/http"
+	"os"
+	"testing"
+	"time"
+
+	libnet "git.sr.ht/~shulhan/pakakeh.go/lib/net"
+	"git.sr.ht/~shulhan/pakakeh.go/lib/test"
+
+	"git.sr.ht/~shulhan/jarink/brokenlinks"
+)
+
+// The test run two web servers that serve content on "testdata/web/".
+// The first web server is the one that we want to scan.
+// The second web server is external web server, where HTML pages should not
+// be parsed.
+
+const testAddress = `127.0.0.1:11836`
+const testExternalAddress = `127.0.0.1:11900`
+
+func TestMain(m *testing.M) {
+	log.SetFlags(0)
+	var httpDirWeb = http.Dir(`testdata/web`)
+	var fshandle = http.FileServer(httpDirWeb)
+
+	go func() {
+		var mux = http.NewServeMux()
+		mux.Handle(`/`, fshandle)
+		var testServer = &http.Server{
+			Addr:           testAddress,
+			Handler:        mux,
+			ReadTimeout:    10 * time.Second,
+			WriteTimeout:   10 * time.Second,
+			MaxHeaderBytes: 1 << 20,
+		}
+		var err = testServer.ListenAndServe()
+		if err != nil {
+			log.Fatal(err)
+		}
+	}()
+	go func() {
+		var mux = http.NewServeMux()
+		mux.Handle(`/`, fshandle)
+		var testServer = &http.Server{
+			Addr:           testExternalAddress,
+			Handler:        mux,
+			ReadTimeout:    10 * time.Second,
+			WriteTimeout:   10 * time.Second,
+			MaxHeaderBytes: 1 << 20,
+		}
+		var err = testServer.ListenAndServe()
+		if err != nil {
+			log.Fatal(err)
+		}
+	}()
+
+	var err = libnet.WaitAlive(`tcp`, testAddress, 5*time.Second)
+	if err != nil {
+		log.Fatal(err)
+	}
+	err = libnet.WaitAlive(`tcp`, testExternalAddress, 5*time.Second)
+	if err != nil {
+		log.Fatal(err)
+	}
+
+	os.Exit(m.Run())
+}
+
+func TestBrokenlinks(t *testing.T) {
+	var testUrl = `http://` + testAddress
+
+	type testCase struct {
+		exp      map[string][]brokenlinks.Broken
+		scanUrl  string
+		expError string
+	}
+
+	listCase := []testCase{{
+		scanUrl:  `127.0.0.1:14594`,
+		expError: `brokenlinks: invalid URL "127.0.0.1:14594"`,
+	}, {
+		scanUrl:  `http://127.0.0.1:14594`,
+		expError: `brokenlinks: Get "http://127.0.0.1:14594": dial tcp 127.0.0.1:14594: connect: connection refused`,
+	}, {
+		scanUrl: testUrl,
+		exp: map[string][]brokenlinks.Broken{
+			testUrl: []brokenlinks.Broken{
+				{
+					Link: testUrl + `/broken.png`,
+					Code: http.StatusNotFound,
+				}, {
+					Link: testUrl + `/brokenPage`,
+					Code: http.StatusNotFound,
+				}, {
+					Link:  `http://127.0.0.1:abc`,
+					Error: `parse "http://127.0.0.1:abc": invalid port ":abc" after host`,
+					Code:  brokenlinks.StatusBadLink,
+				}, {
+					Link:  `http:/127.0.0.1:11836`,
+					Error: `Get "http:/127.0.0.1:11836": http: no Host in request URL`,
+					Code:  brokenlinks.StatusBadLink,
+				},
+			},
+			testUrl + `/broken.html`: []brokenlinks.Broken{
+				{
+					Link: testUrl + `/brokenPage`,
+					Code: http.StatusNotFound,
+				},
+			},
+			testUrl + `/page2`: []brokenlinks.Broken{
+				{
+					Link: testUrl + `/broken.png`,
+					Code: http.StatusNotFound,
+				}, {
+					Link: testUrl + `/page2/broken/relative`,
+					Code: http.StatusNotFound,
+				}, {
+					Link: testUrl + `/page2/broken2.png`,
+					Code: http.StatusNotFound,
+				},
+			},
+		},
+	}, {
+		// Scanning on "/path" should not scan the the "/" or other
+		// pages other than below of "/path" itself.
+		scanUrl: testUrl + `/page2`,
+		exp: map[string][]brokenlinks.Broken{
+			testUrl + `/page2`: []brokenlinks.Broken{
+				{
+					Link: testUrl + `/broken.png`,
+					Code: http.StatusNotFound,
+				}, {
+					Link: testUrl + `/page2/broken/relative`,
+					Code: http.StatusNotFound,
+				}, {
+					Link: testUrl + `/page2/broken2.png`,
+					Code: http.StatusNotFound,
+				},
+			},
+		},
+	}}
+
+	var (
+		result *brokenlinks.Result
+		err    error
+	)
+	for _, tcase := range listCase {
+		t.Logf(`--- brokenlinks: %s`, tcase.scanUrl)
+		var opts = brokenlinks.Options{
+			Url: tcase.scanUrl,
+		}
+		result, err = brokenlinks.Scan(opts)
+		if err != nil {
+			test.Assert(t, tcase.scanUrl+` error`,
+				tcase.expError, err.Error())
+			continue
+		}
+		//got, _ := json.MarshalIndent(result.BrokenLinks, ``, `  `)
+		//t.Logf(`got=%s`, got)
+		test.Assert(t, tcase.scanUrl, tcase.exp, result.BrokenLinks)
+	}
+}
+
+// Test running Brokenlinks with file PastResultFile is set.
+// The PastResultFile is modified to only report errors on "/page2".
+func TestBrokenlinks_pastResult(t *testing.T) {
+	var testUrl = `http://` + testAddress
+
+	type testCase struct {
+		exp      map[string][]brokenlinks.Broken
+		expError string
+		opts     brokenlinks.Options
+	}
+
+	listCase := []testCase{{
+		// With invalid file.
+		opts: brokenlinks.Options{
+			Url:            testUrl,
+			PastResultFile: `testdata/invalid`,
+		},
+		expError: `brokenlinks: open testdata/invalid: no such file or directory`,
+	}, {
+		// With valid file.
+		opts: brokenlinks.Options{
+			Url:            testUrl,
+			PastResultFile: `testdata/past_result.json`,
+		},
+		exp: map[string][]brokenlinks.Broken{
+			testUrl + `/page2`: []brokenlinks.Broken{
+				{
+					Link: testUrl + `/broken.png`,
+					Code: http.StatusNotFound,
+				}, {
+					Link: testUrl + `/page2/broken/relative`,
+					Code: http.StatusNotFound,
+				}, {
+					Link: testUrl + `/page2/broken2.png`,
+					Code: http.StatusNotFound,
+				},
+			},
+		},
+	}}
+
+	var (
+		result *brokenlinks.Result
+		err    error
+	)
+	for _, tcase := range listCase {
+		t.Logf(`--- brokenlinks: %s`, tcase.opts.Url)
+		result, err = brokenlinks.Scan(tcase.opts)
+		if err != nil {
+			test.Assert(t, tcase.opts.Url+` error`,
+				tcase.expError, err.Error())
+			continue
+		}
+		got, _ := json.MarshalIndent(result.BrokenLinks, ``, `  `)
+		t.Logf(`got=%s`, got)
+		test.Assert(t, tcase.opts.Url, tcase.exp, result.BrokenLinks)
+	}
+}
diff --git a/brokenlinks/link_queue.go b/brokenlinks/link_queue.go
new file mode 100644
index 0000000..164a902
--- /dev/null
+++ b/brokenlinks/link_queue.go
@@ -0,0 +1,55 @@
+// SPDX-FileCopyrightText: 2025 M. Shulhan <ms@kilabit.info>
+// SPDX-License-Identifier: GPL-3.0-only
+
+package brokenlinks
+
+import (
+	"net/url"
+	"strings"
+
+	"golang.org/x/net/html/atom"
+)
+
+type linkQueue struct {
+	parentUrl *url.URL
+
+	// The error from scan.
+	errScan error
+
+	// url being scanned.
+	url string
+
+	// kind of url, its either an anchor or image.
+	// It set to 0 if url is the first URL being scanned.
+	kind atom.Atom
+
+	// isExternal if true the scan will issue HTTP method HEAD instead of
+	// GET.
+	isExternal bool
+
+	// Status of link after scan, its mostly used the HTTP status code.
+	// 0: link is the result of scan, not processed yet.
+	// StatusBadLink: link is invalid, not parseable or unreachable.
+	// 200 - 211: OK.
+	// 400 - 511: Error.
+	status int
+}
+
+// checkExternal set the isExternal field to be true if
+//
+// (1) [linkQueue.url] does not start with [worker.scanUrl]
+//
+// (2) linkQueue is from scanPastResult, indicated by non-nil
+// [worker.pastResult].
+// In this case, we did not want to scan the other pages from the same scanUrl
+// domain.
+func (linkq *linkQueue) checkExternal(wrk *worker) {
+	if !strings.HasPrefix(linkq.url, wrk.scanUrl.String()) {
+		linkq.isExternal = true
+		return
+	}
+	if wrk.pastResult != nil {
+		linkq.isExternal = true
+		return
+	}
+}
diff --git a/brokenlinks/result.go b/brokenlinks/result.go
new file mode 100644
index 0000000..676859b
--- /dev/null
+++ b/brokenlinks/result.go
@@ -0,0 +1,37 @@
+// SPDX-FileCopyrightText: 2025 M. Shulhan <ms@kilabit.info>
+// SPDX-License-Identifier: GPL-3.0-only
+
+package brokenlinks
+
+import (
+	"slices"
+	"strings"
+)
+
+// Broken store the broken link, HTTP status code, and the error message that
+// cause it.
+type Broken struct {
+	Link  string `json:"link"`
+	Error string `json:"error,omitempty"`
+	Code  int    `json:"code"`
+}
+
+// Result store the result of scanning for broken links.
+type Result struct {
+	// BrokenLinks store the page and its broken links.
+	BrokenLinks map[string][]Broken `json:"broken_links"`
+}
+
+func newResult() *Result {
+	return &Result{
+		BrokenLinks: map[string][]Broken{},
+	}
+}
+
+func (result *Result) sort() {
+	for _, listBroken := range result.BrokenLinks {
+		slices.SortFunc(listBroken, func(a, b Broken) int {
+			return strings.Compare(a.Link, b.Link)
+		})
+	}
+}
diff --git a/brokenlinks/testdata/past_result.json b/brokenlinks/testdata/past_result.json
new file mode 100644
index 0000000..ca29d35
--- /dev/null
+++ b/brokenlinks/testdata/past_result.json
@@ -0,0 +1,10 @@
+{
+  "broken_links": {
+    "http://127.0.0.1:11836/page2": [
+      {
+        "link": "http://127.0.0.1:11836/",
+        "code": 404
+      }
+    ]
+  }
+}
diff --git a/brokenlinks/testdata/past_result.json.license b/brokenlinks/testdata/past_result.json.license
new file mode 100644
index 0000000..22616a9
--- /dev/null
+++ b/brokenlinks/testdata/past_result.json.license
@@ -0,0 +1,2 @@
+SPDX-FileCopyrightText: 2025 M. Shulhan <ms@kilabit.info>
+SPDX-License-Identifier: GPL-3.0-only
diff --git a/brokenlinks/testdata/web/broken.html b/brokenlinks/testdata/web/broken.html
new file mode 100644
index 0000000..533e542
--- /dev/null
+++ b/brokenlinks/testdata/web/broken.html
@@ -0,0 +1,7 @@
+<html>
+  <head></head>
+  <body>
+    <a href="/brokenPage"
+    <p>
+  </body>
+</html>
diff --git a/brokenlinks/testdata/web/gopher.png b/brokenlinks/testdata/web/gopher.png
new file mode 100644
index 0000000..79352be
--- /dev/null
+++ b/brokenlinks/testdata/web/gopher.png
diff --git a/brokenlinks/testdata/web/index.html b/brokenlinks/testdata/web/index.html
new file mode 100644
index 0000000..61a1f39
--- /dev/null
+++ b/brokenlinks/testdata/web/index.html
@@ -0,0 +1,22 @@
+<!--
+SPDX-FileCopyrightText: 2025 M. Shulhan <ms@kilabit.info>
+SPDX-License-Identifier: GPL-3.0-only
+-->
+<html>
+  <body>
+    <img src="/broken.png" />
+    <a href="/brokenPage">Broken page</a>
+    <img src="/gopher.png" />
+    <img width="200" src="" />
+    <a href="/page2">Page 2</a>
+    <a href="/broken.html">Broken HTML</a>
+    <a href="http://127.0.0.1:11900">External URL</a>
+    <!-- Error when fetching with GET -->
+    <a href="http:/127.0.0.1:11836">Invalid external URL</a>
+    <!-- Error when parsing URL -->
+    <a href="http://127.0.0.1:abc">Invalid URL port</a>
+    <!-- Fragment should be skipped and cleaned up -->
+    <a href="#goto_a">Same with href to "/"</a>
+    <a href="/page2#goto_a">Same with href to "/page2"</a>
+  </body>
+</html>
diff --git a/brokenlinks/testdata/web/page2/index.html b/brokenlinks/testdata/web/page2/index.html
new file mode 100644
index 0000000..ae6b4ea
--- /dev/null
+++ b/brokenlinks/testdata/web/page2/index.html
@@ -0,0 +1,14 @@
+<!--
+SPDX-FileCopyrightText: 2025 M. Shulhan <ms@kilabit.info>
+SPDX-License-Identifier: GPL-3.0-only
+-->
+<html>
+  <body>
+    <img src="/broken.png" />
+    <img src="broken2.png" />
+    <a href="broken/relative">broken relative link</a>
+    <a href="/">Back with absolute path</a>
+    <a href="../">Back with relative path</a>
+    <a href="http://127.0.0.1:11900/page2">External URL page2</a>
+  </body>
+</html>
diff --git a/brokenlinks/worker.go b/brokenlinks/worker.go
new file mode 100644
index 0000000..4ed56d2
--- /dev/null
+++ b/brokenlinks/worker.go
@@ -0,0 +1,467 @@
+// SPDX-FileCopyrightText: 2025 M. Shulhan <ms@kilabit.info>
+// SPDX-License-Identifier: GPL-3.0-only
+
+package brokenlinks
+
+import (
+	"encoding/json"
+	"errors"
+	"fmt"
+	"log"
+	"net"
+	"net/http"
+	"net/url"
+	"os"
+	"strings"
+	"sync"
+	"time"
+
+	"golang.org/x/net/html"
+	"golang.org/x/net/html/atom"
+)
+
+type worker struct {
+	// seenLink store the URL being or has been scanned and its HTTP
+	// status code.
+	seenLink map[string]int
+
+	// resultq channel that collect result from scanning.
+	resultq chan map[string]linkQueue
+
+	// result contains the final result after all of the pages has been
+	// scanned.
+	result *Result
+
+	// pastResult containts the past scan result, loaded from file
+	// [Options.PastResultFile].
+	pastResult *Result
+
+	// The base URL that will be joined to relative or absolute
+	// links or image.
+	baseUrl *url.URL
+
+	// The URL to scan.
+	scanUrl *url.URL
+
+	log *log.Logger
+
+	opts Options
+
+	// wg sync the goroutine scanner.
+	wg sync.WaitGroup
+}
+
+func newWorker(opts Options) (wrk *worker, err error) {
+	wrk = &worker{
+		opts:     opts,
+		seenLink: map[string]int{},
+		resultq:  make(chan map[string]linkQueue, 100),
+		result:   newResult(),
+		log:      log.New(os.Stderr, ``, log.LstdFlags),
+	}
+
+	wrk.scanUrl, err = url.Parse(opts.Url)
+	if err != nil {
+		return nil, fmt.Errorf(`invalid URL %q`, opts.Url)
+	}
+	wrk.scanUrl.Path = strings.TrimSuffix(wrk.scanUrl.Path, `/`)
+	wrk.scanUrl.Fragment = ""
+	wrk.scanUrl.RawFragment = ""
+
+	wrk.baseUrl = &url.URL{
+		Scheme: wrk.scanUrl.Scheme,
+		Host:   wrk.scanUrl.Host,
+	}
+
+	if opts.PastResultFile == "" {
+		// Run with normal scan.
+		return wrk, nil
+	}
+
+	pastresult, err := os.ReadFile(opts.PastResultFile)
+	if err != nil {
+		return nil, err
+	}
+
+	wrk.pastResult = newResult()
+	err = json.Unmarshal(pastresult, &wrk.pastResult)
+	if err != nil {
+		return nil, err
+	}
+
+	return wrk, nil
+}
+
+func (wrk *worker) run() (result *Result, err error) {
+	if wrk.pastResult == nil {
+		result, err = wrk.scanAll()
+	} else {
+		result, err = wrk.scanPastResult()
+	}
+	return result, err
+}
+
+// scanAll scan all pages start from [Options.Url].
+func (wrk *worker) scanAll() (result *Result, err error) {
+	// Scan the first URL to make sure that the server is reachable.
+	var firstLinkq = linkQueue{
+		parentUrl: nil,
+		url:       wrk.scanUrl.String(),
+		status:    http.StatusProcessing,
+	}
+	wrk.seenLink[firstLinkq.url] = http.StatusProcessing
+
+	wrk.wg.Add(1)
+	go wrk.scan(firstLinkq)
+	wrk.wg.Wait()
+
+	var resultq = <-wrk.resultq
+	for _, linkq := range resultq {
+		if linkq.url == firstLinkq.url {
+			if linkq.errScan != nil {
+				return nil, linkq.errScan
+			}
+			wrk.seenLink[linkq.url] = linkq.status
+			continue
+		}
+		if linkq.status >= http.StatusBadRequest {
+			wrk.markBroken(linkq)
+			continue
+		}
+
+		wrk.seenLink[linkq.url] = http.StatusProcessing
+		wrk.wg.Add(1)
+		go wrk.scan(linkq)
+	}
+
+	var tick = time.NewTicker(500 * time.Millisecond)
+	var listWaitStatus []linkQueue
+	var isScanning = true
+	for isScanning {
+		select {
+		case resultq := <-wrk.resultq:
+			listWaitStatus = wrk.processResult(resultq, listWaitStatus)
+
+		case <-tick.C:
+			wrk.wg.Wait()
+			if len(wrk.resultq) != 0 {
+				continue
+			}
+			if len(listWaitStatus) != 0 {
+				// There are links that still waiting for
+				// scanning to be completed.
+				continue
+			}
+			isScanning = false
+		}
+	}
+	wrk.result.sort()
+	return wrk.result, nil
+}
+
+// scanPastResult scan only pages reported inside
+// [Result.BrokenLinks].
+func (wrk *worker) scanPastResult() (
+	result *Result, err error,
+) {
+	go func() {
+		for page := range wrk.pastResult.BrokenLinks {
+			var linkq = linkQueue{
+				parentUrl: nil,
+				url:       page,
+				status:    http.StatusProcessing,
+			}
+			wrk.seenLink[linkq.url] = http.StatusProcessing
+			wrk.wg.Add(1)
+			go wrk.scan(linkq)
+		}
+	}()
+
+	var tick = time.NewTicker(500 * time.Millisecond)
+	var listWaitStatus []linkQueue
+	var isScanning = true
+	for isScanning {
+		select {
+		case resultq := <-wrk.resultq:
+			listWaitStatus = wrk.processResult(resultq, listWaitStatus)
+
+		case <-tick.C:
+			wrk.wg.Wait()
+			if len(wrk.resultq) != 0 {
+				continue
+			}
+			if len(listWaitStatus) != 0 {
+				// There are links that still waiting for
+				// scanning to be completed.
+				continue
+			}
+			isScanning = false
+		}
+	}
+	wrk.result.sort()
+	return wrk.result, nil
+}
+
+// processResult the resultq contains the original URL being scanned
+// and its child links.
+// For example, scanning "http://example.tld" result in
+//
+//	"http://example.tld": {status=200}
+//	"http://example.tld/page": {status=0}
+//	"http://example.tld/image.png": {status=0}
+//	"http://bad:domain/image.png": {status=700}
+func (wrk *worker) processResult(
+	resultq map[string]linkQueue, listWaitStatus []linkQueue,
+) (
+	newList []linkQueue,
+) {
+	for _, linkq := range resultq {
+		if linkq.status >= http.StatusBadRequest {
+			wrk.markBroken(linkq)
+			continue
+		}
+		if linkq.status != 0 {
+			// linkq is the result of scan with
+			// non error status.
+			wrk.seenLink[linkq.url] = linkq.status
+			continue
+		}
+
+		seenStatus, seen := wrk.seenLink[linkq.url]
+		if !seen {
+			wrk.seenLink[linkq.url] = http.StatusProcessing
+			wrk.wg.Add(1)
+			go wrk.scan(linkq)
+			continue
+		}
+		if seenStatus >= http.StatusBadRequest {
+			linkq.status = seenStatus
+			wrk.markBroken(linkq)
+			continue
+		}
+		if seenStatus >= http.StatusOK {
+			// The link has been processed and its
+			// not an error.
+			continue
+		}
+		// The link being processed by other goroutine.
+		linkq.status = seenStatus
+		newList = append(newList, linkq)
+	}
+	for _, linkq := range listWaitStatus {
+		seenStatus := wrk.seenLink[linkq.url]
+		if seenStatus >= http.StatusBadRequest {
+			linkq.status = seenStatus
+			wrk.markBroken(linkq)
+			continue
+		}
+		if seenStatus >= http.StatusOK {
+			continue
+		}
+		if seenStatus == http.StatusProcessing {
+			// Scanning still in progress.
+			newList = append(newList, linkq)
+			continue
+		}
+	}
+	return newList
+}
+
+func (wrk *worker) markBroken(linkq linkQueue) {
+	var parentUrl = linkq.parentUrl.String()
+	var listBroken = wrk.result.BrokenLinks[parentUrl]
+	var brokenLink = Broken{
+		Link: linkq.url,
+		Code: linkq.status,
+	}
+	if linkq.errScan != nil {
+		brokenLink.Error = linkq.errScan.Error()
+	}
+	listBroken = append(listBroken, brokenLink)
+	wrk.result.BrokenLinks[parentUrl] = listBroken
+
+	wrk.seenLink[linkq.url] = linkq.status
+}
+
+// scan fetch the HTML page or image to check if its valid.
+func (wrk *worker) scan(linkq linkQueue) {
+	defer func() {
+		if wrk.opts.IsVerbose && linkq.errScan != nil {
+			wrk.log.Printf("error: %d %s error=%v\n", linkq.status,
+				linkq.url, linkq.errScan)
+		}
+		wrk.wg.Done()
+	}()
+
+	var (
+		resultq  = map[string]linkQueue{}
+		httpResp *http.Response
+		err      error
+	)
+	httpResp, err = wrk.fetch(linkq)
+	if err != nil {
+		linkq.status = StatusBadLink
+		linkq.errScan = err
+		resultq[linkq.url] = linkq
+		go wrk.pushResult(resultq)
+		return
+	}
+	defer httpResp.Body.Close()
+
+	linkq.status = httpResp.StatusCode
+	resultq[linkq.url] = linkq
+
+	if httpResp.StatusCode >= http.StatusBadRequest {
+		go wrk.pushResult(resultq)
+		return
+	}
+	if linkq.kind == atom.Img || linkq.isExternal {
+		go wrk.pushResult(resultq)
+		return
+	}
+
+	var doc *html.Node
+	doc, _ = html.Parse(httpResp.Body)
+
+	// After we check the code and test for [html.Parse] there are
+	// no case actual cases where HTML content will return an error.
+	// The only possible error is when reading from body (io.Reader), and
+	// that is also almost impossible.
+	//
+	// [html.Parse]: https://go.googlesource.com/net/+/refs/tags/v0.40.0/html/parse.go#2347
+
+	var scanUrl *url.URL
+
+	scanUrl, err = url.Parse(linkq.url)
+	if err != nil {
+		log.Fatal(err)
+	}
+
+	var node *html.Node
+	for node = range doc.Descendants() {
+		if node.Type != html.ElementNode {
+			continue
+		}
+		var nodeLink *linkQueue
+		if node.DataAtom == atom.A {
+			for _, attr := range node.Attr {
+				if attr.Key != `href` {
+					continue
+				}
+				nodeLink = wrk.processLink(scanUrl, attr.Val, atom.A)
+				break
+			}
+		} else if node.DataAtom == atom.Img {
+			for _, attr := range node.Attr {
+				if attr.Key != `src` {
+					continue
+				}
+				nodeLink = wrk.processLink(scanUrl, attr.Val, atom.Img)
+				break
+			}
+		} else {
+			continue
+		}
+		if nodeLink == nil {
+			continue
+		}
+		_, seen := resultq[nodeLink.url]
+		if !seen {
+			nodeLink.checkExternal(wrk)
+			resultq[nodeLink.url] = *nodeLink
+		}
+	}
+	go wrk.pushResult(resultq)
+}
+
+func (wrk *worker) fetch(linkq linkQueue) (
+	httpResp *http.Response,
+	err error,
+) {
+	const maxRetry = 5
+	var retry int
+	for retry < 5 {
+		if linkq.kind == atom.Img {
+			if wrk.opts.IsVerbose {
+				wrk.log.Printf("scan: HEAD %s\n", linkq.url)
+			}
+			httpResp, err = http.Head(linkq.url)
+		} else {
+			if wrk.opts.IsVerbose {
+				wrk.log.Printf("scan: GET %s\n", linkq.url)
+			}
+			httpResp, err = http.Get(linkq.url)
+		}
+		if err == nil {
+			return httpResp, nil
+		}
+		var errDNS *net.DNSError
+		if !errors.As(err, &errDNS) {
+			return nil, err
+		}
+		if errDNS.Timeout() {
+			retry++
+		}
+	}
+	return nil, err
+}
+
+func (wrk *worker) processLink(parentUrl *url.URL, val string, kind atom.Atom) (
+	linkq *linkQueue,
+) {
+	if len(val) == 0 {
+		return nil
+	}
+
+	var newUrl *url.URL
+	var err error
+	newUrl, err = url.Parse(val)
+	if err != nil {
+		return &linkQueue{
+			parentUrl: parentUrl,
+			errScan:   err,
+			url:       val,
+			kind:      kind,
+			status:    StatusBadLink,
+		}
+	}
+	newUrl.Fragment = ""
+	newUrl.RawFragment = ""
+
+	if kind == atom.A && val[0] == '#' {
+		// Ignore link to ID, like `href="#element_id"`.
+		return nil
+	}
+	if strings.HasPrefix(val, `http`) {
+		return &linkQueue{
+			parentUrl: parentUrl,
+			url:       strings.TrimSuffix(newUrl.String(), `/`),
+			kind:      kind,
+		}
+	}
+	if val[0] == '/' {
+		// val is absolute to parent URL.
+		newUrl = wrk.baseUrl.JoinPath(newUrl.Path)
+	} else {
+		// val is relative to parent URL.
+		newUrl = parentUrl.JoinPath(`/`, newUrl.Path)
+	}
+	linkq = &linkQueue{
+		parentUrl: parentUrl,
+		url:       strings.TrimSuffix(newUrl.String(), `/`),
+		kind:      kind,
+	}
+	return linkq
+}
+
+func (wrk *worker) pushResult(resultq map[string]linkQueue) {
+	var tick = time.NewTicker(100 * time.Millisecond)
+	for {
+		select {
+		case wrk.resultq <- resultq:
+			tick.Stop()
+			return
+		case <-tick.C:
+		}
+	}
+}
author	Shulhan <ms@kilabit.info>	2025-06-12 21:13:58 +0700
committer	Shulhan <ms@kilabit.info>	2025-06-12 21:13:58 +0700
commit	a02e915388723a5d8cc3b555fb3dfec477fc2a55 (patch)
tree	aa35678b263646e1edd730a16cb35a66e7b933d8 /brokenlinks
parent	f408c77795a9dd6d4551fadd2e8352ba08915feb (diff)
download	jarink-a02e915388723a5d8cc3b555fb3dfec477fc2a55.tar.xz