aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorShulhan <ms@kilabit.info>2026-01-22 01:39:41 +0700
committerShulhan <ms@kilabit.info>2026-01-22 01:39:41 +0700
commit79eaccc81b85eb92dab9cf18d52662f367903652 (patch)
treeaf58482138c2ba9211029174ab579e951cd7fff6
parent26fc8bd3203dae6b4705ada227439c90129bbe36 (diff)
downloadjarink-79eaccc81b85eb92dab9cf18d52662f367903652.tar.xz
all: refactoring, use single struct to represent Link
Previously, have [jarink.Link], [brokenlinks.Broken], and [brokenlinks.linkQueue] to store the metadata for a link. These changes unified them into struct [jarink.Link].
-rw-r--r--Makefile2
-rw-r--r--README.md4
-rw-r--r--brokenlinks/brokenlinks_test.go231
-rw-r--r--brokenlinks/link_queue.go38
-rw-r--r--brokenlinks/options.go2
-rw-r--r--brokenlinks/result.go20
-rw-r--r--brokenlinks/testdata/exp_cache.json6
-rw-r--r--brokenlinks/worker.go138
-rw-r--r--cache.go11
-rw-r--r--jarink.go2
-rw-r--r--link.go31
-rw-r--r--url_test.go2
12 files changed, 275 insertions, 212 deletions
diff --git a/Makefile b/Makefile
index cc311e5..0226504 100644
--- a/Makefile
+++ b/Makefile
@@ -1,5 +1,5 @@
-## SPDX-FileCopyrightText: 2025 M. Shulhan <ms@kilabit.info>
## SPDX-License-Identifier: GPL-3.0-only
+## SPDX-FileCopyrightText: 2025 M. Shulhan <ms@kilabit.info>
COVER_OUT:=cover.out
COVER_HTML:=cover.html
diff --git a/README.md b/README.md
index ca63317..cbefa5a 100644
--- a/README.md
+++ b/README.md
@@ -36,9 +36,9 @@ JSON format to standard output,
```
{
"$PAGE": [{
- "link": <string>,
+ "url": <string>,
"error": <string>,
- "code": <integer>
+ "status_code": <integer>
},
...
],
diff --git a/brokenlinks/brokenlinks_test.go b/brokenlinks/brokenlinks_test.go
index db3775a..a04be7f 100644
--- a/brokenlinks/brokenlinks_test.go
+++ b/brokenlinks/brokenlinks_test.go
@@ -4,8 +4,11 @@
package brokenlinks_test
import (
+ "errors"
"log"
+ "net"
"net/http"
+ "net/url"
"os"
"path/filepath"
"testing"
@@ -13,7 +16,9 @@ import (
libnet "git.sr.ht/~shulhan/pakakeh.go/lib/net"
"git.sr.ht/~shulhan/pakakeh.go/lib/test"
+ "golang.org/x/net/html/atom"
+ "git.sr.ht/~shulhan/jarink"
"git.sr.ht/~shulhan/jarink/brokenlinks"
"git.sr.ht/~shulhan/jarink/internal"
)
@@ -212,8 +217,27 @@ func runServerSlow() {
func TestScan(t *testing.T) {
var testUrl = `http://` + testAddress
+ // Generate ParentUrl.
+
+ parsedTestUrl, err := url.Parse(testUrl)
+ if err != nil {
+ t.Fatal(err)
+ }
+ parsedUrlBrokenHtml, err := url.Parse(testUrl + `/broken.html`)
+ if err != nil {
+ t.Fatal(err)
+ }
+ parsedUrlPage2, err := url.Parse(testUrl + `/page2`)
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ // Generate error for ErrScan.
+
+ _, errScanPort := url.Parse(`http://127.0.0.1:abc`)
+
type testCase struct {
- exp map[string][]brokenlinks.Broken
+ exp map[string][]jarink.Link
expError string
desc string
opts brokenlinks.Options
@@ -246,92 +270,133 @@ func TestScan(t *testing.T) {
},
expError: `Scan: Options: unknown status code "50"`,
}, {
- desc: `With ` + testUrl,
+ desc: `With Url=testUrl`,
opts: brokenlinks.Options{
Url: testUrl,
IgnoreStatus: `403`,
Insecure: true,
IsVerbose: true,
},
- exp: map[string][]brokenlinks.Broken{
- testUrl: []brokenlinks.Broken{
+ exp: map[string][]jarink.Link{
+ testUrl: []jarink.Link{
{
- Link: testUrl + `/broken.png`,
- Code: http.StatusNotFound,
+ ParentUrl: parsedTestUrl,
+ Url: testUrl + `/broken.png`,
+ StatusCode: http.StatusNotFound,
+ Kind: int(atom.Img),
}, {
- Link: testUrl + `/brokenPage`,
- Code: http.StatusNotFound,
+ ParentUrl: parsedTestUrl,
+ Url: testUrl + `/brokenPage`,
+ StatusCode: http.StatusNotFound,
+ Kind: int(atom.A),
}, {
- Link: `http://127.0.0.1:abc`,
- Error: `parse "http://127.0.0.1:abc": invalid port ":abc" after host`,
- Code: brokenlinks.StatusBadLink,
+ ParentUrl: parsedTestUrl,
+ Url: `http://127.0.0.1:abc`,
+ ErrScan: errScanPort,
+ Error: `parse "http://127.0.0.1:abc": invalid port ":abc" after host`,
+ StatusCode: brokenlinks.StatusBadLink,
+ Kind: int(atom.A),
}, {
- Link: `http:/127.0.0.1:11836`,
- Error: `Get "http:/127.0.0.1:11836": http: no Host in request URL`,
- Code: brokenlinks.StatusBadLink,
+ ParentUrl: parsedTestUrl,
+ Url: `http:/127.0.0.1:11836`,
+ ErrScan: &url.Error{
+ Op: `Get`,
+ URL: `http:/127.0.0.1:11836`,
+ Err: errors.New(`http: no Host in request URL`),
+ },
+ Error: `Get "http:/127.0.0.1:11836": http: no Host in request URL`,
+ StatusCode: brokenlinks.StatusBadLink,
+ Kind: int(atom.A),
+ IsExternal: true,
}, {
- Link: `https://domain`,
- Error: `Get "https://domain": dial tcp: lookup domain: no such host`,
- Code: 700,
+ ParentUrl: parsedTestUrl,
+ Url: `https://domain`,
+ ErrScan: &url.Error{
+ Op: `Get`,
+ URL: `https://domain`,
+ Err: &net.OpError{
+ Op: `dial`,
+ Net: `tcp`,
+ Err: &net.DNSError{
+ Err: `no such host`,
+ Name: `domain`,
+ IsNotFound: true,
+ },
+ },
+ },
+ Error: `Get "https://domain": dial tcp: lookup domain: no such host`,
+ StatusCode: 700,
+ Kind: int(atom.A),
+ IsExternal: true,
},
},
- testUrl + `/broken.html`: []brokenlinks.Broken{
+ testUrl + `/broken.html`: []jarink.Link{
{
- Link: testUrl + `/brokenPage`,
- Code: http.StatusNotFound,
+ ParentUrl: parsedUrlBrokenHtml,
+ Url: testUrl + `/brokenPage`,
+ StatusCode: http.StatusNotFound,
+ Kind: int(atom.A),
},
},
- testUrl + `/page2`: []brokenlinks.Broken{
+ testUrl + `/page2`: []jarink.Link{
{
- Link: testUrl + `/broken.png`,
- Code: http.StatusNotFound,
+ ParentUrl: parsedUrlPage2,
+ Url: testUrl + `/broken.png`,
+ StatusCode: http.StatusNotFound,
+ Kind: int(atom.Img),
}, {
- Link: testUrl + `/page2/broken/relative`,
- Code: http.StatusNotFound,
+ ParentUrl: parsedUrlPage2,
+ Url: testUrl + `/page2/broken/relative`,
+ StatusCode: http.StatusNotFound,
+ Kind: int(atom.A),
}, {
- Link: testUrl + `/page2/broken2.png`,
- Code: http.StatusNotFound,
+ ParentUrl: parsedUrlPage2,
+ Url: testUrl + `/page2/broken2.png`,
+ StatusCode: http.StatusNotFound,
+ Kind: int(atom.Img),
},
},
},
}, {
- desc: `With ` + testUrl + `/page2`,
// Scanning on "/page2" should not scan the the "/" or other
// pages other than below of "/page2" itself.
+ desc: `With Url=/page2`,
opts: brokenlinks.Options{
Url: testUrl + `/page2`,
IsVerbose: true,
},
- exp: map[string][]brokenlinks.Broken{
- testUrl + `/page2`: []brokenlinks.Broken{
+ exp: map[string][]jarink.Link{
+ testUrl + `/page2`: []jarink.Link{
{
- Link: testUrl + `/broken.png`,
- Code: http.StatusNotFound,
+ ParentUrl: parsedUrlPage2,
+ Url: testUrl + `/broken.png`,
+ StatusCode: http.StatusNotFound,
+ Kind: int(atom.Img),
+ IsExternal: true,
}, {
- Link: testUrl + `/page2/broken/relative`,
- Code: http.StatusNotFound,
+ ParentUrl: parsedUrlPage2,
+ Url: testUrl + `/page2/broken/relative`,
+ StatusCode: http.StatusNotFound,
+ Kind: int(atom.A),
}, {
- Link: testUrl + `/page2/broken2.png`,
- Code: http.StatusNotFound,
+ ParentUrl: parsedUrlPage2,
+ Url: testUrl + `/page2/broken2.png`,
+ StatusCode: http.StatusNotFound,
+ Kind: int(atom.Img),
},
},
},
}}
- var (
- result *brokenlinks.Result
- err error
- )
for _, tcase := range listCase {
t.Run(tcase.desc, func(tt *testing.T) {
internal.CacheFile = func() (string, error) {
return tt.TempDir() + `/cache.json`, nil
}
- result, err = brokenlinks.Scan(tcase.opts)
+ result, err := brokenlinks.Scan(tcase.opts)
if err != nil {
- test.Assert(tt, tcase.opts.Url+` error`,
- tcase.expError, err.Error())
+ test.Assert(tt, tcase.opts.Url+` error`, tcase.expError, err.Error())
return
}
@@ -348,8 +413,13 @@ func TestScan(t *testing.T) {
func TestScan_pastResult(t *testing.T) {
var testUrl = `http://` + testAddress
+ parsedUrlPage2, err := url.Parse(testUrl + `/page2`)
+ if err != nil {
+ t.Fatal(err)
+ }
+
type testCase struct {
- exp map[string][]brokenlinks.Broken
+ exp map[string][]jarink.Link
desc string
expError string
opts brokenlinks.Options
@@ -369,33 +439,36 @@ func TestScan_pastResult(t *testing.T) {
PastResultFile: `testdata/past_result.json`,
IgnoreStatus: `403`,
},
- exp: map[string][]brokenlinks.Broken{
- testUrl + `/page2`: []brokenlinks.Broken{
+ exp: map[string][]jarink.Link{
+ testUrl + `/page2`: []jarink.Link{
{
- Link: testUrl + `/broken.png`,
- Code: http.StatusNotFound,
+ ParentUrl: parsedUrlPage2,
+ Url: testUrl + `/broken.png`,
+ StatusCode: http.StatusNotFound,
+ Kind: int(atom.Img),
+ IsExternal: true,
}, {
- Link: testUrl + `/page2/broken/relative`,
- Code: http.StatusNotFound,
+ ParentUrl: parsedUrlPage2,
+ Url: testUrl + `/page2/broken/relative`,
+ StatusCode: http.StatusNotFound,
+ Kind: int(atom.A),
}, {
- Link: testUrl + `/page2/broken2.png`,
- Code: http.StatusNotFound,
+ ParentUrl: parsedUrlPage2,
+ Url: testUrl + `/page2/broken2.png`,
+ StatusCode: http.StatusNotFound,
+ Kind: int(atom.Img),
},
},
},
}}
- var (
- result *brokenlinks.Result
- err error
- )
for _, tcase := range listCase {
t.Run(tcase.desc, func(tt *testing.T) {
internal.CacheFile = func() (string, error) {
return tt.TempDir() + `/cache.json`, nil
}
- result, err = brokenlinks.Scan(tcase.opts)
+ result, err := brokenlinks.Scan(tcase.opts)
if err != nil {
test.Assert(tt, tcase.opts.Url+` error`, tcase.expError, err.Error())
return
@@ -410,14 +483,24 @@ func TestScan_pastResult(t *testing.T) {
func TestScan_slow(t *testing.T) {
const testUrl = `http://` + testAddressSlow
+ parsedUrlSlow1, err := url.Parse(testUrl + `/slow1`)
+ if err != nil {
+ t.Fatal(err)
+ }
+ parsedUrlSlow2, err := url.Parse(testUrl + `/slow2`)
+ if err != nil {
+ t.Fatal(err)
+ }
+ parsedUrlSlow3, err := url.Parse(testUrl + `/slow3`)
+ if err != nil {
+ t.Fatal(err)
+ }
+
var opts = brokenlinks.Options{
Url: testUrl,
IsVerbose: true,
}
-
- var gotResult *brokenlinks.Result
- var err error
- gotResult, err = brokenlinks.Scan(opts)
+ gotResult, err := brokenlinks.Scan(opts)
if err != nil {
t.Fatal(err)
}
@@ -426,18 +509,24 @@ func TestScan_slow(t *testing.T) {
//t.Logf(`got=%s`, got)
var expResult = &brokenlinks.Result{
- BrokenLinks: map[string][]brokenlinks.Broken{
- testUrl + `/slow1`: []brokenlinks.Broken{{
- Link: testUrl + `/slow3/sub`,
- Code: http.StatusForbidden,
+ BrokenLinks: map[string][]jarink.Link{
+ testUrl + `/slow1`: []jarink.Link{{
+ ParentUrl: parsedUrlSlow1,
+ Url: testUrl + `/slow3/sub`,
+ StatusCode: http.StatusForbidden,
+ Kind: int(atom.A),
}},
- testUrl + `/slow2`: []brokenlinks.Broken{{
- Link: testUrl + `/slow3/sub`,
- Code: http.StatusForbidden,
+ testUrl + `/slow2`: []jarink.Link{{
+ ParentUrl: parsedUrlSlow2,
+ Url: testUrl + `/slow3/sub`,
+ StatusCode: http.StatusForbidden,
+ Kind: int(atom.A),
}},
- testUrl + `/slow3`: []brokenlinks.Broken{{
- Link: testUrl + `/slow3/sub`,
- Code: http.StatusForbidden,
+ testUrl + `/slow3`: []jarink.Link{{
+ ParentUrl: parsedUrlSlow3,
+ Url: testUrl + `/slow3/sub`,
+ StatusCode: http.StatusForbidden,
+ Kind: int(atom.A),
}},
},
}
diff --git a/brokenlinks/link_queue.go b/brokenlinks/link_queue.go
deleted file mode 100644
index 14bf8c7..0000000
--- a/brokenlinks/link_queue.go
+++ /dev/null
@@ -1,38 +0,0 @@
-// SPDX-FileCopyrightText: 2025 M. Shulhan <ms@kilabit.info>
-// SPDX-License-Identifier: GPL-3.0-only
-
-package brokenlinks
-
-import (
- "net/url"
-
- "golang.org/x/net/html/atom"
-)
-
-type linkQueue struct {
- parentUrl *url.URL
-
- // The error from scan.
- errScan error
-
- // url being scanned.
- url string
-
- // kind of url, its either an anchor or image.
- // It set to 0 if url is the first URL being scanned.
- kind atom.Atom
-
- // isExternal if true the scan will issue HTTP method HEAD instead of
- // GET.
- isExternal bool
-
- // Status of link after scan, its mostly used the HTTP status code.
- // 0: link is the result of scan, not processed yet.
- // StatusBadLink: link is invalid, not parseable or unreachable.
- // 200 - 211: OK.
- // 400 - 511: Error.
- status int
-
- // Size of the page, derived from HTTP response ContentLength.
- size int64
-}
diff --git a/brokenlinks/options.go b/brokenlinks/options.go
index 3e69daf..2703f8d 100644
--- a/brokenlinks/options.go
+++ b/brokenlinks/options.go
@@ -1,5 +1,5 @@
-// SPDX-FileCopyrightText: 2025 M. Shulhan <ms@kilabit.info>
// SPDX-License-Identifier: GPL-3.0-only
+// SPDX-FileCopyrightText: 2025 M. Shulhan <ms@kilabit.info>
package brokenlinks
diff --git a/brokenlinks/result.go b/brokenlinks/result.go
index 676859b..1cd49a3 100644
--- a/brokenlinks/result.go
+++ b/brokenlinks/result.go
@@ -1,37 +1,31 @@
-// SPDX-FileCopyrightText: 2025 M. Shulhan <ms@kilabit.info>
// SPDX-License-Identifier: GPL-3.0-only
+// SPDX-FileCopyrightText: 2025 M. Shulhan <ms@kilabit.info>
package brokenlinks
import (
"slices"
"strings"
-)
-// Broken store the broken link, HTTP status code, and the error message that
-// cause it.
-type Broken struct {
- Link string `json:"link"`
- Error string `json:"error,omitempty"`
- Code int `json:"code"`
-}
+ "git.sr.ht/~shulhan/jarink"
+)
// Result store the result of scanning for broken links.
type Result struct {
// BrokenLinks store the page and its broken links.
- BrokenLinks map[string][]Broken `json:"broken_links"`
+ BrokenLinks map[string][]jarink.Link `json:"broken_links"`
}
func newResult() *Result {
return &Result{
- BrokenLinks: map[string][]Broken{},
+ BrokenLinks: map[string][]jarink.Link{},
}
}
func (result *Result) sort() {
for _, listBroken := range result.BrokenLinks {
- slices.SortFunc(listBroken, func(a, b Broken) int {
- return strings.Compare(a.Link, b.Link)
+ slices.SortFunc(listBroken, func(a, b jarink.Link) int {
+ return strings.Compare(a.Url, b.Url)
})
}
}
diff --git a/brokenlinks/testdata/exp_cache.json b/brokenlinks/testdata/exp_cache.json
index 8b84ff7..f9aa32a 100644
--- a/brokenlinks/testdata/exp_cache.json
+++ b/brokenlinks/testdata/exp_cache.json
@@ -3,17 +3,17 @@
"http://127.0.0.1:11900": {
"url": "http://127.0.0.1:11900",
"size": 1064,
- "response_code": 200
+ "status_code": 200
},
"http://127.0.0.1:11900/page2": {
"url": "http://127.0.0.1:11900/page2",
"size": 410,
- "response_code": 200
+ "status_code": 200
},
"https://127.0.0.1:11838": {
"url": "https://127.0.0.1:11838",
"size": 1064,
- "response_code": 200
+ "status_code": 200
}
}
}
diff --git a/brokenlinks/worker.go b/brokenlinks/worker.go
index c0a33dd..07bda88 100644
--- a/brokenlinks/worker.go
+++ b/brokenlinks/worker.go
@@ -29,7 +29,7 @@ type worker struct {
seenLink map[string]int
// queue contains list of link to be scanned.
- queue []linkQueue
+ queue []jarink.Link
// result contains the final result after all of the pages has been
// scanned.
@@ -124,15 +124,14 @@ func (wrk *worker) run() (result *Result, err error) {
// scanAll scan all pages start from [Options.Url].
func (wrk *worker) scanAll() (result *Result, err error) {
// Scan the first URL to make sure that the server is reachable.
- var linkq = linkQueue{
- parentUrl: nil,
- url: wrk.opts.scanUrl.String(),
+ var linkq = jarink.Link{
+ Url: wrk.opts.scanUrl.String(),
}
var resultq = wrk.scan(linkq)
- linkq = resultq[linkq.url]
- if linkq.errScan != nil {
- return nil, linkq.errScan
+ linkq = resultq[linkq.Url]
+ if linkq.ErrScan != nil {
+ return nil, linkq.ErrScan
}
wrk.processResult(resultq)
@@ -176,17 +175,16 @@ func (wrk *worker) scanPastResult() (result *Result, err error) {
// - Skip external link that has been checked before.
// - Skip link that has been seen.
// - Otherwise push it to queue.
-func (wrk *worker) processResult(resultq map[string]linkQueue) {
- var linkq linkQueue
+func (wrk *worker) processResult(resultq map[string]jarink.Link) {
var seen bool
- for _, linkq = range resultq {
- if linkq.status != 0 {
+ for _, linkq := range resultq {
+ if linkq.StatusCode != 0 {
wrk.seen(linkq)
continue
}
- if linkq.isExternal {
- var scannedLink = wrk.cache.Get(linkq.url)
+ if linkq.IsExternal {
+ var scannedLink = wrk.cache.Get(linkq.Url)
if scannedLink != nil {
// The external link has been scanned
// previously.
@@ -194,9 +192,9 @@ func (wrk *worker) processResult(resultq map[string]linkQueue) {
}
}
- linkq.status, seen = wrk.seenLink[linkq.url]
+ linkq.StatusCode, seen = wrk.seenLink[linkq.Url]
if seen {
- if linkq.status >= http.StatusBadRequest {
+ if linkq.StatusCode >= http.StatusBadRequest {
// Different pages may have the same broken
// link.
wrk.markAsBroken(linkq)
@@ -207,61 +205,61 @@ func (wrk *worker) processResult(resultq map[string]linkQueue) {
}
}
-func (wrk *worker) seen(linkq linkQueue) {
- wrk.seenLink[linkq.url] = linkq.status
+func (wrk *worker) seen(linkq jarink.Link) {
+ wrk.seenLink[linkq.Url] = linkq.StatusCode
- if linkq.isExternal {
- if linkq.status != StatusBadLink {
- wrk.cache.Set(linkq.url, linkq.status, linkq.size)
+ if linkq.IsExternal {
+ if linkq.StatusCode != StatusBadLink {
+ wrk.cache.Set(linkq)
}
}
- if linkq.status >= http.StatusBadRequest {
+ if linkq.StatusCode >= http.StatusBadRequest {
wrk.markAsBroken(linkq)
}
}
-func (wrk *worker) markAsBroken(linkq linkQueue) {
- if slices.Contains(wrk.opts.ignoreStatus, linkq.status) {
+func (wrk *worker) markAsBroken(linkq jarink.Link) {
+ if slices.Contains(wrk.opts.ignoreStatus, linkq.StatusCode) {
return
}
- var parentUrl = linkq.parentUrl.String()
+ var parentUrl = linkq.ParentUrl.String()
var listBroken = wrk.result.BrokenLinks[parentUrl]
- var brokenLink = Broken{
- Link: linkq.url,
- Code: linkq.status,
+ if linkq.ErrScan != nil {
+ linkq.Error = linkq.ErrScan.Error()
}
- if linkq.errScan != nil {
- brokenLink.Error = linkq.errScan.Error()
- }
- listBroken = append(listBroken, brokenLink)
+ listBroken = append(listBroken, linkq)
wrk.result.BrokenLinks[parentUrl] = listBroken
}
// scan the link to HTML page or image.
-func (wrk *worker) scan(linkq linkQueue) (resultq map[string]linkQueue) {
- resultq = make(map[string]linkQueue)
+func (wrk *worker) scan(linkq jarink.Link) (resultq map[string]jarink.Link) {
+ resultq = make(map[string]jarink.Link)
var (
httpResp *http.Response
err error
)
httpResp, err = wrk.fetch(linkq)
if err != nil {
- linkq.status = StatusBadLink
- linkq.errScan = err
- resultq[linkq.url] = linkq
+ linkq.StatusCode = StatusBadLink
+ linkq.ErrScan = err
+ resultq[linkq.Url] = linkq
return resultq
}
defer httpResp.Body.Close()
- linkq.status = httpResp.StatusCode
- linkq.size = httpResp.ContentLength
- resultq[linkq.url] = linkq
+ linkq.StatusCode = httpResp.StatusCode
+ resultq[linkq.Url] = linkq
if httpResp.StatusCode >= http.StatusBadRequest {
return resultq
}
- if linkq.kind == atom.Img || linkq.isExternal {
+ if linkq.Kind == int(atom.Img) {
+ return resultq
+ }
+ linkq.Size = httpResp.ContentLength
+ if linkq.IsExternal {
+ resultq[linkq.Url] = linkq
return resultq
}
@@ -276,11 +274,11 @@ func (wrk *worker) scan(linkq linkQueue) (resultq map[string]linkQueue) {
var parentUrl *url.URL
- parentUrl, err = url.Parse(linkq.url)
+ parentUrl, err = url.Parse(linkq.Url)
if err != nil {
- linkq.status = StatusBadLink
- linkq.errScan = err
- resultq[linkq.url] = linkq
+ linkq.StatusCode = StatusBadLink
+ linkq.ErrScan = err
+ resultq[linkq.Url] = linkq
return resultq
}
@@ -292,13 +290,13 @@ func (wrk *worker) scan(linkq linkQueue) (resultq map[string]linkQueue) {
if node.DataAtom != atom.A && node.DataAtom != atom.Img {
continue
}
- var nodeLink *linkQueue
+ var nodeLink *jarink.Link
if node.DataAtom == atom.A {
for _, attr := range node.Attr {
if attr.Key != `href` {
continue
}
- nodeLink = wrk.processLink(parentUrl, attr.Val, atom.A)
+ nodeLink = wrk.processLink(parentUrl, attr.Val, int(atom.A))
break
}
} else { // atom.Img
@@ -306,7 +304,7 @@ func (wrk *worker) scan(linkq linkQueue) (resultq map[string]linkQueue) {
if attr.Key != `src` {
continue
}
- nodeLink = wrk.processLink(parentUrl, attr.Val, atom.Img)
+ nodeLink = wrk.processLink(parentUrl, attr.Val, int(atom.Img))
break
}
}
@@ -314,30 +312,30 @@ func (wrk *worker) scan(linkq linkQueue) (resultq map[string]linkQueue) {
// Link is invalid.
continue
}
- _, seen := resultq[nodeLink.url]
+ _, seen := resultq[nodeLink.Url]
if seen {
// The same link already exist previously.
continue
}
- resultq[nodeLink.url] = *nodeLink
+ resultq[nodeLink.Url] = *nodeLink
}
return resultq
}
-func (wrk *worker) fetch(linkq linkQueue) (httpResp *http.Response, err error) {
+func (wrk *worker) fetch(linkq jarink.Link) (httpResp *http.Response, err error) {
const maxRetry = 5
var retry int
for retry < 5 {
- if linkq.kind == atom.Img {
+ if linkq.Kind == int(atom.Img) {
if wrk.opts.IsVerbose {
- wrk.log.Printf("fetch: HEAD %s", linkq.url)
+ wrk.log.Printf("fetch: HEAD %s", linkq.Url)
}
- httpResp, err = wrk.httpc.Head(linkq.url)
+ httpResp, err = wrk.httpc.Head(linkq.Url)
} else {
if wrk.opts.IsVerbose {
- wrk.log.Printf("fetch: GET %s", linkq.url)
+ wrk.log.Printf("fetch: GET %s", linkq.Url)
}
- httpResp, err = wrk.httpc.Get(linkq.url)
+ httpResp, err = wrk.httpc.Get(linkq.Url)
}
if err == nil {
return httpResp, nil
@@ -348,7 +346,7 @@ func (wrk *worker) fetch(linkq linkQueue) (httpResp *http.Response, err error) {
}
if errDNS.Timeout() {
retry++
- wrk.log.Printf(`fetch %s: %s (%d/%d)`, linkq.url, err, retry, maxRetry)
+ wrk.log.Printf(`fetch %s: %s (%d/%d)`, linkq.Url, err, retry, maxRetry)
continue
}
break
@@ -356,33 +354,33 @@ func (wrk *worker) fetch(linkq linkQueue) (httpResp *http.Response, err error) {
return nil, err
}
-// processLink given a parentURL and link value `val`
-// check if link `val` is valid and return it as linkQueue.
-func (wrk *worker) processLink(parentUrl *url.URL, val string, kind atom.Atom) (
- linkq *linkQueue,
+// processLink given a parentURL, check if link `val` is valid, and return it
+// as [jarink.Link].
+func (wrk *worker) processLink(parentUrl *url.URL, val string, kind int) (
+ linkq *jarink.Link,
) {
if len(val) == 0 {
return nil
}
- linkq = &linkQueue{
- parentUrl: parentUrl,
- kind: kind,
+ linkq = &jarink.Link{
+ ParentUrl: parentUrl,
+ Kind: kind,
}
var newUrl *url.URL
var err error
newUrl, err = url.Parse(val)
if err != nil {
- linkq.errScan = err
- linkq.url = val
- linkq.status = StatusBadLink
+ linkq.ErrScan = err
+ linkq.Url = val
+ linkq.StatusCode = StatusBadLink
return linkq
}
newUrl.Fragment = ""
newUrl.RawFragment = ""
- if kind == atom.A && val[0] == '#' {
+ if kind == int(atom.A) && val[0] == '#' {
// Ignore link to ID, like `href="#element_id"`.
return nil
}
@@ -395,9 +393,9 @@ func (wrk *worker) processLink(parentUrl *url.URL, val string, kind atom.Atom) (
newUrl = parentUrl.JoinPath(`/`, newUrl.Path)
}
}
- linkq.url = strings.TrimSuffix(newUrl.String(), `/`)
- if !strings.HasPrefix(linkq.url, wrk.opts.scanUrl.String()) {
- linkq.isExternal = true
+ linkq.Url = strings.TrimSuffix(newUrl.String(), `/`)
+ if !strings.HasPrefix(linkq.Url, wrk.opts.scanUrl.String()) {
+ linkq.IsExternal = true
}
return linkq
}
diff --git a/cache.go b/cache.go
index 73b4d58..2eeb416 100644
--- a/cache.go
+++ b/cache.go
@@ -80,18 +80,13 @@ func (cache *Cache) Save() (err error) {
return nil
}
-func (cache *Cache) Set(url string, respCode int, size int64) {
+func (cache *Cache) Set(link Link) {
cache.mtx.Lock()
defer cache.mtx.Unlock()
- var scannedLink = cache.ScannedLinks[url]
+ var scannedLink = cache.ScannedLinks[link.Url]
if scannedLink != nil {
return
}
- scannedLink = &Link{
- Url: url,
- Size: size,
- ResponseCode: respCode,
- }
- cache.ScannedLinks[url] = scannedLink
+ cache.ScannedLinks[link.Url] = &link
}
diff --git a/jarink.go b/jarink.go
index 643a514..5c46e4e 100644
--- a/jarink.go
+++ b/jarink.go
@@ -1,5 +1,5 @@
-// SPDX-FileCopyrightText: 2025 M. Shulhan <ms@kilabit.info>
// SPDX-License-Identifier: GPL-3.0-only
+// SPDX-FileCopyrightText: 2025 M. Shulhan <ms@kilabit.info>
package jarink
diff --git a/link.go b/link.go
index a03808b..266bcdd 100644
--- a/link.go
+++ b/link.go
@@ -3,9 +3,34 @@
package jarink
+import (
+ "net/url"
+)
+
// Link store information about the link.
type Link struct {
- Url string `json:"url"`
- Size int64 `json:"size"`
- ResponseCode int `json:"response_code"`
+ ParentUrl *url.URL `json:"-"`
+
+ // The error from scan.
+ ErrScan error `json:"-"`
+
+ Url string `json:"url"`
+ Error string `json:"error,omitempty"`
+ Size int64 `json:"size,omitempty"`
+
+ // StatusCode status of link after scan, its mostly used the HTTP
+ // status code.
+ // 0: link is the result of scan, not processed yet.
+ // StatusBadLink: link is invalid, not parseable or unreachable.
+ // 200 - 211: OK.
+ // 400 - 511: Error.
+ StatusCode int `json:"status_code"`
+
+ // kind of url, its either an anchor (atom.A) or image (atom.Img).
+ // It set to 0 if url is the first URL being scanned (parent URL).
+ Kind int `json:"-"`
+
+ // IsExternal if true the scan will use HTTP method HEAD instead of
+ // GET.
+ IsExternal bool `json:"-"`
}
diff --git a/url_test.go b/url_test.go
index 0b0bf03..70b6b90 100644
--- a/url_test.go
+++ b/url_test.go
@@ -1,5 +1,5 @@
-// SPDX-FileCopyrightText: 2025 M. Shulhan <ms@kilabit.info>
// SPDX-License-Identifier: GPL-3.0-only
+// SPDX-FileCopyrightText: 2025 M. Shulhan <ms@kilabit.info>
package jarink