diff options
| -rw-r--r-- | Makefile | 2 | ||||
| -rw-r--r-- | README.md | 4 | ||||
| -rw-r--r-- | brokenlinks/brokenlinks_test.go | 231 | ||||
| -rw-r--r-- | brokenlinks/link_queue.go | 38 | ||||
| -rw-r--r-- | brokenlinks/options.go | 2 | ||||
| -rw-r--r-- | brokenlinks/result.go | 20 | ||||
| -rw-r--r-- | brokenlinks/testdata/exp_cache.json | 6 | ||||
| -rw-r--r-- | brokenlinks/worker.go | 138 | ||||
| -rw-r--r-- | cache.go | 11 | ||||
| -rw-r--r-- | jarink.go | 2 | ||||
| -rw-r--r-- | link.go | 31 | ||||
| -rw-r--r-- | url_test.go | 2 |
12 files changed, 275 insertions, 212 deletions
@@ -1,5 +1,5 @@ -## SPDX-FileCopyrightText: 2025 M. Shulhan <ms@kilabit.info> ## SPDX-License-Identifier: GPL-3.0-only +## SPDX-FileCopyrightText: 2025 M. Shulhan <ms@kilabit.info> COVER_OUT:=cover.out COVER_HTML:=cover.html @@ -36,9 +36,9 @@ JSON format to standard output, ``` { "$PAGE": [{ - "link": <string>, + "url": <string>, "error": <string>, - "code": <integer> + "status_code": <integer> }, ... ], diff --git a/brokenlinks/brokenlinks_test.go b/brokenlinks/brokenlinks_test.go index db3775a..a04be7f 100644 --- a/brokenlinks/brokenlinks_test.go +++ b/brokenlinks/brokenlinks_test.go @@ -4,8 +4,11 @@ package brokenlinks_test import ( + "errors" "log" + "net" "net/http" + "net/url" "os" "path/filepath" "testing" @@ -13,7 +16,9 @@ import ( libnet "git.sr.ht/~shulhan/pakakeh.go/lib/net" "git.sr.ht/~shulhan/pakakeh.go/lib/test" + "golang.org/x/net/html/atom" + "git.sr.ht/~shulhan/jarink" "git.sr.ht/~shulhan/jarink/brokenlinks" "git.sr.ht/~shulhan/jarink/internal" ) @@ -212,8 +217,27 @@ func runServerSlow() { func TestScan(t *testing.T) { var testUrl = `http://` + testAddress + // Generate ParentUrl. + + parsedTestUrl, err := url.Parse(testUrl) + if err != nil { + t.Fatal(err) + } + parsedUrlBrokenHtml, err := url.Parse(testUrl + `/broken.html`) + if err != nil { + t.Fatal(err) + } + parsedUrlPage2, err := url.Parse(testUrl + `/page2`) + if err != nil { + t.Fatal(err) + } + + // Generate error for ErrScan. + + _, errScanPort := url.Parse(`http://127.0.0.1:abc`) + type testCase struct { - exp map[string][]brokenlinks.Broken + exp map[string][]jarink.Link expError string desc string opts brokenlinks.Options @@ -246,92 +270,133 @@ func TestScan(t *testing.T) { }, expError: `Scan: Options: unknown status code "50"`, }, { - desc: `With ` + testUrl, + desc: `With Url=testUrl`, opts: brokenlinks.Options{ Url: testUrl, IgnoreStatus: `403`, Insecure: true, IsVerbose: true, }, - exp: map[string][]brokenlinks.Broken{ - testUrl: []brokenlinks.Broken{ + exp: map[string][]jarink.Link{ + testUrl: []jarink.Link{ { - Link: testUrl + `/broken.png`, - Code: http.StatusNotFound, + ParentUrl: parsedTestUrl, + Url: testUrl + `/broken.png`, + StatusCode: http.StatusNotFound, + Kind: int(atom.Img), }, { - Link: testUrl + `/brokenPage`, - Code: http.StatusNotFound, + ParentUrl: parsedTestUrl, + Url: testUrl + `/brokenPage`, + StatusCode: http.StatusNotFound, + Kind: int(atom.A), }, { - Link: `http://127.0.0.1:abc`, - Error: `parse "http://127.0.0.1:abc": invalid port ":abc" after host`, - Code: brokenlinks.StatusBadLink, + ParentUrl: parsedTestUrl, + Url: `http://127.0.0.1:abc`, + ErrScan: errScanPort, + Error: `parse "http://127.0.0.1:abc": invalid port ":abc" after host`, + StatusCode: brokenlinks.StatusBadLink, + Kind: int(atom.A), }, { - Link: `http:/127.0.0.1:11836`, - Error: `Get "http:/127.0.0.1:11836": http: no Host in request URL`, - Code: brokenlinks.StatusBadLink, + ParentUrl: parsedTestUrl, + Url: `http:/127.0.0.1:11836`, + ErrScan: &url.Error{ + Op: `Get`, + URL: `http:/127.0.0.1:11836`, + Err: errors.New(`http: no Host in request URL`), + }, + Error: `Get "http:/127.0.0.1:11836": http: no Host in request URL`, + StatusCode: brokenlinks.StatusBadLink, + Kind: int(atom.A), + IsExternal: true, }, { - Link: `https://domain`, - Error: `Get "https://domain": dial tcp: lookup domain: no such host`, - Code: 700, + ParentUrl: parsedTestUrl, + Url: `https://domain`, + ErrScan: &url.Error{ + Op: `Get`, + URL: `https://domain`, + Err: &net.OpError{ + Op: `dial`, + Net: `tcp`, + Err: &net.DNSError{ + Err: `no such host`, + Name: `domain`, + IsNotFound: true, + }, + }, + }, + Error: `Get "https://domain": dial tcp: lookup domain: no such host`, + StatusCode: 700, + Kind: int(atom.A), + IsExternal: true, }, }, - testUrl + `/broken.html`: []brokenlinks.Broken{ + testUrl + `/broken.html`: []jarink.Link{ { - Link: testUrl + `/brokenPage`, - Code: http.StatusNotFound, + ParentUrl: parsedUrlBrokenHtml, + Url: testUrl + `/brokenPage`, + StatusCode: http.StatusNotFound, + Kind: int(atom.A), }, }, - testUrl + `/page2`: []brokenlinks.Broken{ + testUrl + `/page2`: []jarink.Link{ { - Link: testUrl + `/broken.png`, - Code: http.StatusNotFound, + ParentUrl: parsedUrlPage2, + Url: testUrl + `/broken.png`, + StatusCode: http.StatusNotFound, + Kind: int(atom.Img), }, { - Link: testUrl + `/page2/broken/relative`, - Code: http.StatusNotFound, + ParentUrl: parsedUrlPage2, + Url: testUrl + `/page2/broken/relative`, + StatusCode: http.StatusNotFound, + Kind: int(atom.A), }, { - Link: testUrl + `/page2/broken2.png`, - Code: http.StatusNotFound, + ParentUrl: parsedUrlPage2, + Url: testUrl + `/page2/broken2.png`, + StatusCode: http.StatusNotFound, + Kind: int(atom.Img), }, }, }, }, { - desc: `With ` + testUrl + `/page2`, // Scanning on "/page2" should not scan the the "/" or other // pages other than below of "/page2" itself. + desc: `With Url=/page2`, opts: brokenlinks.Options{ Url: testUrl + `/page2`, IsVerbose: true, }, - exp: map[string][]brokenlinks.Broken{ - testUrl + `/page2`: []brokenlinks.Broken{ + exp: map[string][]jarink.Link{ + testUrl + `/page2`: []jarink.Link{ { - Link: testUrl + `/broken.png`, - Code: http.StatusNotFound, + ParentUrl: parsedUrlPage2, + Url: testUrl + `/broken.png`, + StatusCode: http.StatusNotFound, + Kind: int(atom.Img), + IsExternal: true, }, { - Link: testUrl + `/page2/broken/relative`, - Code: http.StatusNotFound, + ParentUrl: parsedUrlPage2, + Url: testUrl + `/page2/broken/relative`, + StatusCode: http.StatusNotFound, + Kind: int(atom.A), }, { - Link: testUrl + `/page2/broken2.png`, - Code: http.StatusNotFound, + ParentUrl: parsedUrlPage2, + Url: testUrl + `/page2/broken2.png`, + StatusCode: http.StatusNotFound, + Kind: int(atom.Img), }, }, }, }} - var ( - result *brokenlinks.Result - err error - ) for _, tcase := range listCase { t.Run(tcase.desc, func(tt *testing.T) { internal.CacheFile = func() (string, error) { return tt.TempDir() + `/cache.json`, nil } - result, err = brokenlinks.Scan(tcase.opts) + result, err := brokenlinks.Scan(tcase.opts) if err != nil { - test.Assert(tt, tcase.opts.Url+` error`, - tcase.expError, err.Error()) + test.Assert(tt, tcase.opts.Url+` error`, tcase.expError, err.Error()) return } @@ -348,8 +413,13 @@ func TestScan(t *testing.T) { func TestScan_pastResult(t *testing.T) { var testUrl = `http://` + testAddress + parsedUrlPage2, err := url.Parse(testUrl + `/page2`) + if err != nil { + t.Fatal(err) + } + type testCase struct { - exp map[string][]brokenlinks.Broken + exp map[string][]jarink.Link desc string expError string opts brokenlinks.Options @@ -369,33 +439,36 @@ func TestScan_pastResult(t *testing.T) { PastResultFile: `testdata/past_result.json`, IgnoreStatus: `403`, }, - exp: map[string][]brokenlinks.Broken{ - testUrl + `/page2`: []brokenlinks.Broken{ + exp: map[string][]jarink.Link{ + testUrl + `/page2`: []jarink.Link{ { - Link: testUrl + `/broken.png`, - Code: http.StatusNotFound, + ParentUrl: parsedUrlPage2, + Url: testUrl + `/broken.png`, + StatusCode: http.StatusNotFound, + Kind: int(atom.Img), + IsExternal: true, }, { - Link: testUrl + `/page2/broken/relative`, - Code: http.StatusNotFound, + ParentUrl: parsedUrlPage2, + Url: testUrl + `/page2/broken/relative`, + StatusCode: http.StatusNotFound, + Kind: int(atom.A), }, { - Link: testUrl + `/page2/broken2.png`, - Code: http.StatusNotFound, + ParentUrl: parsedUrlPage2, + Url: testUrl + `/page2/broken2.png`, + StatusCode: http.StatusNotFound, + Kind: int(atom.Img), }, }, }, }} - var ( - result *brokenlinks.Result - err error - ) for _, tcase := range listCase { t.Run(tcase.desc, func(tt *testing.T) { internal.CacheFile = func() (string, error) { return tt.TempDir() + `/cache.json`, nil } - result, err = brokenlinks.Scan(tcase.opts) + result, err := brokenlinks.Scan(tcase.opts) if err != nil { test.Assert(tt, tcase.opts.Url+` error`, tcase.expError, err.Error()) return @@ -410,14 +483,24 @@ func TestScan_pastResult(t *testing.T) { func TestScan_slow(t *testing.T) { const testUrl = `http://` + testAddressSlow + parsedUrlSlow1, err := url.Parse(testUrl + `/slow1`) + if err != nil { + t.Fatal(err) + } + parsedUrlSlow2, err := url.Parse(testUrl + `/slow2`) + if err != nil { + t.Fatal(err) + } + parsedUrlSlow3, err := url.Parse(testUrl + `/slow3`) + if err != nil { + t.Fatal(err) + } + var opts = brokenlinks.Options{ Url: testUrl, IsVerbose: true, } - - var gotResult *brokenlinks.Result - var err error - gotResult, err = brokenlinks.Scan(opts) + gotResult, err := brokenlinks.Scan(opts) if err != nil { t.Fatal(err) } @@ -426,18 +509,24 @@ func TestScan_slow(t *testing.T) { //t.Logf(`got=%s`, got) var expResult = &brokenlinks.Result{ - BrokenLinks: map[string][]brokenlinks.Broken{ - testUrl + `/slow1`: []brokenlinks.Broken{{ - Link: testUrl + `/slow3/sub`, - Code: http.StatusForbidden, + BrokenLinks: map[string][]jarink.Link{ + testUrl + `/slow1`: []jarink.Link{{ + ParentUrl: parsedUrlSlow1, + Url: testUrl + `/slow3/sub`, + StatusCode: http.StatusForbidden, + Kind: int(atom.A), }}, - testUrl + `/slow2`: []brokenlinks.Broken{{ - Link: testUrl + `/slow3/sub`, - Code: http.StatusForbidden, + testUrl + `/slow2`: []jarink.Link{{ + ParentUrl: parsedUrlSlow2, + Url: testUrl + `/slow3/sub`, + StatusCode: http.StatusForbidden, + Kind: int(atom.A), }}, - testUrl + `/slow3`: []brokenlinks.Broken{{ - Link: testUrl + `/slow3/sub`, - Code: http.StatusForbidden, + testUrl + `/slow3`: []jarink.Link{{ + ParentUrl: parsedUrlSlow3, + Url: testUrl + `/slow3/sub`, + StatusCode: http.StatusForbidden, + Kind: int(atom.A), }}, }, } diff --git a/brokenlinks/link_queue.go b/brokenlinks/link_queue.go deleted file mode 100644 index 14bf8c7..0000000 --- a/brokenlinks/link_queue.go +++ /dev/null @@ -1,38 +0,0 @@ -// SPDX-FileCopyrightText: 2025 M. Shulhan <ms@kilabit.info> -// SPDX-License-Identifier: GPL-3.0-only - -package brokenlinks - -import ( - "net/url" - - "golang.org/x/net/html/atom" -) - -type linkQueue struct { - parentUrl *url.URL - - // The error from scan. - errScan error - - // url being scanned. - url string - - // kind of url, its either an anchor or image. - // It set to 0 if url is the first URL being scanned. - kind atom.Atom - - // isExternal if true the scan will issue HTTP method HEAD instead of - // GET. - isExternal bool - - // Status of link after scan, its mostly used the HTTP status code. - // 0: link is the result of scan, not processed yet. - // StatusBadLink: link is invalid, not parseable or unreachable. - // 200 - 211: OK. - // 400 - 511: Error. - status int - - // Size of the page, derived from HTTP response ContentLength. - size int64 -} diff --git a/brokenlinks/options.go b/brokenlinks/options.go index 3e69daf..2703f8d 100644 --- a/brokenlinks/options.go +++ b/brokenlinks/options.go @@ -1,5 +1,5 @@ -// SPDX-FileCopyrightText: 2025 M. Shulhan <ms@kilabit.info> // SPDX-License-Identifier: GPL-3.0-only +// SPDX-FileCopyrightText: 2025 M. Shulhan <ms@kilabit.info> package brokenlinks diff --git a/brokenlinks/result.go b/brokenlinks/result.go index 676859b..1cd49a3 100644 --- a/brokenlinks/result.go +++ b/brokenlinks/result.go @@ -1,37 +1,31 @@ -// SPDX-FileCopyrightText: 2025 M. Shulhan <ms@kilabit.info> // SPDX-License-Identifier: GPL-3.0-only +// SPDX-FileCopyrightText: 2025 M. Shulhan <ms@kilabit.info> package brokenlinks import ( "slices" "strings" -) -// Broken store the broken link, HTTP status code, and the error message that -// cause it. -type Broken struct { - Link string `json:"link"` - Error string `json:"error,omitempty"` - Code int `json:"code"` -} + "git.sr.ht/~shulhan/jarink" +) // Result store the result of scanning for broken links. type Result struct { // BrokenLinks store the page and its broken links. - BrokenLinks map[string][]Broken `json:"broken_links"` + BrokenLinks map[string][]jarink.Link `json:"broken_links"` } func newResult() *Result { return &Result{ - BrokenLinks: map[string][]Broken{}, + BrokenLinks: map[string][]jarink.Link{}, } } func (result *Result) sort() { for _, listBroken := range result.BrokenLinks { - slices.SortFunc(listBroken, func(a, b Broken) int { - return strings.Compare(a.Link, b.Link) + slices.SortFunc(listBroken, func(a, b jarink.Link) int { + return strings.Compare(a.Url, b.Url) }) } } diff --git a/brokenlinks/testdata/exp_cache.json b/brokenlinks/testdata/exp_cache.json index 8b84ff7..f9aa32a 100644 --- a/brokenlinks/testdata/exp_cache.json +++ b/brokenlinks/testdata/exp_cache.json @@ -3,17 +3,17 @@ "http://127.0.0.1:11900": { "url": "http://127.0.0.1:11900", "size": 1064, - "response_code": 200 + "status_code": 200 }, "http://127.0.0.1:11900/page2": { "url": "http://127.0.0.1:11900/page2", "size": 410, - "response_code": 200 + "status_code": 200 }, "https://127.0.0.1:11838": { "url": "https://127.0.0.1:11838", "size": 1064, - "response_code": 200 + "status_code": 200 } } } diff --git a/brokenlinks/worker.go b/brokenlinks/worker.go index c0a33dd..07bda88 100644 --- a/brokenlinks/worker.go +++ b/brokenlinks/worker.go @@ -29,7 +29,7 @@ type worker struct { seenLink map[string]int // queue contains list of link to be scanned. - queue []linkQueue + queue []jarink.Link // result contains the final result after all of the pages has been // scanned. @@ -124,15 +124,14 @@ func (wrk *worker) run() (result *Result, err error) { // scanAll scan all pages start from [Options.Url]. func (wrk *worker) scanAll() (result *Result, err error) { // Scan the first URL to make sure that the server is reachable. - var linkq = linkQueue{ - parentUrl: nil, - url: wrk.opts.scanUrl.String(), + var linkq = jarink.Link{ + Url: wrk.opts.scanUrl.String(), } var resultq = wrk.scan(linkq) - linkq = resultq[linkq.url] - if linkq.errScan != nil { - return nil, linkq.errScan + linkq = resultq[linkq.Url] + if linkq.ErrScan != nil { + return nil, linkq.ErrScan } wrk.processResult(resultq) @@ -176,17 +175,16 @@ func (wrk *worker) scanPastResult() (result *Result, err error) { // - Skip external link that has been checked before. // - Skip link that has been seen. // - Otherwise push it to queue. -func (wrk *worker) processResult(resultq map[string]linkQueue) { - var linkq linkQueue +func (wrk *worker) processResult(resultq map[string]jarink.Link) { var seen bool - for _, linkq = range resultq { - if linkq.status != 0 { + for _, linkq := range resultq { + if linkq.StatusCode != 0 { wrk.seen(linkq) continue } - if linkq.isExternal { - var scannedLink = wrk.cache.Get(linkq.url) + if linkq.IsExternal { + var scannedLink = wrk.cache.Get(linkq.Url) if scannedLink != nil { // The external link has been scanned // previously. @@ -194,9 +192,9 @@ func (wrk *worker) processResult(resultq map[string]linkQueue) { } } - linkq.status, seen = wrk.seenLink[linkq.url] + linkq.StatusCode, seen = wrk.seenLink[linkq.Url] if seen { - if linkq.status >= http.StatusBadRequest { + if linkq.StatusCode >= http.StatusBadRequest { // Different pages may have the same broken // link. wrk.markAsBroken(linkq) @@ -207,61 +205,61 @@ func (wrk *worker) processResult(resultq map[string]linkQueue) { } } -func (wrk *worker) seen(linkq linkQueue) { - wrk.seenLink[linkq.url] = linkq.status +func (wrk *worker) seen(linkq jarink.Link) { + wrk.seenLink[linkq.Url] = linkq.StatusCode - if linkq.isExternal { - if linkq.status != StatusBadLink { - wrk.cache.Set(linkq.url, linkq.status, linkq.size) + if linkq.IsExternal { + if linkq.StatusCode != StatusBadLink { + wrk.cache.Set(linkq) } } - if linkq.status >= http.StatusBadRequest { + if linkq.StatusCode >= http.StatusBadRequest { wrk.markAsBroken(linkq) } } -func (wrk *worker) markAsBroken(linkq linkQueue) { - if slices.Contains(wrk.opts.ignoreStatus, linkq.status) { +func (wrk *worker) markAsBroken(linkq jarink.Link) { + if slices.Contains(wrk.opts.ignoreStatus, linkq.StatusCode) { return } - var parentUrl = linkq.parentUrl.String() + var parentUrl = linkq.ParentUrl.String() var listBroken = wrk.result.BrokenLinks[parentUrl] - var brokenLink = Broken{ - Link: linkq.url, - Code: linkq.status, + if linkq.ErrScan != nil { + linkq.Error = linkq.ErrScan.Error() } - if linkq.errScan != nil { - brokenLink.Error = linkq.errScan.Error() - } - listBroken = append(listBroken, brokenLink) + listBroken = append(listBroken, linkq) wrk.result.BrokenLinks[parentUrl] = listBroken } // scan the link to HTML page or image. -func (wrk *worker) scan(linkq linkQueue) (resultq map[string]linkQueue) { - resultq = make(map[string]linkQueue) +func (wrk *worker) scan(linkq jarink.Link) (resultq map[string]jarink.Link) { + resultq = make(map[string]jarink.Link) var ( httpResp *http.Response err error ) httpResp, err = wrk.fetch(linkq) if err != nil { - linkq.status = StatusBadLink - linkq.errScan = err - resultq[linkq.url] = linkq + linkq.StatusCode = StatusBadLink + linkq.ErrScan = err + resultq[linkq.Url] = linkq return resultq } defer httpResp.Body.Close() - linkq.status = httpResp.StatusCode - linkq.size = httpResp.ContentLength - resultq[linkq.url] = linkq + linkq.StatusCode = httpResp.StatusCode + resultq[linkq.Url] = linkq if httpResp.StatusCode >= http.StatusBadRequest { return resultq } - if linkq.kind == atom.Img || linkq.isExternal { + if linkq.Kind == int(atom.Img) { + return resultq + } + linkq.Size = httpResp.ContentLength + if linkq.IsExternal { + resultq[linkq.Url] = linkq return resultq } @@ -276,11 +274,11 @@ func (wrk *worker) scan(linkq linkQueue) (resultq map[string]linkQueue) { var parentUrl *url.URL - parentUrl, err = url.Parse(linkq.url) + parentUrl, err = url.Parse(linkq.Url) if err != nil { - linkq.status = StatusBadLink - linkq.errScan = err - resultq[linkq.url] = linkq + linkq.StatusCode = StatusBadLink + linkq.ErrScan = err + resultq[linkq.Url] = linkq return resultq } @@ -292,13 +290,13 @@ func (wrk *worker) scan(linkq linkQueue) (resultq map[string]linkQueue) { if node.DataAtom != atom.A && node.DataAtom != atom.Img { continue } - var nodeLink *linkQueue + var nodeLink *jarink.Link if node.DataAtom == atom.A { for _, attr := range node.Attr { if attr.Key != `href` { continue } - nodeLink = wrk.processLink(parentUrl, attr.Val, atom.A) + nodeLink = wrk.processLink(parentUrl, attr.Val, int(atom.A)) break } } else { // atom.Img @@ -306,7 +304,7 @@ func (wrk *worker) scan(linkq linkQueue) (resultq map[string]linkQueue) { if attr.Key != `src` { continue } - nodeLink = wrk.processLink(parentUrl, attr.Val, atom.Img) + nodeLink = wrk.processLink(parentUrl, attr.Val, int(atom.Img)) break } } @@ -314,30 +312,30 @@ func (wrk *worker) scan(linkq linkQueue) (resultq map[string]linkQueue) { // Link is invalid. continue } - _, seen := resultq[nodeLink.url] + _, seen := resultq[nodeLink.Url] if seen { // The same link already exist previously. continue } - resultq[nodeLink.url] = *nodeLink + resultq[nodeLink.Url] = *nodeLink } return resultq } -func (wrk *worker) fetch(linkq linkQueue) (httpResp *http.Response, err error) { +func (wrk *worker) fetch(linkq jarink.Link) (httpResp *http.Response, err error) { const maxRetry = 5 var retry int for retry < 5 { - if linkq.kind == atom.Img { + if linkq.Kind == int(atom.Img) { if wrk.opts.IsVerbose { - wrk.log.Printf("fetch: HEAD %s", linkq.url) + wrk.log.Printf("fetch: HEAD %s", linkq.Url) } - httpResp, err = wrk.httpc.Head(linkq.url) + httpResp, err = wrk.httpc.Head(linkq.Url) } else { if wrk.opts.IsVerbose { - wrk.log.Printf("fetch: GET %s", linkq.url) + wrk.log.Printf("fetch: GET %s", linkq.Url) } - httpResp, err = wrk.httpc.Get(linkq.url) + httpResp, err = wrk.httpc.Get(linkq.Url) } if err == nil { return httpResp, nil @@ -348,7 +346,7 @@ func (wrk *worker) fetch(linkq linkQueue) (httpResp *http.Response, err error) { } if errDNS.Timeout() { retry++ - wrk.log.Printf(`fetch %s: %s (%d/%d)`, linkq.url, err, retry, maxRetry) + wrk.log.Printf(`fetch %s: %s (%d/%d)`, linkq.Url, err, retry, maxRetry) continue } break @@ -356,33 +354,33 @@ func (wrk *worker) fetch(linkq linkQueue) (httpResp *http.Response, err error) { return nil, err } -// processLink given a parentURL and link value `val` -// check if link `val` is valid and return it as linkQueue. -func (wrk *worker) processLink(parentUrl *url.URL, val string, kind atom.Atom) ( - linkq *linkQueue, +// processLink given a parentURL, check if link `val` is valid, and return it +// as [jarink.Link]. +func (wrk *worker) processLink(parentUrl *url.URL, val string, kind int) ( + linkq *jarink.Link, ) { if len(val) == 0 { return nil } - linkq = &linkQueue{ - parentUrl: parentUrl, - kind: kind, + linkq = &jarink.Link{ + ParentUrl: parentUrl, + Kind: kind, } var newUrl *url.URL var err error newUrl, err = url.Parse(val) if err != nil { - linkq.errScan = err - linkq.url = val - linkq.status = StatusBadLink + linkq.ErrScan = err + linkq.Url = val + linkq.StatusCode = StatusBadLink return linkq } newUrl.Fragment = "" newUrl.RawFragment = "" - if kind == atom.A && val[0] == '#' { + if kind == int(atom.A) && val[0] == '#' { // Ignore link to ID, like `href="#element_id"`. return nil } @@ -395,9 +393,9 @@ func (wrk *worker) processLink(parentUrl *url.URL, val string, kind atom.Atom) ( newUrl = parentUrl.JoinPath(`/`, newUrl.Path) } } - linkq.url = strings.TrimSuffix(newUrl.String(), `/`) - if !strings.HasPrefix(linkq.url, wrk.opts.scanUrl.String()) { - linkq.isExternal = true + linkq.Url = strings.TrimSuffix(newUrl.String(), `/`) + if !strings.HasPrefix(linkq.Url, wrk.opts.scanUrl.String()) { + linkq.IsExternal = true } return linkq } @@ -80,18 +80,13 @@ func (cache *Cache) Save() (err error) { return nil } -func (cache *Cache) Set(url string, respCode int, size int64) { +func (cache *Cache) Set(link Link) { cache.mtx.Lock() defer cache.mtx.Unlock() - var scannedLink = cache.ScannedLinks[url] + var scannedLink = cache.ScannedLinks[link.Url] if scannedLink != nil { return } - scannedLink = &Link{ - Url: url, - Size: size, - ResponseCode: respCode, - } - cache.ScannedLinks[url] = scannedLink + cache.ScannedLinks[link.Url] = &link } @@ -1,5 +1,5 @@ -// SPDX-FileCopyrightText: 2025 M. Shulhan <ms@kilabit.info> // SPDX-License-Identifier: GPL-3.0-only +// SPDX-FileCopyrightText: 2025 M. Shulhan <ms@kilabit.info> package jarink @@ -3,9 +3,34 @@ package jarink +import ( + "net/url" +) + // Link store information about the link. type Link struct { - Url string `json:"url"` - Size int64 `json:"size"` - ResponseCode int `json:"response_code"` + ParentUrl *url.URL `json:"-"` + + // The error from scan. + ErrScan error `json:"-"` + + Url string `json:"url"` + Error string `json:"error,omitempty"` + Size int64 `json:"size,omitempty"` + + // StatusCode status of link after scan, its mostly used the HTTP + // status code. + // 0: link is the result of scan, not processed yet. + // StatusBadLink: link is invalid, not parseable or unreachable. + // 200 - 211: OK. + // 400 - 511: Error. + StatusCode int `json:"status_code"` + + // kind of url, its either an anchor (atom.A) or image (atom.Img). + // It set to 0 if url is the first URL being scanned (parent URL). + Kind int `json:"-"` + + // IsExternal if true the scan will use HTTP method HEAD instead of + // GET. + IsExternal bool `json:"-"` } diff --git a/url_test.go b/url_test.go index 0b0bf03..70b6b90 100644 --- a/url_test.go +++ b/url_test.go @@ -1,5 +1,5 @@ -// SPDX-FileCopyrightText: 2025 M. Shulhan <ms@kilabit.info> // SPDX-License-Identifier: GPL-3.0-only +// SPDX-FileCopyrightText: 2025 M. Shulhan <ms@kilabit.info> package jarink |
