From a02e915388723a5d8cc3b555fb3dfec477fc2a55 Mon Sep 17 00:00:00 2001 From: Shulhan Date: Thu, 12 Jun 2025 21:13:58 +0700 Subject: all: refactoring, move brokenlinks code to its own package When two or more struct has the same prefix that means it is time to move it to group it. Also, we will group one command to one package in the future. --- brokenlinks.go | 69 ---- brokenlinks/brokenlinks.go | 39 +++ brokenlinks/brokenlinks_test.go | 227 +++++++++++++ brokenlinks/link_queue.go | 55 +++ brokenlinks/result.go | 37 ++ brokenlinks/testdata/past_result.json | 10 + brokenlinks/testdata/past_result.json.license | 2 + brokenlinks/testdata/web/broken.html | 7 + brokenlinks/testdata/web/gopher.png | Bin 0 -> 32775 bytes brokenlinks/testdata/web/index.html | 22 ++ brokenlinks/testdata/web/page2/index.html | 14 + brokenlinks/worker.go | 467 ++++++++++++++++++++++++++ brokenlinks_test.go | 166 --------- brokenlinks_worker.go | 467 -------------------------- cmd/jarink/main.go | 25 +- jarink_test.go | 70 ---- link_queue.go | 55 --- testdata/past_result.json | 10 - testdata/past_result.json.license | 2 - testdata/web/broken.html | 7 - testdata/web/gopher.png | Bin 32775 -> 0 bytes testdata/web/index.html | 22 -- testdata/web/page2/index.html | 14 - 23 files changed, 897 insertions(+), 890 deletions(-) delete mode 100644 brokenlinks.go create mode 100644 brokenlinks/brokenlinks.go create mode 100644 brokenlinks/brokenlinks_test.go create mode 100644 brokenlinks/link_queue.go create mode 100644 brokenlinks/result.go create mode 100644 brokenlinks/testdata/past_result.json create mode 100644 brokenlinks/testdata/past_result.json.license create mode 100644 brokenlinks/testdata/web/broken.html create mode 100644 brokenlinks/testdata/web/gopher.png create mode 100644 brokenlinks/testdata/web/index.html create mode 100644 brokenlinks/testdata/web/page2/index.html create mode 100644 brokenlinks/worker.go delete mode 100644 brokenlinks_test.go delete mode 100644 brokenlinks_worker.go delete mode 100644 jarink_test.go delete mode 100644 link_queue.go delete mode 100644 testdata/past_result.json delete mode 100644 testdata/past_result.json.license delete mode 100644 testdata/web/broken.html delete mode 100644 testdata/web/gopher.png delete mode 100644 testdata/web/index.html delete mode 100644 testdata/web/page2/index.html diff --git a/brokenlinks.go b/brokenlinks.go deleted file mode 100644 index 96580e5..0000000 --- a/brokenlinks.go +++ /dev/null @@ -1,69 +0,0 @@ -// SPDX-FileCopyrightText: 2025 M. Shulhan -// SPDX-License-Identifier: GPL-3.0-only - -package jarink - -import ( - "fmt" - "slices" - "strings" -) - -const Version = `0.1.0` - -// StatusBadLink status for link that is not parseable by [url.Parse] or not -// reachable during GET or HEAD, either timeout or IP or domain not exist. -const StatusBadLink = 700 - -// Broken store the broken link, HTTP status code, and the error message that -// cause it. -type Broken struct { - Link string `json:"link"` - Error string `json:"error,omitempty"` - Code int `json:"code"` -} - -// BrokenlinksOptions define the options for scanning broken links. -type BrokenlinksOptions struct { - Url string - PastResultFile string - IsVerbose bool -} - -// BrokenlinksResult store the result of scanning for broken links. -type BrokenlinksResult struct { - // BrokenLinks store the page and its broken links. - BrokenLinks map[string][]Broken `json:"broken_links"` -} - -func newBrokenlinksResult() *BrokenlinksResult { - return &BrokenlinksResult{ - BrokenLinks: map[string][]Broken{}, - } -} - -func (result *BrokenlinksResult) sort() { - for _, listBroken := range result.BrokenLinks { - slices.SortFunc(listBroken, func(a, b Broken) int { - return strings.Compare(a.Link, b.Link) - }) - } -} - -// Brokenlinks scan the URL for broken links. -func Brokenlinks(opts BrokenlinksOptions) (result *BrokenlinksResult, err error) { - var logp = `brokenlinks` - var wrk *brokenlinksWorker - - wrk, err = newWorker(opts) - if err != nil { - return nil, fmt.Errorf(`%s: %s`, logp, err) - } - - result, err = wrk.run() - if err != nil { - return nil, fmt.Errorf(`%s: %s`, logp, err) - } - - return result, nil -} diff --git a/brokenlinks/brokenlinks.go b/brokenlinks/brokenlinks.go new file mode 100644 index 0000000..8ac458f --- /dev/null +++ b/brokenlinks/brokenlinks.go @@ -0,0 +1,39 @@ +// SPDX-FileCopyrightText: 2025 M. Shulhan +// SPDX-License-Identifier: GPL-3.0-only + +package brokenlinks + +import ( + "fmt" +) + +const Version = `0.1.0` + +// StatusBadLink status for link that is not parseable by [url.Parse] or not +// reachable during GET or HEAD, either timeout or IP or domain not exist. +const StatusBadLink = 700 + +// Options define the options for scanning broken links. +type Options struct { + Url string + PastResultFile string + IsVerbose bool +} + +// Scan the URL for broken links. +func Scan(opts Options) (result *Result, err error) { + var logp = `brokenlinks` + var wrk *worker + + wrk, err = newWorker(opts) + if err != nil { + return nil, fmt.Errorf(`%s: %s`, logp, err) + } + + result, err = wrk.run() + if err != nil { + return nil, fmt.Errorf(`%s: %s`, logp, err) + } + + return result, nil +} diff --git a/brokenlinks/brokenlinks_test.go b/brokenlinks/brokenlinks_test.go new file mode 100644 index 0000000..367ae6c --- /dev/null +++ b/brokenlinks/brokenlinks_test.go @@ -0,0 +1,227 @@ +// SPDX-FileCopyrightText: 2025 M. Shulhan +// SPDX-License-Identifier: GPL-3.0-only + +package brokenlinks_test + +import ( + "encoding/json" + "log" + "net/http" + "os" + "testing" + "time" + + libnet "git.sr.ht/~shulhan/pakakeh.go/lib/net" + "git.sr.ht/~shulhan/pakakeh.go/lib/test" + + "git.sr.ht/~shulhan/jarink/brokenlinks" +) + +// The test run two web servers that serve content on "testdata/web/". +// The first web server is the one that we want to scan. +// The second web server is external web server, where HTML pages should not +// be parsed. + +const testAddress = `127.0.0.1:11836` +const testExternalAddress = `127.0.0.1:11900` + +func TestMain(m *testing.M) { + log.SetFlags(0) + var httpDirWeb = http.Dir(`testdata/web`) + var fshandle = http.FileServer(httpDirWeb) + + go func() { + var mux = http.NewServeMux() + mux.Handle(`/`, fshandle) + var testServer = &http.Server{ + Addr: testAddress, + Handler: mux, + ReadTimeout: 10 * time.Second, + WriteTimeout: 10 * time.Second, + MaxHeaderBytes: 1 << 20, + } + var err = testServer.ListenAndServe() + if err != nil { + log.Fatal(err) + } + }() + go func() { + var mux = http.NewServeMux() + mux.Handle(`/`, fshandle) + var testServer = &http.Server{ + Addr: testExternalAddress, + Handler: mux, + ReadTimeout: 10 * time.Second, + WriteTimeout: 10 * time.Second, + MaxHeaderBytes: 1 << 20, + } + var err = testServer.ListenAndServe() + if err != nil { + log.Fatal(err) + } + }() + + var err = libnet.WaitAlive(`tcp`, testAddress, 5*time.Second) + if err != nil { + log.Fatal(err) + } + err = libnet.WaitAlive(`tcp`, testExternalAddress, 5*time.Second) + if err != nil { + log.Fatal(err) + } + + os.Exit(m.Run()) +} + +func TestBrokenlinks(t *testing.T) { + var testUrl = `http://` + testAddress + + type testCase struct { + exp map[string][]brokenlinks.Broken + scanUrl string + expError string + } + + listCase := []testCase{{ + scanUrl: `127.0.0.1:14594`, + expError: `brokenlinks: invalid URL "127.0.0.1:14594"`, + }, { + scanUrl: `http://127.0.0.1:14594`, + expError: `brokenlinks: Get "http://127.0.0.1:14594": dial tcp 127.0.0.1:14594: connect: connection refused`, + }, { + scanUrl: testUrl, + exp: map[string][]brokenlinks.Broken{ + testUrl: []brokenlinks.Broken{ + { + Link: testUrl + `/broken.png`, + Code: http.StatusNotFound, + }, { + Link: testUrl + `/brokenPage`, + Code: http.StatusNotFound, + }, { + Link: `http://127.0.0.1:abc`, + Error: `parse "http://127.0.0.1:abc": invalid port ":abc" after host`, + Code: brokenlinks.StatusBadLink, + }, { + Link: `http:/127.0.0.1:11836`, + Error: `Get "http:/127.0.0.1:11836": http: no Host in request URL`, + Code: brokenlinks.StatusBadLink, + }, + }, + testUrl + `/broken.html`: []brokenlinks.Broken{ + { + Link: testUrl + `/brokenPage`, + Code: http.StatusNotFound, + }, + }, + testUrl + `/page2`: []brokenlinks.Broken{ + { + Link: testUrl + `/broken.png`, + Code: http.StatusNotFound, + }, { + Link: testUrl + `/page2/broken/relative`, + Code: http.StatusNotFound, + }, { + Link: testUrl + `/page2/broken2.png`, + Code: http.StatusNotFound, + }, + }, + }, + }, { + // Scanning on "/path" should not scan the the "/" or other + // pages other than below of "/path" itself. + scanUrl: testUrl + `/page2`, + exp: map[string][]brokenlinks.Broken{ + testUrl + `/page2`: []brokenlinks.Broken{ + { + Link: testUrl + `/broken.png`, + Code: http.StatusNotFound, + }, { + Link: testUrl + `/page2/broken/relative`, + Code: http.StatusNotFound, + }, { + Link: testUrl + `/page2/broken2.png`, + Code: http.StatusNotFound, + }, + }, + }, + }} + + var ( + result *brokenlinks.Result + err error + ) + for _, tcase := range listCase { + t.Logf(`--- brokenlinks: %s`, tcase.scanUrl) + var opts = brokenlinks.Options{ + Url: tcase.scanUrl, + } + result, err = brokenlinks.Scan(opts) + if err != nil { + test.Assert(t, tcase.scanUrl+` error`, + tcase.expError, err.Error()) + continue + } + //got, _ := json.MarshalIndent(result.BrokenLinks, ``, ` `) + //t.Logf(`got=%s`, got) + test.Assert(t, tcase.scanUrl, tcase.exp, result.BrokenLinks) + } +} + +// Test running Brokenlinks with file PastResultFile is set. +// The PastResultFile is modified to only report errors on "/page2". +func TestBrokenlinks_pastResult(t *testing.T) { + var testUrl = `http://` + testAddress + + type testCase struct { + exp map[string][]brokenlinks.Broken + expError string + opts brokenlinks.Options + } + + listCase := []testCase{{ + // With invalid file. + opts: brokenlinks.Options{ + Url: testUrl, + PastResultFile: `testdata/invalid`, + }, + expError: `brokenlinks: open testdata/invalid: no such file or directory`, + }, { + // With valid file. + opts: brokenlinks.Options{ + Url: testUrl, + PastResultFile: `testdata/past_result.json`, + }, + exp: map[string][]brokenlinks.Broken{ + testUrl + `/page2`: []brokenlinks.Broken{ + { + Link: testUrl + `/broken.png`, + Code: http.StatusNotFound, + }, { + Link: testUrl + `/page2/broken/relative`, + Code: http.StatusNotFound, + }, { + Link: testUrl + `/page2/broken2.png`, + Code: http.StatusNotFound, + }, + }, + }, + }} + + var ( + result *brokenlinks.Result + err error + ) + for _, tcase := range listCase { + t.Logf(`--- brokenlinks: %s`, tcase.opts.Url) + result, err = brokenlinks.Scan(tcase.opts) + if err != nil { + test.Assert(t, tcase.opts.Url+` error`, + tcase.expError, err.Error()) + continue + } + got, _ := json.MarshalIndent(result.BrokenLinks, ``, ` `) + t.Logf(`got=%s`, got) + test.Assert(t, tcase.opts.Url, tcase.exp, result.BrokenLinks) + } +} diff --git a/brokenlinks/link_queue.go b/brokenlinks/link_queue.go new file mode 100644 index 0000000..164a902 --- /dev/null +++ b/brokenlinks/link_queue.go @@ -0,0 +1,55 @@ +// SPDX-FileCopyrightText: 2025 M. Shulhan +// SPDX-License-Identifier: GPL-3.0-only + +package brokenlinks + +import ( + "net/url" + "strings" + + "golang.org/x/net/html/atom" +) + +type linkQueue struct { + parentUrl *url.URL + + // The error from scan. + errScan error + + // url being scanned. + url string + + // kind of url, its either an anchor or image. + // It set to 0 if url is the first URL being scanned. + kind atom.Atom + + // isExternal if true the scan will issue HTTP method HEAD instead of + // GET. + isExternal bool + + // Status of link after scan, its mostly used the HTTP status code. + // 0: link is the result of scan, not processed yet. + // StatusBadLink: link is invalid, not parseable or unreachable. + // 200 - 211: OK. + // 400 - 511: Error. + status int +} + +// checkExternal set the isExternal field to be true if +// +// (1) [linkQueue.url] does not start with [worker.scanUrl] +// +// (2) linkQueue is from scanPastResult, indicated by non-nil +// [worker.pastResult]. +// In this case, we did not want to scan the other pages from the same scanUrl +// domain. +func (linkq *linkQueue) checkExternal(wrk *worker) { + if !strings.HasPrefix(linkq.url, wrk.scanUrl.String()) { + linkq.isExternal = true + return + } + if wrk.pastResult != nil { + linkq.isExternal = true + return + } +} diff --git a/brokenlinks/result.go b/brokenlinks/result.go new file mode 100644 index 0000000..676859b --- /dev/null +++ b/brokenlinks/result.go @@ -0,0 +1,37 @@ +// SPDX-FileCopyrightText: 2025 M. Shulhan +// SPDX-License-Identifier: GPL-3.0-only + +package brokenlinks + +import ( + "slices" + "strings" +) + +// Broken store the broken link, HTTP status code, and the error message that +// cause it. +type Broken struct { + Link string `json:"link"` + Error string `json:"error,omitempty"` + Code int `json:"code"` +} + +// Result store the result of scanning for broken links. +type Result struct { + // BrokenLinks store the page and its broken links. + BrokenLinks map[string][]Broken `json:"broken_links"` +} + +func newResult() *Result { + return &Result{ + BrokenLinks: map[string][]Broken{}, + } +} + +func (result *Result) sort() { + for _, listBroken := range result.BrokenLinks { + slices.SortFunc(listBroken, func(a, b Broken) int { + return strings.Compare(a.Link, b.Link) + }) + } +} diff --git a/brokenlinks/testdata/past_result.json b/brokenlinks/testdata/past_result.json new file mode 100644 index 0000000..ca29d35 --- /dev/null +++ b/brokenlinks/testdata/past_result.json @@ -0,0 +1,10 @@ +{ + "broken_links": { + "http://127.0.0.1:11836/page2": [ + { + "link": "http://127.0.0.1:11836/", + "code": 404 + } + ] + } +} diff --git a/brokenlinks/testdata/past_result.json.license b/brokenlinks/testdata/past_result.json.license new file mode 100644 index 0000000..22616a9 --- /dev/null +++ b/brokenlinks/testdata/past_result.json.license @@ -0,0 +1,2 @@ +SPDX-FileCopyrightText: 2025 M. Shulhan +SPDX-License-Identifier: GPL-3.0-only diff --git a/brokenlinks/testdata/web/broken.html b/brokenlinks/testdata/web/broken.html new file mode 100644 index 0000000..533e542 --- /dev/null +++ b/brokenlinks/testdata/web/broken.html @@ -0,0 +1,7 @@ + + + + + + diff --git a/brokenlinks/testdata/web/gopher.png b/brokenlinks/testdata/web/gopher.png new file mode 100644 index 0000000..79352be Binary files /dev/null and b/brokenlinks/testdata/web/gopher.png differ diff --git a/brokenlinks/testdata/web/index.html b/brokenlinks/testdata/web/index.html new file mode 100644 index 0000000..61a1f39 --- /dev/null +++ b/brokenlinks/testdata/web/index.html @@ -0,0 +1,22 @@ + + + + + Broken page + + + Page 2 + Broken HTML + External URL + + Invalid external URL + + Invalid URL port + + Same with href to "/" + Same with href to "/page2" + + diff --git a/brokenlinks/testdata/web/page2/index.html b/brokenlinks/testdata/web/page2/index.html new file mode 100644 index 0000000..ae6b4ea --- /dev/null +++ b/brokenlinks/testdata/web/page2/index.html @@ -0,0 +1,14 @@ + + + + + + broken relative link + Back with absolute path + Back with relative path + External URL page2 + + diff --git a/brokenlinks/worker.go b/brokenlinks/worker.go new file mode 100644 index 0000000..4ed56d2 --- /dev/null +++ b/brokenlinks/worker.go @@ -0,0 +1,467 @@ +// SPDX-FileCopyrightText: 2025 M. Shulhan +// SPDX-License-Identifier: GPL-3.0-only + +package brokenlinks + +import ( + "encoding/json" + "errors" + "fmt" + "log" + "net" + "net/http" + "net/url" + "os" + "strings" + "sync" + "time" + + "golang.org/x/net/html" + "golang.org/x/net/html/atom" +) + +type worker struct { + // seenLink store the URL being or has been scanned and its HTTP + // status code. + seenLink map[string]int + + // resultq channel that collect result from scanning. + resultq chan map[string]linkQueue + + // result contains the final result after all of the pages has been + // scanned. + result *Result + + // pastResult containts the past scan result, loaded from file + // [Options.PastResultFile]. + pastResult *Result + + // The base URL that will be joined to relative or absolute + // links or image. + baseUrl *url.URL + + // The URL to scan. + scanUrl *url.URL + + log *log.Logger + + opts Options + + // wg sync the goroutine scanner. + wg sync.WaitGroup +} + +func newWorker(opts Options) (wrk *worker, err error) { + wrk = &worker{ + opts: opts, + seenLink: map[string]int{}, + resultq: make(chan map[string]linkQueue, 100), + result: newResult(), + log: log.New(os.Stderr, ``, log.LstdFlags), + } + + wrk.scanUrl, err = url.Parse(opts.Url) + if err != nil { + return nil, fmt.Errorf(`invalid URL %q`, opts.Url) + } + wrk.scanUrl.Path = strings.TrimSuffix(wrk.scanUrl.Path, `/`) + wrk.scanUrl.Fragment = "" + wrk.scanUrl.RawFragment = "" + + wrk.baseUrl = &url.URL{ + Scheme: wrk.scanUrl.Scheme, + Host: wrk.scanUrl.Host, + } + + if opts.PastResultFile == "" { + // Run with normal scan. + return wrk, nil + } + + pastresult, err := os.ReadFile(opts.PastResultFile) + if err != nil { + return nil, err + } + + wrk.pastResult = newResult() + err = json.Unmarshal(pastresult, &wrk.pastResult) + if err != nil { + return nil, err + } + + return wrk, nil +} + +func (wrk *worker) run() (result *Result, err error) { + if wrk.pastResult == nil { + result, err = wrk.scanAll() + } else { + result, err = wrk.scanPastResult() + } + return result, err +} + +// scanAll scan all pages start from [Options.Url]. +func (wrk *worker) scanAll() (result *Result, err error) { + // Scan the first URL to make sure that the server is reachable. + var firstLinkq = linkQueue{ + parentUrl: nil, + url: wrk.scanUrl.String(), + status: http.StatusProcessing, + } + wrk.seenLink[firstLinkq.url] = http.StatusProcessing + + wrk.wg.Add(1) + go wrk.scan(firstLinkq) + wrk.wg.Wait() + + var resultq = <-wrk.resultq + for _, linkq := range resultq { + if linkq.url == firstLinkq.url { + if linkq.errScan != nil { + return nil, linkq.errScan + } + wrk.seenLink[linkq.url] = linkq.status + continue + } + if linkq.status >= http.StatusBadRequest { + wrk.markBroken(linkq) + continue + } + + wrk.seenLink[linkq.url] = http.StatusProcessing + wrk.wg.Add(1) + go wrk.scan(linkq) + } + + var tick = time.NewTicker(500 * time.Millisecond) + var listWaitStatus []linkQueue + var isScanning = true + for isScanning { + select { + case resultq := <-wrk.resultq: + listWaitStatus = wrk.processResult(resultq, listWaitStatus) + + case <-tick.C: + wrk.wg.Wait() + if len(wrk.resultq) != 0 { + continue + } + if len(listWaitStatus) != 0 { + // There are links that still waiting for + // scanning to be completed. + continue + } + isScanning = false + } + } + wrk.result.sort() + return wrk.result, nil +} + +// scanPastResult scan only pages reported inside +// [Result.BrokenLinks]. +func (wrk *worker) scanPastResult() ( + result *Result, err error, +) { + go func() { + for page := range wrk.pastResult.BrokenLinks { + var linkq = linkQueue{ + parentUrl: nil, + url: page, + status: http.StatusProcessing, + } + wrk.seenLink[linkq.url] = http.StatusProcessing + wrk.wg.Add(1) + go wrk.scan(linkq) + } + }() + + var tick = time.NewTicker(500 * time.Millisecond) + var listWaitStatus []linkQueue + var isScanning = true + for isScanning { + select { + case resultq := <-wrk.resultq: + listWaitStatus = wrk.processResult(resultq, listWaitStatus) + + case <-tick.C: + wrk.wg.Wait() + if len(wrk.resultq) != 0 { + continue + } + if len(listWaitStatus) != 0 { + // There are links that still waiting for + // scanning to be completed. + continue + } + isScanning = false + } + } + wrk.result.sort() + return wrk.result, nil +} + +// processResult the resultq contains the original URL being scanned +// and its child links. +// For example, scanning "http://example.tld" result in +// +// "http://example.tld": {status=200} +// "http://example.tld/page": {status=0} +// "http://example.tld/image.png": {status=0} +// "http://bad:domain/image.png": {status=700} +func (wrk *worker) processResult( + resultq map[string]linkQueue, listWaitStatus []linkQueue, +) ( + newList []linkQueue, +) { + for _, linkq := range resultq { + if linkq.status >= http.StatusBadRequest { + wrk.markBroken(linkq) + continue + } + if linkq.status != 0 { + // linkq is the result of scan with + // non error status. + wrk.seenLink[linkq.url] = linkq.status + continue + } + + seenStatus, seen := wrk.seenLink[linkq.url] + if !seen { + wrk.seenLink[linkq.url] = http.StatusProcessing + wrk.wg.Add(1) + go wrk.scan(linkq) + continue + } + if seenStatus >= http.StatusBadRequest { + linkq.status = seenStatus + wrk.markBroken(linkq) + continue + } + if seenStatus >= http.StatusOK { + // The link has been processed and its + // not an error. + continue + } + // The link being processed by other goroutine. + linkq.status = seenStatus + newList = append(newList, linkq) + } + for _, linkq := range listWaitStatus { + seenStatus := wrk.seenLink[linkq.url] + if seenStatus >= http.StatusBadRequest { + linkq.status = seenStatus + wrk.markBroken(linkq) + continue + } + if seenStatus >= http.StatusOK { + continue + } + if seenStatus == http.StatusProcessing { + // Scanning still in progress. + newList = append(newList, linkq) + continue + } + } + return newList +} + +func (wrk *worker) markBroken(linkq linkQueue) { + var parentUrl = linkq.parentUrl.String() + var listBroken = wrk.result.BrokenLinks[parentUrl] + var brokenLink = Broken{ + Link: linkq.url, + Code: linkq.status, + } + if linkq.errScan != nil { + brokenLink.Error = linkq.errScan.Error() + } + listBroken = append(listBroken, brokenLink) + wrk.result.BrokenLinks[parentUrl] = listBroken + + wrk.seenLink[linkq.url] = linkq.status +} + +// scan fetch the HTML page or image to check if its valid. +func (wrk *worker) scan(linkq linkQueue) { + defer func() { + if wrk.opts.IsVerbose && linkq.errScan != nil { + wrk.log.Printf("error: %d %s error=%v\n", linkq.status, + linkq.url, linkq.errScan) + } + wrk.wg.Done() + }() + + var ( + resultq = map[string]linkQueue{} + httpResp *http.Response + err error + ) + httpResp, err = wrk.fetch(linkq) + if err != nil { + linkq.status = StatusBadLink + linkq.errScan = err + resultq[linkq.url] = linkq + go wrk.pushResult(resultq) + return + } + defer httpResp.Body.Close() + + linkq.status = httpResp.StatusCode + resultq[linkq.url] = linkq + + if httpResp.StatusCode >= http.StatusBadRequest { + go wrk.pushResult(resultq) + return + } + if linkq.kind == atom.Img || linkq.isExternal { + go wrk.pushResult(resultq) + return + } + + var doc *html.Node + doc, _ = html.Parse(httpResp.Body) + + // After we check the code and test for [html.Parse] there are + // no case actual cases where HTML content will return an error. + // The only possible error is when reading from body (io.Reader), and + // that is also almost impossible. + // + // [html.Parse]: https://go.googlesource.com/net/+/refs/tags/v0.40.0/html/parse.go#2347 + + var scanUrl *url.URL + + scanUrl, err = url.Parse(linkq.url) + if err != nil { + log.Fatal(err) + } + + var node *html.Node + for node = range doc.Descendants() { + if node.Type != html.ElementNode { + continue + } + var nodeLink *linkQueue + if node.DataAtom == atom.A { + for _, attr := range node.Attr { + if attr.Key != `href` { + continue + } + nodeLink = wrk.processLink(scanUrl, attr.Val, atom.A) + break + } + } else if node.DataAtom == atom.Img { + for _, attr := range node.Attr { + if attr.Key != `src` { + continue + } + nodeLink = wrk.processLink(scanUrl, attr.Val, atom.Img) + break + } + } else { + continue + } + if nodeLink == nil { + continue + } + _, seen := resultq[nodeLink.url] + if !seen { + nodeLink.checkExternal(wrk) + resultq[nodeLink.url] = *nodeLink + } + } + go wrk.pushResult(resultq) +} + +func (wrk *worker) fetch(linkq linkQueue) ( + httpResp *http.Response, + err error, +) { + const maxRetry = 5 + var retry int + for retry < 5 { + if linkq.kind == atom.Img { + if wrk.opts.IsVerbose { + wrk.log.Printf("scan: HEAD %s\n", linkq.url) + } + httpResp, err = http.Head(linkq.url) + } else { + if wrk.opts.IsVerbose { + wrk.log.Printf("scan: GET %s\n", linkq.url) + } + httpResp, err = http.Get(linkq.url) + } + if err == nil { + return httpResp, nil + } + var errDNS *net.DNSError + if !errors.As(err, &errDNS) { + return nil, err + } + if errDNS.Timeout() { + retry++ + } + } + return nil, err +} + +func (wrk *worker) processLink(parentUrl *url.URL, val string, kind atom.Atom) ( + linkq *linkQueue, +) { + if len(val) == 0 { + return nil + } + + var newUrl *url.URL + var err error + newUrl, err = url.Parse(val) + if err != nil { + return &linkQueue{ + parentUrl: parentUrl, + errScan: err, + url: val, + kind: kind, + status: StatusBadLink, + } + } + newUrl.Fragment = "" + newUrl.RawFragment = "" + + if kind == atom.A && val[0] == '#' { + // Ignore link to ID, like `href="#element_id"`. + return nil + } + if strings.HasPrefix(val, `http`) { + return &linkQueue{ + parentUrl: parentUrl, + url: strings.TrimSuffix(newUrl.String(), `/`), + kind: kind, + } + } + if val[0] == '/' { + // val is absolute to parent URL. + newUrl = wrk.baseUrl.JoinPath(newUrl.Path) + } else { + // val is relative to parent URL. + newUrl = parentUrl.JoinPath(`/`, newUrl.Path) + } + linkq = &linkQueue{ + parentUrl: parentUrl, + url: strings.TrimSuffix(newUrl.String(), `/`), + kind: kind, + } + return linkq +} + +func (wrk *worker) pushResult(resultq map[string]linkQueue) { + var tick = time.NewTicker(100 * time.Millisecond) + for { + select { + case wrk.resultq <- resultq: + tick.Stop() + return + case <-tick.C: + } + } +} diff --git a/brokenlinks_test.go b/brokenlinks_test.go deleted file mode 100644 index 3818fbc..0000000 --- a/brokenlinks_test.go +++ /dev/null @@ -1,166 +0,0 @@ -// SPDX-FileCopyrightText: 2025 M. Shulhan -// SPDX-License-Identifier: GPL-3.0-only - -package jarink_test - -import ( - "encoding/json" - "net/http" - "testing" - - "git.sr.ht/~shulhan/jarink" - "git.sr.ht/~shulhan/pakakeh.go/lib/test" -) - -func TestBrokenlinks(t *testing.T) { - var testUrl = `http://` + testAddress - - type testCase struct { - exp map[string][]jarink.Broken - scanUrl string - expError string - } - - listCase := []testCase{{ - scanUrl: `127.0.0.1:14594`, - expError: `brokenlinks: invalid URL "127.0.0.1:14594"`, - }, { - scanUrl: `http://127.0.0.1:14594`, - expError: `brokenlinks: Get "http://127.0.0.1:14594": dial tcp 127.0.0.1:14594: connect: connection refused`, - }, { - scanUrl: testUrl, - exp: map[string][]jarink.Broken{ - testUrl: []jarink.Broken{ - { - Link: testUrl + `/broken.png`, - Code: http.StatusNotFound, - }, { - Link: testUrl + `/brokenPage`, - Code: http.StatusNotFound, - }, { - Link: `http://127.0.0.1:abc`, - Error: `parse "http://127.0.0.1:abc": invalid port ":abc" after host`, - Code: jarink.StatusBadLink, - }, { - Link: `http:/127.0.0.1:11836`, - Error: `Get "http:/127.0.0.1:11836": http: no Host in request URL`, - Code: jarink.StatusBadLink, - }, - }, - testUrl + `/broken.html`: []jarink.Broken{ - { - Link: testUrl + `/brokenPage`, - Code: http.StatusNotFound, - }, - }, - testUrl + `/page2`: []jarink.Broken{ - { - Link: testUrl + `/broken.png`, - Code: http.StatusNotFound, - }, { - Link: testUrl + `/page2/broken/relative`, - Code: http.StatusNotFound, - }, { - Link: testUrl + `/page2/broken2.png`, - Code: http.StatusNotFound, - }, - }, - }, - }, { - // Scanning on "/path" should not scan the the "/" or other - // pages other than below of "/path" itself. - scanUrl: testUrl + `/page2`, - exp: map[string][]jarink.Broken{ - testUrl + `/page2`: []jarink.Broken{ - { - Link: testUrl + `/broken.png`, - Code: http.StatusNotFound, - }, { - Link: testUrl + `/page2/broken/relative`, - Code: http.StatusNotFound, - }, { - Link: testUrl + `/page2/broken2.png`, - Code: http.StatusNotFound, - }, - }, - }, - }} - - var ( - result *jarink.BrokenlinksResult - err error - ) - for _, tcase := range listCase { - t.Logf(`--- brokenlinks: %s`, tcase.scanUrl) - var brokenlinksOpts = jarink.BrokenlinksOptions{ - Url: tcase.scanUrl, - } - result, err = jarink.Brokenlinks(brokenlinksOpts) - if err != nil { - test.Assert(t, tcase.scanUrl+` error`, - tcase.expError, err.Error()) - continue - } - //got, _ := json.MarshalIndent(result.BrokenLinks, ``, ` `) - //t.Logf(`got=%s`, got) - test.Assert(t, tcase.scanUrl, tcase.exp, result.BrokenLinks) - } -} - -// Test running Brokenlinks with file PastResultFile is set. -// The PastResultFile is modified to only report errors on "/page2". -func TestBrokenlinks_pastResult(t *testing.T) { - var testUrl = `http://` + testAddress - - type testCase struct { - exp map[string][]jarink.Broken - expError string - opts jarink.BrokenlinksOptions - } - - listCase := []testCase{{ - // With invalid file. - opts: jarink.BrokenlinksOptions{ - Url: testUrl, - PastResultFile: `testdata/invalid`, - }, - expError: `brokenlinks: open testdata/invalid: no such file or directory`, - }, { - // With valid file. - opts: jarink.BrokenlinksOptions{ - Url: testUrl, - PastResultFile: `testdata/past_result.json`, - }, - exp: map[string][]jarink.Broken{ - testUrl + `/page2`: []jarink.Broken{ - { - Link: testUrl + `/broken.png`, - Code: http.StatusNotFound, - }, { - Link: testUrl + `/page2/broken/relative`, - Code: http.StatusNotFound, - }, { - Link: testUrl + `/page2/broken2.png`, - Code: http.StatusNotFound, - }, - }, - }, - }} - - var ( - result *jarink.BrokenlinksResult - err error - ) - for _, tcase := range listCase { - t.Logf(`--- brokenlinks: %s`, tcase.opts.Url) - result, err = jarink.Brokenlinks(tcase.opts) - if err != nil { - test.Assert(t, tcase.opts.Url+` error`, - tcase.expError, err.Error()) - continue - } - got, _ := json.MarshalIndent(result.BrokenLinks, ``, ` `) - t.Logf(`got=%s`, got) - test.Assert(t, tcase.opts.Url, tcase.exp, result.BrokenLinks) - } -} diff --git a/brokenlinks_worker.go b/brokenlinks_worker.go deleted file mode 100644 index a4e854d..0000000 --- a/brokenlinks_worker.go +++ /dev/null @@ -1,467 +0,0 @@ -// SPDX-FileCopyrightText: 2025 M. Shulhan -// SPDX-License-Identifier: GPL-3.0-only - -package jarink - -import ( - "encoding/json" - "errors" - "fmt" - "log" - "net" - "net/http" - "net/url" - "os" - "strings" - "sync" - "time" - - "golang.org/x/net/html" - "golang.org/x/net/html/atom" -) - -type brokenlinksWorker struct { - // seenLink store the URL being or has been scanned and its HTTP - // status code. - seenLink map[string]int - - // resultq channel that collect result from scanning. - resultq chan map[string]linkQueue - - // result contains the final result after all of the pages has been - // scanned. - result *BrokenlinksResult - - // pastResult containts the past scan result, loaded from file - // [BrokenlinksOptions.PastResultFile]. - pastResult *BrokenlinksResult - - // The base URL that will be joined to relative or absolute - // links or image. - baseUrl *url.URL - - // The URL to scan. - scanUrl *url.URL - - log *log.Logger - - opts BrokenlinksOptions - - // wg sync the goroutine scanner. - wg sync.WaitGroup -} - -func newWorker(opts BrokenlinksOptions) (wrk *brokenlinksWorker, err error) { - wrk = &brokenlinksWorker{ - opts: opts, - seenLink: map[string]int{}, - resultq: make(chan map[string]linkQueue, 100), - result: newBrokenlinksResult(), - log: log.New(os.Stderr, ``, log.LstdFlags), - } - - wrk.scanUrl, err = url.Parse(opts.Url) - if err != nil { - return nil, fmt.Errorf(`invalid URL %q`, opts.Url) - } - wrk.scanUrl.Path = strings.TrimSuffix(wrk.scanUrl.Path, `/`) - wrk.scanUrl.Fragment = "" - wrk.scanUrl.RawFragment = "" - - wrk.baseUrl = &url.URL{ - Scheme: wrk.scanUrl.Scheme, - Host: wrk.scanUrl.Host, - } - - if opts.PastResultFile == "" { - // Run with normal scan. - return wrk, nil - } - - pastresult, err := os.ReadFile(opts.PastResultFile) - if err != nil { - return nil, err - } - - wrk.pastResult = newBrokenlinksResult() - err = json.Unmarshal(pastresult, &wrk.pastResult) - if err != nil { - return nil, err - } - - return wrk, nil -} - -func (wrk *brokenlinksWorker) run() (result *BrokenlinksResult, err error) { - if wrk.pastResult == nil { - result, err = wrk.scanAll() - } else { - result, err = wrk.scanPastResult() - } - return result, err -} - -// scanAll scan all pages start from [BrokenlinksOptions.Url]. -func (wrk *brokenlinksWorker) scanAll() (result *BrokenlinksResult, err error) { - // Scan the first URL to make sure that the server is reachable. - var firstLinkq = linkQueue{ - parentUrl: nil, - url: wrk.scanUrl.String(), - status: http.StatusProcessing, - } - wrk.seenLink[firstLinkq.url] = http.StatusProcessing - - wrk.wg.Add(1) - go wrk.scan(firstLinkq) - wrk.wg.Wait() - - var resultq = <-wrk.resultq - for _, linkq := range resultq { - if linkq.url == firstLinkq.url { - if linkq.errScan != nil { - return nil, linkq.errScan - } - wrk.seenLink[linkq.url] = linkq.status - continue - } - if linkq.status >= http.StatusBadRequest { - wrk.markBroken(linkq) - continue - } - - wrk.seenLink[linkq.url] = http.StatusProcessing - wrk.wg.Add(1) - go wrk.scan(linkq) - } - - var tick = time.NewTicker(500 * time.Millisecond) - var listWaitStatus []linkQueue - var isScanning = true - for isScanning { - select { - case resultq := <-wrk.resultq: - listWaitStatus = wrk.processResult(resultq, listWaitStatus) - - case <-tick.C: - wrk.wg.Wait() - if len(wrk.resultq) != 0 { - continue - } - if len(listWaitStatus) != 0 { - // There are links that still waiting for - // scanning to be completed. - continue - } - isScanning = false - } - } - wrk.result.sort() - return wrk.result, nil -} - -// scanPastResult scan only pages reported inside -// [BrokenlinksResult.BrokenLinks]. -func (wrk *brokenlinksWorker) scanPastResult() ( - result *BrokenlinksResult, err error, -) { - go func() { - for page := range wrk.pastResult.BrokenLinks { - var linkq = linkQueue{ - parentUrl: nil, - url: page, - status: http.StatusProcessing, - } - wrk.seenLink[linkq.url] = http.StatusProcessing - wrk.wg.Add(1) - go wrk.scan(linkq) - } - }() - - var tick = time.NewTicker(500 * time.Millisecond) - var listWaitStatus []linkQueue - var isScanning = true - for isScanning { - select { - case resultq := <-wrk.resultq: - listWaitStatus = wrk.processResult(resultq, listWaitStatus) - - case <-tick.C: - wrk.wg.Wait() - if len(wrk.resultq) != 0 { - continue - } - if len(listWaitStatus) != 0 { - // There are links that still waiting for - // scanning to be completed. - continue - } - isScanning = false - } - } - wrk.result.sort() - return wrk.result, nil -} - -// processResult the resultq contains the original URL being scanned -// and its child links. -// For example, scanning "http://example.tld" result in -// -// "http://example.tld": {status=200} -// "http://example.tld/page": {status=0} -// "http://example.tld/image.png": {status=0} -// "http://bad:domain/image.png": {status=700} -func (wrk *brokenlinksWorker) processResult( - resultq map[string]linkQueue, listWaitStatus []linkQueue, -) ( - newList []linkQueue, -) { - for _, linkq := range resultq { - if linkq.status >= http.StatusBadRequest { - wrk.markBroken(linkq) - continue - } - if linkq.status != 0 { - // linkq is the result of scan with - // non error status. - wrk.seenLink[linkq.url] = linkq.status - continue - } - - seenStatus, seen := wrk.seenLink[linkq.url] - if !seen { - wrk.seenLink[linkq.url] = http.StatusProcessing - wrk.wg.Add(1) - go wrk.scan(linkq) - continue - } - if seenStatus >= http.StatusBadRequest { - linkq.status = seenStatus - wrk.markBroken(linkq) - continue - } - if seenStatus >= http.StatusOK { - // The link has been processed and its - // not an error. - continue - } - // The link being processed by other goroutine. - linkq.status = seenStatus - newList = append(newList, linkq) - } - for _, linkq := range listWaitStatus { - seenStatus := wrk.seenLink[linkq.url] - if seenStatus >= http.StatusBadRequest { - linkq.status = seenStatus - wrk.markBroken(linkq) - continue - } - if seenStatus >= http.StatusOK { - continue - } - if seenStatus == http.StatusProcessing { - // Scanning still in progress. - newList = append(newList, linkq) - continue - } - } - return newList -} - -func (wrk *brokenlinksWorker) markBroken(linkq linkQueue) { - var parentUrl = linkq.parentUrl.String() - var listBroken = wrk.result.BrokenLinks[parentUrl] - var brokenLink = Broken{ - Link: linkq.url, - Code: linkq.status, - } - if linkq.errScan != nil { - brokenLink.Error = linkq.errScan.Error() - } - listBroken = append(listBroken, brokenLink) - wrk.result.BrokenLinks[parentUrl] = listBroken - - wrk.seenLink[linkq.url] = linkq.status -} - -// scan fetch the HTML page or image to check if its valid. -func (wrk *brokenlinksWorker) scan(linkq linkQueue) { - defer func() { - if wrk.opts.IsVerbose && linkq.errScan != nil { - wrk.log.Printf("error: %d %s error=%v\n", linkq.status, - linkq.url, linkq.errScan) - } - wrk.wg.Done() - }() - - var ( - resultq = map[string]linkQueue{} - httpResp *http.Response - err error - ) - httpResp, err = wrk.fetch(linkq) - if err != nil { - linkq.status = StatusBadLink - linkq.errScan = err - resultq[linkq.url] = linkq - go wrk.pushResult(resultq) - return - } - defer httpResp.Body.Close() - - linkq.status = httpResp.StatusCode - resultq[linkq.url] = linkq - - if httpResp.StatusCode >= http.StatusBadRequest { - go wrk.pushResult(resultq) - return - } - if linkq.kind == atom.Img || linkq.isExternal { - go wrk.pushResult(resultq) - return - } - - var doc *html.Node - doc, _ = html.Parse(httpResp.Body) - - // After we check the code and test for [html.Parse] there are - // no case actual cases where HTML content will return an error. - // The only possible error is when reading from body (io.Reader), and - // that is also almost impossible. - // - // [html.Parse]: https://go.googlesource.com/net/+/refs/tags/v0.40.0/html/parse.go#2347 - - var scanUrl *url.URL - - scanUrl, err = url.Parse(linkq.url) - if err != nil { - log.Fatal(err) - } - - var node *html.Node - for node = range doc.Descendants() { - if node.Type != html.ElementNode { - continue - } - var nodeLink *linkQueue - if node.DataAtom == atom.A { - for _, attr := range node.Attr { - if attr.Key != `href` { - continue - } - nodeLink = wrk.processLink(scanUrl, attr.Val, atom.A) - break - } - } else if node.DataAtom == atom.Img { - for _, attr := range node.Attr { - if attr.Key != `src` { - continue - } - nodeLink = wrk.processLink(scanUrl, attr.Val, atom.Img) - break - } - } else { - continue - } - if nodeLink == nil { - continue - } - _, seen := resultq[nodeLink.url] - if !seen { - nodeLink.checkExternal(wrk) - resultq[nodeLink.url] = *nodeLink - } - } - go wrk.pushResult(resultq) -} - -func (wrk *brokenlinksWorker) fetch(linkq linkQueue) ( - httpResp *http.Response, - err error, -) { - const maxRetry = 5 - var retry int - for retry < 5 { - if linkq.kind == atom.Img { - if wrk.opts.IsVerbose { - wrk.log.Printf("scan: HEAD %s\n", linkq.url) - } - httpResp, err = http.Head(linkq.url) - } else { - if wrk.opts.IsVerbose { - wrk.log.Printf("scan: GET %s\n", linkq.url) - } - httpResp, err = http.Get(linkq.url) - } - if err == nil { - return httpResp, nil - } - var errDNS *net.DNSError - if !errors.As(err, &errDNS) { - return nil, err - } - if errDNS.Timeout() { - retry++ - } - } - return nil, err -} - -func (wrk *brokenlinksWorker) processLink(parentUrl *url.URL, val string, kind atom.Atom) ( - linkq *linkQueue, -) { - if len(val) == 0 { - return nil - } - - var newUrl *url.URL - var err error - newUrl, err = url.Parse(val) - if err != nil { - return &linkQueue{ - parentUrl: parentUrl, - errScan: err, - url: val, - kind: kind, - status: StatusBadLink, - } - } - newUrl.Fragment = "" - newUrl.RawFragment = "" - - if kind == atom.A && val[0] == '#' { - // Ignore link to ID, like `href="#element_id"`. - return nil - } - if strings.HasPrefix(val, `http`) { - return &linkQueue{ - parentUrl: parentUrl, - url: strings.TrimSuffix(newUrl.String(), `/`), - kind: kind, - } - } - if val[0] == '/' { - // val is absolute to parent URL. - newUrl = wrk.baseUrl.JoinPath(newUrl.Path) - } else { - // val is relative to parent URL. - newUrl = parentUrl.JoinPath(`/`, newUrl.Path) - } - linkq = &linkQueue{ - parentUrl: parentUrl, - url: strings.TrimSuffix(newUrl.String(), `/`), - kind: kind, - } - return linkq -} - -func (wrk *brokenlinksWorker) pushResult(resultq map[string]linkQueue) { - var tick = time.NewTicker(100 * time.Millisecond) - for { - select { - case wrk.resultq <- resultq: - tick.Stop() - return - case <-tick.C: - } - } -} diff --git a/cmd/jarink/main.go b/cmd/jarink/main.go index cba254f..b384032 100644 --- a/cmd/jarink/main.go +++ b/cmd/jarink/main.go @@ -12,17 +12,19 @@ import ( "strings" "git.sr.ht/~shulhan/jarink" + "git.sr.ht/~shulhan/jarink/brokenlinks" ) func main() { log.SetFlags(0) - var brokenlinksOpts = jarink.BrokenlinksOptions{} + var optIsVerbose bool + var optPastResult string - flag.BoolVar(&brokenlinksOpts.IsVerbose, `verbose`, false, + flag.BoolVar(&optIsVerbose, `verbose`, false, `Print additional information while running.`) - flag.StringVar(&brokenlinksOpts.PastResultFile, `past-result`, ``, + flag.StringVar(&optPastResult, `past-result`, ``, `Scan only pages with broken links from the past JSON result.`) flag.Parse() @@ -31,15 +33,22 @@ func main() { cmd = strings.ToLower(cmd) switch cmd { case `brokenlinks`: - brokenlinksOpts.Url = flag.Arg(1) - if brokenlinksOpts.Url == "" { + var opts = brokenlinks.Options{ + IsVerbose: optIsVerbose, + PastResultFile: optPastResult, + } + + opts.Url = flag.Arg(1) + if opts.Url == "" { log.Printf(`Missing argument URL to be scanned.`) goto invalid_command } - var result *jarink.BrokenlinksResult - var err error - result, err = jarink.Brokenlinks(brokenlinksOpts) + var ( + result *brokenlinks.Result + err error + ) + result, err = brokenlinks.Scan(opts) if err != nil { log.Fatal(err.Error()) } diff --git a/jarink_test.go b/jarink_test.go deleted file mode 100644 index 91d38a0..0000000 --- a/jarink_test.go +++ /dev/null @@ -1,70 +0,0 @@ -// SPDX-FileCopyrightText: 2025 M. Shulhan -// SPDX-License-Identifier: GPL-3.0-only - -package jarink_test - -import ( - "log" - "net/http" - "os" - "testing" - "time" - - libnet "git.sr.ht/~shulhan/pakakeh.go/lib/net" -) - -// The test run two web servers that serve content on "testdata/web/". -// The first web server is the one that we want to scan. -// The second web server is external web server, where HTML pages should not -// be parsed. - -const testAddress = `127.0.0.1:11836` -const testExternalAddress = `127.0.0.1:11900` - -func TestMain(m *testing.M) { - log.SetFlags(0) - var httpDirWeb = http.Dir(`testdata/web`) - var fshandle = http.FileServer(httpDirWeb) - - go func() { - var mux = http.NewServeMux() - mux.Handle(`/`, fshandle) - var testServer = &http.Server{ - Addr: testAddress, - Handler: mux, - ReadTimeout: 10 * time.Second, - WriteTimeout: 10 * time.Second, - MaxHeaderBytes: 1 << 20, - } - var err = testServer.ListenAndServe() - if err != nil { - log.Fatal(err) - } - }() - go func() { - var mux = http.NewServeMux() - mux.Handle(`/`, fshandle) - var testServer = &http.Server{ - Addr: testExternalAddress, - Handler: mux, - ReadTimeout: 10 * time.Second, - WriteTimeout: 10 * time.Second, - MaxHeaderBytes: 1 << 20, - } - var err = testServer.ListenAndServe() - if err != nil { - log.Fatal(err) - } - }() - - var err = libnet.WaitAlive(`tcp`, testAddress, 5*time.Second) - if err != nil { - log.Fatal(err) - } - err = libnet.WaitAlive(`tcp`, testExternalAddress, 5*time.Second) - if err != nil { - log.Fatal(err) - } - - os.Exit(m.Run()) -} diff --git a/link_queue.go b/link_queue.go deleted file mode 100644 index 1470115..0000000 --- a/link_queue.go +++ /dev/null @@ -1,55 +0,0 @@ -// SPDX-FileCopyrightText: 2025 M. Shulhan -// SPDX-License-Identifier: GPL-3.0-only - -package jarink - -import ( - "net/url" - "strings" - - "golang.org/x/net/html/atom" -) - -type linkQueue struct { - parentUrl *url.URL - - // The error from scan. - errScan error - - // url being scanned. - url string - - // kind of url, its either an anchor or image. - // It set to 0 if url is the first URL being scanned. - kind atom.Atom - - // isExternal if true the scan will issue HTTP method HEAD instead of - // GET. - isExternal bool - - // Status of link after scan, its mostly used the HTTP status code. - // 0: link is the result of scan, not processed yet. - // StatusBadLink: link is invalid, not parseable or unreachable. - // 200 - 211: OK. - // 400 - 511: Error. - status int -} - -// checkExternal set the isExternal field to be true if -// -// (1) [linkQueue.url] does not start with [brokenlinksWorker.scanUrl] -// -// (2) linkQueue is from scanPastResult, indicated by non-nil -// [brokenlinksWorker.pastResult]. -// In this case, we did not want to scan the other pages from the same scanUrl -// domain. -func (linkq *linkQueue) checkExternal(wrk *brokenlinksWorker) { - if !strings.HasPrefix(linkq.url, wrk.scanUrl.String()) { - linkq.isExternal = true - return - } - if wrk.pastResult != nil { - linkq.isExternal = true - return - } -} diff --git a/testdata/past_result.json b/testdata/past_result.json deleted file mode 100644 index ca29d35..0000000 --- a/testdata/past_result.json +++ /dev/null @@ -1,10 +0,0 @@ -{ - "broken_links": { - "http://127.0.0.1:11836/page2": [ - { - "link": "http://127.0.0.1:11836/", - "code": 404 - } - ] - } -} diff --git a/testdata/past_result.json.license b/testdata/past_result.json.license deleted file mode 100644 index 22616a9..0000000 --- a/testdata/past_result.json.license +++ /dev/null @@ -1,2 +0,0 @@ -SPDX-FileCopyrightText: 2025 M. Shulhan -SPDX-License-Identifier: GPL-3.0-only diff --git a/testdata/web/broken.html b/testdata/web/broken.html deleted file mode 100644 index 533e542..0000000 --- a/testdata/web/broken.html +++ /dev/null @@ -1,7 +0,0 @@ - - - - - - diff --git a/testdata/web/gopher.png b/testdata/web/gopher.png deleted file mode 100644 index 79352be..0000000 Binary files a/testdata/web/gopher.png and /dev/null differ diff --git a/testdata/web/index.html b/testdata/web/index.html deleted file mode 100644 index 61a1f39..0000000 --- a/testdata/web/index.html +++ /dev/null @@ -1,22 +0,0 @@ - - - - - Broken page - - - Page 2 - Broken HTML - External URL - - Invalid external URL - - Invalid URL port - - Same with href to "/" - Same with href to "/page2" - - diff --git a/testdata/web/page2/index.html b/testdata/web/page2/index.html deleted file mode 100644 index ae6b4ea..0000000 --- a/testdata/web/page2/index.html +++ /dev/null @@ -1,14 +0,0 @@ - - - - - - broken relative link - Back with absolute path - Back with relative path - External URL page2 - - -- cgit v1.3