all: improve the scan by loading the existing report

In the second or next scan, load the existing spdxconv.report and check if the file is already scanned before. If the file is in group regular or binary, skip the scan; otherwise re-scan it again.
author: Shulhan <ms@kilabit.info> 2026-01-15 01:44:47 +0700
committer: Shulhan <ms@kilabit.info> 2026-01-15 01:44:47 +0700
commit: d0cb81287185db0c0fb088da5c35004af315cd42 (patch)
tree: 581c0cee6648c82f8fe9674702fab89c6bdc2616
parent: 56f2fb3751f73ec7bc04f19a7bb36587340de298 (diff)
download: spdxconv-d0cb81287185db0c0fb088da5c35004af315cd42.tar.xz
3 files changed, 114 insertions, 84 deletions
diff --git a/report.go b/report.go
index df77fa6..d8acc0c 100644
--- a/report.go
+++ b/report.go
@@ -9,7 +9,9 @@ import (
 	"errors"
 	"fmt"
 	"io"
+	"maps"
 	"os"
+	"slices"
 	"strconv"
 	"strings"
 )
@@ -30,9 +32,9 @@ const (
 const v1FieldsPerRecord = 8
 
 type report struct {
-	listRegular []*file
-	listBinary  []*file
-	listUnknown []*file
+	listBinary  map[string]*file
+	listRegular map[string]*file
+	listUnknown map[string]*file
 }
 
 // loadReport load the [ReportFile] from the current directory.
@@ -50,7 +52,7 @@ func loadReport() (rep *report, err error) {
 	csvr.ReuseRecord = true
 	csvr.TrimLeadingSpace = true
 
-	rep = &report{}
+	rep = newReport()
 	var group string
 	var record []string
 	var f *file
@@ -91,13 +93,13 @@ func loadReport() (rep *report, err error) {
 
 		switch group {
 		case reportGroupRegular:
-			rep.listRegular = append(rep.listRegular, f)
+			rep.listRegular[f.path] = f
 		case reportGroupBinary:
 			f.group = groupBinary
-			rep.listBinary = append(rep.listBinary, f)
+			rep.listBinary[f.path] = f
 		case reportGroupUnknown:
 			f.group = groupUnknown
-			rep.listUnknown = append(rep.listUnknown, f)
+			rep.listUnknown[f.path] = f
 		}
 	next:
 		record, err = csvr.Read()
@@ -108,44 +110,67 @@ func loadReport() (rep *report, err error) {
 }
 
 func newReport() (rep *report) {
-	rep = &report{}
+	rep = &report{
+		listBinary:  map[string]*file{},
+		listRegular: map[string]*file{},
+		listUnknown: map[string]*file{},
+	}
 	return rep
 }
 
 func (rep *report) scan(conv *SPDXConv, listFile []string) (err error) {
 	var logp = `report.scan`
 	for _, file := range listFile {
+		if rep.hasScanned(file) {
+			continue
+		}
+
 		f, err := newFile(file, conv.cfg.MaxLineMatch)
 		if err != nil {
 			return fmt.Errorf(`%s: %w`, logp, err)
 		}
 		if f.group == groupBinary {
-			rep.listBinary = append(rep.listBinary, f)
+			rep.listBinary[f.path] = f
 			continue
 		}
 		f.scan(conv)
 		if f.group == groupBinary {
 			// json file should be detected as binary, since its
 			// does not have comment syntax.
-			rep.listBinary = append(rep.listBinary, f)
+			rep.listBinary[f.path] = f
 			continue
 		}
 		if f.group == groupUnknown {
-			rep.listUnknown = append(rep.listUnknown, f)
+			rep.listUnknown[f.path] = f
 			continue
 		}
 		if f.licenseID == valExist && f.copyrightText == valExist {
 			continue
 		}
 		if f.copyrightYear == `` {
-			rep.listUnknown = append(rep.listUnknown, f)
+			rep.listUnknown[f.path] = f
 			continue
 		}
-		rep.listRegular = append(rep.listRegular, f)
+		rep.listRegular[f.path] = f
 	}
 	return nil
 }
 
+// hasScanned return true if the file is already reported in regular or
+// binary group before.
+func (rep *report) hasScanned(path string) bool {
+	var ok bool
+	_, ok = rep.listBinary[path]
+	if ok {
+		return true
+	}
+	_, ok = rep.listRegular[path]
+	if ok {
+		return true
+	}
+	return false
+}
+
 func (rep *report) write() (err error) {
 	var buf bytes.Buffer
 
@@ -161,19 +186,13 @@ func (rep *report) write() (err error) {
 	var csvw = csv.NewWriter(&buf)
 	var record = make([]string, v1FieldsPerRecord)
 
+	var f *file
 	buf.WriteString("//\n")
 	buf.WriteString(reportMetaPrefix + reportGroupRegular + "\n")
 	buf.WriteString("//\n")
-	for _, file := range rep.listRegular {
-		record[0] = file.path
-		record[1] = file.licenseID
-		record[2] = strconv.Itoa(file.idxLicenseID)
-		record[3] = file.copyrightYear
-		record[4] = file.copyrightText
-		record[5] = strconv.Itoa(file.idxCopyrightText)
-		record[6] = file.commentPrefix
-		record[7] = file.commentSuffix
-		err = csvw.Write(record)
+	for _, key := range slices.Sorted(maps.Keys(rep.listRegular)) {
+		f = rep.listRegular[key]
+		err = csvWrite(csvw, f, record)
 		if err != nil {
 			return err
 		}
@@ -183,16 +202,9 @@ func (rep *report) write() (err error) {
 	buf.WriteString("//\n")
 	buf.WriteString(reportMetaPrefix + reportGroupBinary + "\n")
 	buf.WriteString("//\n")
-	for _, file := range rep.listBinary {
-		record[0] = file.path
-		record[1] = file.licenseID
-		record[2] = strconv.Itoa(file.idxLicenseID)
-		record[3] = file.copyrightYear
-		record[4] = file.copyrightText
-		record[5] = strconv.Itoa(file.idxCopyrightText)
-		record[6] = file.commentPrefix
-		record[7] = file.commentSuffix
-		err = csvw.Write(record)
+	for _, key := range slices.Sorted(maps.Keys(rep.listBinary)) {
+		f = rep.listBinary[key]
+		err = csvWrite(csvw, f, record)
 		if err != nil {
 			return err
 		}
@@ -202,16 +214,9 @@ func (rep *report) write() (err error) {
 	buf.WriteString("//\n")
 	buf.WriteString(reportMetaPrefix + reportGroupUnknown + "\n")
 	buf.WriteString("//\n")
-	for _, file := range rep.listUnknown {
-		record[0] = file.path
-		record[1] = file.licenseID
-		record[2] = strconv.Itoa(file.idxLicenseID)
-		record[3] = file.copyrightYear
-		record[4] = file.copyrightText
-		record[5] = strconv.Itoa(file.idxCopyrightText)
-		record[6] = file.commentPrefix
-		record[7] = file.commentSuffix
-		err = csvw.Write(record)
+	for _, key := range slices.Sorted(maps.Keys(rep.listUnknown)) {
+		f = rep.listUnknown[key]
+		err = csvWrite(csvw, f, record)
 		if err != nil {
 			return err
 		}
@@ -224,3 +229,16 @@ func (rep *report) write() (err error) {
 	}
 	return nil
 }
+
+func csvWrite(csvw *csv.Writer, f *file, record []string) (err error) {
+	record[0] = f.path
+	record[1] = f.licenseID
+	record[2] = strconv.Itoa(f.idxLicenseID)
+	record[3] = f.copyrightYear
+	record[4] = f.copyrightText
+	record[5] = strconv.Itoa(f.idxCopyrightText)
+	record[6] = f.commentPrefix
+	record[7] = f.commentSuffix
+	err = csvw.Write(record)
+	return err
+}
diff --git a/report_test.go b/report_test.go
index abe7766..a042652 100644
--- a/report_test.go
+++ b/report_test.go
@@ -18,41 +18,49 @@ func TestLoadReport(t *testing.T) {
 		t.Fatal(err)
 	}
 	exp := &report{
-		listRegular: []*file{{
-			path:          `fileR1`,
-			licenseID:     valDefault,
-			copyrightText: valDefault,
-			commentPrefix: `# `,
-		}, {
-			path:             `file R2`,
-			licenseID:        valExist,
-			idxLicenseID:     1,
-			copyrightYear:    `2024`,
-			copyrightText:    valExist,
-			idxCopyrightText: -1,
-			commentPrefix:    `// `,
-		}, {
-			path:             `fileR3`,
-			licenseID:        valMatch,
-			idxLicenseID:     -2,
-			copyrightYear:    `2000-2026`,
-			copyrightText:    valMatch,
-			idxCopyrightText: -3,
-			commentPrefix:    `<!-- `,
-			commentSuffix:    ` -->`,
-		}},
-		listBinary: []*file{{
-			path:          `fileB1`,
-			licenseID:     valDefault,
-			copyrightText: valDefault,
-			group:         groupBinary,
-		}},
-		listUnknown: []*file{{
-			path:          `fileU1`,
-			licenseID:     valDefault,
-			copyrightText: valDefault,
-			group:         groupUnknown,
-		}},
+		listRegular: map[string]*file{
+			`fileR1`: &file{
+				path:          `fileR1`,
+				licenseID:     valDefault,
+				copyrightText: valDefault,
+				commentPrefix: `# `,
+			},
+			`file R2`: &file{
+				path:             `file R2`,
+				licenseID:        valExist,
+				idxLicenseID:     1,
+				copyrightYear:    `2024`,
+				copyrightText:    valExist,
+				idxCopyrightText: -1,
+				commentPrefix:    `// `,
+			},
+			`fileR3`: &file{
+				path:             `fileR3`,
+				licenseID:        valMatch,
+				idxLicenseID:     -2,
+				copyrightYear:    `2000-2026`,
+				copyrightText:    valMatch,
+				idxCopyrightText: -3,
+				commentPrefix:    `<!-- `,
+				commentSuffix:    ` -->`,
+			},
+		},
+		listBinary: map[string]*file{
+			`fileB1`: &file{
+				path:          `fileB1`,
+				licenseID:     valDefault,
+				copyrightText: valDefault,
+				group:         groupBinary,
+			},
+		},
+		listUnknown: map[string]*file{
+			`fileU1`: &file{
+				path:          `fileU1`,
+				licenseID:     valDefault,
+				copyrightText: valDefault,
+				group:         groupUnknown,
+			},
+		},
 	}
 	test.Assert(t, workDir, exp, got)
 }
diff --git a/spdxconv.go b/spdxconv.go
index 778ec51..64f138f 100644
--- a/spdxconv.go
+++ b/spdxconv.go
@@ -80,7 +80,11 @@ func Scan(path string) (err error) {
 		return fmt.Errorf(`%s: %w`, logp, err)
 	}
 
-	rep := newReport()
+	var rep *report
+	rep, err = loadReport()
+	if err != nil {
+		return fmt.Errorf(`%s: %w`, logp, err)
+	}
 	err = rep.scan(conv, listFile)
 	if err != nil {
 		return fmt.Errorf(`%s: %w`, logp, err)
@@ -114,18 +118,18 @@ func Apply() (err error) {
 		return fmt.Errorf(`%s: %w`, logp, err)
 	}
 
-	var listFail []*file
+	var listFail = make(map[string]*file)
 	for _, f := range rep.listRegular {
 		err = f.apply(conv)
 		if err != nil {
-			listFail = append(listFail, f)
+			listFail[f.path] = f
 			log.Printf(`%s: %s`, logp, err)
 			continue
 		}
 
 		err = f.write()
 		if err != nil {
-			listFail = append(listFail, f)
+			listFail[f.path] = f
 			log.Printf(`%s: %s`, logp, err)
 			continue
 		}
@@ -138,12 +142,12 @@ func Apply() (err error) {
 	fmt.Fprintf(&buf, "SPDX-FileCopyrightText: %s\n", conv.cfg.FileCopyrightText)
 	// REUSE-IgnoreEnd
 
-	listFail = nil
+	listFail = make(map[string]*file)
 	for _, f := range rep.listBinary {
 		pathLicense := f.path + suffixLicense
 		err = os.WriteFile(pathLicense, buf.Bytes(), 0600)
 		if err != nil {
-			listFail = append(listFail, f)
+			listFail[f.path] = f
 			log.Printf(`%s: failed to write %s`, logp, pathLicense)
 			continue
 		}
author	Shulhan <ms@kilabit.info>	2026-01-15 01:44:47 +0700
committer	Shulhan <ms@kilabit.info>	2026-01-15 01:44:47 +0700
commit	d0cb81287185db0c0fb088da5c35004af315cd42 (patch)
tree	581c0cee6648c82f8fe9674702fab89c6bdc2616
parent	56f2fb3751f73ec7bc04f19a7bb36587340de298 (diff)
download	spdxconv-d0cb81287185db0c0fb088da5c35004af315cd42.tar.xz