diff options
| author | Shulhan <ms@kilabit.info> | 2026-01-13 21:08:26 +0700 |
|---|---|---|
| committer | Shulhan <ms@kilabit.info> | 2026-01-14 04:10:44 +0700 |
| commit | a2bfa5ec9539f9063e33519e245ae00193783750 (patch) | |
| tree | dd15bf22d9b7df7b2079103fbd248c0eb717b59e | |
| parent | 3bc8f7ea570c726d88f4e0d31910b9eec999ce46 (diff) | |
| download | spdxconv-a2bfa5ec9539f9063e33519e245ae00193783750.tar.xz | |
all: get the copyright year from git history
If the line that match with pattern on match-copyright does not contains
year, or there is no match, try to get the year from the first commit of
the file using "git log --follow ..." command.
If no commit history or its not using git, use default copyright year from
configuration.
| -rw-r--r-- | README.md | 13 | ||||
| -rw-r--r-- | config.go | 14 | ||||
| -rw-r--r-- | file.go | 64 | ||||
| -rw-r--r-- | go.mod | 2 | ||||
| -rw-r--r-- | go.sum | 4 | ||||
| -rw-r--r-- | report_test.go | 3 | ||||
| -rw-r--r-- | scm.go | 10 | ||||
| -rw-r--r-- | spdxconv.go | 2 | ||||
| -rw-r--r-- | spdxconv_test.go | 20 | ||||
| -rw-r--r-- | testdata/Apply_test.txt | 12 | ||||
| -rw-r--r-- | testdata/loadConfig/config_exists/spdxconv.cfg | 14 | ||||
| -rw-r--r-- | testdata/loadReport/ok/spdxconv.report | 6 | ||||
| -rw-r--r-- | testdata/scan/no_copyright_year.md | 7 | ||||
| -rw-r--r-- | testdata/scan/spdxconv.cfg | 16 | ||||
| -rw-r--r-- | testdata/scan/test.sh | 6 | ||||
| -rw-r--r-- | testdata/scan/test.sql | 5 |
16 files changed, 131 insertions, 67 deletions
@@ -19,6 +19,7 @@ Features, regex - Customizable pattern for searching and capturing existing copyright year, author, and contact through regex +- Derive the copyright year from the first commit in git history. ## Background @@ -210,7 +211,13 @@ The match-copyright section define the pattern to match with old copyright text. The regex must contains named group to capture copyright year, author, and contact. -For example, given the following old copyright text, + +If no copyright year found on the file, program will derive the year from +the date of the first commit in history of the file using the Source Code +Management (SCM). +In git SCM, it will run "git log --follow file". + +Example, given the following old copyright text, ``` Copyright 2022, John Doe <john.doe@email>. All rights reserved. @@ -258,7 +265,7 @@ path = { unicode_char } license_id = "default" | "exist" | "match" idx_license_id = 1 * decimal_digit -year = "unknown" | 4 * decimal_digit +year = 4 * decimal_digit ("," year) | 4 * decimal_digit "-" 4*decimal_digit copyright_id = "default" | "exist" | "match" @@ -285,8 +292,8 @@ found at the bottom. The `year` column define the copyright year for the work. The value is either, -- unknown - program cannot detect year - YYYY - single year, for example 2026 +- YYYY,YYYY,... - list of year, separated by comma - YYYY-YYYY - range of years, for example 2000-2026 The `copyright_id` define the author and contact. @@ -54,16 +54,16 @@ prefix = "%" pattern = "^.*\\.(apk|app|bz2|csv|doc|docx|exe|gif|gz|jpeg|jpg|json|pdf|png|ppt|pptx|svg|svgz|tar|tgz|xls|xlsx|zip)$" [match-license] -pattern = "^(//+|#+|/\\*+|<!--+)?\\s*(.*)governed by a BSD-style(.*)$" +pattern = "^(//+|#+|/\\*+|<!--+|--+)?\\s*(.*)governed by a BSD-style(.*)$" license_identifier = BSD-3-Clause -delete_line_before = "^(//+|#+|/\\*+|<!--+)$" -delete_line_after = "^(//+|#+|/\\*+|<!--+)?\\s*license that can(.*)$" -delete_line_after = "^(//+|#+|\\*+/|--+>)$" +delete_line_before = "^(//+|#+|/\\*+|<!--+|--+)$" +delete_line_after = "^(//+|#+|/\\*+|<!--+|--+)?\\s*license that can(.*)$" +delete_line_after = "^(//+|#+|\\*+/|--+>|--+)$" [match-copyright] -pattern = "^(//+|#+|/\\*+|<!--+)?\\s*Copyright\\s+(?<year>\\d{4}),?\\s+(?<author>.*)\\s+<(?<contact>.*)>.*$" -delete_line_before = "^(//+|#+|/\\*+|<!--+)$" -delete_line_after = "^(//+|#+|\\*+/|--+>)$" +pattern = "^(//+|#+|/\\*+|<!--+|--+)?\\s*Copyright\\s+(?<year>\\d{4}),?\\s+(?<author>.*)\\s+<(?<contact>.*)>.*$" +delete_line_before = "^(//+|#+|/\\*+|<!--+|--+)$" +delete_line_after = "^(//+|#+|\\*+/|--+>|--+)$" ` type config struct { @@ -6,11 +6,14 @@ package spdxconv import ( "bytes" "fmt" + "log" "os" "path/filepath" "regexp" "slices" + "strconv" "strings" + "time" libos "git.sr.ht/~shulhan/pakakeh.go/lib/os" ) @@ -20,17 +23,16 @@ const ( valDefault = `default` // Use the default value from configuration. valExist = `exist` // The license/copyright exist in the file. valMatch = `match` // One of the pattern in match-xxx found in file. - valUnknown = `unknown` ) // REUSE-IgnoreStart // reLicenseID regex to detect SPDX license identifier with or without // comment prefix. -var reLicenseID = regexp.MustCompile(`^(//+|#+|/\*+|<!--+)?\s?SPDX-License-Identifier:\s*(.*)\s*$`) +var reLicenseID = regexp.MustCompile(`^(//+|#+|/\*+|<!--+|--+)?\s?SPDX-License-Identifier:\s*(.*)\s*$`) // reCopyrightText regex to detect SPDX copyright text. -var reCopyrightText = regexp.MustCompile(`^(//+|#+|/\*+|<!--+)?\s?SPDX-FileCopyrightText:\s*(.*)\s*$`) +var reCopyrightText = regexp.MustCompile(`^(//+|#+|/\*+|<!--+|--+)?\s?SPDX-FileCopyrightText:\s*(.*)\s*$`) // REUSE-IgnoreEnd @@ -41,8 +43,6 @@ type file struct { path string // commentPrefix used as prefix to SPDX identifier. - // The comment prefix is detected automatically from the first N - // lines of file. commentPrefix string commentSuffix string @@ -77,11 +77,8 @@ func newFile(path string, maxLine int) (f *file, err error) { } f = &file{ - path: path, - licenseID: valDefault, - copyrightYear: valUnknown, - copyrightText: valDefault, - isBinary: libos.IsBinaryStream(content), + path: path, + isBinary: libos.IsBinaryStream(content), } if f.isBinary { return f, nil @@ -125,6 +122,7 @@ func (f *file) scan(conv *SPDXConv) { } f.scanLicenseID(conv) f.scanCopyrightText(conv) + f.getYearFromSCM(conv) } // detectComment get comment prefix and suffix using the "match-file-comment" @@ -149,6 +147,7 @@ func (f *file) detectComment(cfg *config) { } func (f *file) scanLicenseID(conv *SPDXConv) { + f.licenseID = valDefault for _, cml := range conv.cfg.MatchLicense { for x, line := range f.topLines { if reLicenseID.Match(line) { @@ -175,10 +174,10 @@ func (f *file) scanLicenseID(conv *SPDXConv) { } } } - f.licenseID = valDefault } func (f *file) scanCopyrightText(conv *SPDXConv) { + f.copyrightText = valDefault for _, cmc := range conv.cfg.MatchCopyright { for x, line := range f.topLines { if reCopyrightText.Match(line) { @@ -207,7 +206,45 @@ func (f *file) scanCopyrightText(conv *SPDXConv) { } } } - f.copyrightText = valDefault +} + +func (f *file) getYearFromSCM(conv *SPDXConv) { + if f.copyrightText == valExist { + // SPDX for copyright exist, skip it. Even though maybe there + // is no year. + return + } + if f.copyrightYear != `` { + return + } + var logp = `getYearFromSCM` + + f.copyrightYear = conv.cfg.CopyrightYear + + logs, err := conv.scm.LogFollow(f.path, ``) + if err != nil { + log.Printf(`%s %s: %s`, logp, f.path, err) + return + } + if len(logs) == 0 { + log.Printf(`%s %s: empty commit history`, logp, f.path) + return + } + + first_commit := logs[len(logs)-1] + fields := strings.Split(first_commit, `,`) + if len(fields) < 2 { + log.Printf(`%s %s: cannot parse commit log %q`, logp, f.path, first_commit) + return + } + + sec, err := strconv.ParseInt(fields[1], 10, 64) + if err != nil { + log.Printf(`%s %s: cannot parse commit timestamp: %s`, logp, f.path, err) + return + } + + f.copyrightYear = strconv.Itoa(time.Unix(sec, 0).Year()) } // apply the SPDX identifier to file. @@ -242,9 +279,6 @@ func (f *file) apply(conv *SPDXConv) (err error) { } f.topLines = slices.Insert(f.topLines, f.idxLicenseID, rawline) - if f.copyrightYear == valUnknown { - f.copyrightYear = conv.cfg.CopyrightYear - } if f.copyrightYear != `` { f.copyrightYear += ` ` } @@ -5,7 +5,7 @@ module git.sr.ht/~shulhan/spdxconv go 1.24.0 -require git.sr.ht/~shulhan/pakakeh.go v0.60.3-0.20260111142917-cdfebe3f55dc +require git.sr.ht/~shulhan/pakakeh.go v0.60.3-0.20260113140641-d18ec5d8635b require ( golang.org/x/exp v0.0.0-20251219203646-944ab1f22d93 // indirect @@ -1,5 +1,5 @@ -git.sr.ht/~shulhan/pakakeh.go v0.60.3-0.20260111142917-cdfebe3f55dc h1:9oi3sLX/kyePdZyCVmx0qoIAmcPy63v8U1D3Br23gLc= -git.sr.ht/~shulhan/pakakeh.go v0.60.3-0.20260111142917-cdfebe3f55dc/go.mod h1:1MkKXbLZRHTcnheeSEbRpGztkym4Yxzh90ep+jCxbDc= +git.sr.ht/~shulhan/pakakeh.go v0.60.3-0.20260113140641-d18ec5d8635b h1:tPRWSIjzQTWoQJ4WG2zJFP1GybZuvpkniKjjJc4gvJ8= +git.sr.ht/~shulhan/pakakeh.go v0.60.3-0.20260113140641-d18ec5d8635b/go.mod h1:1MkKXbLZRHTcnheeSEbRpGztkym4Yxzh90ep+jCxbDc= github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= golang.org/x/exp v0.0.0-20251219203646-944ab1f22d93 h1:fQsdNF2N+/YewlRZiricy4P1iimyPKZ/xwniHj8Q2a0= diff --git a/report_test.go b/report_test.go index db43739..2b31436 100644 --- a/report_test.go +++ b/report_test.go @@ -21,7 +21,6 @@ func TestLoadReport(t *testing.T) { listRegular: []*file{{ path: `fileR1`, licenseID: valDefault, - copyrightYear: valUnknown, copyrightText: valDefault, commentPrefix: `# `, }, { @@ -45,14 +44,12 @@ func TestLoadReport(t *testing.T) { listBinary: []*file{{ path: `fileB1`, licenseID: valDefault, - copyrightYear: valUnknown, copyrightText: valDefault, isBinary: true, }}, listUnknown: []*file{{ path: `fileU1`, licenseID: valDefault, - copyrightYear: valUnknown, copyrightText: valDefault, isUnknown: true, }}, @@ -6,12 +6,20 @@ package spdxconv // sourceCodeManagement define the interface for SCM tools. type sourceCodeManagement interface { IsIgnored(path string) bool + LogFollow(path, format string) ([]string, error) } // noSCM is a type to indicate working directory without SCM. // It always return false on IsIgnored. -type noSCM struct{} +type noSCM struct { +} + +var defaultNoSCM = &noSCM{} func (scm *noSCM) IsIgnored(path string) bool { return false } + +func (scm *noSCM) LogFollow(path, format string) ([]string, error) { + return nil, nil +} diff --git a/spdxconv.go b/spdxconv.go index f3da9c1..466b0ee 100644 --- a/spdxconv.go +++ b/spdxconv.go @@ -246,7 +246,7 @@ func (conv *SPDXConv) scanForSCM(dir, curDir string) (err error) { return nil } - conv.scm = &noSCM{} + conv.scm = defaultNoSCM return nil } diff --git a/spdxconv_test.go b/spdxconv_test.go index 11a2a72..842102c 100644 --- a/spdxconv_test.go +++ b/spdxconv_test.go @@ -64,23 +64,23 @@ func TestInit(t *testing.T) { Pattern: `^.*\.(apk|app|bz2|csv|doc|docx|exe|gif|gz|jpeg|jpg|json|pdf|png|ppt|pptx|svg|svgz|tar|tgz|xls|xlsx|zip)$`, }}, MatchLicense: []*matchLicense{{ - Pattern: `^(//+|#+|/\*+|<!--+)?\s*(.*)governed by a BSD-style(.*)$`, + Pattern: `^(//+|#+|/\*+|<!--+|--+)?\s*(.*)governed by a BSD-style(.*)$`, LicenseIdentifier: `BSD-3-Clause`, DeleteLineBefore: []string{ - `^(//+|#+|/\*+|<!--+)$`, + `^(//+|#+|/\*+|<!--+|--+)$`, }, DeleteLineAfter: []string{ - `^(//+|#+|/\*+|<!--+)?\s*license that can(.*)$`, - `^(//+|#+|\*+/|--+>)$`, + `^(//+|#+|/\*+|<!--+|--+)?\s*license that can(.*)$`, + `^(//+|#+|\*+/|--+>|--+)$`, }, }}, MatchCopyright: []*matchCopyright{{ - Pattern: `^(//+|#+|/\*+|<!--+)?\s*Copyright\s+(?<year>\d{4}),?\s+(?<author>.*)\s+<(?<contact>.*)>.*$`, + Pattern: `^(//+|#+|/\*+|<!--+|--+)?\s*Copyright\s+(?<year>\d{4}),?\s+(?<author>.*)\s+<(?<contact>.*)>.*$`, DeleteLineBefore: []string{ - `^(//+|#+|/\*+|<!--+)$`, + `^(//+|#+|/\*+|<!--+|--+)$`, }, DeleteLineAfter: []string{ - `^(//+|#+|\*+/|--+>)$`, + `^(//+|#+|\*+/|--+>|--+)$`, }, }}, } @@ -132,9 +132,11 @@ func TestScan(t *testing.T) { //spdxconv:version:v1 //spdxconv:header:path,license_id,idx_license_id,year,copyright_id,idx_copyright_id //spdxconv:regular +no_copyright_year.md,match,2,2026,default,0,<!-- ," -->" test.go,match,1,2022,match,0,// , test.html,match,4,2022,match,1,<!-- ," -->" -test.sh,match,1,2022,match,0,# , +test.sh,default,0,2026,default,0,# , +test.sql,match,1,2022,match,0,-- , //spdxconv:binary //spdxconv:unknown` @@ -295,9 +297,11 @@ func TestSPDXConv_scanDir(t *testing.T) { exp: []string{ `.gitignore`, `a/b/.gitignore`, + `no_copyright_year.md`, `test.go`, `test.html`, `test.sh`, + `test.sql`, `with_spdx.go`, }, }, { diff --git a/testdata/Apply_test.txt b/testdata/Apply_test.txt index 8107ee8..a8c3e6d 100644 --- a/testdata/Apply_test.txt +++ b/testdata/Apply_test.txt @@ -48,14 +48,16 @@ prefix = "%" pattern = "^.*\\.(apk|app|bz2|csv|doc|docx|exe|gif|gz|jpeg|jpg|json|pdf|png|ppt|pptx|svg|svgz|tar|tgz|xls|xlsx|zip)$" [match-license] -pattern = "^(//+|#+|/\\*+|<!--+)?\\s*(.*)governed by a BSD-style(.*)$" +pattern = "^(//+|#+|/\\*+|<!--+|--+)?\\s*(.*)governed by a BSD-style(.*)$" license_identifier = BSD-3-Clause -delete_line_after = "^(//+|#+|/\\*+|<!--+)?\\s*license that can(.*)$" -delete_line_after = "^(//+|#+|\\*+/|--+>)$" +delete_line_before = "^(//+|#+|/\\*+|<!--+|--+)$" +delete_line_after = "^(//+|#+|/\\*+|<!--+|--+)?\\s*license that can(.*)$" +delete_line_after = "^(//+|#+|\\*+/|--+>|--+)$" [match-copyright] -pattern = "^(//+|#+|/\\*+|<!--+)?\\s*Copyright\\s+(?<year>\\d{4}),?\\s+(?<author>.*)\\s+<(?<contact>.*)>.*$" -delete_line_before = "^(//+|#+|/\\*+|<!--+)$" +pattern = "^(//+|#+|/\\*+|<!--+|--+)?\\s*Copyright\\s+(?<year>\\d{4}),?\\s+(?<author>.*)\\s+<(?<contact>.*)>.*$" +delete_line_before = "^(//+|#+|/\\*+|<!--+|--+)$" +delete_line_after = "^(//+|#+|\\*+/|--+>|--+)$" >>> without_spdx_license_id.go 1 diff --git a/testdata/loadConfig/config_exists/spdxconv.cfg b/testdata/loadConfig/config_exists/spdxconv.cfg index 1f7ff68..b92f0c8 100644 --- a/testdata/loadConfig/config_exists/spdxconv.cfg +++ b/testdata/loadConfig/config_exists/spdxconv.cfg @@ -43,13 +43,13 @@ prefix = "%" pattern = "^.*\\.(apk|app|bz2|csv|doc|docx|exe|gif|gz|jpeg|jpg|json|pdf|png|ppt|pptx|svg|svgz|tar|tgz|xls|xlsx|zip)$" [match-license] -pattern = "^(//+|#+|/\\*+|<!--+)?\\s*(.*)governed by a BSD-style(.*)$" +pattern = "^(//+|#+|/\\*+|<!--+|--+)?\\s*(.*)governed by a BSD-style(.*)$" license_identifier = BSD-3-Clause -delete_line_before = "^(//+|#+|/\\*+|<!--+)$" -delete_line_after = "^(//+|#+|/\\*+|<!--+)?\\s*license that can(.*)$" -delete_line_after = "^(//+|#+|\\*+/|--+>)$" +delete_line_before = "^(//+|#+|/\\*+|<!--+|--+)$" +delete_line_after = "^(//+|#+|/\\*+|<!--+|--+)?\\s*license that can(.*)$" +delete_line_after = "^(//+|#+|\\*+/|--+>|--+)$" [match-copyright] -pattern = "^(//+|#+|/\\*+|<!--+)?\\s*Copyright\\s+(?<year>\\d{4}),?\\s+(?<author>.*)\\s+<(?<contact>.*)>.*$" -delete_line_before = "^(//+|#+|/\\*+|<!--+)$" -delete_line_after = "^(//+|#+|\\*+/|--+>)$" +pattern = "^(//+|#+|/\\*+|<!--+|--+)?\\s*Copyright\\s+(?<year>\\d{4}),?\\s+(?<author>.*)\\s+<(?<contact>.*)>.*$" +delete_line_before = "^(//+|#+|/\\*+|<!--+|--+)$" +delete_line_after = "^(//+|#+|\\*+/|--+>|--+)$" diff --git a/testdata/loadReport/ok/spdxconv.report b/testdata/loadReport/ok/spdxconv.report index ef027b8..cfb222d 100644 --- a/testdata/loadReport/ok/spdxconv.report +++ b/testdata/loadReport/ok/spdxconv.report @@ -3,14 +3,14 @@ // comment //spdxconv:regular -fileR1,default,0,unknown,default,0,# , +fileR1,default,0,,default,0,# , file R2,exist,1,2024,exist,-1,// , fileR3,match,-2,2000-2026,match,-3,<!-- ," -->" //spdxconv:binary -fileB1,default,0,unknown,default,0,, +fileB1,default,0,,default,0,, // comment //spdxconv:unknown // comment -fileU1,default,0,unknown,default,0,, +fileU1,default,0,,default,0,, diff --git a/testdata/scan/no_copyright_year.md b/testdata/scan/no_copyright_year.md new file mode 100644 index 0000000..be47d2e --- /dev/null +++ b/testdata/scan/no_copyright_year.md @@ -0,0 +1,7 @@ +<!-- +Copyright Shulhan <ms@kilabit.info>. All rights reserved. +Use of this source code is governed by a BSD-style +license that can be found in the LICENSE file. +--> + +The copyrightYear will use the default one. diff --git a/testdata/scan/spdxconv.cfg b/testdata/scan/spdxconv.cfg index 1cdfefb..082d482 100644 --- a/testdata/scan/spdxconv.cfg +++ b/testdata/scan/spdxconv.cfg @@ -3,7 +3,7 @@ [default] license_identifier = GPL-3.0-only -copyright_year = 2026 +copyright_year = 2025 file_copyright_text = Author <contact@email.local> max_line_match = 10 @@ -48,13 +48,13 @@ pattern = "^.*\\.(apk|app|bz2|csv|doc|docx|exe|gif|gz|jpeg|jpg|json|pdf|png|ppt| pattern = "^.*\\.(apk|app|bz2|csv|doc|docx|exe|gif|gz|jpeg|jpg|json|pdf|png|ppt|pptx|svg|svgz|tar|tgz|xls|xlsx|zip)$" [match-license] -pattern = "^(//+|#+|/\\*+|<!--+)?\\s*(.*)governed by a BSD-style(.*)$" +pattern = "^(//+|#+|/\\*+|<!--+|--+)?\\s*(.*)governed by a BSD-style(.*)$" license_identifier = BSD-3-Clause -delete_line_before = "^(//+|#+|/\\*+|<!--+)$" -delete_line_after = "^(//+|#+|/\\*+|<!--+)?\\s*license that can(.*)$" -delete_line_after = "^(//+|#+|\\*+/|--+>)$" +delete_line_before = "^(//+|#+|/\\*+|<!--+|--+)$" +delete_line_after = "^(//+|#+|/\\*+|<!--+|--+)?\\s*license that can(.*)$" +delete_line_after = "^(//+|#+|\\*+/|--+>|--+)$" [match-copyright] -pattern = "^(//+|#+|/\\*+|<!--+)?\\s*Copyright\\s+(?<year>\\d{4}),?\\s+(?<author>.*)\\s+<(?<contact>.*)>.*$" -delete_line_before = "^(//+|#+|/\\*+|<!--+)$" -delete_line_after = "^(//+|#+|\\*+/|--+>)$" +pattern = "^(//+|#+|/\\*+|<!--+|--+)?\\s*Copyright\\s+(?<year>\\d{4}),?\\s+(?<author>.*)\\s+<(?<contact>.*)>.*$" +delete_line_before = "^(//+|#+|/\\*+|<!--+|--+)$" +delete_line_after = "^(//+|#+|\\*+/|--+>|--+)$" diff --git a/testdata/scan/test.sh b/testdata/scan/test.sh index 8fd6949..13b4657 100644 --- a/testdata/scan/test.sh +++ b/testdata/scan/test.sh @@ -1,3 +1,3 @@ -# Copyright 2022, Shulhan <ms@kilabit.info>. All rights reserved. -# Use of this source code is governed by a BSD-style -# license that can be found in the LICENSE file.
\ No newline at end of file +#!/bin/sh + +# Test getYearFromSCM. diff --git a/testdata/scan/test.sql b/testdata/scan/test.sql new file mode 100644 index 0000000..86d55b3 --- /dev/null +++ b/testdata/scan/test.sql @@ -0,0 +1,5 @@ +-- Copyright 2022, Shulhan <ms@kilabit.info>. All rights reserved. +-- Use of this source code is governed by a BSD-style +-- license that can be found in the LICENSE file. + +-- comment |
