From 6d95e5a4ff9f2ffa12ca225974d0ce45fd2504d4 Mon Sep 17 00:00:00 2001
From: Roger Peppe <rogpeppe@gmail.com>
Date: Thu, 11 Feb 2021 13:54:45 +0000
Subject: encoding/csv: add FieldPos method
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This enables a consumer of a CSV to find out the position
of a CSV field without implementing an intermediate buffer.
This is useful to produce good higher level error messages when
the CSV syntax is OK but the field values don't match expectations.

This also changes the existing semantics of the `ParseError.Column`
field to bring it in line with precedent elsewhere in the Go
standard library (notably go/token.Position) - the column is
now 1-based and indicates a byte count rather than a rune count,
and the error position reporting at the end of a last line without
a newline is now fixed.

This change has some impact on performance:

```
name                                     old time/op    new time/op    delta
Read-8                                     2.14µs ± 0%    2.15µs ± 0%    ~     (p=0.056 n=5+5)
ReadWithFieldsPerRecord-8                  2.15µs ± 2%    2.15µs ± 1%    ~     (p=0.151 n=5+5)
ReadWithoutFieldsPerRecord-8               2.15µs ± 0%    2.15µs ± 0%  +0.37%  (p=0.024 n=5+5)
ReadLargeFields-8                          3.55µs ± 2%    3.59µs ± 0%    ~     (p=0.206 n=5+5)
ReadReuseRecord-8                          1.18µs ± 1%    1.22µs ± 1%  +2.93%  (p=0.008 n=5+5)
ReadReuseRecordWithFieldsPerRecord-8       1.18µs ± 0%    1.21µs ± 0%  +2.54%  (p=0.008 n=5+5)
ReadReuseRecordWithoutFieldsPerRecord-8    1.18µs ± 0%    1.22µs ± 1%  +3.66%  (p=0.008 n=5+5)
ReadReuseRecordLargeFields-8               2.53µs ± 1%    2.57µs ± 1%  +1.70%  (p=0.008 n=5+5)
Write-8                                    1.02µs ± 1%    1.01µs ± 0%  -1.18%  (p=0.016 n=5+4)
```

Fixes #44221.

Change-Id: Id37c50fc396024eef406c5bad45380ecd414f5ea
Reviewed-on: https://go-review.googlesource.com/c/go/+/291290
Run-TryBot: Ian Lance Taylor <iant@golang.org>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Ian Lance Taylor <iant@golang.org>
Trust: Paul Jolly <paul@myitcv.org.uk>
---
 src/encoding/csv/reader.go | 71 +++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 60 insertions(+), 11 deletions(-)

(limited to 'src/encoding/csv/reader.go')

diff --git a/src/encoding/csv/reader.go b/src/encoding/csv/reader.go
index c40aa506b0..f860f4f25f 100644
--- a/src/encoding/csv/reader.go
+++ b/src/encoding/csv/reader.go
@@ -66,7 +66,7 @@ import (
 type ParseError struct {
 	StartLine int   // Line where the record starts
 	Line      int   // Line where the error occurred
-	Column    int   // Column (rune index) where the error occurred
+	Column    int   // Column (1-based byte index) where the error occurred
 	Err       error // The actual error
 }
 
@@ -162,6 +162,10 @@ type Reader struct {
 	// The i'th field ends at offset fieldIndexes[i] in recordBuffer.
 	fieldIndexes []int
 
+	// fieldPositions is an index of field positions for the
+	// last record returned by Read.
+	fieldPositions []position
+
 	// lastRecord is a record cache and only used when ReuseRecord == true.
 	lastRecord []string
 }
@@ -192,6 +196,25 @@ func (r *Reader) Read() (record []string, err error) {
 	return record, err
 }
 
+// FieldPos returns the line and column corresponding to
+// the start of the field with the given index in the slice most recently
+// returned by Read. Numbering of lines and columns starts at 1;
+// columns are counted in bytes, not runes.
+//
+// If this is called with an out-of-bounds index, it panics.
+func (r *Reader) FieldPos(field int) (line, column int) {
+	if field < 0 || field >= len(r.fieldPositions) {
+		panic("out of range index passed to FieldPos")
+	}
+	p := &r.fieldPositions[field]
+	return p.line, p.col
+}
+
+// pos holds the position of a field in the current line.
+type position struct {
+	line, col int
+}
+
 // ReadAll reads all the remaining records from r.
 // Each record is a slice of fields.
 // A successful call returns err == nil, not err == io.EOF. Because ReadAll is
@@ -260,7 +283,7 @@ func (r *Reader) readRecord(dst []string) ([]string, error) {
 	}
 
 	// Read line (automatically skipping past empty lines and any comments).
-	var line, fullLine []byte
+	var line []byte
 	var errRead error
 	for errRead == nil {
 		line, errRead = r.readLine()
@@ -272,7 +295,6 @@ func (r *Reader) readRecord(dst []string) ([]string, error) {
 			line = nil
 			continue // Skip empty lines
 		}
-		fullLine = line
 		break
 	}
 	if errRead == io.EOF {
@@ -286,10 +308,20 @@ func (r *Reader) readRecord(dst []string) ([]string, error) {
 	recLine := r.numLine // Starting line for record
 	r.recordBuffer = r.recordBuffer[:0]
 	r.fieldIndexes = r.fieldIndexes[:0]
+	r.fieldPositions = r.fieldPositions[:0]
+	pos := position{line: r.numLine, col: 1}
 parseField:
 	for {
 		if r.TrimLeadingSpace {
-			line = bytes.TrimLeftFunc(line, unicode.IsSpace)
+			i := bytes.IndexFunc(line, func(r rune) bool {
+				return !unicode.IsSpace(r)
+			})
+			if i < 0 {
+				i = len(line)
+				pos.col -= lengthNL(line)
+			}
+			line = line[i:]
+			pos.col += i
 		}
 		if len(line) == 0 || line[0] != '"' {
 			// Non-quoted string field
@@ -303,48 +335,56 @@ parseField:
 			// Check to make sure a quote does not appear in field.
 			if !r.LazyQuotes {
 				if j := bytes.IndexByte(field, '"'); j >= 0 {
-					col := utf8.RuneCount(fullLine[:len(fullLine)-len(line[j:])])
+					col := pos.col + j
 					err = &ParseError{StartLine: recLine, Line: r.numLine, Column: col, Err: ErrBareQuote}
 					break parseField
 				}
 			}
 			r.recordBuffer = append(r.recordBuffer, field...)
 			r.fieldIndexes = append(r.fieldIndexes, len(r.recordBuffer))
+			r.fieldPositions = append(r.fieldPositions, pos)
 			if i >= 0 {
 				line = line[i+commaLen:]
+				pos.col += i + commaLen
 				continue parseField
 			}
 			break parseField
 		} else {
 			// Quoted string field
+			fieldPos := pos
 			line = line[quoteLen:]
+			pos.col += quoteLen
 			for {
 				i := bytes.IndexByte(line, '"')
 				if i >= 0 {
 					// Hit next quote.
 					r.recordBuffer = append(r.recordBuffer, line[:i]...)
 					line = line[i+quoteLen:]
+					pos.col += i + quoteLen
 					switch rn := nextRune(line); {
 					case rn == '"':
 						// `""` sequence (append quote).
 						r.recordBuffer = append(r.recordBuffer, '"')
 						line = line[quoteLen:]
+						pos.col += quoteLen
 					case rn == r.Comma:
 						// `",` sequence (end of field).
 						line = line[commaLen:]
+						pos.col += commaLen
 						r.fieldIndexes = append(r.fieldIndexes, len(r.recordBuffer))
+						r.fieldPositions = append(r.fieldPositions, fieldPos)
 						continue parseField
 					case lengthNL(line) == len(line):
 						// `"\n` sequence (end of line).
 						r.fieldIndexes = append(r.fieldIndexes, len(r.recordBuffer))
+						r.fieldPositions = append(r.fieldPositions, fieldPos)
 						break parseField
 					case r.LazyQuotes:
 						// `"` sequence (bare quote).
 						r.recordBuffer = append(r.recordBuffer, '"')
 					default:
 						// `"*` sequence (invalid non-escaped quote).
-						col := utf8.RuneCount(fullLine[:len(fullLine)-len(line)-quoteLen])
-						err = &ParseError{StartLine: recLine, Line: r.numLine, Column: col, Err: ErrQuote}
+						err = &ParseError{StartLine: recLine, Line: r.numLine, Column: pos.col - quoteLen, Err: ErrQuote}
 						break parseField
 					}
 				} else if len(line) > 0 {
@@ -353,19 +393,23 @@ parseField:
 					if errRead != nil {
 						break parseField
 					}
+					pos.col += len(line)
 					line, errRead = r.readLine()
+					if len(line) > 0 {
+						pos.line++
+						pos.col = 1
+					}
 					if errRead == io.EOF {
 						errRead = nil
 					}
-					fullLine = line
 				} else {
 					// Abrupt end of file (EOF or error).
 					if !r.LazyQuotes && errRead == nil {
-						col := utf8.RuneCount(fullLine)
-						err = &ParseError{StartLine: recLine, Line: r.numLine, Column: col, Err: ErrQuote}
+						err = &ParseError{StartLine: recLine, Line: pos.line, Column: pos.col, Err: ErrQuote}
 						break parseField
 					}
 					r.fieldIndexes = append(r.fieldIndexes, len(r.recordBuffer))
+					r.fieldPositions = append(r.fieldPositions, fieldPos)
 					break parseField
 				}
 			}
@@ -392,7 +436,12 @@ parseField:
 	// Check or update the expected fields per record.
 	if r.FieldsPerRecord > 0 {
 		if len(dst) != r.FieldsPerRecord && err == nil {
-			err = &ParseError{StartLine: recLine, Line: recLine, Err: ErrFieldCount}
+			err = &ParseError{
+				StartLine: recLine,
+				Line:      recLine,
+				Column:    1,
+				Err:       ErrFieldCount,
+			}
 		}
 	} else if r.FieldsPerRecord == 0 {
 		r.FieldsPerRecord = len(dst)
-- 
cgit v1.3