From 89ccfe496224bc92f2d2af860cae2f5d7e830f8d Mon Sep 17 00:00:00 2001
From: Joe Tsai <joetsai@digital-static.net>
Date: Fri, 20 Oct 2017 05:26:43 -0700
Subject: encoding/csv: simplify and optimize Reader
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The Reader implementation is slow because it operates on a rune-by-rune
basis via bufio.Reader.ReadRune. We speed this up by operating on entire
lines that we read from bufio.Reader.ReadSlice.

In order to ensure that we read the full line, we augment ReadSlice
in our Reader.readLine method to automatically expand the slice if
bufio.ErrBufferFull is every hit.

This change happens to fix #19410 because it no longer relies on
rune-by-rune parsing and only searches for the relevant delimiter rune.

In order to keep column accounting simple and consistent, this change
reverts parts of CL 52830.

This CL is an alternative to CL 36270 and builds on some of the ideas
from that change by Diogo Pinela.

name                                     old time/op    new time/op    delta
Read-8                                   3.12µs ± 1%    2.54µs ± 2%  -18.76%   (p=0.000 n=10+9)
ReadWithFieldsPerRecord-8                3.12µs ± 1%    2.53µs ± 1%  -18.91%    (p=0.000 n=9+9)
ReadWithoutFieldsPerRecord-8             3.13µs ± 0%    2.57µs ± 3%  -18.07%  (p=0.000 n=10+10)
ReadLargeFields-8                        52.3µs ± 1%     5.3µs ± 2%  -89.93%   (p=0.000 n=10+9)
ReadReuseRecord-8                        2.05µs ± 1%    1.40µs ± 1%  -31.48%   (p=0.000 n=10+9)
ReadReuseRecordWithFieldsPerRecord-8     2.05µs ± 1%    1.41µs ± 0%  -31.03%   (p=0.000 n=10+9)
ReadReuseRecordWithoutFieldsPerRecord-8  2.06µs ± 1%    1.40µs ± 1%  -31.70%   (p=0.000 n=9+10)
ReadReuseRecordLargeFields-8             50.9µs ± 0%     4.1µs ± 3%  -92.01%  (p=0.000 n=10+10)

name                                     old alloc/op   new alloc/op
Read-8                                       664B ± 0%      664B ± 0%
ReadWithFieldsPerRecord-8                    664B ± 0%      664B ± 0%
ReadWithoutFieldsPerRecord-8                 664B ± 0%      664B ± 0%
ReadLargeFields-8                          3.94kB ± 0%    3.94kB ± 0%
ReadReuseRecord-8                           24.0B ± 0%     24.0B ± 0%
ReadReuseRecordWithFieldsPerRecord-8        24.0B ± 0%     24.0B ± 0%
ReadReuseRecordWithoutFieldsPerRecord-8     24.0B ± 0%     24.0B ± 0%
ReadReuseRecordLargeFields-8               2.98kB ± 0%    2.98kB ± 0%

name                                     old allocs/op  new allocs/op
Read-8                                       18.0 ± 0%      18.0 ± 0%
ReadWithFieldsPerRecord-8                    18.0 ± 0%      18.0 ± 0%
ReadWithoutFieldsPerRecord-8                 18.0 ± 0%      18.0 ± 0%
ReadLargeFields-8                            24.0 ± 0%      24.0 ± 0%
ReadReuseRecord-8                            8.00 ± 0%      8.00 ± 0%
ReadReuseRecordWithFieldsPerRecord-8         8.00 ± 0%      8.00 ± 0%
ReadReuseRecordWithoutFieldsPerRecord-8      8.00 ± 0%      8.00 ± 0%
ReadReuseRecordLargeFields-8                 12.0 ± 0%      12.0 ± 0%

Updates #22352
Updates #19019
Fixes #16791
Fixes #19410

Change-Id: I31c27cfcc56880e6abac262f36c947179b550bbf
Reviewed-on: https://go-review.googlesource.com/72150
Reviewed-by: Ian Lance Taylor <iant@golang.org>
Run-TryBot: Joe Tsai <thebrokentoaster@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
---
 src/encoding/csv/reader_test.go | 135 +++++++++++++++++++++++++++++++---------
 1 file changed, 106 insertions(+), 29 deletions(-)

(limited to 'src/encoding/csv/reader_test.go')

diff --git a/src/encoding/csv/reader_test.go b/src/encoding/csv/reader_test.go
index 3811629aad..781847cefa 100644
--- a/src/encoding/csv/reader_test.go
+++ b/src/encoding/csv/reader_test.go
@@ -26,9 +26,8 @@ var readTests = []struct {
 	TrimLeadingSpace bool
 	ReuseRecord      bool
 
-	Error  string
-	Line   int // Expected error line if != 0
-	Column int // Expected error column if line != 0
+	Error error
+	Line  int // Expected error line if != 0
 }{
 	{
 		Name:   "Simple",
@@ -140,7 +139,7 @@ field"`,
 	{
 		Name:  "BadDoubleQuotes",
 		Input: `a""b,c`,
-		Error: `bare " in non-quoted-field`, Line: 1, Column: 1,
+		Error: &ParseError{Line: 1, Column: 1, Err: ErrBareQuote},
 	},
 	{
 		Name:             "TrimQuote",
@@ -151,30 +150,30 @@ field"`,
 	{
 		Name:  "BadBareQuote",
 		Input: `a "word","b"`,
-		Error: `bare " in non-quoted-field`, Line: 1, Column: 2,
+		Error: &ParseError{Line: 1, Column: 2, Err: ErrBareQuote},
 	},
 	{
 		Name:  "BadTrailingQuote",
 		Input: `"a word",b"`,
-		Error: `bare " in non-quoted-field`, Line: 1, Column: 10,
+		Error: &ParseError{Line: 1, Column: 10, Err: ErrBareQuote},
 	},
 	{
 		Name:  "ExtraneousQuote",
 		Input: `"a "word","b"`,
-		Error: `extraneous " in field`, Line: 1, Column: 3,
+		Error: &ParseError{Line: 1, Column: 3, Err: ErrQuote},
 	},
 	{
 		Name:               "BadFieldCount",
 		UseFieldsPerRecord: true,
 		Input:              "a,b,c\nd,e",
-		Error:              "wrong number of fields", Line: 2,
+		Error:              &ParseError{Line: 2, Err: ErrFieldCount},
 	},
 	{
 		Name:               "BadFieldCount1",
 		UseFieldsPerRecord: true,
 		FieldsPerRecord:    2,
 		Input:              `a,b,c`,
-		Error:              "wrong number of fields", Line: 1,
+		Error:              &ParseError{Line: 1, Err: ErrFieldCount},
 	},
 	{
 		Name:   "FieldCount",
@@ -271,18 +270,14 @@ x,,,
 		},
 	},
 	{ // issue 19019
-		Name:   "RecordLine1",
-		Input:  "a,\"b\nc\"d,e",
-		Error:  `extraneous " in field`,
-		Line:   1,
-		Column: 1,
+		Name:  "RecordLine1",
+		Input: "a,\"b\nc\"d,e",
+		Error: &ParseError{Line: 2, Column: 1, Err: ErrQuote},
 	},
 	{
-		Name:   "RecordLine2",
-		Input:  "a,b\n\"d\n\n,e",
-		Error:  `extraneous " in field`,
-		Line:   2,
-		Column: 2,
+		Name:  "RecordLine2",
+		Input: "a,b\n\"d\n\n,e",
+		Error: &ParseError{Line: 5, Column: 0, Err: ErrQuote},
 	},
 	{ // issue 21201
 		Name:  "CRLFInQuotedField",
@@ -291,6 +286,95 @@ x,,,
 			{"Hello\r\nHi"},
 		},
 	},
+	{ // issue 19410
+		Name:   "BinaryBlobField",
+		Input:  "x09\x41\xb4\x1c,aktau",
+		Output: [][]string{{"x09A\xb4\x1c", "aktau"}},
+	},
+	{
+		Name:   "TrailingCR",
+		Input:  "field1,field2\r",
+		Output: [][]string{{"field1", "field2\r"}},
+	},
+	{
+		Name:             "NonASCIICommaAndComment",
+		TrimLeadingSpace: true,
+		Comma:            '£',
+		Comment:          '€',
+		Input:            "a£b,c£ \td,e\n€ comment\n",
+		Output:           [][]string{{"a", "b,c", "d,e"}},
+	},
+	{
+		Name:    "NonASCIICommaAndCommentWithQuotes",
+		Comma:   '€',
+		Comment: 'λ',
+		Input:   "a€\"  b,\"€ c\nλ comment\n",
+		Output:  [][]string{{"a", "  b,", " c"}},
+	},
+	{
+		Name:    "NonASCIICommaConfusion",
+		Comma:   'λ',
+		Comment: '€',
+		// λ and θ start with the same byte. This test is intended to ensure the parser doesn't
+		// confuse such characters.
+		Input:  "\"abθcd\"λefθgh",
+		Output: [][]string{{"abθcd", "efθgh"}},
+	},
+	{
+		Name:    "NonASCIICommentConfusion",
+		Comment: 'θ',
+		Input:   "λ\nλ\nθ\nλ\n",
+		Output:  [][]string{{"λ"}, {"λ"}, {"λ"}},
+	},
+	{
+		Name:   "QuotedFieldMultipleLF",
+		Input:  "\"\n\n\n\n\"",
+		Output: [][]string{{"\n\n\n\n"}},
+	},
+	{
+		Name:  "MultipleCRLF",
+		Input: "\r\n\r\n\r\n\r\n",
+	},
+	{
+		// The implementation may read each line in several chunks if it doesn't fit entirely
+		// in the read buffer, so we should test the code to handle that condition.
+		Name:    "HugeLines",
+		Comment: '#',
+		Input:   strings.Repeat("#ignore\n", 10000) + strings.Repeat("@", 5000) + "," + strings.Repeat("*", 5000),
+		Output:  [][]string{{strings.Repeat("@", 5000), strings.Repeat("*", 5000)}},
+	},
+	{
+		Name:  "QuoteWithTrailingCRLF",
+		Input: "\"foo\"bar\"\r\n",
+		Error: &ParseError{Line: 1, Column: 4, Err: ErrQuote},
+	},
+	{
+		Name:       "LazyQuoteWithTrailingCRLF",
+		Input:      "\"foo\"bar\"\r\n",
+		LazyQuotes: true,
+		Output:     [][]string{{`foo"bar`}},
+	},
+	{
+		Name:   "DoubleQuoteWithTrailingCRLF",
+		Input:  "\"foo\"\"bar\"\r\n",
+		Output: [][]string{{`foo"bar`}},
+	},
+	{
+		Name:   "EvenQuotes",
+		Input:  `""""""""`,
+		Output: [][]string{{`"""`}},
+	},
+	{
+		Name:  "OddQuotes",
+		Input: `"""""""`,
+		Error: &ParseError{Line: 1, Column: 7, Err: ErrQuote},
+	},
+	{
+		Name:       "LazyOddQuotes",
+		Input:      `"""""""`,
+		LazyQuotes: true,
+		Output:     [][]string{{`"""`}},
+	},
 }
 
 func TestRead(t *testing.T) {
@@ -310,17 +394,10 @@ func TestRead(t *testing.T) {
 			r.Comma = tt.Comma
 		}
 		out, err := r.ReadAll()
-		perr, _ := err.(*ParseError)
-		if tt.Error != "" {
-			if err == nil || !strings.Contains(err.Error(), tt.Error) {
-				t.Errorf("%s: error %v, want error %q", tt.Name, err, tt.Error)
-			} else if tt.Line != 0 && (tt.Line != perr.Line || tt.Column != perr.Column) {
-				t.Errorf("%s: error at %d:%d expected %d:%d", tt.Name, perr.Line, perr.Column, tt.Line, tt.Column)
-			}
-		} else if err != nil {
-			t.Errorf("%s: unexpected error %v", tt.Name, err)
+		if !reflect.DeepEqual(err, tt.Error) {
+			t.Errorf("%s: ReadAll() error:\ngot  %v\nwant %v", tt.Name, err, tt.Error)
 		} else if !reflect.DeepEqual(out, tt.Output) {
-			t.Errorf("%s: out=%q want %q", tt.Name, out, tt.Output)
+			t.Errorf("%s: ReadAll() output:\ngot  %q\nwant %q", tt.Name, out, tt.Output)
 		}
 	}
 }
-- 
cgit v1.3