lib/email: refactoring Field parsing

Split the parsing into two methods: parseName and parseValue. The error returned from those methods are prefixed by its name.
author: Shulhan <ms@kilabit.info> 2023-06-01 17:45:55 +0700
committer: Shulhan <ms@kilabit.info> 2023-06-03 01:08:02 +0700
commit: a9198587b02ee060d8cacbe9d5ff19c5c1532a89 (patch)
tree: d27cb338e5484f45239b762dc1b9f259d97d4e12
parent: a9f6156024d5e7def26640bd6448001d3da19e4e (diff)
download: pakakeh.go-a9198587b02ee060d8cacbe9d5ff19c5c1532a89.tar.xz
8 files changed, 112 insertions, 86 deletions
diff --git a/_doc/RFC_5322__IMF.adoc b/_doc/RFC_5322__IMF.adoc
index fc2e9699..0359f6d2 100644
--- a/_doc/RFC_5322__IMF.adoc
+++ b/_doc/RFC_5322__IMF.adoc
@@ -12,26 +12,29 @@ Message Format as defined in {url-rfc5322}[RFC 5322^].
 ==  Syntax
 
 ....
-message         =   (fields / obs-fields)
+message         =   header
                     [CRLF body]
 
-fields          =   *(field-name ":" (field-body / unstructured) CRLF)
+header          =   *field
 
-field-name      =   1*ftext
+field           =   field-name ":" field-body CRLF
 
-field-body      =   (*([FWS] VCHAR) *WSP)
+field-name      =   1*(ftext / obs-ftext)
 
-unstructured    =   (*([FWS] VCHAR) *WSP) / obs-unstruct
+field-body      =   *(FWS / WSP / VCHAR)
 
 VCHAR           =   %d33-126
 
 WSP             =   %d9 / %d32
                 ; tab or space
 
+obs-ftext       =   %d32 / ftext
+                ; Allow space in obsolete syntax.
+
 ftext           =   %d33-57 / %d59-126
                 ; Printable US-ASCII, except %d0-32 and %d58 (":")
 
-body            =   (*(*998text CRLF) *998text) / obs-body
+body            =   (*(*998text CRLF) *998text)
 
 text            =   %d1-9 /            ; Characters excluding CR
                     %d11 /             ;  and LF
@@ -39,12 +42,11 @@ text            =   %d1-9 /            ; Characters excluding CR
                     %d14-127
 ....
 
-*  Each line MUST be no more than 998 characters, excluding CRLF.
-
-*  Each line SHOULD be no more than 78 characters, excluding the CRLF.
+*  Each line in a message (header and body) MUST be no more than 998
+   characters, excluding CRLF.
 
-*  CR and LF MUST only occur together as CRLF; they MUST NOT appear
-   independently in the body.
+*  Each line in a message SHOULD be no more than 78 characters, excluding the
+   CRLF.
 
 *  Each header field SHOULD be treated in its unfolded form for further
    syntactic and semantic evaluation.
@@ -52,13 +54,16 @@ text            =   %d1-9 /            ; Characters excluding CR
 *  "field-body" MUST NOT include CR and LF except when used in "folding" and
    "unfolding".
 
+*  CR and LF MUST only occur together as CRLF; they MUST NOT appear
+   independently in the body.
+
 
 ===   Folding White Space and Comments
 
 ....
 CFWS            =   (1*([FWS] comment) [FWS]) / FWS
 
-FWS             =   ([*WSP CRLF] 1*WSP) / obs-FWS
+FWS             =   CRLF 1*WSP / obs-FWS
                 ; Folding white space
 
 comment         =   "(" *([FWS] ccontent) [FWS] ")"
diff --git a/lib/email/body_test.go b/lib/email/body_test.go
index 409765dc..d53d217d 100644
--- a/lib/email/body_test.go
+++ b/lib/email/body_test.go
@@ -38,7 +38,7 @@ func TestParseBody(t *testing.T) {
 			"--boundary\r\n" +
 			"Content-Encoding:\r\n\r\n",
 		boundary: "boundary",
-		expErr:   "email: empty field value at 'Content-Encoding:\r\n'",
+		expErr:   `ParseField: parseValue: empty field value`,
 	}, {
 		desc: "With epilogue",
 		in: "preamble\r\n\r\n" +
diff --git a/lib/email/doc.go b/lib/email/doc.go
index 802b58a1..44f26f77 100644
--- a/lib/email/doc.go
+++ b/lib/email/doc.go
@@ -29,7 +29,7 @@
 //	| Name | Value | Type |
 //	+------+-------+------+
 //
-// [Field] is parsed line that contains Name and Value separated by ": ".
+// [Field] is parsed line that contains Name and Value separated by colon ':'.
 //
 // A [ContentType] is special Field where Name is "Content-Type", and its
 // Value is parsed from string "top/sub; <param>; ...".
diff --git a/lib/email/field.go b/lib/email/field.go
index cbd25530..ecf377e4 100644
--- a/lib/email/field.go
+++ b/lib/email/field.go
@@ -37,6 +37,10 @@ type Field struct {
 	// Type of field, the numeric representation of field name.
 	Type FieldType
 
+	// isFolded set to true if field line contains folding, CRLF following
+	// by space and values.
+	isFolded bool
+
 	// true if field.unpack has been called, false when field.setValue is
 	// called again.
 	unpacked bool
@@ -54,54 +58,85 @@ func ParseField(raw []byte) (field *Field, rest []byte, err error) {
 		return nil, nil, nil
 	}
 
+	var logp = `ParseField`
+
 	field = &Field{}
-	isFolded := false
-	start := 0
 
-	// Get field's name.
-	// Valid values: %d33-57 / %d59-126 .
-	x := 0
+	raw, err = field.parseName(raw)
+	if err != nil {
+		return nil, nil, fmt.Errorf(`%s: %w`, logp, err)
+	}
+
+	raw, err = field.parseValue(raw)
+	if err != nil {
+		return nil, nil, fmt.Errorf(`%s: %w`, logp, err)
+	}
+
+	if !field.isFolded {
+		if (len(field.oriName) + len(field.oriValue) + 1) > 1000 {
+			return nil, nil, fmt.Errorf(`%s: field line greater than 998 characters`, logp)
+		}
+	}
+
+	rest = raw
+	return field, rest, nil
+}
+
+// parseName parse the field Name.
+// Format,
+//
+//	field-name = 1*(ftext / obs-ftext) ":"
+//	obs-ftext  = %d32 / ftext
+//	           ; space allowed in [obsolete] specification.
+//	[ftext]    = %d33-57 / %d59-126
+//	           ; printable ASCII character except colon (%d58).
+//
+// [ftext]: https://datatracker.ietf.org/doc/html/rfc5322#section-2.2
+// [obsolete]: https://datatracker.ietf.org/doc/html/rfc5322#section-4.5
+func (field *Field) parseName(raw []byte) (rest []byte, err error) {
+	var (
+		logp = `parseName`
+		x    int
+	)
 	for ; x < len(raw); x++ {
-		if raw[x] == ' ' || raw[x] == ':' {
+		if raw[x] == '\t' || raw[x] == ' ' || raw[x] == ':' {
 			break
 		}
 		if raw[x] < 33 || raw[x] > 126 {
-			err = fmt.Errorf("email: invalid field at '%s'", raw[:x])
-			goto invalid
+			return nil, fmt.Errorf(`%s: invalid character %q`, logp, raw[x])
 		}
 	}
-	if len(raw) == x {
-		err = fmt.Errorf("email: invalid field at '%s'", raw[:x])
-		goto invalid
-	}
-
 	// Skip WSP before ':'.
 	for ; x < len(raw) && (raw[x] == '\t' || raw[x] == ' '); x++ {
 	}
 	if len(raw) == x {
-		err = fmt.Errorf("email: invalid field at '%s'", raw[:x])
-		goto invalid
+		return nil, fmt.Errorf(`%s: missing value`, logp)
 	}
 	if raw[x] != ':' {
-		err = fmt.Errorf("email: missing field separator at '%s'", raw[:x])
-		goto invalid
+		return nil, fmt.Errorf(`%s: missing field separator`, logp)
 	}
 
 	field.setName(raw[:x])
-	x++
-	start = x
 
-	// Skip WSP after ':'.
-	for ; x < len(raw) && (raw[x] == '\t' || raw[x] == ' '); x++ {
-	}
+	rest = raw[x+1:]
 
-	if len(raw) == x {
-		err = fmt.Errorf("email: empty field value at '%s'", raw[:x])
-		goto invalid
-	}
+	return rest, nil
+}
+
+// parseValue parse field value.
+// Format,
+//
+//	field-body = 1*(FWS / WSP / %d33-126) CRLF
+//	FWS        = CRLF WSP              ; \r\n followed by space.
+//	WSP        = %d9 / %d32            ; tab or space.
+//
+// [Reference]: https://datatracker.ietf.org/doc/html/rfc5322#section-2.2
+func (field *Field) parseValue(raw []byte) (rest []byte, err error) {
+	var (
+		logp = `parseValue`
+		x    int
+	)
 
-	// Get field's value.
-	// Valid values: WSP / %d33-126 .
 	for ; x < len(raw); x++ {
 		for ; x < len(raw); x++ {
 			if raw[x] == '\t' || raw[x] == ' ' {
@@ -112,48 +147,34 @@ func ParseField(raw []byte) (field *Field, rest []byte, err error) {
 				break
 			}
 			if raw[x] < 33 || raw[x] > 126 {
-				err = fmt.Errorf("email: invalid field value at '%s'", raw[:x])
-				goto invalid
+				return nil, fmt.Errorf(`%s: invalid field value %q`, logp, raw[x])
 			}
 		}
 		if x == len(raw) || raw[x] != lf {
-			err = fmt.Errorf("email: field value without CRLF at '%s'", raw[:x])
-			goto invalid
+			return nil, fmt.Errorf(`%s: invalid or missing termination`, logp)
 		}
-		if x++; x == len(raw) {
+		x++
+		if x == len(raw) {
 			break
 		}
-
 		// Unfolding ...
 		if raw[x] == '\t' || raw[x] == ' ' {
-			isFolded = true
+			field.isFolded = true
 			continue
 		}
+		// End with CRLF.
 		break
 	}
-	if !isFolded && x > 1000 {
-		err = fmt.Errorf("email: field line greater than 998 characters")
-		return nil, nil, err
-	}
 
-	field.setValue(raw[start:x])
+	field.setValue(raw[:x])
 
 	if len(field.Value) == 0 {
-		err = fmt.Errorf("email: empty field value at '%s'", raw[:x])
-		goto invalid
+		return nil, fmt.Errorf(`%s: empty field value`, logp)
 	}
 
-	if len(raw) > x {
-		rest = raw[x:]
-	}
-
-	return field, rest, nil
+	rest = raw[x:]
 
-invalid:
-	if x < len(raw) {
-		rest = raw[x:]
-	}
-	return nil, rest, err
+	return rest, nil
 }
 
 // addMailboxes append zero or more mailboxes to current mboxes.
@@ -213,7 +234,7 @@ func (field *Field) appendValue(raw []byte) {
 
 // setName set field Name by canonicalizing raw field name using "simple" and
 // "relaxed" algorithms.
-// .
+//
 // "simple" algorithm store raw field name as is.
 //
 // "relaxed" algorithm convert field name to lowercase and removing trailing
diff --git a/lib/email/field_test.go b/lib/email/field_test.go
index 672cf181..b8d7c5cb 100644
--- a/lib/email/field_test.go
+++ b/lib/email/field_test.go
@@ -26,59 +26,59 @@ func TestParseField(t *testing.T) {
 	}, {
 		desc:   "With long line",
 		raw:    []byte("name:" + longValue + "\r\n"),
-		expErr: "email: field line greater than 998 characters",
+		expErr: `ParseField: field line greater than 998 characters`,
 	}, {
 		desc:   "With only whitespaces",
 		raw:    []byte("  "),
-		expErr: "email: invalid field at '  '",
+		expErr: `ParseField: parseName: missing value`,
 	}, {
 		desc:   "With only CRLF",
 		raw:    []byte("\r\n"),
-		expErr: "email: invalid field at ''",
+		expErr: `ParseField: parseName: invalid character '\r'`,
 	}, {
 		desc:   "Without separator and CRLF",
 		raw:    []byte("name"),
-		expErr: "email: invalid field at 'name'",
+		expErr: `ParseField: parseName: missing value`,
 	}, {
 		desc:   "Without separator",
 		raw:    []byte("name\r\n"),
-		expErr: "email: invalid field at 'name'",
+		expErr: `ParseField: parseName: invalid character '\r'`,
 	}, {
 		desc:   "With space on name",
 		raw:    []byte("na me\r\n"),
-		expErr: "email: missing field separator at 'na '",
+		expErr: `ParseField: parseName: missing field separator`,
 	}, {
 		desc:   "Without value and CRLF",
 		raw:    []byte("name:"),
-		expErr: "email: empty field value at 'name:'",
+		expErr: `ParseField: parseValue: empty field value`,
 	}, {
 		desc:   "Without value and CRLF",
 		raw:    []byte("name: "),
-		expErr: "email: empty field value at 'name: '",
+		expErr: `ParseField: parseValue: invalid or missing termination`,
 	}, {
 		desc:   "Without value",
 		raw:    []byte("name:\r\n"),
-		expErr: "email: empty field value at 'name:\r\n'",
+		expErr: `ParseField: parseValue: empty field value`,
 	}, {
 		desc:   "Without value",
 		raw:    []byte("name: \r\n"),
-		expErr: "email: empty field value at 'name: \r\n'",
+		expErr: `ParseField: parseValue: empty field value`,
 	}, {
 		desc:   "Without CRLF",
 		raw:    []byte("name:value"),
-		expErr: "email: field value without CRLF at 'name:value'",
+		expErr: `ParseField: parseValue: invalid or missing termination`,
 	}, {
 		desc:   "Without CR",
 		raw:    []byte("name:value\n"),
-		expErr: "email: invalid field value at 'name:value'",
+		expErr: `ParseField: parseValue: invalid field value '\n'`,
 	}, {
 		desc:   "Without LF",
 		raw:    []byte("name:value\r"),
-		expErr: "email: field value without CRLF at 'name:value\r'",
+		expErr: `ParseField: parseValue: invalid or missing termination`,
 	}, {
 		desc:   "With CR inside value",
 		raw:    []byte("name:valu\re"),
-		expErr: "email: field value without CRLF at 'name:valu\r'",
+		expErr: `ParseField: parseValue: invalid or missing termination`,
 	}, {
 		desc: "With valid input",
 		raw:  []byte("NAME : VALUE\r\n"),
@@ -308,7 +308,7 @@ func TestUnpackMailboxList(t *testing.T) {
 		in     []byte
 	}{{
 		in:     []byte("From: \r\n"),
-		expErr: "email: empty field value at 'From: \r\n'",
+		expErr: `ParseField: parseValue: empty field value`,
 	}, {
 		in:  []byte("From: test@one, test@two\r\n"),
 		exp: "from:test@one, test@two\r\n",
diff --git a/lib/email/header_test.go b/lib/email/header_test.go
index f0b8c86a..ce5b9ead 100644
--- a/lib/email/header_test.go
+++ b/lib/email/header_test.go
@@ -69,7 +69,7 @@ func TestParseHeader(t *testing.T) {
 	}, {
 		desc:   "With whitespaces only",
 		raw:    []byte(" \t"),
-		expErr: "email: invalid field at ' \t'",
+		expErr: `ParseField: parseName: missing value`,
 	}, {
 		desc:    "With CRLF only",
 		raw:     []byte("\r\n"),
@@ -81,7 +81,7 @@ func TestParseHeader(t *testing.T) {
 	}, {
 		desc:   "With invalid field: missing value",
 		raw:    []byte("a:\r\n\t"),
-		expErr: "email: empty field value at 'a:\r\n\t'",
+		expErr: `ParseField: parseValue: empty field value`,
 	}, {
 		desc:       "With single field",
 		raw:        []byte("a:1\r\n"),
diff --git a/lib/email/message_test.go b/lib/email/message_test.go
index 7813929b..a323b0f8 100644
--- a/lib/email/message_test.go
+++ b/lib/email/message_test.go
@@ -90,7 +90,7 @@ func TestMessageParseMessage(t *testing.T) {
 		exp: "\r\n",
 	}, {
 		in:     "testdata/invalid-header.txt",
-		expErr: "ParseMessage: email: invalid field value at 'From  : John Doe <jdoe@machine(comment).  example>'",
+		expErr: `ParseMessage: ParseField: parseValue: invalid field value '\n'`,
 	}, {
 		in: "testdata/rfc5322-A.6.3.txt",
 		exp: "from:John Doe <jdoe@machine(comment). example>\r\n" +
diff --git a/lib/email/mime_test.go b/lib/email/mime_test.go
index 51826004..794faf67 100644
--- a/lib/email/mime_test.go
+++ b/lib/email/mime_test.go
@@ -55,7 +55,7 @@ func TestParseBodyPart(t *testing.T) {
 		in: "--boundary\r\n" +
 			"Content-Encoding:\r\n\r\n",
 		boundary: "boundary",
-		expErr:   "email: empty field value at 'Content-Encoding:\r\n'",
+		expErr:   `ParseField: parseValue: empty field value`,
 	}, {
 		desc: "With end of body",
 		in: "--boundary--\r\n\r\n" +
author	Shulhan <ms@kilabit.info>	2023-06-01 17:45:55 +0700
committer	Shulhan <ms@kilabit.info>	2023-06-03 01:08:02 +0700
commit	a9198587b02ee060d8cacbe9d5ff19c5c1532a89 (patch)
tree	d27cb338e5484f45239b762dc1b9f259d97d4e12
parent	a9f6156024d5e7def26640bd6448001d3da19e4e (diff)
download	pakakeh.go-a9198587b02ee060d8cacbe9d5ff19c5c1532a89.tar.xz