From e98aea8108d267fc5f8364d0e611698f413095c5 Mon Sep 17 00:00:00 2001
From: Shulhan <ms@kilabit.info>
Date: Mon, 12 Aug 2024 23:40:43 +0700
Subject: all: use strict document header format

Previously, an empty line before Document Title cause the parser
stop parsing the document header, now an empty lines are skipped.
Also document attribute can be place anywhere, either before or
after title, and in between attributes; now it can be only placed
after revision or author or title.
---
 _doc/SPECS.adoc                          |  18 +++--
 document_parser.go                       | 115 +++++++++++++++++++------------
 document_test.go                         |   5 +-
 testdata/document_title_test.txt         |   2 +-
 testdata/header_with_empty_line_test.txt |   4 +-
 testdata/test.adoc                       |   4 +-
 testdata/test.got.html                   |   2 +-
 7 files changed, 87 insertions(+), 63 deletions(-)

diff --git a/_doc/SPECS.adoc b/_doc/SPECS.adoc
index 3ceb8ae..82f81ba 100644
--- a/_doc/SPECS.adoc
+++ b/_doc/SPECS.adoc
@@ -67,21 +67,19 @@ REF_ID    = 1*ALPHA *("-" / "_" / ALPHA / DIGIT)
 {url_ref}/document/header/[Reference^].
 
 Document header consist of title and optional authors, a revision, and zero or
-more metadata.
-The document metadata can be in any order, before or after title, but the
-author and revision MUST be after title and in order.
+more attributes.
+The author and revision MUST be after title and in order.
+The document attributes can be in any order, after title, author or
+revision.
 
 ----
-DOC_HEADER     = *(DOC_ATTRIBUTE / COMMENTS)
-                 "=" SP DOC_TITLE LF
-                 (*DOC_ATTRIBUTE)
-                 DOC_AUTHORS LF
-                 (*DOC_ATTRIBUTE)
-                 DOC_REVISION LF
+DOC_HEADER     = [ "=" SP DOC_TITLE LF
+                 [ DOC_AUTHORS LF
+                 [ DOC_REVISION LF ]]]
                  (*DOC_ATTRIBUTE)
+                 LF
 ----
 
-There are no empty line before and after the document header.
 An empty line mark as the end of document header.
 
 ===  Title
diff --git a/document_parser.go b/document_parser.go
index 5dc4a2e..3b529fc 100644
--- a/document_parser.go
+++ b/document_parser.go
@@ -686,72 +686,76 @@ func (docp *documentParser) parseBlock(parent *element, term int) {
 // The document attributes can be in any order, but the author and revision
 // MUST be in order.
 //
-//	DOC_HEADER  = *(DOC_ATTRIBUTE / COMMENTS)
-//	              "=" SP *ADOC_WORD LF
-//	              (*DOC_ATTRIBUTE)
-//	              DOC_AUTHORS LF
-//	              (*DOC_ATTRIBUTE)
-//	              DOC_REVISION LF
+//	DOC_HEADER  = [ "=" SP *ADOC_WORD LF
+//	              [ DOC_AUTHORS LF
+//	              [ DOC_REVISION LF ]]]
 //	              (*DOC_ATTRIBUTE)
+//	              LF
 func (docp *documentParser) parseHeader() {
-	const (
-		stateBegin int = iota
-		stateTitle
-		stateAuthor
-		stateRevision
+	var (
+		logp = `parseHeader`
+		line []byte
+		ok   bool
 	)
 
-	var (
-		logp  = `parseHeader`
-		state = stateBegin
+	line, ok = docp.skipCommentAndEmptyLine()
+	if !ok {
+		return
+	}
+	if docp.kind == lineKindText && isTitle(line) {
+		docp.doc.header.Write(bytes.TrimSpace(line[2:]))
+		docp.doc.Title.raw = string(docp.doc.header.raw)
 
-		key   string
-		value string
-		line  []byte
-		ok    bool
-	)
-	for {
 		_, line, ok = docp.line(logp)
 		if !ok {
 			return
 		}
-		if len(line) == 0 {
+		if docp.kind == lineKindText {
+			docp.doc.rawAuthors = string(line)
+
+			_, line, ok = docp.line(logp)
+			if !ok {
+				return
+			}
+			if docp.kind == lineKindText {
+				docp.doc.rawRevision = string(line)
+				line = nil
+			}
+		}
+	}
+
+	// Parse the rest of attributes until we found an empty line or
+	// line with non-attribute.
+	for {
+		if line == nil {
+			_, line, ok = docp.line(logp)
+			if !ok {
+				return
+			}
+		}
+		if docp.kind == lineKindEmpty {
 			return
 		}
-		if bytes.HasPrefix(line, []byte(`////`)) {
+		if docp.kind == lineKindBlockComment {
 			docp.parseIgnoreCommentBlock()
+			line = nil
 			continue
 		}
-		if bytes.HasPrefix(line, []byte(`//`)) {
+		if docp.kind == lineKindComment {
+			line = nil
 			continue
 		}
-		if line[0] == ':' {
+		if docp.kind == lineKindAttribute {
+			var key, value string
 			key, value, ok = docp.parseAttribute(line, false)
 			if ok {
 				docp.doc.Attributes.apply(key, value)
 			}
+			line = nil
 			continue
 		}
-		if state == stateBegin {
-			if isTitle(line) {
-				docp.doc.header.Write(bytes.TrimSpace(line[2:]))
-				docp.doc.Title.raw = string(docp.doc.header.raw)
-				state = stateTitle
-			} else {
-				docp.doc.rawAuthors = string(line)
-				state = stateAuthor
-			}
-			continue
-		}
-		switch state {
-		case stateTitle:
-			docp.doc.rawAuthors = string(line)
-			state = stateAuthor
-
-		case stateAuthor:
-			docp.doc.rawRevision = string(line)
-			state = stateRevision
-		}
+		docp.lineNum--
+		break
 	}
 }
 
@@ -1572,3 +1576,26 @@ func (docp *documentParser) parseParagraph(parent, el *element, line []byte, ter
 	el.parseInlineMarkup(docp.doc, elKindText)
 	return line
 }
+
+func (docp *documentParser) skipCommentAndEmptyLine() (line []byte, ok bool) {
+	var logp = `skipCommentAndEmptyLine`
+
+	for {
+		_, line, ok = docp.line(logp)
+		if !ok {
+			return nil, false
+		}
+		if docp.kind == lineKindEmpty {
+			continue
+		}
+		if docp.kind == lineKindBlockComment {
+			docp.parseIgnoreCommentBlock()
+			continue
+		}
+		if docp.kind == lineKindComment {
+			continue
+		}
+		break
+	}
+	return line, true
+}
diff --git a/document_test.go b/document_test.go
index b15781c..c4cc995 100644
--- a/document_test.go
+++ b/document_test.go
@@ -71,8 +71,9 @@ func TestParse_document_title(t *testing.T) {
 		expString: `a: b: c`,
 	}, {
 		// With custom separator.
-		content: `:title-separator: x
-= Mainx sub`,
+		content: `
+= Mainx sub
+:title-separator: x`,
 		exp: DocumentTitle{
 			Main: `Main`,
 			Sub:  `sub`,
diff --git a/testdata/document_title_test.txt b/testdata/document_title_test.txt
index 9856204..95d1e35 100644
--- a/testdata/document_title_test.txt
+++ b/testdata/document_title_test.txt
@@ -25,8 +25,8 @@ output_call: htmlWriteHeader
 </div>
 
 >>> With custom separator
-:title-separator: x
 = Mainx sub
+:title-separator: x
 
 <<< With custom separator
 <div id="header">
diff --git a/testdata/header_with_empty_line_test.txt b/testdata/header_with_empty_line_test.txt
index c164769..30c7e7c 100644
--- a/testdata/header_with_empty_line_test.txt
+++ b/testdata/header_with_empty_line_test.txt
@@ -12,11 +12,9 @@ Below is empty line with spaces.
 
 <<<
 <div id="header">
+<h1>Title</h1>
 </div>
 <div id="content">
-<div class="paragraph">
-<p>= Title</p>
-</div>
 </div>
 <div id="footer">
 <div id="footer-text">
diff --git a/testdata/test.adoc b/testdata/test.adoc
index ee99271..e5c4109 100644
--- a/testdata/test.adoc
+++ b/testdata/test.adoc
@@ -1,10 +1,10 @@
 // SPDX-FileCopyrightText: 2020 M. Shulhan <ms@kilabit.info>
 // SPDX-License-Identifier: GPL-3.0-or-later
 = _Example `Document` **title**_
-:metadata key: value
 Author A <a@a.com>; Author mid_dle B <b@b.com>
-:unclosed metadata:
 v1.1.1, 18 July 2020: remark
+:metadata key: value
+:unclosed metadata:
 :sectnums:
 :sectlinks:
 :sectanchors:
diff --git a/testdata/test.got.html b/testdata/test.got.html
index b567740..2299f90 100644
--- a/testdata/test.got.html
+++ b/testdata/test.got.html
@@ -3056,7 +3056,7 @@ this sidebar.</p>
 <div id="footer">
 <div id="footer-text">
  1.1.1<br>
-Last updated 2024-04-04 21:22:35 +0700
+Last updated 2024-08-12 23:31:24 +0700
 </div>
 </div>
 </body>
-- 
cgit v1.3