4 files changed, 420 insertions, 0 deletions
diff --git a/doc/0/grammar/abnf.txt b/doc/0/grammar/abnf.txt
new file mode 100644
index 0000000..5ab3c89
--- /dev/null
+++ b/doc/0/grammar/abnf.txt
@@ -0,0 +1,139 @@
+; Standards-compliant ABNF (RFC 5234, RFC 7405)
+
+; Compatible with: https://www.quut.com/abnfgen/
+
+; Unlike PEG, grammar rules in BNF are non-deterministic, which makes
+; it much more challenging to express our naive parse logic.  Whether
+; this ABNF file is truly accurate is difficult to assess.
+
+; The abnfgen(1) tool linked above can be used to generate arbitrary
+; strings matching the grammar in this file.  These can be fed into
+; the Zisp parser to reveal some potential bugs; either in the parser
+; itself, or this ABNF grammar.
+
+; Note that the tool may generate Zisp string literals with Unicode
+; escape sequences corresponding to surrogate code points; the parser
+; may reject these.  This is expected; it's difficult to rewrite this
+; ABNF grammar to exclude those Unicode values.
+
+; Other minor inaccuracies that aren't important include: This ABNF
+; forces line comments to be terminated with an LF character, when in
+; fact the end-of-file may also terminate them; the same applies to
+; hash-bang parsing which doesn't actually have to end in LF.  These
+; discrepancies won't make abnfgen(1) generate invalid strings; they
+; only make this ABNF more strict than the Zisp parser, so it won't
+; generate some strings that the parser would actually accept.
+
+
+Stream        = [ Unit *( Blank Unit ) ] *Blank [Trail]
+
+
+Unit          = *Blank Datum
+
+Blank         = HTAB / LF / %x0b / %x0c / CR / SP / Comment
+
+Trail         = SkipLine / SkipUnit / ";" "~" *Blank
+
+
+Datum         = BareString / SpecialStr / CladDatum / Rune / RuneStr
+              / RuneDotStr / RuneClad / LabelRef / LabelDef / HashStr
+              / HashDotStr / HashClad / QuoteExpr / JoinExpr
+
+Comment       = SkipLine LF / SkipUnit Blank
+
+SkipLine      = ";" [ SkipLStart *AnyButLF ]
+
+SkipUnit      = ";" "~" Unit
+
+SkipLStart    = %x00-09 / %x0b-7d / %x7f-ff ; any but LF or "~"
+
+AnyButLF      = %x00-09 / %x0b-ff
+
+
+BareString    = BareChar *( BareChar / Numeric )
+
+SpecialStr    = SpecStrChar *( SpecStrChar / BareChar )
+
+CladDatum     = "|" *( PipeStrChar / "\" StringEsc ) "|"
+              / DQUOTE *( QuotStrChar / "\" StringEsc ) DQUOTE
+              / "(" List ")"
+              / "[" List "]"
+              / "{" List "}"
+
+Rune          = "#" RuneName
+
+RuneStr       = "#" RuneName "\" BareString
+
+RuneDotStr    = "#" RuneName "\" SpecialStr
+
+RuneClad      = "#" RuneName CladDatum
+
+HashBang      = "#" "!" *( SP / HTAB ) HBLine LF
+
+LabelRef      = "#" "%" Label "%"
+
+LabelDef      = "#" "%" Label "=" Datum
+
+HashStr       = "#" "\" BareString
+
+HashDotStr    = "#" "\" SpecialStr
+
+HashClad      = "#" CladDatum
+
+QuoteExpr     = "'" Datum
+              / "`" Datum
+              / "," Datum
+
+JoinExpr      = Datum RJoinDatum
+              / LJoinDatum NoStartDot
+              / Datum ":" Datum
+              / NoEndDot "." Datum
+
+
+BareChar      = "!" / "$" / "%" / "&" / "*" / "/" / "<" / "=" / ">"
+              / "?" / "^" / "_" / "~" / ALPHA
+
+Numeric       = "+" / "-" / DIGIT
+
+SpecStrChar   = "." / ":" / Numeric
+
+PipeStrChar   = %x00-5b / %x5d-7b / %x7d-ff ; any but "|" or "\"
+
+QuotStrChar   = %x00-21 / %x23-5b / %x5d-ff ; any but DQUOTE or "\"
+
+StringEsc     = "\" / "|" / DQUOTE / *( HTAB / SP ) LF *( HTAB / SP )
+              / %s"a" / %s"b" / %s"t" / %s"n"
+              / %s"v" / %s"f" / %s"r" / %s"e"
+              / %s"x" *( 2HEXDIG ) ";"
+              / %s"u" ["0"] 1*5HEXDIG ";"
+              / %s"u" "1" "0" 4HEXDIG ";"
+
+List          = [ Unit *( Blank Unit ) ] *Blank [SkipUnit]
+
+
+RuneName      = ALPHA *5( ALPHA / DIGIT )
+
+Label         = 1*12( HEXDIG )
+
+HBLine        = 1*HBChar [ 1*( SP / HTAB ) *HBChar ]
+
+HBChar        = %x00-08 / %x0b-1f / %x21-ff ; any but HT, LF, SP
+
+
+RJoinDatum    = CladDatum / Rune / RuneStr / RuneDotStr / RuneClad
+              / LabelRef / LabelDef / HashStr / HashDotStr / HashClad
+              / QuoteExpr
+
+LJoinDatum    = CladDatum / RuneClad / LabelRef / HashClad
+
+NoStartDot    = BareString / CladDatum / Rune / RuneStr / RuneDotStr
+              / RuneClad / LabelRef / LabelDef / HashStr / HashDotStr
+              / HashClad / QuoteExpr
+
+NoEndDot      = BareString / Rune / RuneStr / RuneClad / LabelRef
+              / HashStr / HashClad
+
+
+;; Local Variables:
+;; eval: (flyspell-mode -1)
+;; End:
diff --git a/doc/0/grammar/index.md b/doc/0/grammar/index.md
new file mode 100644
index 0000000..e3716ea
--- /dev/null
+++ b/doc/0/grammar/index.md
@@ -0,0 +1,115 @@
+# Zisp S-Expression Grammar
+
+The grammar is available in several different formats:
+
+* [ZBNF](zbnf.txt): See below for the rules of this notation
+* [ABNF](abnf.txt): Compatible with the `abnfgen` tool
+* [PEG](peg.txt): Compatible with `peg/leg` tool
+
+
+## ZBNF notation
+
+The ZBNF grammar specification uses a BNF-like notation with PEG-like
+semantics:
+
+* Concatenation of expressions is implicit: `foo bar` means `foo`
+  followed by `bar`.
+
+* Parentheses are used for grouping, and the pipe symbol `|` is used
+  for alternatives.
+
+* The suffixes `?`, `*`, and `+` have the same meaning as in regular
+  expressions, although `[foo]` is used in place of `(foo)?`.
+
+* The syntax is defined in terms of bytes, not characters.  Terminals
+  `'c'` and `"c"` refer to the ASCII value of the given character `c`.
+  Standard C escape sequences are supported.
+
+* The prefix `~` means NOT.  It only applies to rules that match one
+  byte, and negates them.  For example, `~( 'a' | 'b' )` matches any
+  byte other than 'a' and 'b'.
+
+* Ranges of terminal values are expressed as `x...y` (inclusive).
+
+* ABNF "core rules" like `ALPHA` and `HEXDIG` are supported.
+
+* There is no ambiguity, or look-ahead / backtracking beyond one byte.
+  Rules match left to right, depth-first, and greedy.  As soon as the
+  input matches the first terminal of a rule --explicit or implied by
+  recursively descending into the first non-terminal-- it must match
+  that rule to the end or a syntax error is reported.
+
+The last point makes the notation simple to translate to code.
+
+
+## Limitations outside the grammar
+
+The following limits are not represented in the grammar:
+
+* A `UnicodeSV` is the hexadecimal representation of a Unicode scalar
+  value; it must represent a value in the range 0 to D7FF, or E000 to
+  10FFFF, inclusive.  Any other value signals an error.  Valid values
+  are converted into a UTF-8 byte sequence encoding the value.
+
+* A `Rune` longer than 6 bytes is grammatical, but signals an error.
+  This is important because runes are not self-terminating; defining
+  their grammar as ending after a maximum of 6 bytes would allow
+  another datum beginning with an alphabetic character to follow a
+  rune immediately without any visual delineation, which would be
+  terribly confusing for a human reader.  Consider: `#foobarbaz`.
+  This would parse as a `Datum` joining `#foobar` and `baz`.
+
+  (The ABNF does not suffer from this issue, since it explicitly
+   enumerates the join possibilities anyway.)
+
+* A `Label` is the hexadecimal representation of a 48-bit integer,
+  meaning it allows for a maximum of 12 hexadecimal digits.  Longer
+  values are grammatical, but signal an out-of-range error, so as to
+  avoid signaling a confusing "invalid character" error on input that
+  appears grammatical.  Consider: `#%123456789abcd=foo`.  This would
+  signal an invalid character error at the letter `d` if the grammar
+  limited a `Label` to 12 hexadecimal digits.
+
+  (As above, the ABNF doesn't care about this.  You probably don't
+   want to use the ABNF to generate a parser anyway.)
+
+
+## At-quoted strings
+
+The mechanism of at-quoted strings is not represented in any of the
+grammars, since it essentially has 256 variants.  Representing it
+sanely in a grammar requires the ability to save and reference
+variables.
+
+
+## Stream-parsing strategy
+
+The parser consumes one `Unit` from the input stream every time it's
+called; it returns the `Datum` therein if found, or else it returns
+the Zisp EOF token.
+
+Since a `Datum` is not self-terminating, the parser must read beyond
+it to realize that it has ended (if not followed by the EOF).  Thus,
+it will consume one more `Blank` following the `Unit` that it parsed.
+If this `Blank` is a comment, it will be consumed entirely, ensuring
+that parsing resumes properly on a subsequent parser call on the same
+input stream, without needing to store any state in between.
+
+Since comments of type `SkipUnit` are likewise not self-terminating,
+an arbitrary number of chained `SkipUnit` comments may need to be
+consumed before the parser is finally allowed to return.
+
+The following illustration shows the positions at which the parser
+will stop consuming input when called repeatedly on the same input
+stream.  The dots represent the extent of each `Unit` being parsed,
+while the caret points at the last byte the parser will consume in
+that parse cycle.
+
+```
+foo (bar)[baz] foo;~bar foo;~bar;~baz;~bat foobar
+...^..........^...     ^...               ^......^
+```
+
+Notice how, in the fourth cycle, the parser is forced to consume all
+commented-out units before it can return, since it would otherwise
+leave the stream in an inappropriate state.
diff --git a/doc/0/grammar/peg.txt b/doc/0/grammar/peg.txt
new file mode 100644
index 0000000..1541da6
--- /dev/null
+++ b/doc/0/grammar/peg.txt
@@ -0,0 +1,91 @@
+# Standard PEG notation
+
+Stream       <- Unit ( Blank Unit )* !.
+
+
+Unit         <- Blank* Datum
+
+Blank        <- [\t-\r ] / Comment
+
+
+Datum        <- OneDatum ( JoinChar? OneDatum )*
+
+JoinChar     <- '.' / ':'
+
+
+Comment      <- ';' ( SkipUnit / SkipLine )
+
+SkipUnit     <- '~' Unit
+
+SkipLine     <- (!'\n' .)* '\n'?
+
+
+OneDatum     <- BareString / CladDatum
+
+
+BareString   <- SpecBareChar ( BareChar / JoinChar )*
+              / BareChar+
+
+SpecBareChar <- '+' / '-' / JoinChar / DIGIT
+
+BareChar     <- ALPHA / DIGIT
+              / '!' / '$' / '%' / '&' / '*' / '+' / '-' / '/'
+              / '<' / '=' / '>' / '?' / '^' / '_' / '~'
+
+
+CladDatum    <- PipeStr / QuoteStr / HashExpr / QuoteExpr / List
+
+PipeStr      <- '|' ( PipeStrChar / '\' StringEsc )* '|'
+QuoteStr     <- '"' ( QuotStrChar / '\' StringEsc )* '"'
+HashExpr     <- '#' HashExprs
+QuoteExpr    <- "'" Datum / '`' Datum / ',' Datum
+List         <- ParenList / SquareList / BraceList
+
+
+PipeStrChar  <- (![|\\] .)
+QuotStrChar  <- (!["\\] .)
+
+StringEsc    <- '\' / '|' / '"' / ( HTAB / SP )* LF ( HTAB / SP )*
+              / '0' / 'a' / 'b' / 't' / 'n' / 'v' / 'f' / 'r' / 'e'
+              / 'x' HexByte* ';'
+              / 'u' UnicodeSV ';'
+
+HexByte      <- HEXDIG HEXDIG
+UnicodeSV    <- HEXDIG+
+
+
+HashExprs    <- '!' [\t ]* HBangLine '\n'?
+              / '%' Label ( '%' / '=' Datum )
+              / '\' BareString / CladDatum
+              / Rune ( '\' BareString / CladDatum )?
+
+HBangLine    <- HBChars+ [\t ]* ( HBChars+ )?
+HBChars      <- (![\t\n ] .)
+Label        <- HEXDIG+
+Rune         <- ALPHA ( ALPHA / DIGIT )*
+
+
+ParenList    <- '(' Unit* ')'
+SquareList   <- '[' Unit* ']'
+BraceList    <- '{' Unit* '}'
+
+
+DIGIT        <- [0-9]
+ALPHA        <- [a-zA-Z]
+HEXDIG       <- [0-9a-fA-F]
+
+
+# Keep this in sync line-for-line with the ZBNF grammar for easy
+# comparison between the two.
+
+# This file is meant to be compatible with:
+# https://piumarta.com/software/peg
+
+# Due to a quirk in the peg tool this file is used with, the grammar
+# must not allow an empty stream.  Therefore, the Unit rule has its
+# Datum declared as mandatory rather than optional.
+
+
+# Local Variables:
+# eval: (flyspell-mode -1)
+# End:
diff --git a/doc/0/grammar/zbnf.txt b/doc/0/grammar/zbnf.txt
new file mode 100644
index 0000000..c04b813
--- /dev/null
+++ b/doc/0/grammar/zbnf.txt
@@ -0,0 +1,75 @@
+; Custom notation with PEG semantics
+
+Stream        : Unit ( Blank Unit )*
+
+
+Unit          : Blank* [Datum]
+
+Blank         : '\t'...'\r' | SP | Comment
+
+
+Datum         : OneDatum ( [JoinChar] OneDatum )*
+
+JoinChar      : '.' | ':'
+
+
+Comment       : ';' ( SkipUnit | SkipLine )
+
+SkipUnit      : '~' Unit
+
+SkipLine      : ( ~LF )* [LF]
+
+
+OneDatum      : BareString | CladDatum
+
+
+BareString    : SpecBareChar ( BareChar | JoinChar )*
+              | BareChar+
+
+SpecBareChar  : '+' | '-' | JoinChar | DIGIT
+
+BareChar      : ALPHA | DIGIT
+              | '!' | '$' | '%' | '&' | '*' | '+' | '-' | '/'
+              | '<' | '=' | '>' | '?' | '^' | '_' | '~'
+
+
+CladDatum     : PipeStr | QuoteStr | HashExpr | QuoteExpr | List
+
+PipeStr       : '|' ( PipeStrChar | '\' StringEsc )* '|'
+QuoteStr      : '"' ( QuotStrChar | '\' StringEsc )* '"'
+HashExpr      : '#' HashExprs
+QuoteExpr     : "'" Datum | '`' Datum | ',' Datum
+List          : ParenList | SquareList | BraceList
+
+
+PipeStrChar   : ~( '|' | '\' )
+QuotStrChar   : ~( '"' | '\' )
+
+StringEsc     : '\' | '|' | '"' | ( HTAB | SP )* LF ( HTAB | SP )*
+              | '0' | 'a' | 'b' | 't' | 'n' | 'v' | 'f' | 'r' | 'e'
+              | 'x' HexByte* ';'
+              | 'u' UnicodeSV ';'
+
+HexByte       : HEXDIG HEXDIG
+UnicodeSV     : HEXDIG+
+
+
+HashExprs     : '!' ( SP | HTAB )* HBangLine [ LF ]
+              | '%' Label ( '%' | '=' Datum )
+              | '\' BareString | CladDatum
+              | Rune [ '\' BareString | CladDatum ]
+
+HBangLine     : HBChars+ ( SP | HTAB )* [ HBChars+ ]
+HBChars       : ~( SP | HTAB | LF )
+Label         : HEXDIG+
+Rune          : ALPHA ( ALPHA | DIGIT )*
+
+
+ParenList     : '(' Unit* ')'
+SquareList    : '[' Unit* ']'
+BraceList     : '{' Unit* '}'
+
+
+;; Local Variables:
+;; eval: (flyspell-mode -1)
+;; End: