diff options
Diffstat (limited to 'doc/0/grammar')
| -rw-r--r-- | doc/0/grammar/abnf.txt | 139 | ||||
| -rw-r--r-- | doc/0/grammar/index.md | 115 | ||||
| -rw-r--r-- | doc/0/grammar/peg.txt | 91 | ||||
| -rw-r--r-- | doc/0/grammar/zbnf.txt | 75 |
4 files changed, 420 insertions, 0 deletions
diff --git a/doc/0/grammar/abnf.txt b/doc/0/grammar/abnf.txt new file mode 100644 index 0000000..5ab3c89 --- /dev/null +++ b/doc/0/grammar/abnf.txt @@ -0,0 +1,139 @@ +; Standards-compliant ABNF (RFC 5234, RFC 7405) + +; Compatible with: https://www.quut.com/abnfgen/ + +; Unlike PEG, grammar rules in BNF are non-deterministic, which makes +; it much more challenging to express our naive parse logic. Whether +; this ABNF file is truly accurate is difficult to assess. + +; The abnfgen(1) tool linked above can be used to generate arbitrary +; strings matching the grammar in this file. These can be fed into +; the Zisp parser to reveal some potential bugs; either in the parser +; itself, or this ABNF grammar. + +; Note that the tool may generate Zisp string literals with Unicode +; escape sequences corresponding to surrogate code points; the parser +; may reject these. This is expected; it's difficult to rewrite this +; ABNF grammar to exclude those Unicode values. + +; Other minor inaccuracies that aren't important include: This ABNF +; forces line comments to be terminated with an LF character, when in +; fact the end-of-file may also terminate them; the same applies to +; hash-bang parsing which doesn't actually have to end in LF. These +; discrepancies won't make abnfgen(1) generate invalid strings; they +; only make this ABNF more strict than the Zisp parser, so it won't +; generate some strings that the parser would actually accept. + + +Stream = [ Unit *( Blank Unit ) ] *Blank [Trail] + + +Unit = *Blank Datum + +Blank = HTAB / LF / %x0b / %x0c / CR / SP / Comment + +Trail = SkipLine / SkipUnit / ";" "~" *Blank + + +Datum = BareString / SpecialStr / CladDatum / Rune / RuneStr + / RuneDotStr / RuneClad / LabelRef / LabelDef / HashStr + / HashDotStr / HashClad / QuoteExpr / JoinExpr + +Comment = SkipLine LF / SkipUnit Blank + +SkipLine = ";" [ SkipLStart *AnyButLF ] + +SkipUnit = ";" "~" Unit + +SkipLStart = %x00-09 / %x0b-7d / %x7f-ff ; any but LF or "~" + +AnyButLF = %x00-09 / %x0b-ff + + +BareString = BareChar *( BareChar / Numeric ) + +SpecialStr = SpecStrChar *( SpecStrChar / BareChar ) + +CladDatum = "|" *( PipeStrChar / "\" StringEsc ) "|" + / DQUOTE *( QuotStrChar / "\" StringEsc ) DQUOTE + / "(" List ")" + / "[" List "]" + / "{" List "}" + +Rune = "#" RuneName + +RuneStr = "#" RuneName "\" BareString + +RuneDotStr = "#" RuneName "\" SpecialStr + +RuneClad = "#" RuneName CladDatum + +HashBang = "#" "!" *( SP / HTAB ) HBLine LF + +LabelRef = "#" "%" Label "%" + +LabelDef = "#" "%" Label "=" Datum + +HashStr = "#" "\" BareString + +HashDotStr = "#" "\" SpecialStr + +HashClad = "#" CladDatum + +QuoteExpr = "'" Datum + / "`" Datum + / "," Datum + +JoinExpr = Datum RJoinDatum + / LJoinDatum NoStartDot + / Datum ":" Datum + / NoEndDot "." Datum + + +BareChar = "!" / "$" / "%" / "&" / "*" / "/" / "<" / "=" / ">" + / "?" / "^" / "_" / "~" / ALPHA + +Numeric = "+" / "-" / DIGIT + +SpecStrChar = "." / ":" / Numeric + +PipeStrChar = %x00-5b / %x5d-7b / %x7d-ff ; any but "|" or "\" + +QuotStrChar = %x00-21 / %x23-5b / %x5d-ff ; any but DQUOTE or "\" + +StringEsc = "\" / "|" / DQUOTE / *( HTAB / SP ) LF *( HTAB / SP ) + / %s"a" / %s"b" / %s"t" / %s"n" + / %s"v" / %s"f" / %s"r" / %s"e" + / %s"x" *( 2HEXDIG ) ";" + / %s"u" ["0"] 1*5HEXDIG ";" + / %s"u" "1" "0" 4HEXDIG ";" + +List = [ Unit *( Blank Unit ) ] *Blank [SkipUnit] + + +RuneName = ALPHA *5( ALPHA / DIGIT ) + +Label = 1*12( HEXDIG ) + +HBLine = 1*HBChar [ 1*( SP / HTAB ) *HBChar ] + +HBChar = %x00-08 / %x0b-1f / %x21-ff ; any but HT, LF, SP + + +RJoinDatum = CladDatum / Rune / RuneStr / RuneDotStr / RuneClad + / LabelRef / LabelDef / HashStr / HashDotStr / HashClad + / QuoteExpr + +LJoinDatum = CladDatum / RuneClad / LabelRef / HashClad + +NoStartDot = BareString / CladDatum / Rune / RuneStr / RuneDotStr + / RuneClad / LabelRef / LabelDef / HashStr / HashDotStr + / HashClad / QuoteExpr + +NoEndDot = BareString / Rune / RuneStr / RuneClad / LabelRef + / HashStr / HashClad + + +;; Local Variables: +;; eval: (flyspell-mode -1) +;; End: diff --git a/doc/0/grammar/index.md b/doc/0/grammar/index.md new file mode 100644 index 0000000..e3716ea --- /dev/null +++ b/doc/0/grammar/index.md @@ -0,0 +1,115 @@ +# Zisp S-Expression Grammar + +The grammar is available in several different formats: + +* [ZBNF](zbnf.txt): See below for the rules of this notation +* [ABNF](abnf.txt): Compatible with the `abnfgen` tool +* [PEG](peg.txt): Compatible with `peg/leg` tool + + +## ZBNF notation + +The ZBNF grammar specification uses a BNF-like notation with PEG-like +semantics: + +* Concatenation of expressions is implicit: `foo bar` means `foo` + followed by `bar`. + +* Parentheses are used for grouping, and the pipe symbol `|` is used + for alternatives. + +* The suffixes `?`, `*`, and `+` have the same meaning as in regular + expressions, although `[foo]` is used in place of `(foo)?`. + +* The syntax is defined in terms of bytes, not characters. Terminals + `'c'` and `"c"` refer to the ASCII value of the given character `c`. + Standard C escape sequences are supported. + +* The prefix `~` means NOT. It only applies to rules that match one + byte, and negates them. For example, `~( 'a' | 'b' )` matches any + byte other than 'a' and 'b'. + +* Ranges of terminal values are expressed as `x...y` (inclusive). + +* ABNF "core rules" like `ALPHA` and `HEXDIG` are supported. + +* There is no ambiguity, or look-ahead / backtracking beyond one byte. + Rules match left to right, depth-first, and greedy. As soon as the + input matches the first terminal of a rule --explicit or implied by + recursively descending into the first non-terminal-- it must match + that rule to the end or a syntax error is reported. + +The last point makes the notation simple to translate to code. + + +## Limitations outside the grammar + +The following limits are not represented in the grammar: + +* A `UnicodeSV` is the hexadecimal representation of a Unicode scalar + value; it must represent a value in the range 0 to D7FF, or E000 to + 10FFFF, inclusive. Any other value signals an error. Valid values + are converted into a UTF-8 byte sequence encoding the value. + +* A `Rune` longer than 6 bytes is grammatical, but signals an error. + This is important because runes are not self-terminating; defining + their grammar as ending after a maximum of 6 bytes would allow + another datum beginning with an alphabetic character to follow a + rune immediately without any visual delineation, which would be + terribly confusing for a human reader. Consider: `#foobarbaz`. + This would parse as a `Datum` joining `#foobar` and `baz`. + + (The ABNF does not suffer from this issue, since it explicitly + enumerates the join possibilities anyway.) + +* A `Label` is the hexadecimal representation of a 48-bit integer, + meaning it allows for a maximum of 12 hexadecimal digits. Longer + values are grammatical, but signal an out-of-range error, so as to + avoid signaling a confusing "invalid character" error on input that + appears grammatical. Consider: `#%123456789abcd=foo`. This would + signal an invalid character error at the letter `d` if the grammar + limited a `Label` to 12 hexadecimal digits. + + (As above, the ABNF doesn't care about this. You probably don't + want to use the ABNF to generate a parser anyway.) + + +## At-quoted strings + +The mechanism of at-quoted strings is not represented in any of the +grammars, since it essentially has 256 variants. Representing it +sanely in a grammar requires the ability to save and reference +variables. + + +## Stream-parsing strategy + +The parser consumes one `Unit` from the input stream every time it's +called; it returns the `Datum` therein if found, or else it returns +the Zisp EOF token. + +Since a `Datum` is not self-terminating, the parser must read beyond +it to realize that it has ended (if not followed by the EOF). Thus, +it will consume one more `Blank` following the `Unit` that it parsed. +If this `Blank` is a comment, it will be consumed entirely, ensuring +that parsing resumes properly on a subsequent parser call on the same +input stream, without needing to store any state in between. + +Since comments of type `SkipUnit` are likewise not self-terminating, +an arbitrary number of chained `SkipUnit` comments may need to be +consumed before the parser is finally allowed to return. + +The following illustration shows the positions at which the parser +will stop consuming input when called repeatedly on the same input +stream. The dots represent the extent of each `Unit` being parsed, +while the caret points at the last byte the parser will consume in +that parse cycle. + +``` +foo (bar)[baz] foo;~bar foo;~bar;~baz;~bat foobar +...^..........^... ^... ^......^ +``` + +Notice how, in the fourth cycle, the parser is forced to consume all +commented-out units before it can return, since it would otherwise +leave the stream in an inappropriate state. diff --git a/doc/0/grammar/peg.txt b/doc/0/grammar/peg.txt new file mode 100644 index 0000000..1541da6 --- /dev/null +++ b/doc/0/grammar/peg.txt @@ -0,0 +1,91 @@ +# Standard PEG notation + +Stream <- Unit ( Blank Unit )* !. + + +Unit <- Blank* Datum + +Blank <- [\t-\r ] / Comment + + +Datum <- OneDatum ( JoinChar? OneDatum )* + +JoinChar <- '.' / ':' + + +Comment <- ';' ( SkipUnit / SkipLine ) + +SkipUnit <- '~' Unit + +SkipLine <- (!'\n' .)* '\n'? + + +OneDatum <- BareString / CladDatum + + +BareString <- SpecBareChar ( BareChar / JoinChar )* + / BareChar+ + +SpecBareChar <- '+' / '-' / JoinChar / DIGIT + +BareChar <- ALPHA / DIGIT + / '!' / '$' / '%' / '&' / '*' / '+' / '-' / '/' + / '<' / '=' / '>' / '?' / '^' / '_' / '~' + + +CladDatum <- PipeStr / QuoteStr / HashExpr / QuoteExpr / List + +PipeStr <- '|' ( PipeStrChar / '\' StringEsc )* '|' +QuoteStr <- '"' ( QuotStrChar / '\' StringEsc )* '"' +HashExpr <- '#' HashExprs +QuoteExpr <- "'" Datum / '`' Datum / ',' Datum +List <- ParenList / SquareList / BraceList + + +PipeStrChar <- (![|\\] .) +QuotStrChar <- (!["\\] .) + +StringEsc <- '\' / '|' / '"' / ( HTAB / SP )* LF ( HTAB / SP )* + / '0' / 'a' / 'b' / 't' / 'n' / 'v' / 'f' / 'r' / 'e' + / 'x' HexByte* ';' + / 'u' UnicodeSV ';' + +HexByte <- HEXDIG HEXDIG +UnicodeSV <- HEXDIG+ + + +HashExprs <- '!' [\t ]* HBangLine '\n'? + / '%' Label ( '%' / '=' Datum ) + / '\' BareString / CladDatum + / Rune ( '\' BareString / CladDatum )? + +HBangLine <- HBChars+ [\t ]* ( HBChars+ )? +HBChars <- (![\t\n ] .) +Label <- HEXDIG+ +Rune <- ALPHA ( ALPHA / DIGIT )* + + +ParenList <- '(' Unit* ')' +SquareList <- '[' Unit* ']' +BraceList <- '{' Unit* '}' + + +DIGIT <- [0-9] +ALPHA <- [a-zA-Z] +HEXDIG <- [0-9a-fA-F] + + +# Keep this in sync line-for-line with the ZBNF grammar for easy +# comparison between the two. + +# This file is meant to be compatible with: +# https://piumarta.com/software/peg + +# Due to a quirk in the peg tool this file is used with, the grammar +# must not allow an empty stream. Therefore, the Unit rule has its +# Datum declared as mandatory rather than optional. + + +# Local Variables: +# eval: (flyspell-mode -1) +# End: diff --git a/doc/0/grammar/zbnf.txt b/doc/0/grammar/zbnf.txt new file mode 100644 index 0000000..c04b813 --- /dev/null +++ b/doc/0/grammar/zbnf.txt @@ -0,0 +1,75 @@ +; Custom notation with PEG semantics + +Stream : Unit ( Blank Unit )* + + +Unit : Blank* [Datum] + +Blank : '\t'...'\r' | SP | Comment + + +Datum : OneDatum ( [JoinChar] OneDatum )* + +JoinChar : '.' | ':' + + +Comment : ';' ( SkipUnit | SkipLine ) + +SkipUnit : '~' Unit + +SkipLine : ( ~LF )* [LF] + + +OneDatum : BareString | CladDatum + + +BareString : SpecBareChar ( BareChar | JoinChar )* + | BareChar+ + +SpecBareChar : '+' | '-' | JoinChar | DIGIT + +BareChar : ALPHA | DIGIT + | '!' | '$' | '%' | '&' | '*' | '+' | '-' | '/' + | '<' | '=' | '>' | '?' | '^' | '_' | '~' + + +CladDatum : PipeStr | QuoteStr | HashExpr | QuoteExpr | List + +PipeStr : '|' ( PipeStrChar | '\' StringEsc )* '|' +QuoteStr : '"' ( QuotStrChar | '\' StringEsc )* '"' +HashExpr : '#' HashExprs +QuoteExpr : "'" Datum | '`' Datum | ',' Datum +List : ParenList | SquareList | BraceList + + +PipeStrChar : ~( '|' | '\' ) +QuotStrChar : ~( '"' | '\' ) + +StringEsc : '\' | '|' | '"' | ( HTAB | SP )* LF ( HTAB | SP )* + | '0' | 'a' | 'b' | 't' | 'n' | 'v' | 'f' | 'r' | 'e' + | 'x' HexByte* ';' + | 'u' UnicodeSV ';' + +HexByte : HEXDIG HEXDIG +UnicodeSV : HEXDIG+ + + +HashExprs : '!' ( SP | HTAB )* HBangLine [ LF ] + | '%' Label ( '%' | '=' Datum ) + | '\' BareString | CladDatum + | Rune [ '\' BareString | CladDatum ] + +HBangLine : HBChars+ ( SP | HTAB )* [ HBChars+ ] +HBChars : ~( SP | HTAB | LF ) +Label : HEXDIG+ +Rune : ALPHA ( ALPHA | DIGIT )* + + +ParenList : '(' Unit* ')' +SquareList : '[' Unit* ']' +BraceList : '{' Unit* '}' + + +;; Local Variables: +;; eval: (flyspell-mode -1) +;; End: |
