diff options
Diffstat (limited to 'doc/c1/grammar')
| -rw-r--r-- | doc/c1/grammar/abnf.txt | 141 | ||||
| -rw-r--r-- | doc/c1/grammar/index.md | 115 | ||||
| -rw-r--r-- | doc/c1/grammar/peg.txt | 93 | ||||
| -rw-r--r-- | doc/c1/grammar/zbnf.txt | 77 |
4 files changed, 0 insertions, 426 deletions
diff --git a/doc/c1/grammar/abnf.txt b/doc/c1/grammar/abnf.txt deleted file mode 100644 index aa67646..0000000 --- a/doc/c1/grammar/abnf.txt +++ /dev/null @@ -1,141 +0,0 @@ -; Standards-compliant ABNF (RFC 5234, RFC 7405) - -; Compatible with: https://www.quut.com/abnfgen/ - -; Unlike PEG, grammar rules in BNF are non-deterministic, which makes -; it much more challenging to express our naive parse logic. Whether -; this ABNF file is truly accurate is difficult to assess. - -; The abnfgen(1) tool linked above can be used to generate arbitrary -; strings matching the grammar in this file. These can be fed into -; the Zisp parser to reveal some potential bugs; either in the parser -; itself, or this ABNF grammar. - -; Note that the tool may generate Zisp string literals with Unicode -; escape sequences corresponding to surrogate code points; the parser -; may reject these. This is expected; it's difficult to rewrite this -; ABNF grammar to exclude those Unicode values. - -; Other minor inaccuracies that aren't important include: This ABNF -; forces line comments to be terminated with an LF character, when in -; fact the end-of-file may also terminate them; the same applies to -; hash-bang parsing which doesn't actually have to end in LF. These -; discrepancies won't make abnfgen(1) generate invalid strings; they -; only make this ABNF more strict than the Zisp parser, so it won't -; generate some strings that the parser would actually accept. - - -Stream = [ Unit *( Blank Unit ) ] *Blank [Trail] - - -Unit = *Blank Datum - -Blank = HTAB / LF / %x0b / %x0c / CR / SP / Comment - -Trail = SkipLine / SkipUnit / ";" "~" *Blank - - -Datum = BareString / SpecialStr / CladDatum / Rune / RuneStr - / RuneDotStr / RuneClad / LabelRef / LabelDef / HashStr - / HashDotStr / HashClad / QuoteExpr / JoinExpr - -Comment = SkipLine LF / SkipUnit Blank - -SkipLine = ";" [ SkipLStart *AnyButLF ] - -SkipUnit = ";" "~" Unit - -SkipLStart = %x00-09 / %x0b-7d / %x7f-ff ; any but LF or "~" - -AnyButLF = %x00-09 / %x0b-ff - - -BareString = BareChar *( BareChar / Numeric ) - -SpecialStr = SpecStrChar *( SpecStrChar / BareChar ) - -CladDatum = "|" *( PipeStrChar / "\" StringEsc ) "|" - / DQUOTE *( QuotStrChar / "\" StringEsc ) DQUOTE - / "(" List ")" - / "[" List "]" - / "{" List "}" - -Rune = "#" RuneName - -RuneStr = "#" RuneName "\" BareString - -RuneDotStr = "#" RuneName "\" SpecialStr - -RuneClad = "#" RuneName CladDatum - -HashBang = "#" "!" *( SP / HTAB ) HBLine LF - -LabelRef = "#" "%" Label "%" - -LabelDef = "#" "%" Label "=" Datum - -HashStr = "#" "\" BareString - -HashDotStr = "#" "\" SpecialStr - -HashClad = "#" CladDatum - -QuoteExpr = "'" Datum - / "`" Datum - / "," Datum - -JoinExpr = Datum RJoinDatum - / LJoinDatum NoStartDot - / Datum ":" Datum - / NoEndDot "." Datum - - -BareChar = "!" / "$" / "%" / "*" / "/" / "<" / "=" / ">" - / "?" / "^" / "_" / "~" / ALPHA - -Numeric = "+" / "-" / DIGIT - -SpecStrChar = "." / ":" / Numeric - -PipeStrChar = %x00-5b / %x5d-7b / %x7d-ff ; any but "|" or "\" - -QuotStrChar = %x00-21 / %x23-5b / %x5d-ff ; any but DQUOTE or "\" - -StringEsc = "\" / "|" / DQUOTE / *( HTAB / SP ) LF *( HTAB / SP ) - / %s"a" / %s"b" / %s"t" / %s"n" - / %s"v" / %s"f" / %s"r" / %s"e" - / %s"x" *( 2HEXDIG ) ";" - / %s"u" ["0"] 1*5HEXDIG ";" - / %s"u" "1" "0" 4HEXDIG ";" - -List = [ Unit *( Blank Unit ) ] *Blank [Tail] [SkipUnit] - -Tail = "&" Unit *Blank - - -RuneName = ALPHA *5( ALPHA / DIGIT ) - -Label = 1*12( HEXDIG ) - -HBLine = 1*HBChar [ 1*( SP / HTAB ) *HBChar ] - -HBChar = %x00-08 / %x0b-1f / %x21-ff ; any but HT, LF, SP - - -RJoinDatum = CladDatum / Rune / RuneStr / RuneDotStr / RuneClad - / LabelRef / LabelDef / HashStr / HashDotStr / HashClad - / QuoteExpr - -LJoinDatum = CladDatum / RuneClad / LabelRef / HashClad - -NoStartDot = BareString / CladDatum / Rune / RuneStr / RuneDotStr - / RuneClad / LabelRef / LabelDef / HashStr / HashDotStr - / HashClad / QuoteExpr - -NoEndDot = BareString / Rune / RuneStr / RuneClad / LabelRef - / HashStr / HashClad - - -;; Local Variables: -;; eval: (flyspell-mode -1) -;; End: diff --git a/doc/c1/grammar/index.md b/doc/c1/grammar/index.md deleted file mode 100644 index e3716ea..0000000 --- a/doc/c1/grammar/index.md +++ /dev/null @@ -1,115 +0,0 @@ -# Zisp S-Expression Grammar - -The grammar is available in several different formats: - -* [ZBNF](zbnf.txt): See below for the rules of this notation -* [ABNF](abnf.txt): Compatible with the `abnfgen` tool -* [PEG](peg.txt): Compatible with `peg/leg` tool - - -## ZBNF notation - -The ZBNF grammar specification uses a BNF-like notation with PEG-like -semantics: - -* Concatenation of expressions is implicit: `foo bar` means `foo` - followed by `bar`. - -* Parentheses are used for grouping, and the pipe symbol `|` is used - for alternatives. - -* The suffixes `?`, `*`, and `+` have the same meaning as in regular - expressions, although `[foo]` is used in place of `(foo)?`. - -* The syntax is defined in terms of bytes, not characters. Terminals - `'c'` and `"c"` refer to the ASCII value of the given character `c`. - Standard C escape sequences are supported. - -* The prefix `~` means NOT. It only applies to rules that match one - byte, and negates them. For example, `~( 'a' | 'b' )` matches any - byte other than 'a' and 'b'. - -* Ranges of terminal values are expressed as `x...y` (inclusive). - -* ABNF "core rules" like `ALPHA` and `HEXDIG` are supported. - -* There is no ambiguity, or look-ahead / backtracking beyond one byte. - Rules match left to right, depth-first, and greedy. As soon as the - input matches the first terminal of a rule --explicit or implied by - recursively descending into the first non-terminal-- it must match - that rule to the end or a syntax error is reported. - -The last point makes the notation simple to translate to code. - - -## Limitations outside the grammar - -The following limits are not represented in the grammar: - -* A `UnicodeSV` is the hexadecimal representation of a Unicode scalar - value; it must represent a value in the range 0 to D7FF, or E000 to - 10FFFF, inclusive. Any other value signals an error. Valid values - are converted into a UTF-8 byte sequence encoding the value. - -* A `Rune` longer than 6 bytes is grammatical, but signals an error. - This is important because runes are not self-terminating; defining - their grammar as ending after a maximum of 6 bytes would allow - another datum beginning with an alphabetic character to follow a - rune immediately without any visual delineation, which would be - terribly confusing for a human reader. Consider: `#foobarbaz`. - This would parse as a `Datum` joining `#foobar` and `baz`. - - (The ABNF does not suffer from this issue, since it explicitly - enumerates the join possibilities anyway.) - -* A `Label` is the hexadecimal representation of a 48-bit integer, - meaning it allows for a maximum of 12 hexadecimal digits. Longer - values are grammatical, but signal an out-of-range error, so as to - avoid signaling a confusing "invalid character" error on input that - appears grammatical. Consider: `#%123456789abcd=foo`. This would - signal an invalid character error at the letter `d` if the grammar - limited a `Label` to 12 hexadecimal digits. - - (As above, the ABNF doesn't care about this. You probably don't - want to use the ABNF to generate a parser anyway.) - - -## At-quoted strings - -The mechanism of at-quoted strings is not represented in any of the -grammars, since it essentially has 256 variants. Representing it -sanely in a grammar requires the ability to save and reference -variables. - - -## Stream-parsing strategy - -The parser consumes one `Unit` from the input stream every time it's -called; it returns the `Datum` therein if found, or else it returns -the Zisp EOF token. - -Since a `Datum` is not self-terminating, the parser must read beyond -it to realize that it has ended (if not followed by the EOF). Thus, -it will consume one more `Blank` following the `Unit` that it parsed. -If this `Blank` is a comment, it will be consumed entirely, ensuring -that parsing resumes properly on a subsequent parser call on the same -input stream, without needing to store any state in between. - -Since comments of type `SkipUnit` are likewise not self-terminating, -an arbitrary number of chained `SkipUnit` comments may need to be -consumed before the parser is finally allowed to return. - -The following illustration shows the positions at which the parser -will stop consuming input when called repeatedly on the same input -stream. The dots represent the extent of each `Unit` being parsed, -while the caret points at the last byte the parser will consume in -that parse cycle. - -``` -foo (bar)[baz] foo;~bar foo;~bar;~baz;~bat foobar -...^..........^... ^... ^......^ -``` - -Notice how, in the fourth cycle, the parser is forced to consume all -commented-out units before it can return, since it would otherwise -leave the stream in an inappropriate state. diff --git a/doc/c1/grammar/peg.txt b/doc/c1/grammar/peg.txt deleted file mode 100644 index 7b28a99..0000000 --- a/doc/c1/grammar/peg.txt +++ /dev/null @@ -1,93 +0,0 @@ -# Standard PEG notation - -Stream <- Unit ( Blank Unit )* !. - - -Unit <- Blank* Datum - -Blank <- [\t-\r ] / Comment - - -Datum <- OneDatum ( JoinChar? OneDatum )* - -JoinChar <- '.' / ':' - - -Comment <- ';' ( SkipUnit / SkipLine ) - -SkipUnit <- '~' Unit - -SkipLine <- (!'\n' .)* '\n'? - - -OneDatum <- BareString / CladDatum - - -BareString <- SpecBareChar ( BareChar / JoinChar )* - / BareChar+ - -SpecBareChar <- '+' / '-' / JoinChar / DIGIT - -BareChar <- ALPHA / DIGIT - / '!' / '$' / '%' / '*' / '+' / '-' / '/' - / '<' / '=' / '>' / '?' / '^' / '_' / '~' - - -CladDatum <- PipeStr / QuoteStr / HashExpr / QuoteExpr / List - -PipeStr <- '|' ( PipeStrChar / '\' StringEsc )* '|' -QuoteStr <- '"' ( QuotStrChar / '\' StringEsc )* '"' -HashExpr <- '#' HashExprs -QuoteExpr <- "'" Datum / '`' Datum / ',' Datum -List <- ParenList / SquareList / BraceList - - -PipeStrChar <- (![|\\] .) -QuotStrChar <- (!["\\] .) - -StringEsc <- '\' / '|' / '"' / ( HTAB / SP )* LF ( HTAB / SP )* - / '0' / 'a' / 'b' / 't' / 'n' / 'v' / 'f' / 'r' / 'e' - / 'x' HexByte* ';' - / 'u' UnicodeSV ';' - -HexByte <- HEXDIG HEXDIG -UnicodeSV <- HEXDIG+ - - -HashExprs <- '!' [\t ]* HBangLine '\n'? - / '%' Label ( '%' / '=' Datum ) - / '\' BareString / CladDatum - / Rune ( '\' BareString / CladDatum )? - -HBangLine <- HBChars+ [\t ]* ( HBChars+ )? -HBChars <- (![\t\n ] .) -Label <- HEXDIG+ -Rune <- ALPHA ( ALPHA / DIGIT )* - - -ParenList <- '(' ListBody ')' -SquareList <- '[' ListBody ']' -BraceList <- '{' ListBody '}' - -ListBody <- Unit* ( Blank* '&' Unit )? Blank* - - -DIGIT <- [0-9] -ALPHA <- [a-zA-Z] -HEXDIG <- [0-9a-fA-F] - - -# Keep this in sync line-for-line with the ZBNF grammar for easy -# comparison between the two. - -# This file is meant to be compatible with: -# https://piumarta.com/software/peg - -# Due to a quirk in the peg tool this file is used with, the grammar -# must not allow an empty stream. Therefore, the Unit rule has its -# Datum declared as mandatory rather than optional. - - -# Local Variables: -# eval: (flyspell-mode -1) -# End: diff --git a/doc/c1/grammar/zbnf.txt b/doc/c1/grammar/zbnf.txt deleted file mode 100644 index 923ac83..0000000 --- a/doc/c1/grammar/zbnf.txt +++ /dev/null @@ -1,77 +0,0 @@ -; Custom notation with PEG semantics - -Stream : Unit ( Blank Unit )* - - -Unit : Blank* [Datum] - -Blank : '\t'...'\r' | SP | Comment - - -Datum : OneDatum ( [JoinChar] OneDatum )* - -JoinChar : '.' | ':' - - -Comment : ';' ( SkipUnit | SkipLine ) - -SkipUnit : '~' Unit - -SkipLine : ( ~LF )* [LF] - - -OneDatum : BareString | CladDatum - - -BareString : SpecBareChar ( BareChar | JoinChar )* - | BareChar+ - -SpecBareChar : '+' | '-' | JoinChar | DIGIT - -BareChar : ALPHA | DIGIT - | '!' | '$' | '%' | '*' | '+' | '-' | '/' - | '<' | '=' | '>' | '?' | '^' | '_' | '~' - - -CladDatum : PipeStr | QuoteStr | HashExpr | QuoteExpr | List - -PipeStr : '|' ( PipeStrChar | '\' StringEsc )* '|' -QuoteStr : '"' ( QuotStrChar | '\' StringEsc )* '"' -HashExpr : '#' HashExprs -QuoteExpr : "'" Datum | '`' Datum | ',' Datum -List : ParenList | SquareList | BraceList - - -PipeStrChar : ~( '|' | '\' ) -QuotStrChar : ~( '"' | '\' ) - -StringEsc : '\' | '|' | '"' | ( HTAB | SP )* LF ( HTAB | SP )* - | '0' | 'a' | 'b' | 't' | 'n' | 'v' | 'f' | 'r' | 'e' - | 'x' HexByte* ';' - | 'u' UnicodeSV ';' - -HexByte : HEXDIG HEXDIG -UnicodeSV : HEXDIG+ - - -HashExprs : '!' ( SP | HTAB )* HBangLine [ LF ] - | '%' Label ( '%' | '=' Datum ) - | '\' BareString | CladDatum - | Rune [ '\' BareString | CladDatum ] - -HBangLine : HBChars+ ( SP | HTAB )* [ HBChars+ ] -HBChars : ~( SP | HTAB | LF ) -Label : HEXDIG+ -Rune : ALPHA ( ALPHA | DIGIT )* - - -ParenList : '(' ListBody ')' -SquareList : '[' ListBody ']' -BraceList : '{' ListBody '}' - -ListBody : Unit* [ Blank* '&' Unit ] Blank* - - -;; Local Variables: -;; eval: (flyspell-mode -1) -;; End: |
