summaryrefslogtreecommitdiff
path: root/docs/c1/grammar
diff options
context:
space:
mode:
Diffstat (limited to 'docs/c1/grammar')
-rw-r--r--docs/c1/grammar/abnf.txt112
-rw-r--r--docs/c1/grammar/index.md101
-rw-r--r--docs/c1/grammar/peg.txt78
-rw-r--r--docs/c1/grammar/zbnf.txt63
4 files changed, 354 insertions, 0 deletions
diff --git a/docs/c1/grammar/abnf.txt b/docs/c1/grammar/abnf.txt
new file mode 100644
index 0000000..6daaceb
--- /dev/null
+++ b/docs/c1/grammar/abnf.txt
@@ -0,0 +1,112 @@
+; Standards-compliant ABNF (RFC 5234, RFC 7405)
+
+; Compatible with: https://www.quut.com/abnfgen/
+
+; It's unclear whether this grammar is truly complete. It has been
+; verified not to produce text that is rejected by the Zisp parser
+; --except for Unicode escape sequences for surrogate code points--
+; but there may be some text that is accepted by the parser despite
+; not being grammatical according to these rules.
+
+
+Stream = [ Unit *( Blank Unit ) ] *Blank [Trail]
+
+
+Unit = *Blank Datum
+
+Blank = HTAB / LF / %x0b / %x0c / CR / SP / Comment
+
+Trail = SkipLine / SkipUnit / ";" "~" *Blank
+
+
+Datum = BareString / DottedStr / CladDatum / Rune / RuneStr
+ / RuneDotStr / RuneClad / LabelRef / LabelDef / HashStr
+ / HashDotStr / HashClad / QuoteExpr / JoinExpr
+
+Comment = SkipLine LF / SkipUnit Blank
+
+SkipLine = ";" [ SkipLStart *AnyButLF ]
+
+SkipUnit = ";" "~" Unit
+
+SkipLStart = %x00-09 / %x0b-7d / %x7f-ff ; any but LF or "~"
+
+AnyButLF = %x00-09 / %x0b-ff
+
+
+BareString = BareChar *( BareChar / Numeric )
+
+DottedStr = ( "." / Numeric ) *( "." / Numeric / BareChar )
+
+CladDatum = "|" *( PipeStrChar / "\" StringEsc ) "|"
+ / DQUOTE *( QuotStrChar / "\" StringEsc ) DQUOTE
+ / "(" List ")"
+ / "[" List "]"
+ / "{" List "}"
+
+Rune = "#" RuneName
+
+RuneStr = "#" RuneName "\" BareString
+
+RuneDotStr = "#" RuneName "\" DottedStr
+
+RuneClad = "#" RuneName CladDatum
+
+LabelRef = "#" "%" Label "%"
+
+LabelDef = "#" "%" Label "=" Datum
+
+HashStr = "#" "\" BareString
+
+HashDotStr = "#" "\" DottedStr
+
+HashClad = "#" CladDatum
+
+QuoteExpr = "'" Datum
+ / "`" Datum
+ / "," Datum
+
+JoinExpr = Datum RJoinDatum
+ / LJoinDatum NoStartDot
+ / Datum ":" Datum
+ / NoEndDot "." Datum
+
+
+BareChar = "!" / "$" / "%" / "*" / "/" / "<" / "=" / ">"
+ / "?" / "@" / "^" / "_" / "~" / ALPHA
+
+Numeric = "+" / "-" / DIGIT
+
+PipeStrChar = %x00-5b / %x5d-7b / %x7d-ff ; any but "|" or "\"
+
+QuotStrChar = %x00-21 / %x23-5b / %x5d-ff ; any but DQUOTE or "\"
+
+StringEsc = "\" / "|" / DQUOTE / *( HTAB / SP ) LF *( HTAB / SP )
+ / %s"a" / %s"b" / %s"t" / %s"n"
+ / %s"v" / %s"f" / %s"r" / %s"e"
+ / %s"x" 1*( 2HEXDIG ) ";"
+ / %s"u" ["0"] 1*5HEXDIG ";"
+ / %s"u" "1" "0" 4HEXDIG ";"
+
+List = [ Unit *( Blank Unit ) ] *Blank [Tail] [SkipUnit]
+
+Tail = "&" Unit *Blank
+
+
+RuneName = ALPHA *5( ALPHA / DIGIT )
+
+Label = 1*12( HEXDIG )
+
+
+RJoinDatum = CladDatum / Rune / RuneStr / RuneDotStr / RuneClad
+ / LabelRef / LabelDef / HashStr / HashDotStr / HashClad
+ / QuoteExpr
+
+LJoinDatum = CladDatum / RuneClad / LabelRef / HashClad
+
+NoStartDot = BareString / CladDatum / Rune / RuneStr / RuneDotStr
+ / RuneClad / LabelRef / LabelDef / HashStr / HashDotStr
+ / HashClad / QuoteExpr
+
+NoEndDot = BareString / Rune / RuneStr / RuneClad / LabelRef
+ / HashStr / HashClad
diff --git a/docs/c1/grammar/index.md b/docs/c1/grammar/index.md
new file mode 100644
index 0000000..5bedbfc
--- /dev/null
+++ b/docs/c1/grammar/index.md
@@ -0,0 +1,101 @@
+# Zisp S-Expression Grammar
+
+The grammar is available in several different formats:
+
+* [ZBNF](zbnf.txt): See below for the rules of this notation
+* [ABNF](abnf.txt): Compatible with the `abnfgen` tool
+* [PEG](peg.txt): Compatible with `peg/leg` tool
+
+
+## ZBNF notation
+
+The ZBNF grammar specification uses a BNF-like notation with PEG-like
+semantics:
+
+* Concatenation of expressions is implicit: `foo bar` means `foo`
+ followed by `bar`.
+
+* Parentheses are used for grouping, and the pipe symbol `|` is used
+ for alternatives.
+
+* The suffixes `?`, `*`, and `+` have the same meaning as in regular
+ expressions, although `[foo]` is used in place of `(foo)?`.
+
+* The syntax is defined in terms of bytes, not characters. Terminals
+ `'c'` and `"c"` refer to the ASCII value of the given character `c`.
+ Standard C escape sequences are supported.
+
+* The prefix `~` means NOT. It only applies to rules that match one
+ byte, and negates them. For example, `~( 'a' | 'b' )` matches any
+ byte other than 'a' and 'b'.
+
+* Ranges of terminal values are expressed as `x...y` (inclusive).
+
+* ABNF "core rules" like `ALPHA` and `HEXDIG` are supported.
+
+* There is no ambiguity, or look-ahead / backtracking beyond one byte.
+ Rules match left to right, depth-first, and greedy. As soon as the
+ input matches the first terminal of a rule --explicit or implied by
+ recursively descending into the first non-terminal-- it must match
+ that rule to the end or a syntax error is reported.
+
+The last point makes the notation simple to translate to code.
+
+
+## Limitations outside the grammar
+
+The following limits are not represented in the grammar:
+
+* A `UnicodeSV` is the hexadecimal representation of a Unicode scalar
+ value; it must represent a value in the range 0 to D7FF, or E000 to
+ 10FFFF, inclusive. Any other value signals an error. Valid values
+ are converted into a UTF-8 byte sequence encoding the value.
+
+* A `Rune` longer than 6 bytes is grammatical, but signals an error.
+ This is important because runes are not self-terminating; defining
+ their grammar as ending after a maximum of 6 bytes would allow
+ another datum beginning with an alphabetic character to follow a
+ rune immediately without any visual delineation, which would be
+ terribly confusing for a human reader. Consider: `#foobarbaz`.
+ This would parse as a `Datum` joining `#foobar` and `baz`.
+
+* A `Label` is the hexadecimal representation of a 48-bit integer,
+ meaning it allows for a maximum of 12 hexadecimal digits. Longer
+ values are grammatical, but signal an out-of-range error, so as to
+ avoid signaling a confusing "invalid character" error on input that
+ appears grammatical. Consider: `#%123456789abcd=foo`. This would
+ signal an invalid character error at the letter `d` if the grammar
+ limited a `Label` to 12 hexadecimal digits.
+
+
+## Stream-parsing strategy
+
+The parser consumes one `Unit` from the input stream every time it's
+called; it returns the `Datum` therein if found, or else it returns
+the Zisp EOF token.
+
+Since a `Datum` is not self-terminating, the parser must read beyond
+it to realize that it has ended (if not followed by the EOF). Thus,
+it will consume one more `Blank` following the `Unit` that it parsed.
+If this `Blank` is a comment, it will be consumed entirely, ensuring
+that parsing resumes properly on a subsequent parser call on the same
+input stream, without needing to store any state in between.
+
+Since comments of type `SkipUnit` are likewise not self-terminating,
+an arbitrary number of chained `SkipUnit` comments may need to be
+consumed before the parser is finally allowed to return.
+
+The following illustration shows the positions at which the parser
+will stop consuming input when called repeatedly on the same input
+stream. The dots represent the extent of each `Unit` being parsed,
+while the caret points at the last byte the parser will consume in
+that parse cycle.
+
+```
+foo (bar)[baz] foo;~bar foo;~bar;~baz;~bat foobar
+...^..........^... ^... ^......^
+```
+
+Notice how, in the fourth cycle, the parser is forced to consume all
+commented-out units before it can return, since it would otherwise
+leave the stream in an inappropriate state.
diff --git a/docs/c1/grammar/peg.txt b/docs/c1/grammar/peg.txt
new file mode 100644
index 0000000..1e060ec
--- /dev/null
+++ b/docs/c1/grammar/peg.txt
@@ -0,0 +1,78 @@
+# Standard PEG notation
+
+Stream <- Unit ( Blank Unit )* !.
+
+Unit <- Blank* Datum
+
+
+Blank <- [\t-\r ] / Comment
+
+Datum <- OneDatum ( JoinChar? OneDatum )*
+
+JoinChar <- '.' / ':'
+
+
+Comment <- ';' ( SkipUnit / SkipLine )
+
+SkipUnit <- '~' Unit
+
+SkipLine <- (!'\n' .)* '\n'?
+
+
+OneDatum <- BareString / CladDatum
+
+
+BareString <- ( '.' / '+' / '-' / DIGIT ) ( BareChar / '.' )*
+ / BareChar+
+
+CladDatum <- PipeStr / QuoteStr / HashExpr / QuoteExpr / List
+
+PipeStr <- '|' ( PipeStrChar / '\' StringEsc )* '|'
+QuoteStr <- '"' ( QuotStrChar / '\' StringEsc )* '"'
+HashExpr <- '#' ( RuneExpr / LabelExpr / HashDatum )
+QuoteExpr <- "'" Datum / '`' Datum / ',' Datum
+List <- ParenList / SquareList / BraceList
+
+BareChar <- ALPHA / DIGIT
+ / '!' / '$' / '%' / '*' / '+'
+ / '-' / '/' / '<' / '=' / '>'
+ / '?' / '@' / '^' / '_' / '~'
+
+PipeStrChar <- (![|\\] .)
+QuotStrChar <- (!["\\] .)
+
+StringEsc <- '\' / '|' / '"' / ( HTAB / SP )* LF ( HTAB / SP )*
+ / 'a' / 'b' / 't' / 'n' / 'v' / 'f' / 'r' / 'e'
+ / 'x' HexByte+ ';'
+ / 'u' UnicodeSV ';'
+
+HexByte <- HEXDIG HEXDIG
+UnicodeSV <- HEXDIG+
+
+RuneExpr <- Rune ( '\' BareString / CladDatum )?
+LabelExpr <- '%' Label ( '%' / '=' Datum )
+HashDatum <- '\' BareString / CladDatum
+
+Rune <- ALPHA ( ALPHA / DIGIT )*
+Label <- HEXDIG+
+
+ParenList <- '(' ListBody ')'
+SquareList <- '[' ListBody ']'
+BraceList <- '{' ListBody '}'
+
+ListBody <- Unit* ( Blank* '&' Unit )? Blank*
+
+DIGIT <- [0-9]
+ALPHA <- [a-zA-Z]
+HEXDIG <- [0-9a-fA-F]
+
+
+# Keep this in sync line-for-line with the ZBNF grammar for easy
+# comparison between the two.
+
+# This file is meant to be compatible with:
+# https://piumarta.com/software/peg
+
+# Due to a quirk in the peg tool this file is used with, the grammar
+# must not allow an empty stream. Therefore, the Unit rule has its
+# Datum declared as mandatory rather than optional.
diff --git a/docs/c1/grammar/zbnf.txt b/docs/c1/grammar/zbnf.txt
new file mode 100644
index 0000000..551c319
--- /dev/null
+++ b/docs/c1/grammar/zbnf.txt
@@ -0,0 +1,63 @@
+# Custom notation with PEG semantics
+
+Stream : Unit ( Blank Unit )*
+
+Unit : Blank* [Datum]
+
+
+Blank : '\t'...'\r' | SP | Comment
+
+Datum : OneDatum ( [JoinChar] OneDatum )*
+
+JoinChar : '.' | ':'
+
+
+Comment : ';' ( SkipUnit | SkipLine )
+
+SkipUnit : '~' Unit
+
+SkipLine : ( ~LF )* [LF]
+
+
+OneDatum : BareString | CladDatum
+
+
+BareString : ( '.' | '+' | '-' | DIGIT ) ( BareChar | '.' )*
+ | BareChar+
+
+CladDatum : PipeStr | QuoteStr | HashExpr | QuoteExpr | List
+
+PipeStr : '|' ( PipeStrChar | '\' StringEsc )* '|'
+QuoteStr : '"' ( QuotStrChar | '\' StringEsc )* '"'
+HashExpr : '#' ( RuneExpr | LabelExpr | HashDatum )
+QuoteExpr : "'" Datum | '`' Datum | ',' Datum
+List : ParenList | SquareList | BraceList
+
+BareChar : ALPHA | DIGIT
+ | '!' | '$' | '%' | '*' | '+'
+ | '-' | '/' | '<' | '=' | '>'
+ | '?' | '@' | '^' | '_' | '~'
+
+PipeStrChar : ~( '|' | '\' )
+QuotStrChar : ~( '"' | '\' )
+
+StringEsc : '\' | '|' | '"' | ( HTAB | SP )* LF ( HTAB | SP )*
+ | 'a' | 'b' | 't' | 'n' | 'v' | 'f' | 'r' | 'e'
+ | 'x' HexByte+ ';'
+ | 'u' UnicodeSV ';'
+
+HexByte : HEXDIG HEXDIG
+UnicodeSV : HEXDIG+
+
+RuneExpr : Rune [ '\' BareString | CladDatum ]
+LabelExpr : '%' Label ( '%' | '=' Datum )
+HashDatum : '\' BareString | CladDatum
+
+Rune : ALPHA ( ALPHA | DIGIT )*
+Label : HEXDIG+
+
+ParenList : '(' ListBody ')'
+SquareList : '[' ListBody ']'
+BraceList : '{' ListBody '}'
+
+ListBody : Unit* [ Blank* '&' Unit ] Blank*