diff options
| author | Taylan Kammer <taylan.kammer@gmail.com> | 2026-01-10 17:33:14 +0100 |
|---|---|---|
| committer | Taylan Kammer <taylan.kammer@gmail.com> | 2026-01-10 17:33:14 +0100 |
| commit | b737130c059e8e5566caa7aa3144f910d43999ae (patch) | |
| tree | 5dfbfface09aed028efbeacd119e6a78e7530f61 | |
| parent | 29dfeb175122d66bf71fc67ecd80f8df90a9a462 (diff) | |
More grammar shite.
| -rw-r--r-- | docs/c1/1-parse.md | 13 | ||||
| -rw-r--r-- | docs/c1/2-decode.md | 2 | ||||
| -rw-r--r-- | docs/c1/grammar.abnf.txt | 109 | ||||
| -rw-r--r-- | docs/c1/grammar.md | 101 | ||||
| -rw-r--r-- | docs/c1/grammar.peg.txt (renamed from spec/syntax.peg) | 20 | ||||
| -rw-r--r-- | docs/c1/grammar.zbnf.txt (renamed from spec/syntax.zbnf) | 8 | ||||
| -rw-r--r-- | spec/syntax.abnf | 96 | ||||
| -rw-r--r-- | spec/syntax.md | 117 |
8 files changed, 238 insertions, 228 deletions
diff --git a/docs/c1/1-parse.md b/docs/c1/1-parse.md index a23ebbc..e04240b 100644 --- a/docs/c1/1-parse.md +++ b/docs/c1/1-parse.md @@ -1,8 +1,7 @@ # Parser for Code & Data -Zisp s-expressions are defined in terms of an extremely minimal set of data -types; only that which is necessary to build representations of more complex -expressions and data types: +Zisp S-Expressions represent an extremely minimal set of data types; only that +which is necessary to strategically construct more complex code and data: +--------+-----------------+--------+----------+------+ | TYPE | String | Rune | Pair | Nil | @@ -10,12 +9,8 @@ expressions and data types: | E.G. | foo, |foo bar| | #name | (X & Y) | () | +--------+-----------------+--------+----------+------+ -Note that the ampersand replaces the period in pair notation. This simplifies -the grammar: periods are a regular constituent of strings, while the ampersand -cannot appear in unquoted strings. - The parser can also output non-negative integers, but this is only used for -datum labels; number literals are handled by the *decoder*. +datum labels; number literals are handled by the *decoder* (see next). The parser recognizes various "syntax sugar" and transforms it into uses of the above data types. The most ubiquitous example is of course the list: @@ -110,6 +105,8 @@ Further notes about the syntax sugar table and examples above: Zisp's internal use and standard library; users can use lowercase runes with custom meaning without worrying about clashes. +For an exact specification of the grammar, see [grammar](grammar.html). + <!-- ;; Local Variables: ;; fill-column: 80 diff --git a/docs/c1/2-decode.md b/docs/c1/2-decode.md index 0b34204..379c74b 100644 --- a/docs/c1/2-decode.md +++ b/docs/c1/2-decode.md @@ -1,7 +1,7 @@ # Decoding A separate process called "decoding" can transform simple data structures, -consisting of only the datum types, into a richer set of Zisp types. +consisting of only the base datum types, into a richer set of Zisp types. For example, the decoder may turn `(#HASH ...)` into a vector, as one would expect a vector literal like `#(...)` to work in Scheme. Bytevector syntax diff --git a/docs/c1/grammar.abnf.txt b/docs/c1/grammar.abnf.txt new file mode 100644 index 0000000..ad68a16 --- /dev/null +++ b/docs/c1/grammar.abnf.txt @@ -0,0 +1,109 @@ +; Compatible with https://www.quut.com/abnfgen/ + +; It's unclear whether this grammar is truly complete. It has been +; verified not to produce text that is rejected by the Zisp parser +; --except for Unicode escape sequences for surrogate code points-- +; but there may be some text that is accepted by the parser despite +; not being grammatical according to these rules. + + +Stream = [ Unit *( Blank Unit ) ] *Blank [Trail] + + +Unit = *Blank Datum + +Blank = HTAB / LF / %x0b / %x0c / CR / SP / Comment + +Trail = SkipLine / SkipUnit / ';' '~' *Blank + + +Datum = BareString / DottedStr / CladDatum / Rune / RuneStr + / RuneDotStr / RuneClad / LabelRef / LabelDef / HashStr + / HashDotStr / HashClad / QuoteExpr / JoinExpr + +Comment = SkipLine LF / SkipUnit Blank + +SkipLine = ';' [ SkipLStart *AnyButLF ] + +SkipUnit = ';' '~' Unit + +SkipLStart = %x00-09 / %x0b-7d / %x7f-ff ; any but LF or '~' + +AnyButLF = %x00-09 / %x0b-ff + + +BareString = BareChar *( BareChar / Numeric ) + +DottedStr = ( '.' / Numeric ) *( '.' / Numeric / BareChar ) + +CladDatum = '|' *( PipeStrChar / '\' StringEsc ) '|' + / '"' *( QuotStrChar / '\' StringEsc ) '"' + / '(' List ')' + / '[' List ']' + / '{' List '}' + +Rune = '#' RuneName + +RuneStr = '#' RuneName '\' BareString + +RuneDotStr = '#' RuneName '\' DottedStr + +RuneClad = '#' RuneName CladDatum + +LabelRef = '#' '%' Label '%' + +LabelDef = '#' '%' Label '=' Datum + +HashStr = '#' '\' BareString + +HashDotStr = '#' '\' DottedStr + +HashClad = '#' CladDatum + +QuoteExpr = "'" Datum + / '`' Datum + / ',' Datum + +JoinExpr = Datum RJoinDatum + / LJoinDatum NoStartDot + / Datum ':' Datum + / NoEndDot '.' Datum + + +BareChar = '!' / '$' / '%' / '*' / '/' / '<' / '=' / '>' + / '?' / '@' / '^' / '_' / '~' / ALPHA + +Numeric = '+' / '-' / DIGIT + +PipeStrChar = %x00-5b / %x5d-7b / %x7d-ff ; any but '|' or '\' + +QuotStrChar = %x00-21 / %x23-5b / %x5d-ff ; any but '"' or '\' + +StringEsc = '\' / '|' / '"' / *( HTAB / SP ) LF *( HTAB / SP ) + / 'a' / 'b' / 't' / 'n' / 'v' / 'f' / 'r' / 'e' + / 'x' 1*( 2HEXDIG ) ';' + / 'u' ['0'] 1*5HEXDIG ';' + / 'u' '1' '0' 4HEXDIG ';' + +List = [ Unit *( Blank Unit ) ] *Blank [Tail] [SkipUnit] + +Tail = '&' Unit *Blank + + +RuneName = ALPHA *5( ALPHA / DIGIT ) + +Label = 1*12( HEXDIG ) + + +RJoinDatum = CladDatum / Rune / RuneStr / RuneDotStr / RuneClad + / LabelRef / LabelDef / HashStr / HashDotStr / HashClad + / QuoteExpr + +LJoinDatum = CladDatum / RuneClad / LabelRef / HashClad + +NoStartDot = BareString / CladDatum / Rune / RuneStr / RuneDotStr + / RuneClad / LabelRef / LabelDef / HashStr / HashDotStr + / HashClad / QuoteExpr + +NoEndDot = BareString / Rune / RuneStr / RuneClad / LabelRef + / HashStr / HashClad diff --git a/docs/c1/grammar.md b/docs/c1/grammar.md new file mode 100644 index 0000000..3364150 --- /dev/null +++ b/docs/c1/grammar.md @@ -0,0 +1,101 @@ +# Zisp S-Expression Grammar + +The grammar is available in several different formats: + +* [ZBNF](grammar.zbnf.txt): See below for the rules of this notation +* [ABNF](grammar.abnf.txt): Compatible with the `abnfgen` tool +* [PEG](grammar.peg.txt): Compatible with `peg/leg` tool + + +## ZBNF notation + +The ZBNF grammar specification uses a BNF-like notation with PEG-like +semantics: + +* Concatenation of expressions is implicit: `foo bar` means `foo` + followed by `bar`. + +* Parentheses are used for grouping, and the pipe symbol `|` is used + for alternatives. + +* The suffixes `?`, `*`, and `+` have the same meaning as in regular + expressions, although `[foo]` is used in place of `(foo)?`. + +* The syntax is defined in terms of bytes, not characters. Terminals + `'c'` and `"c"` refer to the ASCII value of the given character `c`. + Standard C escape sequences are supported. + +* The prefix `~` means NOT. It only applies to rules that match one + byte, and negates them. For example, `~( 'a' | 'b' )` matches any + byte other than 'a' and 'b'. + +* Ranges of terminal values are expressed as `x...y` (inclusive). + +* ABNF "core rules" like `ALPHA` and `HEXDIG` are supported. + +* There is no ambiguity, or look-ahead / backtracking beyond one byte. + Rules match left to right, depth-first, and greedy. As soon as the + input matches the first terminal of a rule --explicit or implied by + recursively descending into the first non-terminal-- it must match + that rule to the end or a syntax error is reported. + +The last point makes the notation simple to translate to code. + + +## Limitations outside the grammar + +The following limits are not represented in the grammar: + +* A `UnicodeSV` is the hexadecimal representation of a Unicode scalar + value; it must represent a value in the range 0 to D7FF, or E000 to + 10FFFF, inclusive. Any other value signals an error. Valid values + are converted into a UTF-8 byte sequence encoding the value. + +* A `Rune` longer than 6 bytes is grammatical, but signals an error. + This is important because runes are not self-terminating; defining + their grammar as ending after a maximum of 6 bytes would allow + another datum beginning with an alphabetic character to follow a + rune immediately without any visual delineation, which would be + terribly confusing for a human reader. Consider: `#foobarbaz`. + This would parse as a `Datum` joining `#foobar` and `baz`. + +* A `Label` is the hexadecimal representation of a 48-bit integer, + meaning it allows for a maximum of 12 hexadecimal digits. Longer + values are grammatical, but signal an out-of-range error, so as to + avoid signaling a confusing "invalid character" error on input that + appears grammatical. Consider: `#%123456789abcd=foo`. This would + signal an invalid character error at the letter `d` if the grammar + limited a `Label` to 12 hexadecimal digits. + + +## Stream-parsing strategy + +The parser consumes one `Unit` from the input stream every time it's +called; it returns the `Datum` therein if found, or else it returns +the Zisp EOF token. + +Since a `Datum` is not self-terminating, the parser must read beyond +it to realize that it has ended (if not followed by the EOF). Thus, +it will consume one more `Blank` following the `Unit` that it parsed. +If this `Blank` is a comment, it will be consumed entirely, ensuring +that parsing resumes properly on a subsequent parser call on the same +input stream, without needing to store any state in between. + +Since comments of type `SkipUnit` are likewise not self-terminating, +an arbitrary number of chained `SkipUnit` comments may need to be +consumed before the parser is finally allowed to return. + +The following illustration shows the positions at which the parser +will stop consuming input when called repeatedly on the same input +stream. The dots represent the extent of each `Unit` being parsed, +while the caret points at the last byte the parser will consume in +that parse cycle. + +``` +foo (bar)[baz] foo;~bar foo;~bar;~baz;~bat foobar +...^..........^... ^... ^......^ +``` + +Notice how, in the fourth cycle, the parser is forced to consume all +commented-out units before it can return, since it would otherwise +leave the stream in an inappropriate state. diff --git a/spec/syntax.peg b/docs/c1/grammar.peg.txt index 97b9632..d194652 100644 --- a/spec/syntax.peg +++ b/docs/c1/grammar.peg.txt @@ -1,7 +1,11 @@ -Unit <- Blank* ( Datum Blank? )? +# Compatible with https://piumarta.com/software/peg +Stream <- Unit ( Blank Unit )* !. -Blank <- ' ' / '\t' / '\n' / Comment +Unit <- Blank* Datum + + +Blank <- [\t-\r ] / Comment Datum <- OneDatum ( JoinChar? OneDatum )* @@ -45,7 +49,7 @@ StringEsc <- '\' / '|' / '"' / ( HTAB / SP )* LF ( HTAB / SP )* HexByte <- HEXDIG HEXDIG UnicodeSV <- HEXDIG+ -RuneExpr <- Rune [ '\' BareString / CladDatum ] +RuneExpr <- Rune ( '\' BareString / CladDatum )? LabelExpr <- '%' Label ( '%' / '=' Datum ) HashDatum <- '\' BareString / CladDatum @@ -56,8 +60,16 @@ ParenList <- '(' ListBody ')' SquareList <- '[' ListBody ']' BraceList <- '{' ListBody '}' -ListBody <- Unit* [ Blank* '&' Unit ] Blank* +ListBody <- Unit* ( Blank* '&' Unit )? Blank* DIGIT <- [0-9] ALPHA <- [a-zA-Z] HEXDIG <- [0-9a-fA-F] + + +# This file should be kept in perfect sync with zbnf.txt for easy +# comparison between the two. + +# Due to a quirk in the peg tool this file is used with, the grammar +# must not allow an empty stream. Therefore, the Unit rule has its +# Datum declared as mandatory rather than optional. diff --git a/spec/syntax.zbnf b/docs/c1/grammar.zbnf.txt index b87efb5..a8792f0 100644 --- a/spec/syntax.zbnf +++ b/docs/c1/grammar.zbnf.txt @@ -1,7 +1,11 @@ -Unit : Blank* [ Datum [Blank] ] +# Custom notation with PEG semantics; see grammar.html +Stream : Unit ( Blank Unit )* -Blank : 9...13 | SP | Comment +Unit : Blank* [Datum] + + +Blank : '\t'...'\r' | SP | Comment Datum : OneDatum ( [JoinChar] OneDatum )* diff --git a/spec/syntax.abnf b/spec/syntax.abnf deleted file mode 100644 index 132deeb..0000000 --- a/spec/syntax.abnf +++ /dev/null @@ -1,96 +0,0 @@ -;This file follows strict ABNF rules and can be used with abnfgen. - - -File = [Unit] *( Blank Unit ) *Blank [Trail] - - -Unit = *Blank Datum - -Blank = HTAB / LF / %x0b / %x0c / CR / SP / Comment - -Trail = SkipLine / SkipUnit - - -Datum = BareString - / DottedString - / CladDatum - / HashExpr - / HashDotExpr - / QuoteExpr - / JoinExpr - -Comment = SkipLine LF / SkipUnit Blank - -SkipLine = ';' [ SkipLStart *AnyButLF ] - -SkipLStart = %x00-09 / %x0b-7d / %x7f-ff - ; any but LF or '~' - -AnyButLF = %x00-09 / %x0b-ff - -SkipUnit = ';' '~' Unit - - -BareString = BareChar *( BareChar / Numeric ) - -DottedString = ( '.' / Numeric ) *( '.' / Numeric / BareChar ) - -CladDatum = '|' *( PipeStrChar / '\' StringEsc ) '|' - / '"' *( QuotStrChar / '\' StringEsc ) '"' - / '(' List ')' / '[' List ']' / '{' List '}' - -HashExpr = LabelExpr / RuneExpr / HashDatum - -HashDotExpr = RuneDotExpr / HashDotDatum - -QuoteExpr = "'" Datum / '`' Datum / ',' Datum - -JoinExpr = Datum LeftCladDatum - / Datum ':' Datum - / DotlessDatum '.' Datum - -LeftCladDatum = CladDatum / HashExpr / QuoteExpr - -DotlessDatum = BareString / CladDatum / RuneExpr / HashDatum - - -BareChar = '!' / '$' / '%' / '*' / '/' / '<' / '=' / '>' - / '?' / '@' / '^' / '_' / '~' / ALPHA - -Numeric = '+' / '-' / DIGIT - -PipeStrChar = %x00-5b / %x5d-7b / %x7d-ff - ; any but '|' or '\' - -QuotStrChar = %x00-21 / %x23-5b / %x5d-ff - ; any but '"' or '\' - -List = [Unit] *( Blank Unit ) *Blank [Tail] [SkipUnit] - -Tail = '&' Unit *Blank - -LabelExpr = '#' '%' Label ( '%' / '=' Datum ) - -RuneExpr = '#' Rune [ '\' BareString / CladDatum ] - -RuneDotExpr = '#' Rune '\' DottedString - -HashDatum = '#' '\' BareString / CladDatum - -HashDotDatum = '#' '\' DottedString - - -; Unicode escapes must not represent surrogate code points. -; This is difficult to express in ABNF. But we do at least -; disallow code points greater than \u10FFFF which are also -; invalid, since U+10FFFF is the highest allowed. -StringEsc = '\' / '|' / '"' / *( HTAB / SP ) LF *( HTAB / SP ) - / 'a' / 'b' / 't' / 'n' / 'v' / 'f' / 'r' / 'e' - / 'x' 1*( 2HEXDIG ) ';' - / 'u' ['0'] 1*5HEXDIG ';' - / 'u' '1' '0' 4HEXDIG ';' - - -Rune = ALPHA *5( ALPHA / DIGIT ) - -Label = 1*12( HEXDIG ) diff --git a/spec/syntax.md b/spec/syntax.md deleted file mode 100644 index d1a17ad..0000000 --- a/spec/syntax.md +++ /dev/null @@ -1,117 +0,0 @@ -# Zisp S-Expression Syntax - -We use a BNF-like grammar notation with the following rules: - -* Concatenation of expressions is implicit: `foo bar` means `foo` - followed by `bar`. - -* The suffixes `?`, `*`, and `+` have the same meaning as in regular - expressions, although `[foo]` is used in place of `(foo)?`. - -* The syntax is defined in terms of bytes, not characters. Terminals - `'c'` and `"c"` refer to the ASCII value of the given character `c`. - Numbers are in decimal and refer to a byte with the given value. - -* The prefix `~` means NOT. It only applies to rules that match one - byte, and negates them. For example, `~( 'a' | 'b' )` matches any - byte other than 97 and 98. - -* Ranges of terminal values are expressed as `x...y` (inclusive). - -* ABNF "core rules" like `ALPHA` and `HEXDIG` are supported. - -* There is no ambiguity, or look-ahead / backtracking beyond one byte. - Rules match left to right, depth-first, and greedy. As soon as the - input matches the first terminal of a rule (explicit or implied by - recursively descending into the first non-terminal), it must match - that rule to the end, or it is considered a syntax error. - -The last rule means that the notation is simple to translate to code. -It ostensibly makes the notation equivalent to PEG in expression. - -The parser consumes one `Unit` from an input stream every time it's -called; it returns the `Datum` therein, or EOF. The final optional -`Blank` represents the fact that the parser will consume one more -blank at the end if it finds one; this is because `Datum` is not -self-closing so the parser has to check if it goes on. - -The following limits are not represented in the grammar: - -* A `UnicodeSV` is the hexadecimal representation of a Unicode scalar - value; it must represent a value in the range 0 to D7FF, or E000 to - 10FFFF, inclusive. Any other value signals an error. Valid values - are converted into a UTF-8 byte sequence encoding the value. - -* A `Rune` longer than 6 bytes is grammatical, but signals an error. - This is important because runes are not self-terminating; defining - their grammar as ending after a maximum of 6 bytes would allow - another datum beginning with an alphabetic character to follow a - rune immediately without any visual delineation, which would be - terribly confusing for a human reader. Consider: `#foo123bar`. - This would parse as a concatenation of `#foo123` and `bar`. - -* A `Label` is the hexadecimal representation of a 48-bit integer, - meaning it allows for a maximum of 12 hexadecimal digits. Longer - values are grammatical, but signal an out-of-range error. - -``` -Unit : Blank* [ Datum [Blank] ] - - -Blank : 9...13 | SP | Comment - -Datum : OneDatum ( [JoinChar] OneDatum )* - -JoinChar : '.' | ':' - - -Comment : ';' ( SkipUnit | SkipLine ) - -SkipUnit : '~' Unit - -SkipLine : ( ~LF )* [LF] - - -OneDatum : BareString | CladDatum - - -BareString : ( '.' | '+' | '-' | DIGIT ) ( BareChar | '.' )* - | BareChar+ - -CladDatum : PipeStr | QuoteStr | HashExpr | QuoteExpr | List - -PipeStr : '|' ( PipeStrChar | '\' StringEsc )* '|' -QuoteStr : '"' ( QuotStrChar | '\' StringEsc )* '"' -HashExpr : '#' ( RuneExpr | LabelExpr | HashDatum ) -QuoteExpr : "'" Datum | '`' Datum | ',' Datum -List : ParenList | SquareList | BraceList - -BareChar : ALPHA | DIGIT - | '!' | '$' | '%' | '*' | '+' - | '-' | '/' | '<' | '=' | '>' - | '?' | '@' | '^' | '_' | '~' - -PipeStrChar : ~( '|' | '\' ) -QuotStrChar : ~( '"' | '\' ) - -StringEsc : '\' | '|' | '"' | ( HTAB | SP )* LF ( HTAB | SP )* - | 'a' | 'b' | 't' | 'n' | 'v' | 'f' | 'r' | 'e' - | 'x' HexByte+ ';' - | 'u' UnicodeSV ';' - -HexByte : HEXDIG HEXDIG -UnicodeSV : HEXDIG+ - -RuneExpr : Rune [ '\' BareString | CladDatum ] -LabelExpr : '%' Label ( '%' | '=' Datum ) -HashDatum : '\' BareString | CladDatum - -Rune : ALPHA ( ALPHA | DIGIT )* -Label : HEXDIG+ - -ParenList : '(' ListBody ')' -SquareList : '[' ListBody ']' -BraceList : '{' ListBody '}' - -ListBody : Unit* [ Blank* '&' Unit ] Blank* -``` |
