summaryrefslogtreecommitdiff
path: root/spec
diff options
context:
space:
mode:
Diffstat (limited to 'spec')
-rw-r--r--spec/syntax.abnf65
-rw-r--r--spec/syntax.md81
-rw-r--r--spec/syntax.peg63
-rw-r--r--spec/syntax.zbnf59
4 files changed, 214 insertions, 54 deletions
diff --git a/spec/syntax.abnf b/spec/syntax.abnf
index a083eda..132deeb 100644
--- a/spec/syntax.abnf
+++ b/spec/syntax.abnf
@@ -6,42 +6,52 @@ File = [Unit] *( Blank Unit ) *Blank [Trail]
Unit = *Blank Datum
-Blank = HTAB / LF / %x0b / %x0c / CR / Comment
+Blank = HTAB / LF / %x0b / %x0c / CR / SP / Comment
Trail = SkipLine / SkipUnit
+Datum = BareString
+ / DottedString
+ / CladDatum
+ / HashExpr
+ / HashDotExpr
+ / QuoteExpr
+ / JoinExpr
+
Comment = SkipLine LF / SkipUnit Blank
SkipLine = ';' [ SkipLStart *AnyButLF ]
-SkipUnit = ';' '~' Unit
-
-
SkipLStart = %x00-09 / %x0b-7d / %x7f-ff
; any but LF or '~'
AnyButLF = %x00-09 / %x0b-ff
-
-Datum = SingleDatum
- / JoinedDatum *( [ '.' / ':' ] JoinedDatum )
-
-
-SingleDatum = BareString / CladDatum / DottedString
-
-JoinedDatum = BareString / CladDatum
+SkipUnit = ';' '~' Unit
BareString = BareChar *( BareChar / Numeric )
+DottedString = ( '.' / Numeric ) *( '.' / Numeric / BareChar )
+
CladDatum = '|' *( PipeStrChar / '\' StringEsc ) '|'
/ '"' *( QuotStrChar / '\' StringEsc ) '"'
- / '#' HashExpr
/ '(' List ')' / '[' List ']' / '{' List '}'
- / "'" Datum / '`' Datum / ',' Datum
-DottedString = ( '.' / Numeric ) *( '.' / Numeric / BareChar )
+HashExpr = LabelExpr / RuneExpr / HashDatum
+
+HashDotExpr = RuneDotExpr / HashDotDatum
+
+QuoteExpr = "'" Datum / '`' Datum / ',' Datum
+
+JoinExpr = Datum LeftCladDatum
+ / Datum ':' Datum
+ / DotlessDatum '.' Datum
+
+LeftCladDatum = CladDatum / HashExpr / QuoteExpr
+
+DotlessDatum = BareString / CladDatum / RuneExpr / HashDatum
BareChar = '!' / '$' / '%' / '*' / '/' / '<' / '=' / '>'
@@ -49,29 +59,36 @@ BareChar = '!' / '$' / '%' / '*' / '/' / '<' / '=' / '>'
Numeric = '+' / '-' / DIGIT
-
PipeStrChar = %x00-5b / %x5d-7b / %x7d-ff
; any but '|' or '\'
QuotStrChar = %x00-21 / %x23-5b / %x5d-ff
; any but '"' or '\'
-HashExpr = Rune [ '\' BareString / CladDatum ]
- / '\' BareString
- / '%' Label ( '%' / '=' Datum )
- / CladDatum
-
List = [Unit] *( Blank Unit ) *Blank [Tail] [SkipUnit]
Tail = '&' Unit *Blank
+LabelExpr = '#' '%' Label ( '%' / '=' Datum )
+
+RuneExpr = '#' Rune [ '\' BareString / CladDatum ]
+
+RuneDotExpr = '#' Rune '\' DottedString
+
+HashDatum = '#' '\' BareString / CladDatum
+
+HashDotDatum = '#' '\' DottedString
+
+; Unicode escapes must not represent surrogate code points.
+; This is difficult to express in ABNF. But we do at least
+; disallow code points greater than \u10FFFF which are also
+; invalid, since U+10FFFF is the highest allowed.
StringEsc = '\' / '|' / '"' / *( HTAB / SP ) LF *( HTAB / SP )
/ 'a' / 'b' / 't' / 'n' / 'v' / 'f' / 'r' / 'e'
/ 'x' 1*( 2HEXDIG ) ';'
- / 'u' 1*5HEXDIG ';'
- / 'u' '0' 1*5HEXDIG ';'
- / 'u' '1' '0' 1*4HEXDIG ';'
+ / 'u' ['0'] 1*5HEXDIG ';'
+ / 'u' '1' '0' 4HEXDIG ';'
Rune = ALPHA *5( ALPHA / DIGIT )
diff --git a/spec/syntax.md b/spec/syntax.md
index 7f3561c..d1a17ad 100644
--- a/spec/syntax.md
+++ b/spec/syntax.md
@@ -1,20 +1,18 @@
# Zisp S-Expression Syntax
-We use a BNF notation with the following rules:
+We use a BNF-like grammar notation with the following rules:
* Concatenation of expressions is implicit: `foo bar` means `foo`
followed by `bar`.
-* Expressions may be followed by `?`, `*`, `+`, `{N}`, or `{N,M}`,
- which have the same meanings as in regular expressions.
-
-* The syntax `[foo]` is shorthand for `(foo)?`.
+* The suffixes `?`, `*`, and `+` have the same meaning as in regular
+ expressions, although `[foo]` is used in place of `(foo)?`.
* The syntax is defined in terms of bytes, not characters. Terminals
`'c'` and `"c"` refer to the ASCII value of the given character `c`.
Numbers are in decimal and refer to a byte with the given value.
-* The `~` prefix means NOT. It only applies to rules that match one
+* The prefix `~` means NOT. It only applies to rules that match one
byte, and negates them. For example, `~( 'a' | 'b' )` matches any
byte other than 97 and 98.
@@ -24,11 +22,12 @@ We use a BNF notation with the following rules:
* There is no ambiguity, or look-ahead / backtracking beyond one byte.
Rules match left to right, depth-first, and greedy. As soon as the
- input matches the first terminal of a rule, it must match that rule
- to the end or it is considered a syntax error.
+ input matches the first terminal of a rule (explicit or implied by
+ recursively descending into the first non-terminal), it must match
+ that rule to the end, or it is considered a syntax error.
-The last rule means that the BNF is very simple to translate to code.
-It also probably makes it equivalent to PEG.
+The last rule means that the notation is simple to translate to code.
+It ostensibly makes the notation equivalent to PEG in expression.
The parser consumes one `Unit` from an input stream every time it's
called; it returns the `Datum` therein, or EOF. The final optional
@@ -36,11 +35,30 @@ called; it returns the `Datum` therein, or EOF. The final optional
blank at the end if it finds one; this is because `Datum` is not
self-closing so the parser has to check if it goes on.
+The following limits are not represented in the grammar:
+
+* A `UnicodeSV` is the hexadecimal representation of a Unicode scalar
+ value; it must represent a value in the range 0 to D7FF, or E000 to
+ 10FFFF, inclusive. Any other value signals an error. Valid values
+ are converted into a UTF-8 byte sequence encoding the value.
+
+* A `Rune` longer than 6 bytes is grammatical, but signals an error.
+ This is important because runes are not self-terminating; defining
+ their grammar as ending after a maximum of 6 bytes would allow
+ another datum beginning with an alphabetic character to follow a
+ rune immediately without any visual delineation, which would be
+ terribly confusing for a human reader. Consider: `#foo123bar`.
+ This would parse as a concatenation of `#foo123` and `bar`.
+
+* A `Label` is the hexadecimal representation of a 48-bit integer,
+ meaning it allows for a maximum of 12 hexadecimal digits. Longer
+ values are grammatical, but signal an out-of-range error.
+
```
Unit : Blank* [ Datum [Blank] ]
-Blank : 9...13 | Comment
+Blank : 9...13 | SP | Comment
Datum : OneDatum ( [JoinChar] OneDatum )*
@@ -56,41 +74,44 @@ SkipLine : ( ~LF )* [LF]
OneDatum : BareString | CladDatum
+
BareString : ( '.' | '+' | '-' | DIGIT ) ( BareChar | '.' )*
| BareChar+
-CladDatum : '|' ( PipeStrChar | '\' StringEsc )* '|'
- | '"' ( QuotStrChar | '\' StringEsc )* '"'
- | '#' HashExpr
- | '(' List ')' | '[' List ']' | '{' List '}'
- | "'" Datum | '`' Datum | ',' Datum
+CladDatum : PipeStr | QuoteStr | HashExpr | QuoteExpr | List
+PipeStr : '|' ( PipeStrChar | '\' StringEsc )* '|'
+QuoteStr : '"' ( QuotStrChar | '\' StringEsc )* '"'
+HashExpr : '#' ( RuneExpr | LabelExpr | HashDatum )
+QuoteExpr : "'" Datum | '`' Datum | ',' Datum
+List : ParenList | SquareList | BraceList
BareChar : ALPHA | DIGIT
| '!' | '$' | '%' | '*' | '+'
| '-' | '/' | '<' | '=' | '>'
| '?' | '@' | '^' | '_' | '~'
-
PipeStrChar : ~( '|' | '\' )
-
QuotStrChar : ~( '"' | '\' )
-HashExpr : Rune [ '\' BareString | CladDatum ]
- | '\' BareString
- | '%' Label ( '%' | '=' Datum )
- | CladDatum
-
-List : Unit* [ Blank* '&' Unit ] Blank*
-
-
StringEsc : '\' | '|' | '"' | ( HTAB | SP )* LF ( HTAB | SP )*
| 'a' | 'b' | 't' | 'n' | 'v' | 'f' | 'r' | 'e'
- | 'x' ( HEXDIG{2} )+ ';'
- | 'u' HEXDIG{1,6} ';'
+ | 'x' HexByte+ ';'
+ | 'u' UnicodeSV ';'
+
+HexByte : HEXDIG HEXDIG
+UnicodeSV : HEXDIG+
+
+RuneExpr : Rune [ '\' BareString | CladDatum ]
+LabelExpr : '%' Label ( '%' | '=' Datum )
+HashDatum : '\' BareString | CladDatum
+Rune : ALPHA ( ALPHA | DIGIT )*
+Label : HEXDIG+
-Rune : ALPHA ( ALPHA | DIGIT ){0,5}
+ParenList : '(' ListBody ')'
+SquareList : '[' ListBody ']'
+BraceList : '{' ListBody '}'
-Label : HEXDIG{1,12}
+ListBody : Unit* [ Blank* '&' Unit ] Blank*
```
diff --git a/spec/syntax.peg b/spec/syntax.peg
new file mode 100644
index 0000000..97b9632
--- /dev/null
+++ b/spec/syntax.peg
@@ -0,0 +1,63 @@
+Unit <- Blank* ( Datum Blank? )?
+
+
+Blank <- ' ' / '\t' / '\n' / Comment
+
+Datum <- OneDatum ( JoinChar? OneDatum )*
+
+JoinChar <- '.' / ':'
+
+
+Comment <- ';' ( SkipUnit / SkipLine )
+
+SkipUnit <- '~' Unit
+
+SkipLine <- (!'\n' .)* '\n'?
+
+
+OneDatum <- BareString / CladDatum
+
+
+BareString <- ( '.' / '+' / '-' / DIGIT ) ( BareChar / '.' )*
+ / BareChar+
+
+CladDatum <- PipeStr / QuoteStr / HashExpr / QuoteExpr / List
+
+PipeStr <- '|' ( PipeStrChar / '\' StringEsc )* '|'
+QuoteStr <- '"' ( QuotStrChar / '\' StringEsc )* '"'
+HashExpr <- '#' ( RuneExpr / LabelExpr / HashDatum )
+QuoteExpr <- "'" Datum / '`' Datum / ',' Datum
+List <- ParenList / SquareList / BraceList
+
+BareChar <- ALPHA / DIGIT
+ / '!' / '$' / '%' / '*' / '+'
+ / '-' / '/' / '<' / '=' / '>'
+ / '?' / '@' / '^' / '_' / '~'
+
+PipeStrChar <- (![|\\] .)
+QuotStrChar <- (!["\\] .)
+
+StringEsc <- '\' / '|' / '"' / ( HTAB / SP )* LF ( HTAB / SP )*
+ / 'a' / 'b' / 't' / 'n' / 'v' / 'f' / 'r' / 'e'
+ / 'x' HexByte+ ';'
+ / 'u' UnicodeSV ';'
+
+HexByte <- HEXDIG HEXDIG
+UnicodeSV <- HEXDIG+
+
+RuneExpr <- Rune [ '\' BareString / CladDatum ]
+LabelExpr <- '%' Label ( '%' / '=' Datum )
+HashDatum <- '\' BareString / CladDatum
+
+Rune <- ALPHA ( ALPHA / DIGIT )*
+Label <- HEXDIG+
+
+ParenList <- '(' ListBody ')'
+SquareList <- '[' ListBody ']'
+BraceList <- '{' ListBody '}'
+
+ListBody <- Unit* [ Blank* '&' Unit ] Blank*
+
+DIGIT <- [0-9]
+ALPHA <- [a-zA-Z]
+HEXDIG <- [0-9a-fA-F]
diff --git a/spec/syntax.zbnf b/spec/syntax.zbnf
new file mode 100644
index 0000000..b87efb5
--- /dev/null
+++ b/spec/syntax.zbnf
@@ -0,0 +1,59 @@
+Unit : Blank* [ Datum [Blank] ]
+
+
+Blank : 9...13 | SP | Comment
+
+Datum : OneDatum ( [JoinChar] OneDatum )*
+
+JoinChar : '.' | ':'
+
+
+Comment : ';' ( SkipUnit | SkipLine )
+
+SkipUnit : '~' Unit
+
+SkipLine : ( ~LF )* [LF]
+
+
+OneDatum : BareString | CladDatum
+
+
+BareString : ( '.' | '+' | '-' | DIGIT ) ( BareChar | '.' )*
+ | BareChar+
+
+CladDatum : PipeStr | QuoteStr | HashExpr | QuoteExpr | List
+
+PipeStr : '|' ( PipeStrChar | '\' StringEsc )* '|'
+QuoteStr : '"' ( QuotStrChar | '\' StringEsc )* '"'
+HashExpr : '#' ( RuneExpr | LabelExpr | HashDatum )
+QuoteExpr : "'" Datum | '`' Datum | ',' Datum
+List : ParenList | SquareList | BraceList
+
+BareChar : ALPHA | DIGIT
+ | '!' | '$' | '%' | '*' | '+'
+ | '-' | '/' | '<' | '=' | '>'
+ | '?' | '@' | '^' | '_' | '~'
+
+PipeStrChar : ~( '|' | '\' )
+QuotStrChar : ~( '"' | '\' )
+
+StringEsc : '\' | '|' | '"' | ( HTAB | SP )* LF ( HTAB | SP )*
+ | 'a' | 'b' | 't' | 'n' | 'v' | 'f' | 'r' | 'e'
+ | 'x' HexByte+ ';'
+ | 'u' UnicodeSV ';'
+
+HexByte : HEXDIG HEXDIG
+UnicodeSV : HEXDIG+
+
+RuneExpr : Rune [ '\' BareString | CladDatum ]
+LabelExpr : '%' Label ( '%' | '=' Datum )
+HashDatum : '\' BareString | CladDatum
+
+Rune : ALPHA ( ALPHA | DIGIT )*
+Label : HEXDIG+
+
+ParenList : '(' ListBody ')'
+SquareList : '[' ListBody ']'
+BraceList : '{' ListBody '}'
+
+ListBody : Unit* [ Blank* '&' Unit ] Blank*