From 2d72a1aa64a66c486a2329999123c14afcddeb32 Mon Sep 17 00:00:00 2001
From: Taylan Kammer <taylan.kammer@gmail.com>
Date: Fri, 9 Jan 2026 18:09:59 +0100
Subject: More grammar fuckery.  BNF is horrible!

---
 spec/syntax.abnf | 65 ++++++++++++++++++++++++++++-----------------
 spec/syntax.md   | 81 +++++++++++++++++++++++++++++++++++---------------------
 spec/syntax.peg  | 63 +++++++++++++++++++++++++++++++++++++++++++
 spec/syntax.zbnf | 59 +++++++++++++++++++++++++++++++++++++++++
 4 files changed, 214 insertions(+), 54 deletions(-)
 create mode 100644 spec/syntax.peg
 create mode 100644 spec/syntax.zbnf

(limited to 'spec')

diff --git a/spec/syntax.abnf b/spec/syntax.abnf
index a083eda..132deeb 100644
--- a/spec/syntax.abnf
+++ b/spec/syntax.abnf
@@ -6,42 +6,52 @@ File          = [Unit] *( Blank Unit ) *Blank [Trail]
 
 Unit          = *Blank Datum
 
-Blank         = HTAB / LF / %x0b / %x0c / CR / Comment
+Blank         = HTAB / LF / %x0b / %x0c / CR / SP / Comment
 
 Trail         = SkipLine / SkipUnit
 
 
+Datum         = BareString
+              / DottedString
+              / CladDatum
+              / HashExpr
+              / HashDotExpr
+              / QuoteExpr
+              / JoinExpr
+
 Comment       = SkipLine LF / SkipUnit Blank
 
 SkipLine      = ';' [ SkipLStart *AnyButLF ]
 
-SkipUnit      = ';' '~' Unit
-
-
 SkipLStart    = %x00-09 / %x0b-7d / %x7f-ff
               ; any but LF or '~'
 
 AnyButLF      = %x00-09 / %x0b-ff
 
-
-Datum         = SingleDatum
-              / JoinedDatum *( [ '.' / ':' ] JoinedDatum )
-
-
-SingleDatum   = BareString / CladDatum / DottedString
-
-JoinedDatum   = BareString / CladDatum
+SkipUnit      = ';' '~' Unit
 
 
 BareString    = BareChar *( BareChar / Numeric )
 
+DottedString  = ( '.' / Numeric ) *( '.' / Numeric / BareChar )
+
 CladDatum     = '|' *( PipeStrChar / '\' StringEsc ) '|'
               / '"' *( QuotStrChar / '\' StringEsc ) '"'
-              / '#' HashExpr
               / '(' List ')' / '[' List ']' / '{' List '}'
-              / "'" Datum / '`' Datum / ',' Datum
 
-DottedString  = ( '.' / Numeric ) *( '.' / Numeric / BareChar )
+HashExpr      = LabelExpr / RuneExpr / HashDatum
+
+HashDotExpr   = RuneDotExpr / HashDotDatum
+
+QuoteExpr     = "'" Datum / '`' Datum / ',' Datum
+
+JoinExpr      = Datum LeftCladDatum
+              / Datum ':' Datum
+              / DotlessDatum '.' Datum
+
+LeftCladDatum = CladDatum / HashExpr / QuoteExpr
+
+DotlessDatum  = BareString / CladDatum / RuneExpr / HashDatum
 
 
 BareChar      = '!' / '$' / '%' / '*' / '/' / '<' / '=' / '>'
@@ -49,29 +59,36 @@ BareChar      = '!' / '$' / '%' / '*' / '/' / '<' / '=' / '>'
 
 Numeric       = '+' / '-' / DIGIT
 
-
 PipeStrChar   = %x00-5b / %x5d-7b / %x7d-ff
               ; any but '|' or '\'
 
 QuotStrChar   = %x00-21 / %x23-5b / %x5d-ff
               ; any but '"' or '\'
 
-HashExpr      = Rune [ '\' BareString / CladDatum ]
-              / '\' BareString
-              / '%' Label ( '%' / '=' Datum )
-              / CladDatum
-
 List          = [Unit] *( Blank Unit ) *Blank [Tail] [SkipUnit]
 
 Tail          = '&' Unit *Blank
 
+LabelExpr     = '#' '%' Label ( '%' / '=' Datum )
+
+RuneExpr      = '#' Rune [ '\' BareString / CladDatum ]
+
+RuneDotExpr   = '#' Rune '\' DottedString
+
+HashDatum     = '#' '\' BareString / CladDatum
+
+HashDotDatum  = '#' '\' DottedString
+
 
+; Unicode escapes must not represent surrogate code points.
+; This is difficult to express in ABNF.  But we do at least
+; disallow code points greater than \u10FFFF which are also
+; invalid, since U+10FFFF is the highest allowed.
 StringEsc     = '\' / '|' / '"' / *( HTAB / SP ) LF *( HTAB / SP )
               / 'a' / 'b' / 't' / 'n' / 'v' / 'f' / 'r' / 'e'
               / 'x' 1*( 2HEXDIG ) ';'
-              / 'u' 1*5HEXDIG ';'
-              / 'u' '0' 1*5HEXDIG ';'
-              / 'u' '1' '0' 1*4HEXDIG ';'
+              / 'u' ['0'] 1*5HEXDIG ';'
+              / 'u' '1' '0' 4HEXDIG ';'
 
 
 Rune          = ALPHA *5( ALPHA / DIGIT )
diff --git a/spec/syntax.md b/spec/syntax.md
index 7f3561c..d1a17ad 100644
--- a/spec/syntax.md
+++ b/spec/syntax.md
@@ -1,20 +1,18 @@
 # Zisp S-Expression Syntax
 
-We use a BNF notation with the following rules:
+We use a BNF-like grammar notation with the following rules:
 
 * Concatenation of expressions is implicit: `foo bar` means `foo`
   followed by `bar`.
 
-* Expressions may be followed by `?`, `*`, `+`, `{N}`, or `{N,M}`,
-  which have the same meanings as in regular expressions.
-
-* The syntax `[foo]` is shorthand for `(foo)?`.
+* The suffixes `?`, `*`, and `+` have the same meaning as in regular
+  expressions, although `[foo]` is used in place of `(foo)?`.
 
 * The syntax is defined in terms of bytes, not characters.  Terminals
   `'c'` and `"c"` refer to the ASCII value of the given character `c`.
   Numbers are in decimal and refer to a byte with the given value.
 
-* The `~` prefix means NOT.  It only applies to rules that match one
+* The prefix `~` means NOT.  It only applies to rules that match one
   byte, and negates them.  For example, `~( 'a' | 'b' )` matches any
   byte other than 97 and 98.
 
@@ -24,11 +22,12 @@ We use a BNF notation with the following rules:
 
 * There is no ambiguity, or look-ahead / backtracking beyond one byte.
   Rules match left to right, depth-first, and greedy.  As soon as the
-  input matches the first terminal of a rule, it must match that rule
-  to the end or it is considered a syntax error.
+  input matches the first terminal of a rule (explicit or implied by
+  recursively descending into the first non-terminal), it must match
+  that rule to the end, or it is considered a syntax error.
 
-The last rule means that the BNF is very simple to translate to code.
-It also probably makes it equivalent to PEG.
+The last rule means that the notation is simple to translate to code.
+It ostensibly makes the notation equivalent to PEG in expression.
 
 The parser consumes one `Unit` from an input stream every time it's
 called; it returns the `Datum` therein, or EOF.  The final optional
@@ -36,11 +35,30 @@ called; it returns the `Datum` therein, or EOF.  The final optional
 blank at the end if it finds one; this is because `Datum` is not
 self-closing so the parser has to check if it goes on.
 
+The following limits are not represented in the grammar:
+
+* A `UnicodeSV` is the hexadecimal representation of a Unicode scalar
+  value; it must represent a value in the range 0 to D7FF, or E000 to
+  10FFFF, inclusive.  Any other value signals an error.  Valid values
+  are converted into a UTF-8 byte sequence encoding the value.
+
+* A `Rune` longer than 6 bytes is grammatical, but signals an error.
+  This is important because runes are not self-terminating; defining
+  their grammar as ending after a maximum of 6 bytes would allow
+  another datum beginning with an alphabetic character to follow a
+  rune immediately without any visual delineation, which would be
+  terribly confusing for a human reader.  Consider: `#foo123bar`.
+  This would parse as a concatenation of `#foo123` and `bar`.
+
+* A `Label` is the hexadecimal representation of a 48-bit integer,
+  meaning it allows for a maximum of 12 hexadecimal digits.  Longer
+  values are grammatical, but signal an out-of-range error.
+
 ```
 Unit          : Blank* [ Datum [Blank] ]
 
 
-Blank         : 9...13 | Comment
+Blank         : 9...13 | SP | Comment
 
 Datum         : OneDatum ( [JoinChar] OneDatum )*
 
@@ -56,41 +74,44 @@ SkipLine      : ( ~LF )* [LF]
 
 OneDatum      : BareString | CladDatum
 
+
 BareString    : ( '.' | '+' | '-' | DIGIT ) ( BareChar | '.' )*
               | BareChar+
 
-CladDatum     : '|' ( PipeStrChar | '\' StringEsc )* '|'
-              | '"' ( QuotStrChar | '\' StringEsc )* '"'
-              | '#' HashExpr
-              | '(' List ')' | '[' List ']' | '{' List '}'
-              | "'" Datum | '`' Datum | ',' Datum
+CladDatum     : PipeStr | QuoteStr | HashExpr | QuoteExpr | List
 
+PipeStr       : '|' ( PipeStrChar | '\' StringEsc )* '|'
+QuoteStr      : '"' ( QuotStrChar | '\' StringEsc )* '"'
+HashExpr      : '#' ( RuneExpr | LabelExpr | HashDatum )
+QuoteExpr     : "'" Datum | '`' Datum | ',' Datum
+List          : ParenList | SquareList | BraceList
 
 BareChar      : ALPHA | DIGIT
               | '!' | '$' | '%' | '*' | '+'
               | '-' | '/' | '<' | '=' | '>'
               | '?' | '@' | '^' | '_' | '~'
 
-
 PipeStrChar   : ~( '|' | '\' )
-
 QuotStrChar   : ~( '"' | '\' )
 
-HashExpr      : Rune [ '\' BareString | CladDatum ]
-              | '\' BareString
-              | '%' Label ( '%' | '=' Datum )
-              | CladDatum
-
-List          : Unit* [ Blank* '&' Unit ] Blank*
-
-
 StringEsc     : '\' | '|' | '"' | ( HTAB | SP )* LF ( HTAB | SP )*
               | 'a' | 'b' | 't' | 'n' | 'v' | 'f' | 'r' | 'e'
-              | 'x' ( HEXDIG{2} )+ ';'
-              | 'u' HEXDIG{1,6} ';'
+              | 'x' HexByte+ ';'
+              | 'u' UnicodeSV ';'
+
+HexByte       : HEXDIG HEXDIG
+UnicodeSV     : HEXDIG+
+
+RuneExpr      : Rune [ '\' BareString | CladDatum ]
+LabelExpr     : '%' Label ( '%' | '=' Datum )
+HashDatum     : '\' BareString | CladDatum
 
+Rune          : ALPHA ( ALPHA | DIGIT )*
+Label         : HEXDIG+
 
-Rune          : ALPHA ( ALPHA | DIGIT ){0,5}
+ParenList     : '(' ListBody ')'
+SquareList    : '[' ListBody ']'
+BraceList     : '{' ListBody '}'
 
-Label         : HEXDIG{1,12}
+ListBody      : Unit* [ Blank* '&' Unit ] Blank*
 ```
diff --git a/spec/syntax.peg b/spec/syntax.peg
new file mode 100644
index 0000000..97b9632
--- /dev/null
+++ b/spec/syntax.peg
@@ -0,0 +1,63 @@
+Unit         <- Blank* ( Datum Blank? )?
+
+
+Blank        <- ' ' / '\t' / '\n' / Comment
+
+Datum        <- OneDatum ( JoinChar? OneDatum )*
+
+JoinChar     <- '.' / ':'
+
+
+Comment      <- ';' ( SkipUnit / SkipLine )
+
+SkipUnit     <- '~' Unit
+
+SkipLine     <- (!'\n' .)* '\n'?
+
+
+OneDatum     <- BareString / CladDatum
+
+
+BareString   <- ( '.' / '+' / '-' / DIGIT ) ( BareChar / '.' )*
+              / BareChar+
+
+CladDatum    <- PipeStr / QuoteStr / HashExpr / QuoteExpr / List
+
+PipeStr      <- '|' ( PipeStrChar / '\' StringEsc )* '|'
+QuoteStr     <- '"' ( QuotStrChar / '\' StringEsc )* '"'
+HashExpr     <- '#' ( RuneExpr / LabelExpr / HashDatum )
+QuoteExpr    <- "'" Datum / '`' Datum / ',' Datum
+List         <- ParenList / SquareList / BraceList
+
+BareChar     <- ALPHA / DIGIT
+              / '!' / '$' / '%' / '*' / '+'
+              / '-' / '/' / '<' / '=' / '>'
+              / '?' / '@' / '^' / '_' / '~'
+
+PipeStrChar  <- (![|\\] .)
+QuotStrChar  <- (!["\\] .)
+
+StringEsc    <- '\' / '|' / '"' / ( HTAB / SP )* LF ( HTAB / SP )*
+              / 'a' / 'b' / 't' / 'n' / 'v' / 'f' / 'r' / 'e'
+              / 'x' HexByte+ ';'
+              / 'u' UnicodeSV ';'
+
+HexByte      <- HEXDIG HEXDIG
+UnicodeSV    <- HEXDIG+
+
+RuneExpr     <- Rune [ '\' BareString / CladDatum ]
+LabelExpr    <- '%' Label ( '%' / '=' Datum )
+HashDatum    <- '\' BareString / CladDatum
+
+Rune         <- ALPHA ( ALPHA / DIGIT )*
+Label        <- HEXDIG+
+
+ParenList    <- '(' ListBody ')'
+SquareList   <- '[' ListBody ']'
+BraceList    <- '{' ListBody '}'
+
+ListBody     <- Unit* [ Blank* '&' Unit ] Blank*
+
+DIGIT        <- [0-9]
+ALPHA        <- [a-zA-Z]
+HEXDIG       <- [0-9a-fA-F]
diff --git a/spec/syntax.zbnf b/spec/syntax.zbnf
new file mode 100644
index 0000000..b87efb5
--- /dev/null
+++ b/spec/syntax.zbnf
@@ -0,0 +1,59 @@
+Unit          : Blank* [ Datum [Blank] ]
+
+
+Blank         : 9...13 | SP | Comment
+
+Datum         : OneDatum ( [JoinChar] OneDatum )*
+
+JoinChar      : '.' | ':'
+
+
+Comment       : ';' ( SkipUnit | SkipLine )
+
+SkipUnit      : '~' Unit
+
+SkipLine      : ( ~LF )* [LF]
+
+
+OneDatum      : BareString | CladDatum
+
+
+BareString    : ( '.' | '+' | '-' | DIGIT ) ( BareChar | '.' )*
+              | BareChar+
+
+CladDatum     : PipeStr | QuoteStr | HashExpr | QuoteExpr | List
+
+PipeStr       : '|' ( PipeStrChar | '\' StringEsc )* '|'
+QuoteStr      : '"' ( QuotStrChar | '\' StringEsc )* '"'
+HashExpr      : '#' ( RuneExpr | LabelExpr | HashDatum )
+QuoteExpr     : "'" Datum | '`' Datum | ',' Datum
+List          : ParenList | SquareList | BraceList
+
+BareChar      : ALPHA | DIGIT
+              | '!' | '$' | '%' | '*' | '+'
+              | '-' | '/' | '<' | '=' | '>'
+              | '?' | '@' | '^' | '_' | '~'
+
+PipeStrChar   : ~( '|' | '\' )
+QuotStrChar   : ~( '"' | '\' )
+
+StringEsc     : '\' | '|' | '"' | ( HTAB | SP )* LF ( HTAB | SP )*
+              | 'a' | 'b' | 't' | 'n' | 'v' | 'f' | 'r' | 'e'
+              | 'x' HexByte+ ';'
+              | 'u' UnicodeSV ';'
+
+HexByte       : HEXDIG HEXDIG
+UnicodeSV     : HEXDIG+
+
+RuneExpr      : Rune [ '\' BareString | CladDatum ]
+LabelExpr     : '%' Label ( '%' | '=' Datum )
+HashDatum     : '\' BareString | CladDatum
+
+Rune          : ALPHA ( ALPHA | DIGIT )*
+Label         : HEXDIG+
+
+ParenList     : '(' ListBody ')'
+SquareList    : '[' ListBody ']'
+BraceList     : '{' ListBody '}'
+
+ListBody      : Unit* [ Blank* '&' Unit ] Blank*
-- 
cgit v1.2.3