More grammar fuckery. BNF is horrible!

author: Taylan Kammer <taylan.kammer@gmail.com> 2026-01-09 18:09:59 +0100
committer: Taylan Kammer <taylan.kammer@gmail.com> 2026-01-09 18:09:59 +0100
commit: 2d72a1aa64a66c486a2329999123c14afcddeb32 (patch)
tree: 4eba98eb1240d3d445e2d35c61bad63d352e413b
parent: a2ece405cc61341122fc075d499420e894c56909 (diff)
6 files changed, 239 insertions, 77 deletions
diff --git a/spec/syntax.abnf b/spec/syntax.abnf
index a083eda..132deeb 100644
--- a/spec/syntax.abnf
+++ b/spec/syntax.abnf
@@ -6,42 +6,52 @@ File          = [Unit] *( Blank Unit ) *Blank [Trail]
 
 Unit          = *Blank Datum
 
-Blank         = HTAB / LF / %x0b / %x0c / CR / Comment
+Blank         = HTAB / LF / %x0b / %x0c / CR / SP / Comment
 
 Trail         = SkipLine / SkipUnit
 
 
+Datum         = BareString
+              / DottedString
+              / CladDatum
+              / HashExpr
+              / HashDotExpr
+              / QuoteExpr
+              / JoinExpr
+
 Comment       = SkipLine LF / SkipUnit Blank
 
 SkipLine      = ';' [ SkipLStart *AnyButLF ]
 
-SkipUnit      = ';' '~' Unit
-
-
 SkipLStart    = %x00-09 / %x0b-7d / %x7f-ff
               ; any but LF or '~'
 
 AnyButLF      = %x00-09 / %x0b-ff
 
-
-Datum         = SingleDatum
-              / JoinedDatum *( [ '.' / ':' ] JoinedDatum )
-
-
-SingleDatum   = BareString / CladDatum / DottedString
-
-JoinedDatum   = BareString / CladDatum
+SkipUnit      = ';' '~' Unit
 
 
 BareString    = BareChar *( BareChar / Numeric )
 
+DottedString  = ( '.' / Numeric ) *( '.' / Numeric / BareChar )
+
 CladDatum     = '|' *( PipeStrChar / '\' StringEsc ) '|'
               / '"' *( QuotStrChar / '\' StringEsc ) '"'
-              / '#' HashExpr
               / '(' List ')' / '[' List ']' / '{' List '}'
-              / "'" Datum / '`' Datum / ',' Datum
 
-DottedString  = ( '.' / Numeric ) *( '.' / Numeric / BareChar )
+HashExpr      = LabelExpr / RuneExpr / HashDatum
+
+HashDotExpr   = RuneDotExpr / HashDotDatum
+
+QuoteExpr     = "'" Datum / '`' Datum / ',' Datum
+
+JoinExpr      = Datum LeftCladDatum
+              / Datum ':' Datum
+              / DotlessDatum '.' Datum
+
+LeftCladDatum = CladDatum / HashExpr / QuoteExpr
+
+DotlessDatum  = BareString / CladDatum / RuneExpr / HashDatum
 
 
 BareChar      = '!' / '$' / '%' / '*' / '/' / '<' / '=' / '>'
@@ -49,29 +59,36 @@ BareChar      = '!' / '$' / '%' / '*' / '/' / '<' / '=' / '>'
 
 Numeric       = '+' / '-' / DIGIT
 
-
 PipeStrChar   = %x00-5b / %x5d-7b / %x7d-ff
               ; any but '|' or '\'
 
 QuotStrChar   = %x00-21 / %x23-5b / %x5d-ff
               ; any but '"' or '\'
 
-HashExpr      = Rune [ '\' BareString / CladDatum ]
-              / '\' BareString
-              / '%' Label ( '%' / '=' Datum )
-              / CladDatum
-
 List          = [Unit] *( Blank Unit ) *Blank [Tail] [SkipUnit]
 
 Tail          = '&' Unit *Blank
 
+LabelExpr     = '#' '%' Label ( '%' / '=' Datum )
+
+RuneExpr      = '#' Rune [ '\' BareString / CladDatum ]
+
+RuneDotExpr   = '#' Rune '\' DottedString
+
+HashDatum     = '#' '\' BareString / CladDatum
+
+HashDotDatum  = '#' '\' DottedString
+
 
+; Unicode escapes must not represent surrogate code points.
+; This is difficult to express in ABNF.  But we do at least
+; disallow code points greater than \u10FFFF which are also
+; invalid, since U+10FFFF is the highest allowed.
 StringEsc     = '\' / '|' / '"' / *( HTAB / SP ) LF *( HTAB / SP )
               / 'a' / 'b' / 't' / 'n' / 'v' / 'f' / 'r' / 'e'
               / 'x' 1*( 2HEXDIG ) ';'
-              / 'u' 1*5HEXDIG ';'
-              / 'u' '0' 1*5HEXDIG ';'
-              / 'u' '1' '0' 1*4HEXDIG ';'
+              / 'u' ['0'] 1*5HEXDIG ';'
+              / 'u' '1' '0' 4HEXDIG ';'
 
 
 Rune          = ALPHA *5( ALPHA / DIGIT )
diff --git a/spec/syntax.md b/spec/syntax.md
index 7f3561c..d1a17ad 100644
--- a/spec/syntax.md
+++ b/spec/syntax.md
@@ -1,20 +1,18 @@
 # Zisp S-Expression Syntax
 
-We use a BNF notation with the following rules:
+We use a BNF-like grammar notation with the following rules:
 
 * Concatenation of expressions is implicit: `foo bar` means `foo`
   followed by `bar`.
 
-* Expressions may be followed by `?`, `*`, `+`, `{N}`, or `{N,M}`,
-  which have the same meanings as in regular expressions.
-
-* The syntax `[foo]` is shorthand for `(foo)?`.
+* The suffixes `?`, `*`, and `+` have the same meaning as in regular
+  expressions, although `[foo]` is used in place of `(foo)?`.
 
 * The syntax is defined in terms of bytes, not characters.  Terminals
   `'c'` and `"c"` refer to the ASCII value of the given character `c`.
   Numbers are in decimal and refer to a byte with the given value.
 
-* The `~` prefix means NOT.  It only applies to rules that match one
+* The prefix `~` means NOT.  It only applies to rules that match one
   byte, and negates them.  For example, `~( 'a' | 'b' )` matches any
   byte other than 97 and 98.
 
@@ -24,11 +22,12 @@ We use a BNF notation with the following rules:
 
 * There is no ambiguity, or look-ahead / backtracking beyond one byte.
   Rules match left to right, depth-first, and greedy.  As soon as the
-  input matches the first terminal of a rule, it must match that rule
-  to the end or it is considered a syntax error.
+  input matches the first terminal of a rule (explicit or implied by
+  recursively descending into the first non-terminal), it must match
+  that rule to the end, or it is considered a syntax error.
 
-The last rule means that the BNF is very simple to translate to code.
-It also probably makes it equivalent to PEG.
+The last rule means that the notation is simple to translate to code.
+It ostensibly makes the notation equivalent to PEG in expression.
 
 The parser consumes one `Unit` from an input stream every time it's
 called; it returns the `Datum` therein, or EOF.  The final optional
@@ -36,11 +35,30 @@ called; it returns the `Datum` therein, or EOF.  The final optional
 blank at the end if it finds one; this is because `Datum` is not
 self-closing so the parser has to check if it goes on.
 
+The following limits are not represented in the grammar:
+
+* A `UnicodeSV` is the hexadecimal representation of a Unicode scalar
+  value; it must represent a value in the range 0 to D7FF, or E000 to
+  10FFFF, inclusive.  Any other value signals an error.  Valid values
+  are converted into a UTF-8 byte sequence encoding the value.
+
+* A `Rune` longer than 6 bytes is grammatical, but signals an error.
+  This is important because runes are not self-terminating; defining
+  their grammar as ending after a maximum of 6 bytes would allow
+  another datum beginning with an alphabetic character to follow a
+  rune immediately without any visual delineation, which would be
+  terribly confusing for a human reader.  Consider: `#foo123bar`.
+  This would parse as a concatenation of `#foo123` and `bar`.
+
+* A `Label` is the hexadecimal representation of a 48-bit integer,
+  meaning it allows for a maximum of 12 hexadecimal digits.  Longer
+  values are grammatical, but signal an out-of-range error.
+
 ```
 Unit          : Blank* [ Datum [Blank] ]
 
 
-Blank         : 9...13 | Comment
+Blank         : 9...13 | SP | Comment
 
 Datum         : OneDatum ( [JoinChar] OneDatum )*
 
@@ -56,41 +74,44 @@ SkipLine      : ( ~LF )* [LF]
 
 OneDatum      : BareString | CladDatum
 
+
 BareString    : ( '.' | '+' | '-' | DIGIT ) ( BareChar | '.' )*
               | BareChar+
 
-CladDatum     : '|' ( PipeStrChar | '\' StringEsc )* '|'
-              | '"' ( QuotStrChar | '\' StringEsc )* '"'
-              | '#' HashExpr
-              | '(' List ')' | '[' List ']' | '{' List '}'
-              | "'" Datum | '`' Datum | ',' Datum
+CladDatum     : PipeStr | QuoteStr | HashExpr | QuoteExpr | List
 
+PipeStr       : '|' ( PipeStrChar | '\' StringEsc )* '|'
+QuoteStr      : '"' ( QuotStrChar | '\' StringEsc )* '"'
+HashExpr      : '#' ( RuneExpr | LabelExpr | HashDatum )
+QuoteExpr     : "'" Datum | '`' Datum | ',' Datum
+List          : ParenList | SquareList | BraceList
 
 BareChar      : ALPHA | DIGIT
               | '!' | '$' | '%' | '*' | '+'
               | '-' | '/' | '<' | '=' | '>'
               | '?' | '@' | '^' | '_' | '~'
 
-
 PipeStrChar   : ~( '|' | '\' )
-
 QuotStrChar   : ~( '"' | '\' )
 
-HashExpr      : Rune [ '\' BareString | CladDatum ]
-              | '\' BareString
-              | '%' Label ( '%' | '=' Datum )
-              | CladDatum
-
-List          : Unit* [ Blank* '&' Unit ] Blank*
-
-
 StringEsc     : '\' | '|' | '"' | ( HTAB | SP )* LF ( HTAB | SP )*
               | 'a' | 'b' | 't' | 'n' | 'v' | 'f' | 'r' | 'e'
-              | 'x' ( HEXDIG{2} )+ ';'
-              | 'u' HEXDIG{1,6} ';'
+              | 'x' HexByte+ ';'
+              | 'u' UnicodeSV ';'
+
+HexByte       : HEXDIG HEXDIG
+UnicodeSV     : HEXDIG+
+
+RuneExpr      : Rune [ '\' BareString | CladDatum ]
+LabelExpr     : '%' Label ( '%' | '=' Datum )
+HashDatum     : '\' BareString | CladDatum
 
+Rune          : ALPHA ( ALPHA | DIGIT )*
+Label         : HEXDIG+
 
-Rune          : ALPHA ( ALPHA | DIGIT ){0,5}
+ParenList     : '(' ListBody ')'
+SquareList    : '[' ListBody ']'
+BraceList     : '{' ListBody '}'
 
-Label         : HEXDIG{1,12}
+ListBody      : Unit* [ Blank* '&' Unit ] Blank*
 ```
diff --git a/spec/syntax.peg b/spec/syntax.peg
new file mode 100644
index 0000000..97b9632
--- /dev/null
+++ b/spec/syntax.peg
@@ -0,0 +1,63 @@
+Unit         <- Blank* ( Datum Blank? )?
+
+
+Blank        <- ' ' / '\t' / '\n' / Comment
+
+Datum        <- OneDatum ( JoinChar? OneDatum )*
+
+JoinChar     <- '.' / ':'
+
+
+Comment      <- ';' ( SkipUnit / SkipLine )
+
+SkipUnit     <- '~' Unit
+
+SkipLine     <- (!'\n' .)* '\n'?
+
+
+OneDatum     <- BareString / CladDatum
+
+
+BareString   <- ( '.' / '+' / '-' / DIGIT ) ( BareChar / '.' )*
+              / BareChar+
+
+CladDatum    <- PipeStr / QuoteStr / HashExpr / QuoteExpr / List
+
+PipeStr      <- '|' ( PipeStrChar / '\' StringEsc )* '|'
+QuoteStr     <- '"' ( QuotStrChar / '\' StringEsc )* '"'
+HashExpr     <- '#' ( RuneExpr / LabelExpr / HashDatum )
+QuoteExpr    <- "'" Datum / '`' Datum / ',' Datum
+List         <- ParenList / SquareList / BraceList
+
+BareChar     <- ALPHA / DIGIT
+              / '!' / '$' / '%' / '*' / '+'
+              / '-' / '/' / '<' / '=' / '>'
+              / '?' / '@' / '^' / '_' / '~'
+
+PipeStrChar  <- (![|\\] .)
+QuotStrChar  <- (!["\\] .)
+
+StringEsc    <- '\' / '|' / '"' / ( HTAB / SP )* LF ( HTAB / SP )*
+              / 'a' / 'b' / 't' / 'n' / 'v' / 'f' / 'r' / 'e'
+              / 'x' HexByte+ ';'
+              / 'u' UnicodeSV ';'
+
+HexByte      <- HEXDIG HEXDIG
+UnicodeSV    <- HEXDIG+
+
+RuneExpr     <- Rune [ '\' BareString / CladDatum ]
+LabelExpr    <- '%' Label ( '%' / '=' Datum )
+HashDatum    <- '\' BareString / CladDatum
+
+Rune         <- ALPHA ( ALPHA / DIGIT )*
+Label        <- HEXDIG+
+
+ParenList    <- '(' ListBody ')'
+SquareList   <- '[' ListBody ']'
+BraceList    <- '{' ListBody '}'
+
+ListBody     <- Unit* [ Blank* '&' Unit ] Blank*
+
+DIGIT        <- [0-9]
+ALPHA        <- [a-zA-Z]
+HEXDIG       <- [0-9a-fA-F]
diff --git a/spec/syntax.zbnf b/spec/syntax.zbnf
new file mode 100644
index 0000000..b87efb5
--- /dev/null
+++ b/spec/syntax.zbnf
@@ -0,0 +1,59 @@
+Unit          : Blank* [ Datum [Blank] ]
+
+
+Blank         : 9...13 | SP | Comment
+
+Datum         : OneDatum ( [JoinChar] OneDatum )*
+
+JoinChar      : '.' | ':'
+
+
+Comment       : ';' ( SkipUnit | SkipLine )
+
+SkipUnit      : '~' Unit
+
+SkipLine      : ( ~LF )* [LF]
+
+
+OneDatum      : BareString | CladDatum
+
+
+BareString    : ( '.' | '+' | '-' | DIGIT ) ( BareChar | '.' )*
+              | BareChar+
+
+CladDatum     : PipeStr | QuoteStr | HashExpr | QuoteExpr | List
+
+PipeStr       : '|' ( PipeStrChar | '\' StringEsc )* '|'
+QuoteStr      : '"' ( QuotStrChar | '\' StringEsc )* '"'
+HashExpr      : '#' ( RuneExpr | LabelExpr | HashDatum )
+QuoteExpr     : "'" Datum | '`' Datum | ',' Datum
+List          : ParenList | SquareList | BraceList
+
+BareChar      : ALPHA | DIGIT
+              | '!' | '$' | '%' | '*' | '+'
+              | '-' | '/' | '<' | '=' | '>'
+              | '?' | '@' | '^' | '_' | '~'
+
+PipeStrChar   : ~( '|' | '\' )
+QuotStrChar   : ~( '"' | '\' )
+
+StringEsc     : '\' | '|' | '"' | ( HTAB | SP )* LF ( HTAB | SP )*
+              | 'a' | 'b' | 't' | 'n' | 'v' | 'f' | 'r' | 'e'
+              | 'x' HexByte+ ';'
+              | 'u' UnicodeSV ';'
+
+HexByte       : HEXDIG HEXDIG
+UnicodeSV     : HEXDIG+
+
+RuneExpr      : Rune [ '\' BareString | CladDatum ]
+LabelExpr     : '%' Label ( '%' | '=' Datum )
+HashDatum     : '\' BareString | CladDatum
+
+Rune          : ALPHA ( ALPHA | DIGIT )*
+Label         : HEXDIG+
+
+ParenList     : '(' ListBody ')'
+SquareList    : '[' ListBody ']'
+BraceList     : '{' ListBody '}'
+
+ListBody      : Unit* [ Blank* '&' Unit ] Blank*
diff --git a/src/test/parse.zig b/src/test/parse.zig
index 94ccbe5..272c92a 100644
--- a/src/test/parse.zig
+++ b/src/test/parse.zig
@@ -44,9 +44,9 @@ test "parse short bare string" {
     try expect(parse("|x()|").eq(str("x()")));
     try expect(parse("|{\\|}|").eq(str("{|}")));
     try expect(parse("foobar").eq(str("foobar")));
-    try expect(parse("!$%.*+").eq(str("!$%.*+")));
-    try expect(parse("-/<=>?").eq(str("-/<=>?")));
-    try expect(parse("@^_~00").eq(str("@^_~00")));
+    try expect(parse("!$%*+-").eq(str("!$%*+-")));
+    try expect(parse("/<=>?@").eq(str("/<=>?@")));
+    try expect(parse("^_~000").eq(str("^_~000")));
 }
 
 test "parse long bare string" {
@@ -56,7 +56,7 @@ test "parse long bare string" {
     try expect(parse("+foo.bar.baz").eq(str("+foo.bar.baz")));
     try expect(parse("-foo.bar.baz").eq(str("-foo.bar.baz")));
     try expect(parse("0foo.bar.baz").eq(str("0foo.bar.baz")));
-    try expect(parse("!$%*+-./<=>?@^_~").eq(str("!$%*+-./<=>?@^_~")));
+    try expect(parse("!$%*+-/<=>?@^_~").eq(str("!$%*+-/<=>?@^_~")));
     try expect(parse("|foo\\x20;bar\\x0a;baz|").eq(str("foo bar\nbaz")));
 }
 
diff --git a/src/zisp/io/Parser.zig b/src/zisp/io/Parser.zig
index 0fea5f3..d18150e 100644
--- a/src/zisp/io/Parser.zig
+++ b/src/zisp/io/Parser.zig
@@ -96,13 +96,14 @@ const real_cons = &value.pair.cons;
 const fake_cons = &dummyCons;
 
 pub const Error = enum {
+    ReadError,
+    UnexpectedEof,
     InvalidCharacter,
     UnclosedString,
-    UnexpectedEof,
-    OutOfRange,
-    ReadError,
     UnicodeLengthError,
     UnicodeEncodeError,
+    RuneTooLong,
+    OutOfRange,
 };
 
 pub const Context = struct {
@@ -410,8 +411,7 @@ fn returnContext(p: *Parser) !void {
 fn parseDatum(p: *Parser) !void {
     const c = p.getUnread() orelse try p.readNoEof("datum");
     if (isBareChar(c) or c == '.') {
-        const s = try p.parseBareString(c);
-        return p.jump(.parseJoin, s);
+        return p.jump(.parseJoin, try p.getBareString(c));
     } else {
         return p.parseCladDatum(c, .parseJoin);
     }
@@ -456,7 +456,7 @@ fn endJoinDatum(p: *Parser) !void {
     return p.jump(.parseJoin, joined);
 }
 
-fn parseBareString(p: *Parser, c1: u8) !Value {
+fn getBareString(p: *Parser, c1: u8) !Value {
     const allow_dots = std.ascii.isDigit(c1) or switch (c1) {
         '.', '+', '-' => true,
         else => false,
@@ -475,8 +475,8 @@ fn parseBareString(p: *Parser, c1: u8) !Value {
 
 fn parseCladDatum(p: *Parser, c: u8, next: Fn) !void {
     return switch (c) {
-        '|' => p.jump(next, try p.parseString('|')),
-        '"' => p.jump(next, try p.parseString('"')),
+        '|' => p.jump(next, try p.getString('|')),
+        '"' => p.jump(next, try p.getString('"')),
         '#' => p.parseHashExpr(next),
         '(', '[', '{' => p.parseList(c, next),
         '\'', '`', ',' => p.parseQuoteExpr(c, next),
@@ -484,7 +484,7 @@ fn parseCladDatum(p: *Parser, c: u8, next: Fn) !void {
     };
 }
 
-fn parseString(p: *Parser, comptime close: u8) !Value {
+fn getString(p: *Parser, comptime close: u8) !Value {
     while (try p.read()) |c| sw: switch (c) {
         close => {
             const s = p.getCharsAsString();
@@ -577,12 +577,11 @@ fn parseStringCharEsc(p: *Parser, c: u8) !void {
 fn parseHashExpr(p: *Parser, next: Fn) !void {
     const c = try p.readNoEof("hash expression");
     if (std.ascii.isAlphabetic(c)) {
-        const r = try p.parseRune(c);
-        return p.parseRuneEnd(r, next);
+        return p.parseRuneEnd(try p.getRune(c), next);
     }
     if (c == '\\') {
         const c1 = try p.readNoEof("bare string after hash");
-        return p.jump(next, p.cons(HASH, try p.parseBareString(c1)));
+        return p.jump(next, p.cons(HASH, try p.getBareString(c1)));
     }
     if (c == '%') {
         return p.parseLabel(next);
@@ -602,14 +601,17 @@ fn endHashDatum(p: *Parser) !void {
     return p.retval(p.cons(HASH, p.result));
 }
 
-fn parseRune(p: *Parser, c1: u8) !Value {
+fn getRune(p: *Parser, c1: u8) !Value {
     try p.addChar(c1);
     var len: usize = 1;
     while (try p.read()) |c| : (len += 1) {
-        if (len == 6 or !std.ascii.isAlphanumeric(c)) {
+        if (!std.ascii.isAlphanumeric(c)) {
             p.unread(c);
             return p.getCharsAsRune();
         }
+        if (len == 6) {
+            return p.err(.RuneTooLong, "rune");
+        }
         try p.addChar(c);
     }
     return p.getCharsAsRune();
@@ -619,13 +621,13 @@ fn parseRuneEnd(p: *Parser, r: Value, next: Fn) !void {
     const c = p.getUnread() orelse return p.jump(next, r);
     if (c == '\\') {
         const c1 = try p.readNoEof("bare string at rune end");
-        return p.jump(next, p.cons(r, try p.parseBareString(c1)));
+        return p.jump(next, p.cons(r, try p.getBareString(c1)));
     }
     if (c == '"') {
-        return p.jump(next, p.cons(r, try p.parseString('"')));
+        return p.jump(next, p.cons(r, try p.getString('"')));
     }
     if (c == '|') {
-        return p.jump(next, p.cons(r, try p.parseString('|')));
+        return p.jump(next, p.cons(r, try p.getString('|')));
     }
     p.unread(c);
     switch (c) {
@@ -772,8 +774,8 @@ fn checkBlank(p: *Parser, c: u8) !enum { yes, skip_unit, no } {
 fn isBareChar(c: u8) bool {
     return switch (c) {
         // zig fmt: off
-        'a'...'z' , 'A'...'Z' , '0'...'9' , '!' , '$' , '%' , '*' ,'+' ,
-        '-' , '.' , '/' , '<' , '=' , '>' , '?' , '@' , '^' , '_' , '~' => true,
+        'a'...'z' , 'A'...'Z' , '0'...'9' , '!' , '$' , '%' , '*' , '+' ,
+        '-' , '/' , '<' , '=' , '>' , '?' , '@' , '^' , '_' , '~' , => true,
         // zig fmt: on
         else => false,
     };
author	Taylan Kammer <taylan.kammer@gmail.com>	2026-01-09 18:09:59 +0100
committer	Taylan Kammer <taylan.kammer@gmail.com>	2026-01-09 18:09:59 +0100
commit	2d72a1aa64a66c486a2329999123c14afcddeb32 (patch)
tree	4eba98eb1240d3d445e2d35c61bad63d352e413b
parent	a2ece405cc61341122fc075d499420e894c56909 (diff)