summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTaylan Kammer <taylan.kammer@gmail.com>2026-01-09 18:09:59 +0100
committerTaylan Kammer <taylan.kammer@gmail.com>2026-01-09 18:09:59 +0100
commit2d72a1aa64a66c486a2329999123c14afcddeb32 (patch)
tree4eba98eb1240d3d445e2d35c61bad63d352e413b
parenta2ece405cc61341122fc075d499420e894c56909 (diff)
More grammar fuckery. BNF is horrible!
-rw-r--r--spec/syntax.abnf65
-rw-r--r--spec/syntax.md81
-rw-r--r--spec/syntax.peg63
-rw-r--r--spec/syntax.zbnf59
-rw-r--r--src/test/parse.zig8
-rw-r--r--src/zisp/io/Parser.zig40
6 files changed, 239 insertions, 77 deletions
diff --git a/spec/syntax.abnf b/spec/syntax.abnf
index a083eda..132deeb 100644
--- a/spec/syntax.abnf
+++ b/spec/syntax.abnf
@@ -6,42 +6,52 @@ File = [Unit] *( Blank Unit ) *Blank [Trail]
Unit = *Blank Datum
-Blank = HTAB / LF / %x0b / %x0c / CR / Comment
+Blank = HTAB / LF / %x0b / %x0c / CR / SP / Comment
Trail = SkipLine / SkipUnit
+Datum = BareString
+ / DottedString
+ / CladDatum
+ / HashExpr
+ / HashDotExpr
+ / QuoteExpr
+ / JoinExpr
+
Comment = SkipLine LF / SkipUnit Blank
SkipLine = ';' [ SkipLStart *AnyButLF ]
-SkipUnit = ';' '~' Unit
-
-
SkipLStart = %x00-09 / %x0b-7d / %x7f-ff
; any but LF or '~'
AnyButLF = %x00-09 / %x0b-ff
-
-Datum = SingleDatum
- / JoinedDatum *( [ '.' / ':' ] JoinedDatum )
-
-
-SingleDatum = BareString / CladDatum / DottedString
-
-JoinedDatum = BareString / CladDatum
+SkipUnit = ';' '~' Unit
BareString = BareChar *( BareChar / Numeric )
+DottedString = ( '.' / Numeric ) *( '.' / Numeric / BareChar )
+
CladDatum = '|' *( PipeStrChar / '\' StringEsc ) '|'
/ '"' *( QuotStrChar / '\' StringEsc ) '"'
- / '#' HashExpr
/ '(' List ')' / '[' List ']' / '{' List '}'
- / "'" Datum / '`' Datum / ',' Datum
-DottedString = ( '.' / Numeric ) *( '.' / Numeric / BareChar )
+HashExpr = LabelExpr / RuneExpr / HashDatum
+
+HashDotExpr = RuneDotExpr / HashDotDatum
+
+QuoteExpr = "'" Datum / '`' Datum / ',' Datum
+
+JoinExpr = Datum LeftCladDatum
+ / Datum ':' Datum
+ / DotlessDatum '.' Datum
+
+LeftCladDatum = CladDatum / HashExpr / QuoteExpr
+
+DotlessDatum = BareString / CladDatum / RuneExpr / HashDatum
BareChar = '!' / '$' / '%' / '*' / '/' / '<' / '=' / '>'
@@ -49,29 +59,36 @@ BareChar = '!' / '$' / '%' / '*' / '/' / '<' / '=' / '>'
Numeric = '+' / '-' / DIGIT
-
PipeStrChar = %x00-5b / %x5d-7b / %x7d-ff
; any but '|' or '\'
QuotStrChar = %x00-21 / %x23-5b / %x5d-ff
; any but '"' or '\'
-HashExpr = Rune [ '\' BareString / CladDatum ]
- / '\' BareString
- / '%' Label ( '%' / '=' Datum )
- / CladDatum
-
List = [Unit] *( Blank Unit ) *Blank [Tail] [SkipUnit]
Tail = '&' Unit *Blank
+LabelExpr = '#' '%' Label ( '%' / '=' Datum )
+
+RuneExpr = '#' Rune [ '\' BareString / CladDatum ]
+
+RuneDotExpr = '#' Rune '\' DottedString
+
+HashDatum = '#' '\' BareString / CladDatum
+
+HashDotDatum = '#' '\' DottedString
+
+; Unicode escapes must not represent surrogate code points.
+; This is difficult to express in ABNF. But we do at least
+; disallow code points greater than \u10FFFF which are also
+; invalid, since U+10FFFF is the highest allowed.
StringEsc = '\' / '|' / '"' / *( HTAB / SP ) LF *( HTAB / SP )
/ 'a' / 'b' / 't' / 'n' / 'v' / 'f' / 'r' / 'e'
/ 'x' 1*( 2HEXDIG ) ';'
- / 'u' 1*5HEXDIG ';'
- / 'u' '0' 1*5HEXDIG ';'
- / 'u' '1' '0' 1*4HEXDIG ';'
+ / 'u' ['0'] 1*5HEXDIG ';'
+ / 'u' '1' '0' 4HEXDIG ';'
Rune = ALPHA *5( ALPHA / DIGIT )
diff --git a/spec/syntax.md b/spec/syntax.md
index 7f3561c..d1a17ad 100644
--- a/spec/syntax.md
+++ b/spec/syntax.md
@@ -1,20 +1,18 @@
# Zisp S-Expression Syntax
-We use a BNF notation with the following rules:
+We use a BNF-like grammar notation with the following rules:
* Concatenation of expressions is implicit: `foo bar` means `foo`
followed by `bar`.
-* Expressions may be followed by `?`, `*`, `+`, `{N}`, or `{N,M}`,
- which have the same meanings as in regular expressions.
-
-* The syntax `[foo]` is shorthand for `(foo)?`.
+* The suffixes `?`, `*`, and `+` have the same meaning as in regular
+ expressions, although `[foo]` is used in place of `(foo)?`.
* The syntax is defined in terms of bytes, not characters. Terminals
`'c'` and `"c"` refer to the ASCII value of the given character `c`.
Numbers are in decimal and refer to a byte with the given value.
-* The `~` prefix means NOT. It only applies to rules that match one
+* The prefix `~` means NOT. It only applies to rules that match one
byte, and negates them. For example, `~( 'a' | 'b' )` matches any
byte other than 97 and 98.
@@ -24,11 +22,12 @@ We use a BNF notation with the following rules:
* There is no ambiguity, or look-ahead / backtracking beyond one byte.
Rules match left to right, depth-first, and greedy. As soon as the
- input matches the first terminal of a rule, it must match that rule
- to the end or it is considered a syntax error.
+ input matches the first terminal of a rule (explicit or implied by
+ recursively descending into the first non-terminal), it must match
+ that rule to the end, or it is considered a syntax error.
-The last rule means that the BNF is very simple to translate to code.
-It also probably makes it equivalent to PEG.
+The last rule means that the notation is simple to translate to code.
+It ostensibly makes the notation equivalent to PEG in expression.
The parser consumes one `Unit` from an input stream every time it's
called; it returns the `Datum` therein, or EOF. The final optional
@@ -36,11 +35,30 @@ called; it returns the `Datum` therein, or EOF. The final optional
blank at the end if it finds one; this is because `Datum` is not
self-closing so the parser has to check if it goes on.
+The following limits are not represented in the grammar:
+
+* A `UnicodeSV` is the hexadecimal representation of a Unicode scalar
+ value; it must represent a value in the range 0 to D7FF, or E000 to
+ 10FFFF, inclusive. Any other value signals an error. Valid values
+ are converted into a UTF-8 byte sequence encoding the value.
+
+* A `Rune` longer than 6 bytes is grammatical, but signals an error.
+ This is important because runes are not self-terminating; defining
+ their grammar as ending after a maximum of 6 bytes would allow
+ another datum beginning with an alphabetic character to follow a
+ rune immediately without any visual delineation, which would be
+ terribly confusing for a human reader. Consider: `#foo123bar`.
+ This would parse as a concatenation of `#foo123` and `bar`.
+
+* A `Label` is the hexadecimal representation of a 48-bit integer,
+ meaning it allows for a maximum of 12 hexadecimal digits. Longer
+ values are grammatical, but signal an out-of-range error.
+
```
Unit : Blank* [ Datum [Blank] ]
-Blank : 9...13 | Comment
+Blank : 9...13 | SP | Comment
Datum : OneDatum ( [JoinChar] OneDatum )*
@@ -56,41 +74,44 @@ SkipLine : ( ~LF )* [LF]
OneDatum : BareString | CladDatum
+
BareString : ( '.' | '+' | '-' | DIGIT ) ( BareChar | '.' )*
| BareChar+
-CladDatum : '|' ( PipeStrChar | '\' StringEsc )* '|'
- | '"' ( QuotStrChar | '\' StringEsc )* '"'
- | '#' HashExpr
- | '(' List ')' | '[' List ']' | '{' List '}'
- | "'" Datum | '`' Datum | ',' Datum
+CladDatum : PipeStr | QuoteStr | HashExpr | QuoteExpr | List
+PipeStr : '|' ( PipeStrChar | '\' StringEsc )* '|'
+QuoteStr : '"' ( QuotStrChar | '\' StringEsc )* '"'
+HashExpr : '#' ( RuneExpr | LabelExpr | HashDatum )
+QuoteExpr : "'" Datum | '`' Datum | ',' Datum
+List : ParenList | SquareList | BraceList
BareChar : ALPHA | DIGIT
| '!' | '$' | '%' | '*' | '+'
| '-' | '/' | '<' | '=' | '>'
| '?' | '@' | '^' | '_' | '~'
-
PipeStrChar : ~( '|' | '\' )
-
QuotStrChar : ~( '"' | '\' )
-HashExpr : Rune [ '\' BareString | CladDatum ]
- | '\' BareString
- | '%' Label ( '%' | '=' Datum )
- | CladDatum
-
-List : Unit* [ Blank* '&' Unit ] Blank*
-
-
StringEsc : '\' | '|' | '"' | ( HTAB | SP )* LF ( HTAB | SP )*
| 'a' | 'b' | 't' | 'n' | 'v' | 'f' | 'r' | 'e'
- | 'x' ( HEXDIG{2} )+ ';'
- | 'u' HEXDIG{1,6} ';'
+ | 'x' HexByte+ ';'
+ | 'u' UnicodeSV ';'
+
+HexByte : HEXDIG HEXDIG
+UnicodeSV : HEXDIG+
+
+RuneExpr : Rune [ '\' BareString | CladDatum ]
+LabelExpr : '%' Label ( '%' | '=' Datum )
+HashDatum : '\' BareString | CladDatum
+Rune : ALPHA ( ALPHA | DIGIT )*
+Label : HEXDIG+
-Rune : ALPHA ( ALPHA | DIGIT ){0,5}
+ParenList : '(' ListBody ')'
+SquareList : '[' ListBody ']'
+BraceList : '{' ListBody '}'
-Label : HEXDIG{1,12}
+ListBody : Unit* [ Blank* '&' Unit ] Blank*
```
diff --git a/spec/syntax.peg b/spec/syntax.peg
new file mode 100644
index 0000000..97b9632
--- /dev/null
+++ b/spec/syntax.peg
@@ -0,0 +1,63 @@
+Unit <- Blank* ( Datum Blank? )?
+
+
+Blank <- ' ' / '\t' / '\n' / Comment
+
+Datum <- OneDatum ( JoinChar? OneDatum )*
+
+JoinChar <- '.' / ':'
+
+
+Comment <- ';' ( SkipUnit / SkipLine )
+
+SkipUnit <- '~' Unit
+
+SkipLine <- (!'\n' .)* '\n'?
+
+
+OneDatum <- BareString / CladDatum
+
+
+BareString <- ( '.' / '+' / '-' / DIGIT ) ( BareChar / '.' )*
+ / BareChar+
+
+CladDatum <- PipeStr / QuoteStr / HashExpr / QuoteExpr / List
+
+PipeStr <- '|' ( PipeStrChar / '\' StringEsc )* '|'
+QuoteStr <- '"' ( QuotStrChar / '\' StringEsc )* '"'
+HashExpr <- '#' ( RuneExpr / LabelExpr / HashDatum )
+QuoteExpr <- "'" Datum / '`' Datum / ',' Datum
+List <- ParenList / SquareList / BraceList
+
+BareChar <- ALPHA / DIGIT
+ / '!' / '$' / '%' / '*' / '+'
+ / '-' / '/' / '<' / '=' / '>'
+ / '?' / '@' / '^' / '_' / '~'
+
+PipeStrChar <- (![|\\] .)
+QuotStrChar <- (!["\\] .)
+
+StringEsc <- '\' / '|' / '"' / ( HTAB / SP )* LF ( HTAB / SP )*
+ / 'a' / 'b' / 't' / 'n' / 'v' / 'f' / 'r' / 'e'
+ / 'x' HexByte+ ';'
+ / 'u' UnicodeSV ';'
+
+HexByte <- HEXDIG HEXDIG
+UnicodeSV <- HEXDIG+
+
+RuneExpr <- Rune [ '\' BareString / CladDatum ]
+LabelExpr <- '%' Label ( '%' / '=' Datum )
+HashDatum <- '\' BareString / CladDatum
+
+Rune <- ALPHA ( ALPHA / DIGIT )*
+Label <- HEXDIG+
+
+ParenList <- '(' ListBody ')'
+SquareList <- '[' ListBody ']'
+BraceList <- '{' ListBody '}'
+
+ListBody <- Unit* [ Blank* '&' Unit ] Blank*
+
+DIGIT <- [0-9]
+ALPHA <- [a-zA-Z]
+HEXDIG <- [0-9a-fA-F]
diff --git a/spec/syntax.zbnf b/spec/syntax.zbnf
new file mode 100644
index 0000000..b87efb5
--- /dev/null
+++ b/spec/syntax.zbnf
@@ -0,0 +1,59 @@
+Unit : Blank* [ Datum [Blank] ]
+
+
+Blank : 9...13 | SP | Comment
+
+Datum : OneDatum ( [JoinChar] OneDatum )*
+
+JoinChar : '.' | ':'
+
+
+Comment : ';' ( SkipUnit | SkipLine )
+
+SkipUnit : '~' Unit
+
+SkipLine : ( ~LF )* [LF]
+
+
+OneDatum : BareString | CladDatum
+
+
+BareString : ( '.' | '+' | '-' | DIGIT ) ( BareChar | '.' )*
+ | BareChar+
+
+CladDatum : PipeStr | QuoteStr | HashExpr | QuoteExpr | List
+
+PipeStr : '|' ( PipeStrChar | '\' StringEsc )* '|'
+QuoteStr : '"' ( QuotStrChar | '\' StringEsc )* '"'
+HashExpr : '#' ( RuneExpr | LabelExpr | HashDatum )
+QuoteExpr : "'" Datum | '`' Datum | ',' Datum
+List : ParenList | SquareList | BraceList
+
+BareChar : ALPHA | DIGIT
+ | '!' | '$' | '%' | '*' | '+'
+ | '-' | '/' | '<' | '=' | '>'
+ | '?' | '@' | '^' | '_' | '~'
+
+PipeStrChar : ~( '|' | '\' )
+QuotStrChar : ~( '"' | '\' )
+
+StringEsc : '\' | '|' | '"' | ( HTAB | SP )* LF ( HTAB | SP )*
+ | 'a' | 'b' | 't' | 'n' | 'v' | 'f' | 'r' | 'e'
+ | 'x' HexByte+ ';'
+ | 'u' UnicodeSV ';'
+
+HexByte : HEXDIG HEXDIG
+UnicodeSV : HEXDIG+
+
+RuneExpr : Rune [ '\' BareString | CladDatum ]
+LabelExpr : '%' Label ( '%' | '=' Datum )
+HashDatum : '\' BareString | CladDatum
+
+Rune : ALPHA ( ALPHA | DIGIT )*
+Label : HEXDIG+
+
+ParenList : '(' ListBody ')'
+SquareList : '[' ListBody ']'
+BraceList : '{' ListBody '}'
+
+ListBody : Unit* [ Blank* '&' Unit ] Blank*
diff --git a/src/test/parse.zig b/src/test/parse.zig
index 94ccbe5..272c92a 100644
--- a/src/test/parse.zig
+++ b/src/test/parse.zig
@@ -44,9 +44,9 @@ test "parse short bare string" {
try expect(parse("|x()|").eq(str("x()")));
try expect(parse("|{\\|}|").eq(str("{|}")));
try expect(parse("foobar").eq(str("foobar")));
- try expect(parse("!$%.*+").eq(str("!$%.*+")));
- try expect(parse("-/<=>?").eq(str("-/<=>?")));
- try expect(parse("@^_~00").eq(str("@^_~00")));
+ try expect(parse("!$%*+-").eq(str("!$%*+-")));
+ try expect(parse("/<=>?@").eq(str("/<=>?@")));
+ try expect(parse("^_~000").eq(str("^_~000")));
}
test "parse long bare string" {
@@ -56,7 +56,7 @@ test "parse long bare string" {
try expect(parse("+foo.bar.baz").eq(str("+foo.bar.baz")));
try expect(parse("-foo.bar.baz").eq(str("-foo.bar.baz")));
try expect(parse("0foo.bar.baz").eq(str("0foo.bar.baz")));
- try expect(parse("!$%*+-./<=>?@^_~").eq(str("!$%*+-./<=>?@^_~")));
+ try expect(parse("!$%*+-/<=>?@^_~").eq(str("!$%*+-/<=>?@^_~")));
try expect(parse("|foo\\x20;bar\\x0a;baz|").eq(str("foo bar\nbaz")));
}
diff --git a/src/zisp/io/Parser.zig b/src/zisp/io/Parser.zig
index 0fea5f3..d18150e 100644
--- a/src/zisp/io/Parser.zig
+++ b/src/zisp/io/Parser.zig
@@ -96,13 +96,14 @@ const real_cons = &value.pair.cons;
const fake_cons = &dummyCons;
pub const Error = enum {
+ ReadError,
+ UnexpectedEof,
InvalidCharacter,
UnclosedString,
- UnexpectedEof,
- OutOfRange,
- ReadError,
UnicodeLengthError,
UnicodeEncodeError,
+ RuneTooLong,
+ OutOfRange,
};
pub const Context = struct {
@@ -410,8 +411,7 @@ fn returnContext(p: *Parser) !void {
fn parseDatum(p: *Parser) !void {
const c = p.getUnread() orelse try p.readNoEof("datum");
if (isBareChar(c) or c == '.') {
- const s = try p.parseBareString(c);
- return p.jump(.parseJoin, s);
+ return p.jump(.parseJoin, try p.getBareString(c));
} else {
return p.parseCladDatum(c, .parseJoin);
}
@@ -456,7 +456,7 @@ fn endJoinDatum(p: *Parser) !void {
return p.jump(.parseJoin, joined);
}
-fn parseBareString(p: *Parser, c1: u8) !Value {
+fn getBareString(p: *Parser, c1: u8) !Value {
const allow_dots = std.ascii.isDigit(c1) or switch (c1) {
'.', '+', '-' => true,
else => false,
@@ -475,8 +475,8 @@ fn parseBareString(p: *Parser, c1: u8) !Value {
fn parseCladDatum(p: *Parser, c: u8, next: Fn) !void {
return switch (c) {
- '|' => p.jump(next, try p.parseString('|')),
- '"' => p.jump(next, try p.parseString('"')),
+ '|' => p.jump(next, try p.getString('|')),
+ '"' => p.jump(next, try p.getString('"')),
'#' => p.parseHashExpr(next),
'(', '[', '{' => p.parseList(c, next),
'\'', '`', ',' => p.parseQuoteExpr(c, next),
@@ -484,7 +484,7 @@ fn parseCladDatum(p: *Parser, c: u8, next: Fn) !void {
};
}
-fn parseString(p: *Parser, comptime close: u8) !Value {
+fn getString(p: *Parser, comptime close: u8) !Value {
while (try p.read()) |c| sw: switch (c) {
close => {
const s = p.getCharsAsString();
@@ -577,12 +577,11 @@ fn parseStringCharEsc(p: *Parser, c: u8) !void {
fn parseHashExpr(p: *Parser, next: Fn) !void {
const c = try p.readNoEof("hash expression");
if (std.ascii.isAlphabetic(c)) {
- const r = try p.parseRune(c);
- return p.parseRuneEnd(r, next);
+ return p.parseRuneEnd(try p.getRune(c), next);
}
if (c == '\\') {
const c1 = try p.readNoEof("bare string after hash");
- return p.jump(next, p.cons(HASH, try p.parseBareString(c1)));
+ return p.jump(next, p.cons(HASH, try p.getBareString(c1)));
}
if (c == '%') {
return p.parseLabel(next);
@@ -602,14 +601,17 @@ fn endHashDatum(p: *Parser) !void {
return p.retval(p.cons(HASH, p.result));
}
-fn parseRune(p: *Parser, c1: u8) !Value {
+fn getRune(p: *Parser, c1: u8) !Value {
try p.addChar(c1);
var len: usize = 1;
while (try p.read()) |c| : (len += 1) {
- if (len == 6 or !std.ascii.isAlphanumeric(c)) {
+ if (!std.ascii.isAlphanumeric(c)) {
p.unread(c);
return p.getCharsAsRune();
}
+ if (len == 6) {
+ return p.err(.RuneTooLong, "rune");
+ }
try p.addChar(c);
}
return p.getCharsAsRune();
@@ -619,13 +621,13 @@ fn parseRuneEnd(p: *Parser, r: Value, next: Fn) !void {
const c = p.getUnread() orelse return p.jump(next, r);
if (c == '\\') {
const c1 = try p.readNoEof("bare string at rune end");
- return p.jump(next, p.cons(r, try p.parseBareString(c1)));
+ return p.jump(next, p.cons(r, try p.getBareString(c1)));
}
if (c == '"') {
- return p.jump(next, p.cons(r, try p.parseString('"')));
+ return p.jump(next, p.cons(r, try p.getString('"')));
}
if (c == '|') {
- return p.jump(next, p.cons(r, try p.parseString('|')));
+ return p.jump(next, p.cons(r, try p.getString('|')));
}
p.unread(c);
switch (c) {
@@ -772,8 +774,8 @@ fn checkBlank(p: *Parser, c: u8) !enum { yes, skip_unit, no } {
fn isBareChar(c: u8) bool {
return switch (c) {
// zig fmt: off
- 'a'...'z' , 'A'...'Z' , '0'...'9' , '!' , '$' , '%' , '*' ,'+' ,
- '-' , '.' , '/' , '<' , '=' , '>' , '?' , '@' , '^' , '_' , '~' => true,
+ 'a'...'z' , 'A'...'Z' , '0'...'9' , '!' , '$' , '%' , '*' , '+' ,
+ '-' , '/' , '<' , '=' , '>' , '?' , '@' , '^' , '_' , '~' , => true,
// zig fmt: on
else => false,
};