diff options
| author | Taylan Kammer <taylan.kammer@gmail.com> | 2026-01-09 18:09:59 +0100 |
|---|---|---|
| committer | Taylan Kammer <taylan.kammer@gmail.com> | 2026-01-09 18:09:59 +0100 |
| commit | 2d72a1aa64a66c486a2329999123c14afcddeb32 (patch) | |
| tree | 4eba98eb1240d3d445e2d35c61bad63d352e413b | |
| parent | a2ece405cc61341122fc075d499420e894c56909 (diff) | |
More grammar fuckery. BNF is horrible!
| -rw-r--r-- | spec/syntax.abnf | 65 | ||||
| -rw-r--r-- | spec/syntax.md | 81 | ||||
| -rw-r--r-- | spec/syntax.peg | 63 | ||||
| -rw-r--r-- | spec/syntax.zbnf | 59 | ||||
| -rw-r--r-- | src/test/parse.zig | 8 | ||||
| -rw-r--r-- | src/zisp/io/Parser.zig | 40 |
6 files changed, 239 insertions, 77 deletions
diff --git a/spec/syntax.abnf b/spec/syntax.abnf index a083eda..132deeb 100644 --- a/spec/syntax.abnf +++ b/spec/syntax.abnf @@ -6,42 +6,52 @@ File = [Unit] *( Blank Unit ) *Blank [Trail] Unit = *Blank Datum -Blank = HTAB / LF / %x0b / %x0c / CR / Comment +Blank = HTAB / LF / %x0b / %x0c / CR / SP / Comment Trail = SkipLine / SkipUnit +Datum = BareString + / DottedString + / CladDatum + / HashExpr + / HashDotExpr + / QuoteExpr + / JoinExpr + Comment = SkipLine LF / SkipUnit Blank SkipLine = ';' [ SkipLStart *AnyButLF ] -SkipUnit = ';' '~' Unit - - SkipLStart = %x00-09 / %x0b-7d / %x7f-ff ; any but LF or '~' AnyButLF = %x00-09 / %x0b-ff - -Datum = SingleDatum - / JoinedDatum *( [ '.' / ':' ] JoinedDatum ) - - -SingleDatum = BareString / CladDatum / DottedString - -JoinedDatum = BareString / CladDatum +SkipUnit = ';' '~' Unit BareString = BareChar *( BareChar / Numeric ) +DottedString = ( '.' / Numeric ) *( '.' / Numeric / BareChar ) + CladDatum = '|' *( PipeStrChar / '\' StringEsc ) '|' / '"' *( QuotStrChar / '\' StringEsc ) '"' - / '#' HashExpr / '(' List ')' / '[' List ']' / '{' List '}' - / "'" Datum / '`' Datum / ',' Datum -DottedString = ( '.' / Numeric ) *( '.' / Numeric / BareChar ) +HashExpr = LabelExpr / RuneExpr / HashDatum + +HashDotExpr = RuneDotExpr / HashDotDatum + +QuoteExpr = "'" Datum / '`' Datum / ',' Datum + +JoinExpr = Datum LeftCladDatum + / Datum ':' Datum + / DotlessDatum '.' Datum + +LeftCladDatum = CladDatum / HashExpr / QuoteExpr + +DotlessDatum = BareString / CladDatum / RuneExpr / HashDatum BareChar = '!' / '$' / '%' / '*' / '/' / '<' / '=' / '>' @@ -49,29 +59,36 @@ BareChar = '!' / '$' / '%' / '*' / '/' / '<' / '=' / '>' Numeric = '+' / '-' / DIGIT - PipeStrChar = %x00-5b / %x5d-7b / %x7d-ff ; any but '|' or '\' QuotStrChar = %x00-21 / %x23-5b / %x5d-ff ; any but '"' or '\' -HashExpr = Rune [ '\' BareString / CladDatum ] - / '\' BareString - / '%' Label ( '%' / '=' Datum ) - / CladDatum - List = [Unit] *( Blank Unit ) *Blank [Tail] [SkipUnit] Tail = '&' Unit *Blank +LabelExpr = '#' '%' Label ( '%' / '=' Datum ) + +RuneExpr = '#' Rune [ '\' BareString / CladDatum ] + +RuneDotExpr = '#' Rune '\' DottedString + +HashDatum = '#' '\' BareString / CladDatum + +HashDotDatum = '#' '\' DottedString + +; Unicode escapes must not represent surrogate code points. +; This is difficult to express in ABNF. But we do at least +; disallow code points greater than \u10FFFF which are also +; invalid, since U+10FFFF is the highest allowed. StringEsc = '\' / '|' / '"' / *( HTAB / SP ) LF *( HTAB / SP ) / 'a' / 'b' / 't' / 'n' / 'v' / 'f' / 'r' / 'e' / 'x' 1*( 2HEXDIG ) ';' - / 'u' 1*5HEXDIG ';' - / 'u' '0' 1*5HEXDIG ';' - / 'u' '1' '0' 1*4HEXDIG ';' + / 'u' ['0'] 1*5HEXDIG ';' + / 'u' '1' '0' 4HEXDIG ';' Rune = ALPHA *5( ALPHA / DIGIT ) diff --git a/spec/syntax.md b/spec/syntax.md index 7f3561c..d1a17ad 100644 --- a/spec/syntax.md +++ b/spec/syntax.md @@ -1,20 +1,18 @@ # Zisp S-Expression Syntax -We use a BNF notation with the following rules: +We use a BNF-like grammar notation with the following rules: * Concatenation of expressions is implicit: `foo bar` means `foo` followed by `bar`. -* Expressions may be followed by `?`, `*`, `+`, `{N}`, or `{N,M}`, - which have the same meanings as in regular expressions. - -* The syntax `[foo]` is shorthand for `(foo)?`. +* The suffixes `?`, `*`, and `+` have the same meaning as in regular + expressions, although `[foo]` is used in place of `(foo)?`. * The syntax is defined in terms of bytes, not characters. Terminals `'c'` and `"c"` refer to the ASCII value of the given character `c`. Numbers are in decimal and refer to a byte with the given value. -* The `~` prefix means NOT. It only applies to rules that match one +* The prefix `~` means NOT. It only applies to rules that match one byte, and negates them. For example, `~( 'a' | 'b' )` matches any byte other than 97 and 98. @@ -24,11 +22,12 @@ We use a BNF notation with the following rules: * There is no ambiguity, or look-ahead / backtracking beyond one byte. Rules match left to right, depth-first, and greedy. As soon as the - input matches the first terminal of a rule, it must match that rule - to the end or it is considered a syntax error. + input matches the first terminal of a rule (explicit or implied by + recursively descending into the first non-terminal), it must match + that rule to the end, or it is considered a syntax error. -The last rule means that the BNF is very simple to translate to code. -It also probably makes it equivalent to PEG. +The last rule means that the notation is simple to translate to code. +It ostensibly makes the notation equivalent to PEG in expression. The parser consumes one `Unit` from an input stream every time it's called; it returns the `Datum` therein, or EOF. The final optional @@ -36,11 +35,30 @@ called; it returns the `Datum` therein, or EOF. The final optional blank at the end if it finds one; this is because `Datum` is not self-closing so the parser has to check if it goes on. +The following limits are not represented in the grammar: + +* A `UnicodeSV` is the hexadecimal representation of a Unicode scalar + value; it must represent a value in the range 0 to D7FF, or E000 to + 10FFFF, inclusive. Any other value signals an error. Valid values + are converted into a UTF-8 byte sequence encoding the value. + +* A `Rune` longer than 6 bytes is grammatical, but signals an error. + This is important because runes are not self-terminating; defining + their grammar as ending after a maximum of 6 bytes would allow + another datum beginning with an alphabetic character to follow a + rune immediately without any visual delineation, which would be + terribly confusing for a human reader. Consider: `#foo123bar`. + This would parse as a concatenation of `#foo123` and `bar`. + +* A `Label` is the hexadecimal representation of a 48-bit integer, + meaning it allows for a maximum of 12 hexadecimal digits. Longer + values are grammatical, but signal an out-of-range error. + ``` Unit : Blank* [ Datum [Blank] ] -Blank : 9...13 | Comment +Blank : 9...13 | SP | Comment Datum : OneDatum ( [JoinChar] OneDatum )* @@ -56,41 +74,44 @@ SkipLine : ( ~LF )* [LF] OneDatum : BareString | CladDatum + BareString : ( '.' | '+' | '-' | DIGIT ) ( BareChar | '.' )* | BareChar+ -CladDatum : '|' ( PipeStrChar | '\' StringEsc )* '|' - | '"' ( QuotStrChar | '\' StringEsc )* '"' - | '#' HashExpr - | '(' List ')' | '[' List ']' | '{' List '}' - | "'" Datum | '`' Datum | ',' Datum +CladDatum : PipeStr | QuoteStr | HashExpr | QuoteExpr | List +PipeStr : '|' ( PipeStrChar | '\' StringEsc )* '|' +QuoteStr : '"' ( QuotStrChar | '\' StringEsc )* '"' +HashExpr : '#' ( RuneExpr | LabelExpr | HashDatum ) +QuoteExpr : "'" Datum | '`' Datum | ',' Datum +List : ParenList | SquareList | BraceList BareChar : ALPHA | DIGIT | '!' | '$' | '%' | '*' | '+' | '-' | '/' | '<' | '=' | '>' | '?' | '@' | '^' | '_' | '~' - PipeStrChar : ~( '|' | '\' ) - QuotStrChar : ~( '"' | '\' ) -HashExpr : Rune [ '\' BareString | CladDatum ] - | '\' BareString - | '%' Label ( '%' | '=' Datum ) - | CladDatum - -List : Unit* [ Blank* '&' Unit ] Blank* - - StringEsc : '\' | '|' | '"' | ( HTAB | SP )* LF ( HTAB | SP )* | 'a' | 'b' | 't' | 'n' | 'v' | 'f' | 'r' | 'e' - | 'x' ( HEXDIG{2} )+ ';' - | 'u' HEXDIG{1,6} ';' + | 'x' HexByte+ ';' + | 'u' UnicodeSV ';' + +HexByte : HEXDIG HEXDIG +UnicodeSV : HEXDIG+ + +RuneExpr : Rune [ '\' BareString | CladDatum ] +LabelExpr : '%' Label ( '%' | '=' Datum ) +HashDatum : '\' BareString | CladDatum +Rune : ALPHA ( ALPHA | DIGIT )* +Label : HEXDIG+ -Rune : ALPHA ( ALPHA | DIGIT ){0,5} +ParenList : '(' ListBody ')' +SquareList : '[' ListBody ']' +BraceList : '{' ListBody '}' -Label : HEXDIG{1,12} +ListBody : Unit* [ Blank* '&' Unit ] Blank* ``` diff --git a/spec/syntax.peg b/spec/syntax.peg new file mode 100644 index 0000000..97b9632 --- /dev/null +++ b/spec/syntax.peg @@ -0,0 +1,63 @@ +Unit <- Blank* ( Datum Blank? )? + + +Blank <- ' ' / '\t' / '\n' / Comment + +Datum <- OneDatum ( JoinChar? OneDatum )* + +JoinChar <- '.' / ':' + + +Comment <- ';' ( SkipUnit / SkipLine ) + +SkipUnit <- '~' Unit + +SkipLine <- (!'\n' .)* '\n'? + + +OneDatum <- BareString / CladDatum + + +BareString <- ( '.' / '+' / '-' / DIGIT ) ( BareChar / '.' )* + / BareChar+ + +CladDatum <- PipeStr / QuoteStr / HashExpr / QuoteExpr / List + +PipeStr <- '|' ( PipeStrChar / '\' StringEsc )* '|' +QuoteStr <- '"' ( QuotStrChar / '\' StringEsc )* '"' +HashExpr <- '#' ( RuneExpr / LabelExpr / HashDatum ) +QuoteExpr <- "'" Datum / '`' Datum / ',' Datum +List <- ParenList / SquareList / BraceList + +BareChar <- ALPHA / DIGIT + / '!' / '$' / '%' / '*' / '+' + / '-' / '/' / '<' / '=' / '>' + / '?' / '@' / '^' / '_' / '~' + +PipeStrChar <- (![|\\] .) +QuotStrChar <- (!["\\] .) + +StringEsc <- '\' / '|' / '"' / ( HTAB / SP )* LF ( HTAB / SP )* + / 'a' / 'b' / 't' / 'n' / 'v' / 'f' / 'r' / 'e' + / 'x' HexByte+ ';' + / 'u' UnicodeSV ';' + +HexByte <- HEXDIG HEXDIG +UnicodeSV <- HEXDIG+ + +RuneExpr <- Rune [ '\' BareString / CladDatum ] +LabelExpr <- '%' Label ( '%' / '=' Datum ) +HashDatum <- '\' BareString / CladDatum + +Rune <- ALPHA ( ALPHA / DIGIT )* +Label <- HEXDIG+ + +ParenList <- '(' ListBody ')' +SquareList <- '[' ListBody ']' +BraceList <- '{' ListBody '}' + +ListBody <- Unit* [ Blank* '&' Unit ] Blank* + +DIGIT <- [0-9] +ALPHA <- [a-zA-Z] +HEXDIG <- [0-9a-fA-F] diff --git a/spec/syntax.zbnf b/spec/syntax.zbnf new file mode 100644 index 0000000..b87efb5 --- /dev/null +++ b/spec/syntax.zbnf @@ -0,0 +1,59 @@ +Unit : Blank* [ Datum [Blank] ] + + +Blank : 9...13 | SP | Comment + +Datum : OneDatum ( [JoinChar] OneDatum )* + +JoinChar : '.' | ':' + + +Comment : ';' ( SkipUnit | SkipLine ) + +SkipUnit : '~' Unit + +SkipLine : ( ~LF )* [LF] + + +OneDatum : BareString | CladDatum + + +BareString : ( '.' | '+' | '-' | DIGIT ) ( BareChar | '.' )* + | BareChar+ + +CladDatum : PipeStr | QuoteStr | HashExpr | QuoteExpr | List + +PipeStr : '|' ( PipeStrChar | '\' StringEsc )* '|' +QuoteStr : '"' ( QuotStrChar | '\' StringEsc )* '"' +HashExpr : '#' ( RuneExpr | LabelExpr | HashDatum ) +QuoteExpr : "'" Datum | '`' Datum | ',' Datum +List : ParenList | SquareList | BraceList + +BareChar : ALPHA | DIGIT + | '!' | '$' | '%' | '*' | '+' + | '-' | '/' | '<' | '=' | '>' + | '?' | '@' | '^' | '_' | '~' + +PipeStrChar : ~( '|' | '\' ) +QuotStrChar : ~( '"' | '\' ) + +StringEsc : '\' | '|' | '"' | ( HTAB | SP )* LF ( HTAB | SP )* + | 'a' | 'b' | 't' | 'n' | 'v' | 'f' | 'r' | 'e' + | 'x' HexByte+ ';' + | 'u' UnicodeSV ';' + +HexByte : HEXDIG HEXDIG +UnicodeSV : HEXDIG+ + +RuneExpr : Rune [ '\' BareString | CladDatum ] +LabelExpr : '%' Label ( '%' | '=' Datum ) +HashDatum : '\' BareString | CladDatum + +Rune : ALPHA ( ALPHA | DIGIT )* +Label : HEXDIG+ + +ParenList : '(' ListBody ')' +SquareList : '[' ListBody ']' +BraceList : '{' ListBody '}' + +ListBody : Unit* [ Blank* '&' Unit ] Blank* diff --git a/src/test/parse.zig b/src/test/parse.zig index 94ccbe5..272c92a 100644 --- a/src/test/parse.zig +++ b/src/test/parse.zig @@ -44,9 +44,9 @@ test "parse short bare string" { try expect(parse("|x()|").eq(str("x()"))); try expect(parse("|{\\|}|").eq(str("{|}"))); try expect(parse("foobar").eq(str("foobar"))); - try expect(parse("!$%.*+").eq(str("!$%.*+"))); - try expect(parse("-/<=>?").eq(str("-/<=>?"))); - try expect(parse("@^_~00").eq(str("@^_~00"))); + try expect(parse("!$%*+-").eq(str("!$%*+-"))); + try expect(parse("/<=>?@").eq(str("/<=>?@"))); + try expect(parse("^_~000").eq(str("^_~000"))); } test "parse long bare string" { @@ -56,7 +56,7 @@ test "parse long bare string" { try expect(parse("+foo.bar.baz").eq(str("+foo.bar.baz"))); try expect(parse("-foo.bar.baz").eq(str("-foo.bar.baz"))); try expect(parse("0foo.bar.baz").eq(str("0foo.bar.baz"))); - try expect(parse("!$%*+-./<=>?@^_~").eq(str("!$%*+-./<=>?@^_~"))); + try expect(parse("!$%*+-/<=>?@^_~").eq(str("!$%*+-/<=>?@^_~"))); try expect(parse("|foo\\x20;bar\\x0a;baz|").eq(str("foo bar\nbaz"))); } diff --git a/src/zisp/io/Parser.zig b/src/zisp/io/Parser.zig index 0fea5f3..d18150e 100644 --- a/src/zisp/io/Parser.zig +++ b/src/zisp/io/Parser.zig @@ -96,13 +96,14 @@ const real_cons = &value.pair.cons; const fake_cons = &dummyCons; pub const Error = enum { + ReadError, + UnexpectedEof, InvalidCharacter, UnclosedString, - UnexpectedEof, - OutOfRange, - ReadError, UnicodeLengthError, UnicodeEncodeError, + RuneTooLong, + OutOfRange, }; pub const Context = struct { @@ -410,8 +411,7 @@ fn returnContext(p: *Parser) !void { fn parseDatum(p: *Parser) !void { const c = p.getUnread() orelse try p.readNoEof("datum"); if (isBareChar(c) or c == '.') { - const s = try p.parseBareString(c); - return p.jump(.parseJoin, s); + return p.jump(.parseJoin, try p.getBareString(c)); } else { return p.parseCladDatum(c, .parseJoin); } @@ -456,7 +456,7 @@ fn endJoinDatum(p: *Parser) !void { return p.jump(.parseJoin, joined); } -fn parseBareString(p: *Parser, c1: u8) !Value { +fn getBareString(p: *Parser, c1: u8) !Value { const allow_dots = std.ascii.isDigit(c1) or switch (c1) { '.', '+', '-' => true, else => false, @@ -475,8 +475,8 @@ fn parseBareString(p: *Parser, c1: u8) !Value { fn parseCladDatum(p: *Parser, c: u8, next: Fn) !void { return switch (c) { - '|' => p.jump(next, try p.parseString('|')), - '"' => p.jump(next, try p.parseString('"')), + '|' => p.jump(next, try p.getString('|')), + '"' => p.jump(next, try p.getString('"')), '#' => p.parseHashExpr(next), '(', '[', '{' => p.parseList(c, next), '\'', '`', ',' => p.parseQuoteExpr(c, next), @@ -484,7 +484,7 @@ fn parseCladDatum(p: *Parser, c: u8, next: Fn) !void { }; } -fn parseString(p: *Parser, comptime close: u8) !Value { +fn getString(p: *Parser, comptime close: u8) !Value { while (try p.read()) |c| sw: switch (c) { close => { const s = p.getCharsAsString(); @@ -577,12 +577,11 @@ fn parseStringCharEsc(p: *Parser, c: u8) !void { fn parseHashExpr(p: *Parser, next: Fn) !void { const c = try p.readNoEof("hash expression"); if (std.ascii.isAlphabetic(c)) { - const r = try p.parseRune(c); - return p.parseRuneEnd(r, next); + return p.parseRuneEnd(try p.getRune(c), next); } if (c == '\\') { const c1 = try p.readNoEof("bare string after hash"); - return p.jump(next, p.cons(HASH, try p.parseBareString(c1))); + return p.jump(next, p.cons(HASH, try p.getBareString(c1))); } if (c == '%') { return p.parseLabel(next); @@ -602,14 +601,17 @@ fn endHashDatum(p: *Parser) !void { return p.retval(p.cons(HASH, p.result)); } -fn parseRune(p: *Parser, c1: u8) !Value { +fn getRune(p: *Parser, c1: u8) !Value { try p.addChar(c1); var len: usize = 1; while (try p.read()) |c| : (len += 1) { - if (len == 6 or !std.ascii.isAlphanumeric(c)) { + if (!std.ascii.isAlphanumeric(c)) { p.unread(c); return p.getCharsAsRune(); } + if (len == 6) { + return p.err(.RuneTooLong, "rune"); + } try p.addChar(c); } return p.getCharsAsRune(); @@ -619,13 +621,13 @@ fn parseRuneEnd(p: *Parser, r: Value, next: Fn) !void { const c = p.getUnread() orelse return p.jump(next, r); if (c == '\\') { const c1 = try p.readNoEof("bare string at rune end"); - return p.jump(next, p.cons(r, try p.parseBareString(c1))); + return p.jump(next, p.cons(r, try p.getBareString(c1))); } if (c == '"') { - return p.jump(next, p.cons(r, try p.parseString('"'))); + return p.jump(next, p.cons(r, try p.getString('"'))); } if (c == '|') { - return p.jump(next, p.cons(r, try p.parseString('|'))); + return p.jump(next, p.cons(r, try p.getString('|'))); } p.unread(c); switch (c) { @@ -772,8 +774,8 @@ fn checkBlank(p: *Parser, c: u8) !enum { yes, skip_unit, no } { fn isBareChar(c: u8) bool { return switch (c) { // zig fmt: off - 'a'...'z' , 'A'...'Z' , '0'...'9' , '!' , '$' , '%' , '*' ,'+' , - '-' , '.' , '/' , '<' , '=' , '>' , '?' , '@' , '^' , '_' , '~' => true, + 'a'...'z' , 'A'...'Z' , '0'...'9' , '!' , '$' , '%' , '*' , '+' , + '-' , '/' , '<' , '=' , '>' , '?' , '@' , '^' , '_' , '~' , => true, // zig fmt: on else => false, }; |
