diff options
| author | Taylan Kammer <taylan.kammer@gmail.com> | 2025-03-29 23:45:03 +0100 |
|---|---|---|
| committer | Taylan Kammer <taylan.kammer@gmail.com> | 2025-03-29 23:53:26 +0100 |
| commit | fc23b42c6e2183c8ca8b6c42dc4e90d8061f835d (patch) | |
| tree | a88f496bad87f881fcecd247a489438bc0c79779 | |
| parent | 451aa92846b5fd5c8a0739336de3aa26d741d750 (diff) | |
new new parser
| -rw-r--r-- | spec/parser.ebnf | 74 | ||||
| -rw-r--r-- | src/libzisp.zig | 17 | ||||
| -rw-r--r-- | src/libzisp/io.zig | 4 | ||||
| -rw-r--r-- | src/libzisp/io/Parser.zig | 834 | ||||
| -rw-r--r-- | src/libzisp/io/parser.zig | 1131 | ||||
| -rw-r--r-- | src/main.zig | 84 | ||||
| -rw-r--r-- | test-data/parser-torture.scm | 4 |
7 files changed, 939 insertions, 1209 deletions
diff --git a/spec/parser.ebnf b/spec/parser.ebnf index ce7fa83..caa24f3 100644 --- a/spec/parser.ebnf +++ b/spec/parser.ebnf @@ -1,42 +1,33 @@ -unit : empty_unit | datum_unit ; - - -empty_unit : blank* EOF ; - -datum_unit : blank* datum blank? ; +unit : blank* ( datum blank? | EOF ) ; blank : 9...13 | comment ; -datum : join_data | dot_string ; +datum : one_datum ( join_char? one_datum )* ; + +join_char : '.' | ':' ; comment : ';' ( skip_unit | skip_line ) ; skip_unit : '~' unit ; -skip_line : ( ~10 )* 10? ; - - -join_data : one_datum ( join_char? one_datum )* - -join_char : '.' | ':' | '|' ; - -dot_string : '.'{2,} +skip_line : ( ~LF )* LF? ; -one_datum : ( num_string | bare_string | clad_datum ) ; +one_datum : bare_string | clad_datum ; -num_string : ( '+' | '-' )? digit ( bare_str_elt | '.' )* ; - -bare_string : bare_str_elt+ ; +bare_string : ( '.' | '+' | '-' | DIGIT ) ( bare_str_elt | '.' )* + | bare_str_elt+ + ; clad_datum : '\' bare_esc_str - | '"' quoted_str '"' + | '|' pipe_str_elt* '|' + | '"' quot_str_elt* '"' | '#' hash_expr - | '(' blank* list? ')' - | '[' blank* list? ']' - | '{' blank* list? '}' + | '(' list ')' + | '[' list ']' + | '{' list '}' | quote_expr ; @@ -46,44 +37,39 @@ bare_str_elt : bare_char | '\' bare_esc ; bare_esc_str : bare_esc bare_str_elt* ; -quoted_str : ( quoted_char | '\' quoted_esc )* ; +pipe_str_elt : ~( '|' | '\' ) | '\' pipe_esc ; + +quot_str_elt : ~( '"' | '\' ) | '\' quot_esc ; hash_expr : rune clad_datum? - | '%' label ( '%' | '=' blank* datum ) + | '%' label ( '%' | '=' datum ) | clad_datum ; -list : datum_unit+ list_tail? blank* ; +list : unit* ( '.' unit )? blank* ; -list_tail : '.' blank+ datum_unit +quote_expr : ( "'" | "`" | "," ) datum ; -quote_expr : ( "'" | "`" | "," ) blank* datum ; +bare_char : ALPHA | DIGIT | bare_punct ; -bare_char : letter | digit - | '!' | '$' | '%' | '&' | '*' | '+' | '-' | '/' +bare_punct : '!' | '$' | '%' | '&' | '*' | '+' | '-' | '/' | '<' | '=' | '>' | '?' | '@' | '^' | '_' | '~' ; bare_esc : 33...126 ; -quoted_char : ~( '"' | '\' ) ; - -quoted_esc : '\' | '"' | 'a' | 'b' | 'e' - | 'f' | 'n' | 'r' | 't' | 'v' - | 'x' hex_digit{2} - | 'u' '{' hex_digit+ '}' - ; - +pipe_esc : string_esc | '|' ; -rune : letter ( letter | digit ){0,5} ; - -label : hex_digit{1,12} ; +quot_esc : string_esc | '"' ; +string_esc : '\' | 'a' | 'b' | 'e' | 'f' | 'n' | 'r' | 't' | 'v' + | 'x' HEXDIG{2} + | 'u' '{' HEXDIG+ '}' + ; -letter : 'a'...'z' | 'A'...'Z' ; -digit : '0'...'9' ; +rune : ALPHA ( ALPHA | DIGIT ){0,5} ; -hex_digit : '0'...'9' | 'a'...'f' | 'A'...'F' ; +label : HEXDIG{1,12} ; diff --git a/src/libzisp.zig b/src/libzisp.zig index f6eb77c..ced15f0 100644 --- a/src/libzisp.zig +++ b/src/libzisp.zig @@ -399,20 +399,29 @@ fn parseBench(path: []const u8, iters: usize) !void { const file = try std.fs.cwd().openFile(path, .{}); defer file.close(); + var fb_alloc: std.mem.Allocator = undefined; + var stack_sfa: io.parser.DefaultStackSfa = undefined; + var chars_sfa: io.parser.DefaultCharsSfa = undefined; + var parser = try io.parser.default(&fb_alloc, &stack_sfa, &chars_sfa); + defer parser.deinit(); + defer io.parser.deinit(&fb_alloc); + var timer = try std.time.Timer.start(); for (0..iters) |i| { _ = i; - // var br = std.io.bufferedReader(file.reader()); - // const reader = br.reader().any(); - const reader = file.reader().any(); + var br = std.io.bufferedReader(file.reader()); + const reader = br.reader().any(); + // const reader = file.reader().any(); var v: Value = undefined; while (true) { - v = io.parser._parse(reader) catch |e| { + // std.debug.print("hihi {s}\n", .{path}); + v = parser.run(reader) catch |e| { std.debug.print("\nfile pos: {}\n", .{ try file.getPos(), }); return e; }; + // try io.unparser.unparse(std.io.getStdOut().writer().any(), v); if (value.eof.check(v)) { break; } diff --git a/src/libzisp/io.zig b/src/libzisp/io.zig index 3d6d384..8adf495 100644 --- a/src/libzisp/io.zig +++ b/src/libzisp/io.zig @@ -6,3 +6,7 @@ pub const encoder = @import("io/encoder.zig"); pub const reader = @import("io/reader.zig"); pub const writer = @import("io/writer.zig"); + +pub const parse = parser.parse; + +// pub const defaultUnparser = ... diff --git a/src/libzisp/io/Parser.zig b/src/libzisp/io/Parser.zig new file mode 100644 index 0000000..d9eeca9 --- /dev/null +++ b/src/libzisp/io/Parser.zig @@ -0,0 +1,834 @@ +// === Syntax === +// +// See docs/parser.md and spec/syntax.bnf to understand the syntax. +// +// +// === Trampolining strategy === +// +// Instead of using recursion directly, the parser is written in a "trampoline" +// style, which ensures that parse depth is not limited by stack size. +// +// All state and context is passed around via a struct pointer. The parser has +// a main loop, which calls a function as dictated by parser.context.next, and +// the function may update the state to have another function called next. +// +// If a function wants to call a recursive subroutine, it pushes some of the +// current context onto a stack, including what function the subroutine should +// return to, and then updates the state to instruct the main loop to call one +// of the entry point subroutines. +// +// If a function wants to make the parser return, either from a subroutine or +// from the main loop, it sets the .result field, and tries to pop the saved +// context. If the context stack was empty, the main loop returns. +// +// +// === Buffering === +// +// For efficiency, call the parser on an input stream with implicit buffering. +// +// The parser does not use its own buffer, beyond one character that may be +// written back into the unused_char field, which is checked at the end to +// ensure it's nothing other than a trailing blank or comment. +// +// This lack of buffering is to ensure that the parser never reads more bytes +// from the input than what it needs to parse a datum. Consider the input: +// +// (a b c) (x y z) +// +// If we used proper buffering, like reading up to 4K bytes per read, then the +// whole stream would be consumed at once before it's parsed. Then, the parser +// would return the first datum, and the rest of the stream would be lost. The +// parser would need some way to reset the input stream's read head to the end +// of the first datum, but not all stream types may support this. +// +// Although in a scenario like this it would be wise to create a single Parser +// instance to call multiple times (in which case it could continue using its +// internal buffer from where it left), this may not always be practical. +// +// One could even have an input stream with Zisp s-expressions and other data +// intertwined, in which case this lack of buffering is also crucial, since one +// needs to alternate between calling the Zisp parser and some other parser on +// the same input stream. +// + +const builtin = @import("builtin"); +const std = @import("std"); + +const lib = @import("../lib.zig"); +const value = @import("../value.zig"); + +const List = std.ArrayListUnmanaged; + +const Value = value.Value; + +const Parser = @This(); + +const is_test = builtin.is_test; +const is_debug = builtin.mode == .Debug; +const detailed_debug = false; + +// zig fmt: off +const DOT = value.rune.pack("DOT"); +const COLON = value.rune.pack("COLON"); +const JOIN = value.rune.pack("JOIN"); +const LABEL = value.rune.pack("LABEL"); +const HASH = value.rune.pack("HASH"); +const QUOTE = value.rune.pack("QUOTE"); +const GRAVE = value.rune.pack("GRAVE"); +const COMMA = value.rune.pack("COMMA"); +const SQUARE = value.rune.pack("SQUARE"); +const BRACE = value.rune.pack("BRACE"); +const VOID = value.rune.packForced(""); +const LSTAIL = value.sstr.pack("."); +// zig fmt: on + +const Cons = *const fn (v1: Value, v2: Value) Value; + +fn dummyCons(v1: Value, v2: Value) Value { + _ = v1; + _ = v2; + return value.undef; +} + +const real_cons = &value.pair.cons; +const fake_cons = &dummyCons; + +pub const ParseError = enum { + InvalidCharacter, + UnclosedString, + UnexpectedEof, + UnicodeError, + RuneTooLong, + OutOfMemory, + OutOfRange, + ReadError, +}; + +const Fn = *const fn (*Parser) anyerror!void; + +pub const Context = struct { + // What to do next. + next: ?Fn = undefined, + // For storing a context value, like accumulated list elements. + val: Value = undefined, + // For storing a context char, like list opening bracket. + char: u8 = undefined, +}; + +// parameters +stack_alloc: std.mem.Allocator, +chars_alloc: std.mem.Allocator, + +// state +input: std.io.AnyReader = undefined, +context: Context = .{}, +stack: List(Context) = undefined, +chars: List(u8) = undefined, +cons: Cons = real_cons, +result: Value = undefined, +unused_char: ?u8 = null, +err_msg: []const u8 = undefined, + +pub fn init( + stack_alloc: std.mem.Allocator, + chars_alloc: std.mem.Allocator, + init_stack_mem: usize, + init_chars_mem: usize, +) !Parser { + var p: Parser = .{ + .stack_alloc = stack_alloc, + .chars_alloc = chars_alloc, + }; + const csize = @sizeOf(Context); + p.stack = try .initCapacity(stack_alloc, init_stack_mem / csize); + p.chars = try .initCapacity(chars_alloc, init_chars_mem); + return p; +} + +pub fn deinit(p: *Parser) void { + p.stack.deinit(p.stack_alloc); + p.chars.deinit(p.chars_alloc); +} + +// +// Input reading & unreading +// + +fn read(p: *Parser) !?u8 { + if (is_debug and p.unused_char != null) { + @panic("Called read() while there was an unused character!"); + } + const c = p.input.readByte() catch |e| switch (e) { + error.EndOfStream => return null, + else => return p.err(.ReadError, "???"), + }; + if (detailed_debug) { + std.debug.print("{c}", .{c}); + } + return c; +} + +fn readNoEof(p: *Parser, comptime emsg: []const u8) !u8 { + return if (try p.read()) |c| c else p.err(.UnexpectedEof, emsg); +} + +fn unread(p: *Parser, c: u8) void { + p.unused_char = c; +} + +fn getUnused(p: *Parser) ?u8 { + if (p.unused_char) |c| { + p.unused_char = null; + return c; + } + return null; +} + +// +// String accumulation & creation +// + +fn addChar(p: *Parser, c: u8) !void { + try p.chars.append(p.chars_alloc, c); +} + +fn getBareString(p: *Parser) Value { + defer p.chars.clearRetainingCapacity(); + return if (p.chars.items.len <= 6) + value.sstr.pack(p.chars.items) + else + value.istr.intern(p.chars.items, false); +} + +fn getQuotedString(p: *Parser) Value { + defer p.chars.clearRetainingCapacity(); + return if (p.chars.items.len <= 6) + value.sstr.packQuoted(p.chars.items) + else + value.istr.intern(p.chars.items, true); +} + +fn getRune(p: *Parser) Value { + defer p.chars.clearRetainingCapacity(); + return value.rune.pack(p.chars.items); +} + +// +// Stack management & control flow +// + +pub fn run(p: *Parser, input: std.io.AnyReader) !Value { + p.input = input; + p.context.next = &parseUnit; + while (p.context.next) |next| { + if (detailed_debug) printStack(p); + try next(p); + } + if (p.getUnused()) |_| { + return p.err(.InvalidCharacter, "top-level"); + } + return p.result; +} + +fn printStack(p: *Parser) void { + const stack = p.stack.items; + std.debug.print("\n\n{}:{} ctx:'{c}' unused:'{c}' \n", .{ + stack.len, + p.context.next, + p.context.char, + p.unused_char orelse '_', + }); + if (stack.len > 0) { + var i = stack.len; + while (i > 0) : (i -= 1) { + const prev = stack[i - 1]; + std.debug.print("{}:{} ctx:'{c}'\n", .{ + i - 1, + prev.next, + prev.char, + }); + } + } +} + +fn err( + p: *Parser, + comptime e: ParseError, + comptime msg: []const u8, +) error{ParseError} { + p.err_msg = @tagName(e) ++ " at: " ++ msg; + return error.ParseError; +} + +fn push(p: *Parser, next: Fn) !void { + try p.stack.append(p.stack_alloc, .{ .next = next }); +} + +fn pushContext(p: *Parser, next: Fn) !void { + try p.stack.append(p.stack_alloc, .{ + .next = next, + .val = p.context.val, + .char = p.context.char, + }); +} + +fn ret(p: *Parser) void { + if (p.stack.pop()) |c| { + p.context = c; + } else { + p.context.next = null; + } +} + +fn subr(p: *Parser, start: Fn, next: Fn) !void { + try p.pushContext(next); + p.context.next = start; +} + +fn jump(p: *Parser, next: Fn, val: ?Value) void { + if (val) |v| p.result = v; + p.context.next = next; +} + +fn abort(p: *Parser, next: Fn, unused_c: u8) void { + p.result = VOID; + p.unused_char = unused_c; + p.context.next = next; +} + +fn retval(p: *Parser, val: Value) void { + p.result = val; + p.ret(); +} + +// +// Start parser functions +// + +fn parseUnit(p: *Parser) !void { + var c1 = p.getUnused() orelse try p.read(); + while (c1) |c| : (c1 = try p.read()) { + switch (try checkBlanks(p, c)) { + .yes => {}, + .skip_unit => try p.push(&parseUnit), + .no => { + p.unread(c); + return p.subr(&parseDatum, &endUnit); + }, + } + } + return p.retval(value.eof.eof); +} + +fn endUnit(p: *Parser) !void { + if (p.getUnused()) |c| { + switch (try checkBlanks(p, c)) { + .yes => {}, + .skip_unit => return skipUnitAndReturn(p), + .no => p.unread(c), + } + } + return p.ret(); +} + +fn skipUnitAndReturn(p: *Parser) !void { + p.context.val = p.result; + return p.subr(&parseUnit, &returnContext); +} + +fn returnContext(p: *Parser) !void { + return p.retval(p.context.val); +} + +fn parseDatum(p: *Parser) !void { + return parseOneDatum(p, p.getUnused().?, &endFirstDatum); +} + +fn endFirstDatum(p: *Parser) !void { + if (p.result.eq(VOID)) { + return p.ret(); + } + return parseJoin(p); +} + +fn parseJoin(p: *Parser) !void { + const c = p.getUnused() orelse try p.read() orelse return p.ret(); + switch (c) { + '.', ':' => { + p.context.char = c; + p.unread(try p.readNoEof("join datum")); + }, + else => { + p.context.char = 0; + p.unread(c); + }, + } + p.context.val = p.result; + return p.subr(&parseDatum, &endJoinDatum); +} + +fn endJoinDatum(p: *Parser) !void { + const prev = p.context.val; + const join = p.context.char; + if (p.result.eq(VOID)) { + if (join == 0) { + return p.retval(prev); + } else { + return p.err(.InvalidCharacter, "join datum"); + } + } + const rune = switch (join) { + 0 => JOIN, + '.' => DOT, + ':' => COLON, + else => unreachable, + }; + const joined = p.cons(rune, p.cons(prev, p.result)); + return p.jump(&parseJoin, joined); +} + +fn parseOneDatum(p: *Parser, c: u8, next: Fn) !void { + if (isBareChar(c) or c == '.') { + return p.jump(next, try parseBareString(p, c)); + } + return parseCladDatum(p, c, next); +} + +fn parseBareString(p: *Parser, c: u8) !Value { + const allow_dots = std.ascii.isDigit(c) or switch (c) { + '.', '+', '-' => true, + else => false, + }; + try p.addChar(c); + return parseBareStringRest(p, allow_dots); +} + +fn parseBareEscString(p: *Parser) !Value { + try p.addChar(try parseBareEsc(p)); + return parseBareStringRest(p, false); +} + +fn parseBareStringRest(p: *Parser, allow_dots: bool) !Value { + while (try p.read()) |c| { + if (isBareChar(c) or (allow_dots and c == '.')) { + try p.addChar(c); + } else if (c == '\\') { + try p.addChar(try parseBareEsc(p)); + } else { + p.unread(c); + break; + } + } + return p.getBareString(); +} + +fn parseBareEsc(p: *Parser) !u8 { + const c = try p.readNoEof("bare escape"); + if (isBareEsc(c)) { + return c; + } else { + return p.err(.InvalidCharacter, "bare escape"); + } +} + +fn parseCladDatum(p: *Parser, c: u8, next: Fn) !void { + if (c == '\\') { + return p.jump(next, try parseBareEscString(p)); + } + if (c == '"') { + return p.jump(next, try parseQuotedString(p, '"')); + } + if (c == '|') { + return p.jump(next, try parseQuotedString(p, '|')); + } + return switch (c) { + '#' => parseHashExpression(p, next), + '(', '[', '{' => parseList(p, c, next), + '\'', '`', ',' => parseQuoteExpr(p, c, next), + else => p.abort(next, c), + }; +} + +fn parseQuotedString(p: *Parser, close: u8) !Value { + while (try p.read()) |c| { + if (c == close) { + return p.getQuotedString(); + } + if (c != '\\') { + try p.addChar(c); + } else { + try parseQuotedEsc(p, close); + } + } + return error.UnclosedString; +} + +fn parseQuotedEsc(p: *Parser, close: u8) !void { + const c = try p.readNoEof("quoted escape"); + if (c == close) { + return p.addChar(close); + } + if (c == 'u') { + return parseUniHexHandleErrors(p); + } + try p.addChar(switch (c) { + '\\' => c, + '0' => 0, + 'a' => 7, + 'b' => 8, + 't' => 9, + 'n' => 10, + 'v' => 11, + 'f' => 12, + 'r' => 13, + 'e' => 27, + 'x' => try parseHexByte(p, "hex escape"), + else => return p.err(.InvalidCharacter, "quoted escape"), + }); +} + +fn parseUniHexHandleErrors(p: *Parser) !void { + return parseUniHex(p) catch |e| switch (e) { + error.Utf8CannotEncodeSurrogateHalf => p.err( + .UnicodeError, + "unicode escape", + ), + else => e, + }; +} + +fn parseUniHex(p: *Parser) !void { + const msg = "unicode escape"; + + if (try p.readNoEof(msg) != '{') { + return p.err(.InvalidCharacter, msg); + } + + const uc = try parseHex(p, u21, msg); + if (p.getUnused()) |c| { + if (c != '}') { + return p.err(.InvalidCharacter, msg); + } + } else { + return p.err(.UnexpectedEof, msg); + } + + const n = try std.unicode.utf8CodepointSequenceLength(uc); + const buf = try p.chars.addManyAsSlice(p.chars_alloc, n); + _ = try std.unicode.utf8Encode(uc, buf); +} + +fn parseHashExpression(p: *Parser, next: Fn) !void { + const c = try p.readNoEof("hash expression"); + if (try checkBlanks(p, c) != .no) { + return p.err(.InvalidCharacter, "hash expression"); + } + if (std.ascii.isAlphabetic(c)) { + const r = try parseRune(p, c); + return parseRuneEnd(p, r, next); + } + if (c == '%') { + const l = try parseLabel(p); + return parseLabelEnd(p, l, next); + } + if (isBareChar(c)) { + // Reserved for future extensions to syntax sugar. + return p.err(.InvalidCharacter, "hash expression"); + } + // fast-path to avoid subr + if (c == '\\') { + return p.jump(next, p.cons(HASH, try parseBareEscString(p))); + } + if (c == '"') { + return p.jump(next, p.cons(HASH, try parseQuotedString(p, '"'))); + } + if (c == '|') { + return p.jump(next, p.cons(HASH, try parseQuotedString(p, '|'))); + } + p.unread(c); + return p.subr(&parseHashDatum, next); +} + +fn parseHashDatum(p: *Parser) !void { + return parseCladDatum(p, p.getUnused().?, &endHashDatum); +} + +fn endHashDatum(p: *Parser) !void { + if (p.result.eq(VOID)) { + return p.err(.InvalidCharacter, "hash datum"); + } + return p.retval(p.cons(HASH, p.result)); +} + +fn parseRune(p: *Parser, c1: u8) !Value { + try p.addChar(c1); + var len: usize = 1; + while (try p.read()) |c| : (len += 1) { + if (len == 6 or !std.ascii.isAlphanumeric(c)) { + p.unread(c); + return p.getRune(); + } + try p.addChar(c); + } + return p.getRune(); +} + +fn parseRuneEnd(p: *Parser, r: Value, next: Fn) !void { + const c = p.getUnused() orelse return p.jump(next, r); + if (c == '\\') { + return p.jump(next, p.cons(r, try parseBareString(p, c))); + } + if (c == '"') { + return p.jump(next, p.cons(r, try parseQuotedString(p, '"'))); + } + if (c == '|') { + return p.jump(next, p.cons(r, try parseQuotedString(p, '|'))); + } + p.unread(c); + switch (c) { + '#', '(', '[', '{', '\'', '`', ',' => { + try p.push(next); + p.context.val = r; + // Use jump to prevent recursion. + return p.jump(&parseRuneDatum, null); + }, + else => return p.jump(next, r), + } +} + +fn parseRuneDatum(p: *Parser) !void { + return parseCladDatum(p, p.getUnused().?, &endRuneDatum); +} + +fn endRuneDatum(p: *Parser) !void { + if (p.result.eq(VOID)) { + p.retval(p.context.val); + } + return p.retval(p.cons(p.context.val, p.result)); +} + +fn parseLabel(p: *Parser) !Value { + const label = try parseHex(p, u48, "datum label"); + return value.fixnum.pack(label); +} + +fn parseLabelEnd(p: *Parser, l: Value, next: Fn) !void { + const c = p.getUnused() orelse return p.err(.UnexpectedEof, "datum label"); + if (c == '%') { + return p.jump(next, p.cons(LABEL, l)); + } + if (c == '=') { + try p.push(next); + p.context.val = l; + return p.subr(&parseDatum, &endLabelDatum); + } + return p.err(.InvalidCharacter, "datum label"); +} + +fn endLabelDatum(p: *Parser) !void { + if (p.result.eq(VOID)) { + return p.err(.InvalidCharacter, "label datum"); + } + return p.retval(p.cons(LABEL, p.cons(p.context.val, p.result))); +} + +fn parseList(p: *Parser, open: u8, next: Fn) !void { + const head = switch (open) { + '(' => value.nil.nil, + '[' => p.cons(SQUARE, value.nil.nil), + '{' => p.cons(BRACE, value.nil.nil), + else => unreachable, + }; + const close: u8 = switch (open) { + '(' => ')', + '[' => ']', + '{' => '}', + else => unreachable, + }; + while (try p.read()) |c| { + if (c == close) { + return p.jump(next, head); + } + switch (try checkBlanks(p, c)) { + .yes => {}, + .skip_unit => { + try listParserSetup(p, head, close, next); + return p.subr(&parseUnit, &parseUnit); + }, + .no => { + try listParserSetup(p, head, close, next); + p.unread(c); + return p.jump(&parseDatum, null); + }, + } + } + return p.err(.UnexpectedEof, "list"); +} + +fn listParserSetup(p: *Parser, head: Value, close: u8, next: Fn) !void { + try p.push(next); + p.context.val = head; + p.context.char = close; + try p.pushContext(&continueList); +} + +fn continueList(p: *Parser) !void { + const close = p.context.char; + + if (p.result.eq(VOID)) { + const c = p.getUnused().?; + if (c == close) { + return endList(p); + } + return p.err(.InvalidCharacter, "list"); + } + + if (p.result.eq(LSTAIL)) { + return p.subr(&parseUnit, &endImproperList); + } + + p.context.val = p.cons(p.result, p.context.val); + + var c1 = p.getUnused() orelse try p.read(); + while (c1) |c| : (c1 = try p.read()) { + if (c == close) { + return endList(p); + } + switch (try checkBlanks(p, c)) { + .yes => {}, + .skip_unit => { + try p.pushContext(&continueList); + return p.subr(&parseUnit, &parseUnit); + }, + .no => { + p.unread(c); + return p.subr(&parseDatum, &continueList); + }, + } + } + return p.err(.UnexpectedEof, "list"); +} + +fn endList(p: *Parser) !void { + return p.retval(lib.list.reverse(p.context.val)); +} + +fn endImproperList(p: *Parser) !void { + if (p.result.eq(VOID)) { + return p.err(.InvalidCharacter, "list tail"); + } + p.context.val = lib.list.reverseWithTail(p.context.val, p.result); + return closeImproperList(p); +} + +fn closeImproperList(p: *Parser) !void { + const result = p.context.val; + const close = p.context.char; + var c1 = p.getUnused() orelse try p.read(); + while (c1) |c| : (c1 = try p.readNoEof("after list tail")) { + if (c == close) { + return p.retval(result); + } + switch (try checkBlanks(p, c)) { + .yes => {}, + .skip_unit => return p.subr(&parseUnit, &closeImproperList), + .no => return p.err(.InvalidCharacter, "after list tail"), + } + } + unreachable; +} + +fn parseQuoteExpr(p: *Parser, c1: u8, next: Fn) !void { + const q = switch (c1) { + '\'' => QUOTE, + '`' => GRAVE, + ',' => COMMA, + else => unreachable, + }; + + // fast-path to avoid subr + const c = try p.readNoEof("quote expression"); + if (isBareChar(c) or c == '\\') { + return p.jump(next, p.cons(q, try parseBareString(p, c))); + } + + try p.push(next); + p.context.val = q; + p.unread(c); + return p.subr(&parseDatum, &endQuoteExpr); +} + +fn endQuoteExpr(p: *Parser) !void { + if (p.result.eq(VOID)) { + return p.err(.InvalidCharacter, "quote expression datum"); + } + return p.retval(p.cons(p.context.val, p.result)); +} + +// Helpers + +fn checkBlanks(p: *Parser, c: u8) !enum { yes, skip_unit, no } { + switch (c) { + '\t'...'\r', ' ' => return .yes, + ';' => { + const c2 = try p.read() orelse return .yes; + if (c2 == '\n') return .yes; + if (c2 == '~') return .skip_unit; + while (try p.read()) |c3| if (c3 == '\n') break; + return .yes; + }, + else => return .no, + } +} + +fn isBareChar(c: u8) bool { + return switch (c) { + // zig fmt: off + 'a'...'z' , 'A'...'Z' , '0'...'9' , '!' , '$' , '%' , '&' , '*' , + '+' , '-' , '/' , '<' , '=' , '>' , '?' , '@' , '^' , '_' , '~' => true, + // zig fmt: on + else => false, + }; +} + +fn isBareEsc(c: u8) bool { + return switch (c) { + 33...126 => true, + else => false, + }; +} + +fn parseHex(p: *Parser, T: type, comptime emsg: []const u8) !T { + var uc: T = undefined; + + const c1 = try p.readNoEof(emsg); + uc = try parseHexDigit(p, c1, emsg); + + while (try p.read()) |c| { + if (!std.ascii.isHex(c)) { + p.unread(c); + return uc; + } + const shl = std.math.shlExact; + uc = shl(T, uc, 4) catch return p.err(.OutOfRange, emsg); + uc |= try parseHexDigit(p, c, emsg); + } + return uc; +} + +fn parseHexByte(p: *Parser, comptime emsg: []const u8) !u8 { + const h1 = try p.readNoEof(emsg); + const h2 = try p.readNoEof(emsg); + const hi = try parseHexDigit(p, h1, emsg); + const lo = try parseHexDigit(p, h2, emsg); + return hi << 4 | lo; +} + +fn parseHexDigit(p: *Parser, c: u8, comptime emsg: []const u8) !u8 { + return switch (c) { + '0'...'9' => c - '0', + 'A'...'F' => c - 'A' + 10, + 'a'...'f' => c - 'a' + 10, + else => p.err(.InvalidCharacter, emsg), + }; +} diff --git a/src/libzisp/io/parser.zig b/src/libzisp/io/parser.zig index 99bf943..bd45ae1 100644 --- a/src/libzisp/io/parser.zig +++ b/src/libzisp/io/parser.zig @@ -1,1123 +1,80 @@ -// -// === Parser for Code & Data === -// -// Zisp s-expressions are defined in terms of an extremely minimal set of data -// types; only that which is necessary to build representations of more complex -// expressions and data types: -// -// +---------+-------------+---------------+--------+---------+--------+ -// | TYPE | Bare String | Quoted String | Rune | Pair | Nil | -// +---------+-------------+---------------+--------+---------+--------+ -// | EXAMPLE | foobar | "foo bar" | #name | (X . Y) | () | -// +---------+-------------+---------------+--------+---------+--------+ -// -// Bare strings and quoted strings are polymorphic sub-types of the generic -// string type. This lets the compiler distinguish between identifiers and -// string literals. -// -// There are also non-negative integers, but they are only used for datum -// labels; regular number literals are handled by the "decoder" (see below). -// -// The parser recognizes various "syntax sugar" and transforms it into uses of -// the above data types. The most ubiquitous example is of course the list: -// -// (datum1 datum2 ...) -> (datum1 . (datum2 . (... . ()))) -// -// The following table summarizes the other supported transformations: -// -// #datum -> (#HASH . datum) #rune(...) -> (#rune ...) -// -// [...] -> (#SQUARE ...) dat1dat2 -> (#JOIN dat1 . dat2) -// -// {...} -> (#BRACE ...) dat1.dat2 -> (#DOT dat1 . dat2) -// -// 'datum -> (#QUOTE . datum) dat1:dat2 -> (#COLON dat1 . dat2) -// -// `datum -> (#GRAVE . datum) dat1|dat2 -> (#PIPE dat1 . dat2) -// -// ,datum -> (#COMMA . datum) #%n=datum -> (#LABEL hex . datum) -// -// #%n -> (#LABEL . hex) -// -// A separate process called "decoding" can transform these objects into other -// data types. For example, (#HASH x y z) could become a vector, so that the -// expression #(x y z) works just like in Scheme. -// -// Decoding also resolves datum labels, and goes over bare strings to find ones -// that are actually a number literal. This lets us offload the complexity of -// number parsing elsewhere, so the parser remains extremely simple. -// -// Further notes about the syntax sugar table and examples above: -// -// * The terms datum, dat1, and dat2 each refer to an arbitrary datum; ellipsis -// means zero or more data; hex is a hexadecimal number of up to 12 digits. -// -// * The #datum form only applies to expressions that cannot be mistaken for a -// rune, such as #(...) or #"..." or #'string etc.; a hash sign followed by a -// bare string will parse as a rune instead (or raise an error if it contains -// characters that are illegal in rune names). -// -// * A backslash causes the immediately following character to lose any special -// meaning it would normally have, and be considered as part of a bare string -// instead. (This does not apply to whitespace.) For example, the following -// three character sequences are each a valid bare string: -// -// foo\(bar\) \]blah \#\'xyz -// -// This can be used to follow a hash with a bare string without it being -// parsed as a rune: -// -// #\foo -> (#HASH . foo) -// -// * Though not represented in the table due to notational difficulty, the -// format "#rune(...)" doesn't require a list in the second position; any -// datum works, so long as there's no ambiguity; for example: -// -// #rune1#rune2 -> (#rune1 . #rune2) -// -// #rune"text" -> (#rune . "text") -// -// #rune'string -> (#rune #QUOTE . string) -// -// As a counter-example, following a rune immediately with a bare string is -// not possible, since it's ambiguous: -// -// #abcdefgh ;Could be (#abcdef . gh) or (#abcde . fgh) or ... -// -// The parser will see this as an attempt to use an 8-letter rune name, and -// raise an error, since rune names are limited to 6 characters. -// -// * Syntax sugar can combine arbitrarily; some examples follow: -// -// #{...} -> (#HASH #BRACE ...) -// -// #'foo -> (#HASH #QUOTE . foo) -// -// ##'[...] -> (#HASH #HASH #QUOTE #SQUARE ...) -// -// {x y}[i j] -> (#JOIN (#BRACE x y) #SQUARE i j) -// -// foo.bar.baz{x y} -> (#JOIN (#DOT (#DOT foo . bar) . baz) #BRACE x y) -// -// * While 'foo parses as (quote foo) in Scheme, it parses as (#QUOTE . foo) in -// Zisp; the operand of #QUOTE is the entire cdr. The same principle is used -// when parsing other sugar; some examples follow: -// -// Incorrect Correct -// -// #(x y z) -> (#HASH (x y z)) #(x y z) -> (#HASH x y z) -// -// [x y z] -> (#SQUARE (x y z)) [x y z] -> (#SQUARE x y z) -// -// #{x} -> (#HASH (#BRACE (x))) #{x} -> (#HASH #BRACE x) -// -// foo(x y) -> (#JOIN foo (x y)) foo(x y) -> (#JOIN foo x y) -// -// * Runes are case-sensitive, and the parser only emits runes using upper-case -// letters when expressing syntax sugar. This way, there can be no accidental -// clash with runes that appear verbatim in code, as long as only lower-case -// letters are used for rune literals in code. -// -// -// === Decoding === -// -// A separate process called "decoding" can transform simple data structures, -// consisting of only the above types, into a richer set of Zisp data types. -// -// For example, the decoder may turn (#HASH ...) into a vector, as one would -// expect a vector literal like #(...) to work in Scheme. -// -// Runes may be decoded in isolation as well, rather than transforming a list -// whose head they appear in. This can implement #true and #false. -// -// The decoder may also perform arbitrary transforms on any type; for example, -// it may turn bare strings into numbers wher appropriate. This can implement -// number literals. -// -// The decoder recognizes (#QUOTE ...) to implement the traditional quoting -// mechanism, but with a significant difference: -// -// Traditional quote is "unhygienic" in Scheme terms. An expression such as -// '(foo bar) will always be read as (quote (foo bar)) regardless of what sort -// of lexical context it appears in, so the semantics will depend on whatever -// the identifier "quote" is bound to in that lexical context, meaning that the -// expression may end up evaluating to something other than the list (foo bar). -// -// The Zisp decoder, which transforms not text to text, but objects to objects, -// can turn (#QUOTE ...) into an abstract object which encapsulates the notion -// of quoting, which the Zisp evaluator can recognize and act upon, ensuring -// that an expression like '(foo bar) always turns into the list (foo bar). -// -// One way to think about this, in Scheme (R6RS / syntax-case) terms, is that -// expressions like '(foo bar) turn directly into a *syntax object* when read, -// rather than a regular list object. -// -// The decoder is, of course, configurable and extensible. The transformations -// mentioned above would be performed only when it's told to decode data which -// represents Zisp code. The decoder may be given a different configuration, -// telling it to decode, for example, data which represents a different kind of -// domain-specific data, such as application settings, build system commands, -// complex data records with non-standard data types, and so on. -// -// -// === Trampolining strategy === -// -// Instead of using recursion directly, the parser is written in a "trampoline" -// style, which ensures that parsing depth is not limited by the stack size. -// -// All state and context is passed around via a struct pointer. The parser has -// a main loop, which calls a function as dictated by state.context.next, and -// the function may update the state to have another function called next. -// -// If a function wants to call the parser recursively, it pushes some of the -// current context onto a stack, including what function the recursive parser -// should "return" to, and then updates the state to instruct the main loop to -// call one of the starting functions. -// -// If a function wants to make the parser return, either from a recursive parse -// or from the main loop, it sets the .result field, and tries to pop the saved -// context. If the context stack was empty, the main loop returns. -// -// While it's possible to just set .next and return, to make the main loop call -// another function next (possibly even setting .result to pass a value to it), -// this is completely unnecessary. A few non-recursive calls won't blow the -// stack. It's only recursive parsing that we use the above strategy for. -// -// -// === .start_parse VS .start_datum === -// -// The difference between .start_parse and .start_datum is that the former will -// allow whitespace and comments at the beginning, whereas the latter expects a -// datum immediately without anything else in front of it. -// -// When calling the parser recursively, it may seem logical to always make it -// start with .start_datum, because we already cleared whitespace and comments -// out of the way. However, in some cases, we must use .start_parse instead. -// -// This is because of datum comments. When one appears, we start a recursive -// parser, but instead of making it return to a function that will consume the -// result, we make it return to the original starting point, so the result is -// ignored and the parser retries what it was originally doing. If we always -// used .start_datum, this would never allow whitespace after a datum comment, -// since we would be back at .start_datum after the comment is out of the way: -// -// (foo #;bar baz) ;must use .start_parse at the start of each element -// -// -// === List parsing strategy === -// -// When it comes to pairs and lists, we basically treat everything as a list, -// and a pair is just seen as the shortest possible improper list. This saves -// memory: If we implemented list parsing as pair parsing, we would be calling -// the parser recursively, deeper and deeper, for every list element. Though -// we're not limited by stack space thanks to the strategy described above, it -// would still waste memory and the time it takes to allocate memory. -// -// -// === Buffering === -// -// We use a small circular buffer just so we can backtrack. We read bytes into -// this buffer one by one, because we don't want to consume more bytes from the -// input than what we actually parse. Consider the following input stream: -// -// (a b c) (x y z) -// -// If we used proper buffering, like reading up to 4K bytes per read, then the -// whole stream would be consumed at once before it's parsed. Then, the parser -// would return the first datum, and the rest of the stream would be lost. The -// parser would need some way to reset the input stream's read head to the end -// of the first datum, but not all stream types may support this. -// -// For efficiency, call the parser on an input stream with implicit buffering. -// -// -// === Notation used in comments === -// -// Some comments throughout the file give you an example of where the parser -// currently might be in a stream. These illustrations use the pipe symbol, -// which looks like a cursor, to indicate the current position: -// -// (foo| bar baz) <- parser arrived at the end of the string foo -// -// (foo bar baz)| <- parser reached EOF (assuming no trailing spaces) -// - const builtin = @import("builtin"); const std = @import("std"); -const lib = @import("../lib.zig"); const value = @import("../value.zig"); -const List = std.ArrayListUnmanaged; +const Parser = @import("Parser.zig"); const Value = value.Value; -const cons = value.pair.cons; - const is_test = builtin.is_test; const is_debug = builtin.mode == .Debug; -const detailed_debug = false; - // In debug, we want to see if we leak, so very small numbers. -const init_stack_capacity = if (is_debug) 2 else 32; -const init_chars_capacity = if (is_debug) 2 else 2048; - -// zig fmt: off -const DOT = value.rune.pack("DOT"); -const COLON = value.rune.pack("COLON"); -const PIPE = value.rune.pack("PIPE"); -const JOIN = value.rune.pack("JOIN"); -const LABEL = value.rune.pack("LABEL"); -const HASH = value.rune.pack("HASH"); -const QUOTE = value.rune.pack("QUOTE"); -const GRAVE = value.rune.pack("GRAVE"); -const COMMA = value.rune.pack("COMMA"); -const SQUARE = value.rune.pack("SQUARE"); -const BRACE = value.rune.pack("BRACE"); -const VOID = value.rune.packForced(""); -const LSTAIL = value.rune.packForced("."); -// zig fmt: on - -const Context = struct { - // What to do next. - next: Fn = .parse_unit, - // For storing a context value, like accumulated list elements. - val: Value = undefined, - // For storing a context char, like list opening bracket. - char: u8 = undefined, -}; - -const ParseError = enum { - InvalidCharacter, - UnclosedString, - UnexpectedEof, - UnicodeError, - RuneTooLong, - OutOfMemory, - OutOfRange, - ReadError, -}; - -const State = struct { - input: std.io.AnyReader, - stack_alloc: std.mem.Allocator, - chars_alloc: std.mem.Allocator, - - context: Context = .{}, - stack: List(Context) = undefined, - - chars: List(u8) = undefined, - - // TODO: Implement bool flag for when skipping a unit - - result: Value = undefined, - unused_char: ?u8 = null, - - err_msg: []const u8 = undefined, - - fn init( - input: std.io.AnyReader, - stack_alloc: std.mem.Allocator, - chars_alloc: std.mem.Allocator, - ) !State { - var s: State = .{ - .input = input, - .stack_alloc = stack_alloc, - .chars_alloc = chars_alloc, - }; - s.stack = try .initCapacity(s.stack_alloc, init_stack_capacity); - s.chars = try .initCapacity(s.chars_alloc, init_chars_capacity); - return s; - } - - fn deinit(s: *State) void { - s.stack.deinit(s.stack_alloc); - s.chars.deinit(s.chars_alloc); - } - - fn err( - s: *State, - comptime e: ParseError, - comptime msg: []const u8, - ) error{ParseError} { - s.err_msg = @tagName(e) ++ " at: " ++ msg; - return error.ParseError; - } - - fn read(s: *State) !?u8 { - if (is_debug and s.unused_char != null) { - @panic("Called read() while there was an unused character!"); - } - const c = s.input.readByte() catch |e| switch (e) { - error.EndOfStream => return null, - else => return s.err(.ReadError, "???"), - }; - if (detailed_debug) { - std.debug.print("{c}", .{c}); - } - return c; - } +const def_init_stack_mem = (if (is_debug) 2 else 32) * @sizeOf(Parser.Context); +const def_init_chars_mem = (if (is_debug) 2 else 2048); - fn readNoEof(s: *State, comptime emsg: []const u8) !u8 { - return if (try s.read()) |c| c else s.err(.UnexpectedEof, emsg); - } - - fn getUnused(s: *State) ?u8 { - if (s.unused_char) |c| { - s.unused_char = null; - return c; - } - return null; - } - - fn addChar(s: *State, c: u8) !void { - try s.chars.append(s.chars_alloc, c); - } - - fn getBareString(s: *State) Value { - defer s.chars.clearRetainingCapacity(); - return if (s.chars.items.len <= 6) - value.sstr.pack(s.chars.items) - else - value.istr.intern(s.chars.items, false); - } +pub const DefaultStackSfa = std.heap.StackFallbackAllocator(def_init_stack_mem); +pub const DefaultCharsSfa = std.heap.StackFallbackAllocator(def_init_chars_mem); - fn getQuotedString(s: *State) Value { - defer s.chars.clearRetainingCapacity(); - return if (s.chars.items.len <= 6) - value.sstr.packQuoted(s.chars.items) - else - value.istr.intern(s.chars.items, true); - } - - fn getRune(s: *State) Value { - defer s.chars.clearRetainingCapacity(); - return value.rune.pack(s.chars.items); - } - - fn push(s: *State, next: Fn) !void { - try s.stack.append(s.stack_alloc, .{ .next = next }); - } - - fn pushContext(s: *State, next: Fn) !void { - try s.stack.append(s.stack_alloc, .{ - .next = next, - .val = s.context.val, - .char = s.context.char, - }); - } - - fn subr(s: *State, start: Fn, next: Fn) !void { - try s.pushContext(next); - s.context.next = start; - } - - fn jump(s: *State, next: Fn, val: ?Value) void { - if (val) |v| s.result = v; - s.context.next = next; - } - - fn abort(s: *State, next: Fn, unused_c: u8) void { - s.result = VOID; - s.unused_char = unused_c; - s.context.next = next; - } - - fn retval(s: *State, val: Value) void { - s.result = val; - if (s.stack.pop()) |c| { - s.context = c; - } else { - s.context.next = .done; - } - } -}; - -pub fn _parse(input: std.io.AnyReader) !Value { - var debug_alloc: std.heap.DebugAllocator(.{}) = undefined; +pub fn default( + fb_alloc: *std.mem.Allocator, + stack_sfa: *DefaultStackSfa, + chars_sfa: *DefaultCharsSfa, +) !Parser { if (!is_test and is_debug) { - debug_alloc = .init; + fb_alloc.* = std.heap.DebugAllocator(.{}).init; } - defer if (!is_test and is_debug) { - if (debug_alloc.deinit() == .leak) { - @panic("leak"); - } - }; const alloc = if (is_test) std.testing.allocator else if (is_debug) - debug_alloc.allocator() + fb_alloc.allocator() else std.heap.smp_allocator; - const stack_mem = init_stack_capacity * @sizeOf(Context); - const chars_mem = init_chars_capacity; - var stack_sfa = std.heap.stackFallback(stack_mem, alloc); - var chars_sfa = std.heap.stackFallback(chars_mem, alloc); - const stack_alloc = stack_sfa.get(); - const chars_alloc = chars_sfa.get(); - var s = State.init(input, stack_alloc, chars_alloc) catch @panic(""); - defer s.deinit(); - - while (s.context.next != .done) callNext(&s) catch |e| { - // _ = e; - // if (s.unused_char) |c| { - // std.debug.panic( - // "Parse error: {s}, unused_char: 0x{x}\n", - // .{ s.err_msg, c }, - // ); - // } else { - // std.debug.panic("Parse error: {s}\n", .{s.err_msg}); - // } - return e; - }; - if (s.unused_char) |c| { - if (c != ' ') { - std.debug.panic("Invalid trailing character: {c}\n", .{c}); - } - } - return s.result; -} - -pub fn parse(input: std.io.AnyReader) Value { - return _parse(input) catch @panic(""); -} - -const Fn = enum { - parse_unit, - return_context, - end_one_datum, - parse_join_datum, - end_join_datum, - join_data, - parse_hash_datum, - end_hash_datum, - parse_rune_datum, - end_rune_datum, - end_label_datum, - parse_list_element, - continue_list, - end_improper_list, - close_improper_list, - end_quote_expr, - done, -}; - -fn callNext(s: *State) !void { - if (detailed_debug) { - const stack = s.stack.items; - std.debug.print("\n\n{}:{} ctx:'{c}' unused:'{c}' \n", .{ - stack.len, - s.context.next, - s.context.char, - s.unused_char orelse '_', - }); - if (stack.len > 0) { - var i = stack.len; - while (i > 0) : (i -= 1) { - const prev = stack[i - 1]; - std.debug.print("{}:{} ctx:'{c}'\n", .{ - i - 1, - prev.next, - prev.char, - }); - } - } - } - try switch (s.context.next) { - .parse_unit => parseUnit(s), - .return_context => returnContext(s), - .end_one_datum => endOneDatum(s), - .parse_join_datum => parseJoinDatum(s), - .end_join_datum => endJoinDatum(s), - .join_data => joinData(s), - .parse_hash_datum => parseHashDatum(s), - .end_hash_datum => endHashDatum(s), - .parse_rune_datum => parseRuneDatum(s), - .end_rune_datum => endRuneDatum(s), - .end_label_datum => endLabelDatum(s), - .parse_list_element => parseListElement(s), - .continue_list => continueList(s), - .end_improper_list => endImproperList(s), - .close_improper_list => closeImproperList(s), - .end_quote_expr => endQuoteExpr(s), - .done => unreachable, - }; -} - -fn parseUnit(s: *State) !void { - var c1 = s.getUnused() orelse try s.read(); - while (c1) |c| : (c1 = try s.read()) { - switch (try checkBlanks(s, c)) { - .yes => {}, - .skip_unit => try s.push(.parse_unit), - .no => return parseDatum(s, c), - } - } - return s.retval(value.eof.eof); -} - -fn checkBlanks(s: *State, c: u8) !enum { yes, skip_unit, no } { - return switch (c) { - '\t'...'\r', ' ' => .yes, - ';' => switch (try s.read() orelse '\n') { - '\n' => .yes, - '~' => .skip_unit, - else => while (try s.read() != '\n') {} else .yes, - }, - else => .no, - }; -} - -fn parseDatum(s: *State, c: u8) !void { - if (c == '.') { - return parseDotString(s); - } - return parseOneDatum(s, c, .end_one_datum); -} - -fn parseDotString(s: *State) !void { - var n: u48 = 1; - while (try s.read()) |c| : (n += 1) { - switch (try checkBlanks(s, c)) { - .yes => return dotString(s, n, false), - .skip_unit => return dotString(s, n, true), - .no => switch (c) { - '.' => {}, - ')', ']', '}' => { - s.unused_char = c; - return dotString(s, n, false); - }, - else => return s.err(.InvalidCharacter, "dot string"), - }, - } - } - unreachable; -} - -fn dotString(s: *State, n: u48, skip_unit: bool) !void { - const result = if (n == 1) LSTAIL else r: { - const buf = try s.chars.addManyAsSlice(s.chars_alloc, n); - @memset(buf, '.'); - break :r s.getBareString(); - }; - if (skip_unit) { - s.context.val = result; - return s.subr(.parse_unit, .return_context); - } else { - return s.retval(result); - } -} - -fn endOneDatum(s: *State) !void { - if (s.result.eq(VOID)) { - return s.retval(VOID); - } - const d = s.result; - const c1 = s.getUnused() orelse try s.read(); - if (c1) |c| { - switch (try checkBlanks(s, c)) { - .yes => {}, - .skip_unit => return skipUnitAndReturn(s, d), - .no => return parseJoin(s, d, c), - } - } - s.unused_char = ' '; - return s.retval(d); -} - -fn skipUnitAndReturn(s: *State, d: Value) !void { - s.context.val = d; - return s.subr(.parse_unit, .return_context); -} - -fn returnContext(s: *State) !void { - s.unused_char = ' '; - return s.retval(s.context.val); -} - -fn parseJoin(s: *State, d: Value, c: u8) !void { - switch (c) { - '.', ':', '|' => { - s.context.char = c; - s.unused_char = try s.readNoEof("join datum"); - }, - else => { - s.context.char = 0; - s.unused_char = c; - }, - } - s.context.val = d; - return s.subr(.parse_join_datum, .join_data); -} - -fn parseJoinDatum(s: *State) !void { - return parseOneDatum(s, s.getUnused().?, .end_join_datum); -} - -fn endJoinDatum(s: *State) !void { - return s.retval(s.result); -} - -fn joinData(s: *State) !void { - const head = s.context.val; - const join = s.context.char; - const tail = s.result; - if (tail.eq(VOID)) { - if (join == 0) { - return s.retval(head); - } else { - return s.err(.InvalidCharacter, "join datum"); - } - } - const rune = switch (join) { - 0 => JOIN, - '.' => DOT, - ':' => COLON, - '|' => PIPE, - else => unreachable, - }; - const data = cons(head, tail); - return s.jump(.end_one_datum, cons(rune, data)); -} - -fn parseOneDatum(s: *State, c: u8, next: Fn) !void { - if (isBareChar(c)) { - return s.jump(next, try parseBareString(s, c)); - } - return parseCladDatum(s, c, next); -} - -fn parseCladDatum(s: *State, c: u8, next: Fn) !void { - if (c == '\\') { - return s.jump(next, try parseBareEscString(s)); - } - if (c == '"') { - return s.jump(next, try parseQuotedString(s)); - } - return switch (c) { - '#' => parseHashExpression(s, next), - '(', '[', '{' => parseList(s, c, next), - '\'', '`', ',' => parseQuoteExpr(s, c, next), - else => s.abort(next, c), - }; -} - -fn isBareChar(c: u8) bool { - return switch (c) { - // zig fmt: off - 'a'...'z' , 'A'...'Z' , '0'...'9' , '!' , '$' , '%' , '&' , '*' , - '+' , '-' , '/' , '<' , '=' , '>' , '?' , '@' , '^' , '_' , '~' => true, - // zig fmt: on - else => false, - }; -} - -fn isBareEsc(c: u8) bool { - return switch (c) { - 33...126 => true, - else => false, - }; -} - -fn parseBareString(s: *State, c: u8) !Value { - try s.addChar(c); - var is_num = false; - if (std.ascii.isDigit(c)) { - is_num = true; - } else if (c == '+' or c == '-') { - const c2 = try s.read() orelse return s.getBareString(); - if (std.ascii.isDigit(c2)) { - try s.addChar(c2); - is_num = true; - } else if (isBareChar(c2)) { - try s.addChar(c2); - } else if (c2 == '\\') { - try s.addChar(try parseBareEsc(s)); - } else { - s.unused_char = c2; - return s.getBareString(); - } - } - return parseBareStringRest(s, is_num); -} + stack_sfa.* = std.heap.stackFallback(def_init_stack_mem, alloc); + chars_sfa.* = std.heap.stackFallback(def_init_chars_mem, alloc); -fn parseBareEscString(s: *State) !Value { - try s.addChar(try parseBareEsc(s)); - return parseBareStringRest(s, false); + return Parser.init( + stack_sfa.get(), + chars_sfa.get(), + def_init_stack_mem, + def_init_chars_mem, + ); } -fn parseBareStringRest(s: *State, is_num: bool) !Value { - while (try s.read()) |c| { - if (isBareChar(c) or (is_num and c == '.')) { - try s.addChar(c); - } else if (c == '\\') { - try s.addChar(try parseBareEsc(s)); - } else { - s.unused_char = c; - break; +pub fn deinit(fb_alloc: *std.mem.Allocator) void { + if (!is_test and is_debug) { + if (fb_alloc.?.deinit() == .leak) { + @panic("parser leaked"); } } - return s.getBareString(); -} - -fn parseBareEsc(s: *State) !u8 { - const c = try s.readNoEof("bare escape"); - if (isBareEsc(c)) { - return c; - } else { - return s.err(.InvalidCharacter, "bare escape"); - } } -fn parseQuotedString(s: *State) !Value { - while (try s.read()) |c| { - if (c == '"') { - return s.getQuotedString(); - } - if (c != '\\') { - try s.addChar(c); +pub fn parse(input: std.io.AnyReader) Value { + var fb_alloc: std.mem.Allocator = undefined; + var stack_sfa: DefaultStackSfa = undefined; + var chars_sfa: DefaultCharsSfa = undefined; + var p = default(&fb_alloc, &stack_sfa, &chars_sfa) catch @panic("OOM"); + defer p.deinit(); + return _parse(input) catch { + if (p.unused_char) |c| { + std.debug.panic( + "Parse error: {s}, unused_char: 0x{x}\n", + .{ p.err_msg, c }, + ); } else { - try parseQuotedEsc(s); + std.debug.panic("Parse error: {s}\n", .{p.err_msg}); } - } - return error.UnclosedString; -} - -fn parseQuotedEsc(s: *State) !void { - const c = try s.readNoEof("quoted escape"); - if (c == 'u') { - return parseUniHexHandleErrors(s); - } - try s.addChar(switch (c) { - '\\', '"' => c, - '0' => 0, - 'a' => 7, - 'b' => 8, - 't' => 9, - 'n' => 10, - 'v' => 11, - 'f' => 12, - 'r' => 13, - 'e' => 27, - 'x' => try parseHexByte(s, "hex escape"), - else => return s.err(.InvalidCharacter, "quoted escape"), - }); -} - -fn parseUniHexHandleErrors(s: *State) !void { - return parseUniHex(s) catch |err| switch (err) { - error.Utf8CannotEncodeSurrogateHalf => s.err( - .UnicodeError, - "unicode escape", - ), - else => |e| e, }; } -fn parseUniHex(s: *State) !void { - const msg = "unicode escape"; - - if (try s.readNoEof(msg) != '{') { - return s.err(.InvalidCharacter, msg); - } - - const uc, const unused_c = try parseHex(s, u21, msg); - if (unused_c) |c| { - if (c != '}') { - return s.err(.InvalidCharacter, msg); - } - } else { - return s.err(.UnexpectedEof, msg); - } - - const n = try std.unicode.utf8CodepointSequenceLength(uc); - const buf = try s.chars.addManyAsSlice(s.chars_alloc, n); - _ = try std.unicode.utf8Encode(uc, buf); -} - -fn parseHashExpression(s: *State, next: Fn) !void { - const c = try s.readNoEof("hash expression"); - if (try checkBlanks(s, c) != .no) { - return s.err(.InvalidCharacter, "hash expression"); - } - if (std.ascii.isAlphabetic(c)) { - const r, const unused_c = try parseRune(s, c); - return parseRuneEnd(s, r, unused_c, next); - } - if (c == '%') { - const l, const unused_c = try parseLabel(s); - return parseLabelEnd(s, l, unused_c, next); - } - if (isBareChar(c)) { - // Reserved for future extensions to syntax sugar. - return s.err(.InvalidCharacter, "hash expression"); - } - // fast-path to avoid subr - if (c == '\\') { - return s.jump(next, cons(HASH, try parseBareEscString(s))); - } - if (c == '"') { - return s.jump(next, cons(HASH, try parseQuotedString(s))); - } - s.unused_char = c; - return s.subr(.parse_hash_datum, next); -} - -fn parseHashDatum(s: *State) !void { - return parseCladDatum(s, s.getUnused().?, .end_hash_datum); -} - -fn endHashDatum(s: *State) !void { - if (s.result.eq(VOID)) { - return s.err(.InvalidCharacter, "hash datum"); - } - return s.retval(cons(HASH, s.result)); -} - -fn parseRune(s: *State, c1: u8) !struct { Value, ?u8 } { - try s.addChar(c1); - var len: usize = 1; - while (try s.read()) |c| : (len += 1) { - if (len == 6 or !std.ascii.isAlphanumeric(c)) { - return .{ s.getRune(), c }; - } - try s.addChar(c); - } - return .{ s.getRune(), null }; -} - -fn parseRuneEnd(s: *State, r: Value, c1: ?u8, next: Fn) !void { - const c = c1 orelse return s.jump(next, r); - if (c == '\\') { - return s.jump(next, cons(r, try parseBareString(s, c))); - } - if (c == '"') { - return s.jump(next, cons(r, try parseQuotedString(s))); - } - s.unused_char = c; - switch (c) { - '#', '(', '[', '{', '\'', '`', ',' => { - try s.push(next); - return s.jump(.parse_rune_datum, r); - }, - else => return s.jump(next, r), - } -} - -fn parseRuneDatum(s: *State) !void { - s.context.val = s.result; - return parseCladDatum(s, s.getUnused().?, .end_rune_datum); -} - -fn endRuneDatum(s: *State) !void { - if (s.result.eq(VOID)) { - s.retval(s.context.val); - } - return s.retval(cons(s.context.val, s.result)); -} - -fn parseLabel(s: *State) !struct { Value, ?u8 } { - const label, const unused_c = try parseHex(s, u48, "datum label"); - return .{ value.fixnum.pack(label), unused_c }; -} - -fn parseLabelEnd(s: *State, l: Value, c1: ?u8, next: Fn) !void { - const c = c1 orelse return s.err(.UnexpectedEof, "datum label"); - if (c == '%') { - return s.jump(next, cons(LABEL, l)); - } - if (c == '=') { - try s.push(next); - s.context.val = l; - return s.subr(.parse_unit, .end_label_datum); - } - return s.err(.InvalidCharacter, "datum label"); -} - -fn endLabelDatum(s: *State) !void { - if (s.result.eq(VOID)) { - return s.err(.InvalidCharacter, "label datum"); - } - return s.retval(cons(LABEL, cons(s.context.val, s.result))); -} - -fn parseList(s: *State, open: u8, next: Fn) !void { - const head = switch (open) { - '(' => value.nil.nil, - '[' => cons(SQUARE, value.nil.nil), - '{' => cons(BRACE, value.nil.nil), - else => unreachable, - }; - const close: u8 = switch (open) { - '(' => ')', - '[' => ']', - '{' => '}', - else => unreachable, - }; - while (try s.read()) |c| { - if (c == close) { - return s.jump(next, head); - } - switch (try checkBlanks(s, c)) { - .yes => {}, - .skip_unit => { - try listParserSetup(s, head, close, next); - // Parse twice in a row, ignoring the first result. - return s.subr(.parse_unit, .parse_unit); - }, - .no => { - try listParserSetup(s, head, close, next); - s.unused_char = c; - return s.jump(.parse_list_element, null); - }, - } - } - return s.err(.UnexpectedEof, "list"); -} - -fn listParserSetup(s: *State, head: Value, close: u8, next: Fn) !void { - s.context.val = head; - s.context.char = close; - try s.push(next); - try s.pushContext(.continue_list); -} - -fn parseListElement(s: *State) !void { - return parseDatum(s, s.getUnused().?); -} - -fn continueList(s: *State) !void { - const close = s.context.char; - - if (s.result.eq(VOID)) { - const c = s.getUnused().?; - if (c == close) { - return endList(s); - } - return s.err(.InvalidCharacter, "list"); - } - - if (s.result.eq(LSTAIL)) { - return s.subr(.parse_unit, .end_improper_list); - } - - s.context.val = cons(s.result, s.context.val); - - var c1 = s.getUnused() orelse try s.read(); - while (c1) |c| : (c1 = try s.read()) { - if (c == close) { - return endList(s); - } - switch (try checkBlanks(s, c)) { - .yes => {}, - .skip_unit => { - try s.pushContext(.continue_list); - return s.subr(.parse_unit, .parse_unit); - }, - .no => { - s.unused_char = c; - return s.subr(.parse_list_element, .continue_list); - }, - } - } - return s.err(.UnexpectedEof, "list"); -} - -fn endList(s: *State) !void { - return s.retval(lib.list.reverse(s.context.val)); -} - -fn endImproperList(s: *State) !void { - if (s.result.eq(VOID)) { - return s.err(.InvalidCharacter, "list tail"); - } - s.context.val = lib.list.reverseWithTail(s.context.val, s.result); - return closeImproperList(s); -} - -fn closeImproperList(s: *State) !void { - const result = s.context.val; - const close = s.context.char; - var c1 = s.getUnused() orelse try s.read(); - while (c1) |c| : (c1 = try s.readNoEof("after list tail")) { - if (c == close) { - return s.retval(result); - } - switch (try checkBlanks(s, c)) { - .yes => {}, - .skip_unit => return s.subr(.parse_unit, .close_improper_list), - .no => return s.err(.InvalidCharacter, "after list tail"), - } - } - unreachable; -} - -fn parseQuoteExpr(s: *State, c1: u8, next: Fn) !void { - const q = switch (c1) { - '\'' => QUOTE, - '`' => GRAVE, - ',' => COMMA, - else => unreachable, - }; - - // fast-path to avoid subr - const c = try s.readNoEof("quote expression"); - if (isBareChar(c) or c == '\\') { - return s.jump(next, cons(q, try parseBareString(s, c))); - } - - try s.push(next); - s.context.val = q; - s.unused_char = c; - return s.subr(.parse_unit, .end_quote_expr); -} - -fn endQuoteExpr(s: *State) !void { - if (s.result.eq(VOID)) { - return s.err(.InvalidCharacter, "quote expression datum"); - } - return s.retval(cons(s.context.val, s.result)); -} - -// Helpers - -fn parseHex( - s: *State, - u_type: type, - comptime emsg: []const u8, -) !struct { u_type, ?u8 } { - var uc: u_type = undefined; - - const c1 = try s.readNoEof(emsg); - uc = try parseHexDigit(s, c1, emsg); - - while (try s.read()) |c| { - if (!std.ascii.isHex(c)) { - return .{ uc, c }; - } - const shl = std.math.shlExact; - uc = shl(u_type, uc, 4) catch return s.err(.OutOfRange, emsg); - uc |= try parseHexDigit(s, c, emsg); - } - return .{ uc, null }; -} - -fn parseHexByte(s: *State, comptime emsg: []const u8) !u8 { - const h1 = try s.readNoEof(emsg); - const h2 = try s.readNoEof(emsg); - const hi = try parseHexDigit(s, h1, emsg); - const lo = try parseHexDigit(s, h2, emsg); - return hi << 4 | lo; -} - -fn parseHexDigit(s: *State, c: u8, comptime emsg: []const u8) !u8 { - return switch (c) { - '0'...'9' => c - '0', - 'A'...'F' => c - 'A' + 10, - 'a'...'f' => c - 'a' + 10, - else => s.err(.InvalidCharacter, emsg), - }; +pub fn _parse(input: std.io.AnyReader) !Value { + var fb_alloc: std.mem.Allocator = undefined; + var stack_sfa: DefaultStackSfa = undefined; + var chars_sfa: DefaultCharsSfa = undefined; + var p = try default(&fb_alloc, &stack_sfa, &chars_sfa); + defer p.deinit(); + return p.run(input); } diff --git a/src/main.zig b/src/main.zig index 24c7959..c52d7e5 100644 --- a/src/main.zig +++ b/src/main.zig @@ -3,77 +3,17 @@ const std = @import("std"); const zisp = @import("libzisp"); pub fn main() !void { - const T = extern union { - bits: u64, - double: f64, - }; - var f1: *volatile f64 = undefined; - var f2: *volatile f64 = undefined; - var x: *volatile T = undefined; - - var _gpa: std.heap.GeneralPurposeAllocator(.{}) = .init; - const gpa = _gpa.allocator(); - - f1 = try gpa.create(f64); - f2 = try gpa.create(f64); - x = try gpa.create(T); - - f1.* = 0.0; - f2.* = 0.0; - x.*.double = f1.* / f2.*; - std.debug.print(" 0/0: {x}\n", .{x.*.bits}); - - f1.* = -0.0; - f2.* = 0.0; - x.*.double = f1.* / f2.*; - std.debug.print("-0/0: {x}\n", .{x.*.bits}); - - for (0..1000000) |i| { - _ = i; - const p = zisp.value.pair.cons( - zisp.value.fixnum.pack(0), - zisp.value.fixnum.pack(1), - ); - std.mem.doNotOptimizeAway(p); - // std.debug.print("fx: {}\n", .{zisp.value.pair.car(p)}); - } - - const istr = zisp.value.istr.intern("foo", false); - std.debug.print("istr: {s}\n", .{zisp.value.istr.getHeader(istr).bytes()}); - - // // Prints to stderr (it's a shortcut based on `std.io.getStdErr()`) - // std.debug.print("All your {s} are belong to us.\n", .{"codebase"}); - - // // stdout is for the actual output of your application, for example if you - // // are implementing gzip, then only the compressed bytes should be sent to - // // stdout, not any debugging messages. - // const stdout_file = std.io.getStdOut().writer(); - // var bw = std.io.bufferedWriter(stdout_file); - // const stdout = bw.writer(); - - // try stdout.print("Run `zig build test` to run the tests.\n", .{}); - - // try bw.flush(); // Don't forget to flush! -} - -test "simple test" { - var list = std.ArrayList(i32).init(std.testing.allocator); - defer list.deinit(); // Try commenting this out and see if zig detects the memory leak! - try list.append(42); - try std.testing.expectEqual(@as(i32, 42), list.pop()); -} - -test "use other module" { - //try std.testing.expectEqual(@as(i32, 150), lib.add(100, 50)); -} - -test "fuzz example" { - const Context = struct { - fn testOne(context: @This(), input: []const u8) anyerror!void { - _ = context; - // Try passing `--fuzz` to `zig build test` and see if it manages to fail this test case! - try std.testing.expect(!std.mem.eql(u8, "canyoufindme", input)); + const reader = std.io.getStdIn().reader().any(); + const writer = std.io.getStdOut().writer().any(); + while (true) { + try writer.writeAll("> "); + const datum = zisp.io.parser.parse(reader); + if (datum.eq(zisp.value.eof.eof)) { + try writer.writeAll("\n"); + return; } - }; - try std.testing.fuzz(Context{}, Context.testOne, .{}); + try writer.writeAll("= "); + try zisp.io.unparser.unparse(writer, datum); + try writer.writeAll("\n"); + } } diff --git a/test-data/parser-torture.scm b/test-data/parser-torture.scm index b28eb5c..d475379 100644 --- a/test-data/parser-torture.scm +++ b/test-data/parser-torture.scm @@ -50870,7 +50870,7 @@ arbitrary limit." (let ((number\' (* number (expt 10 decimals)))) (if (or (= number\' (floor number\')) (>= decimals max-decimals)) - (let* ((fraction (- number' + (let* ((fraction (- number\' (* (floor number) (expt 10 decimals)))) (str (integer->string @@ -64342,7 +64342,7 @@ as its arguments." (newline)) (SECTION 2 1);; test that all symbol characters are supported. -'(+ - ... !.. $.+ %.- &.! *.: /:. :+. <-. =. >. ?. ~. _. ^.) +;'(+ - ... !.. $.+ %.- &.! *.: /:. :+. <-. =. >. ?. ~. _. ^.) (SECTION 3 4) (define disjoint-type-functions |
