// === Syntax === // // See docs/parser.md and spec/syntax.bnf to understand the syntax. // // // === Trampolining strategy === // // Instead of using recursion directly, the parser is written in a "trampoline" // style, which ensures that parse depth is not limited by stack size. // // All state and context is passed around via a struct pointer. The parser has // a main loop, which calls a function as dictated by parser.context.next, and // the function may update the state to have another function called next. // // If a function wants to call a recursive subroutine, it pushes some of the // current context onto a stack, including what function the subroutine should // return to, and then updates the state to instruct the main loop to call one // of the entry point subroutines. // // If a function wants to make the parser return, either from a subroutine or // from the main loop, it sets the .result field, and tries to pop the saved // context. If the context stack was empty, the main loop returns. // // // === Buffering === // // For efficiency, call the parser on an input stream with implicit buffering. // // The parser does not use its own buffer, beyond one character that may be // written back into the unread_char field, which is checked at the end to // ensure it's nothing other than a trailing blank or comment. // // This lack of buffering is to ensure that the parser never reads more bytes // from the input than what it needs to parse a datum. Consider the input: // // (a b c) (x y z) // // If we used proper buffering, like reading up to 4K bytes per read, then the // whole stream would be consumed at once before it's parsed. Then, the parser // would return the first datum, and the rest of the stream would be lost. The // parser would need some way to reset the input stream's read head to the end // of the first datum, but not all stream types may support this. // // Although in a scenario like this it would be wise to create a single Parser // instance to call multiple times (in which case it could continue using its // internal buffer from where it left), this may not always be practical. // // One could even have an input stream with Zisp s-expressions and other data // intertwined, in which case this lack of buffering is also crucial, since one // needs to alternate between calling the Zisp parser and some other parser on // the same input stream. // const builtin = @import("builtin"); const std = @import("std"); const lib = @import("../lib.zig"); const value = @import("../value.zig"); const List = std.ArrayListUnmanaged; const Value = value.Value; const Parser = @This(); const is_test = builtin.is_test; const is_debug = builtin.mode == .Debug; const detailed_debug = false; // zig fmt: off const DOT = value.rune.pack("DOT"); const COLON = value.rune.pack("COLON"); const JOIN = value.rune.pack("JOIN"); const LABEL = value.rune.pack("LABEL"); const HASH = value.rune.pack("HASH"); const QUOTE = value.rune.pack("QUOTE"); const GRAVE = value.rune.pack("GRAVE"); const COMMA = value.rune.pack("COMMA"); const SQUARE = value.rune.pack("SQUARE"); const BRACE = value.rune.pack("BRACE"); const VOID = value.rune.packForced(""); const LSTAIL = value.sstr.pack("."); // zig fmt: on // We could implement an optimization where we swap in a dummy cons when the // parser is handling a commented-out datum, but this would require changes to // the algorithm and doesn't seem very important, so it's not implemented. const Cons = *const fn (v1: Value, v2: Value) Value; fn dummyCons(v1: Value, v2: Value) Value { _ = v1; _ = v2; return value.undef; } const real_cons = &value.pair.cons; const fake_cons = &dummyCons; pub const ParseError = enum { InvalidCharacter, UnclosedString, UnexpectedEof, UnicodeError, RuneTooLong, OutOfMemory, OutOfRange, ReadError, }; pub const Context = struct { // What to do next. next: ?Fn = undefined, // For storing a context value, like accumulated list elements. val: Value = undefined, // For storing a context char, like list opening bracket. char: u8 = undefined, }; // parameters stack_alloc: std.mem.Allocator, chars_alloc: std.mem.Allocator, // state input: std.io.AnyReader = undefined, context: Context = .{}, stack: List(Context) = undefined, chars: List(u8) = undefined, cons: Cons = real_cons, result: Value = undefined, unread_char: ?u8 = null, err_msg: []const u8 = undefined, pub fn init( stack_alloc: std.mem.Allocator, chars_alloc: std.mem.Allocator, init_stack_mem: usize, init_chars_mem: usize, ) !Parser { var p: Parser = .{ .stack_alloc = stack_alloc, .chars_alloc = chars_alloc, }; const csize = @sizeOf(Context); p.stack = try .initCapacity(stack_alloc, init_stack_mem / csize); p.chars = try .initCapacity(chars_alloc, init_chars_mem); return p; } pub fn deinit(p: *Parser) void { p.stack.deinit(p.stack_alloc); p.chars.deinit(p.chars_alloc); } // // Input reading & unreading // fn read(p: *Parser) !?u8 { if (is_debug and p.unread_char != null) { @panic("Called read() while there was an unused character!"); } const c = p.input.readByte() catch |e| switch (e) { error.EndOfStream => return null, else => return p.err(.ReadError, "???"), }; if (detailed_debug) { std.debug.print("{c}", .{c}); } return c; } fn readNoEof(p: *Parser, comptime emsg: []const u8) !u8 { return if (try p.read()) |c| c else p.err(.UnexpectedEof, emsg); } fn unread(p: *Parser, c: u8) void { p.unread_char = c; } fn getUnread(p: *Parser) ?u8 { const c = p.unread_char orelse return null; p.unread_char = null; return c; } // // String accumulation & creation // fn addChar(p: *Parser, c: u8) !void { try p.chars.append(p.chars_alloc, c); } fn getBareString(p: *Parser) Value { defer p.chars.clearRetainingCapacity(); return if (p.chars.items.len <= 6) value.sstr.pack(p.chars.items) else value.istr.intern(p.chars.items, false); } fn getQuotedString(p: *Parser) Value { defer p.chars.clearRetainingCapacity(); return if (p.chars.items.len <= 6) value.sstr.packQuoted(p.chars.items) else value.istr.intern(p.chars.items, true); } fn getRune(p: *Parser) Value { defer p.chars.clearRetainingCapacity(); return value.rune.pack(p.chars.items); } // // Stack management & control flow // const Fn = enum { parseUnit, parseDatum, endUnit, returnContext, endFirstDatum, endJoinDatum, parseJoin, parseHashDatum, endHashDatum, parseRuneDatum, endRuneDatum, endLabelDatum, continueList, endImproperList, closeImproperList, endQuoteExpr, }; fn call(p: *Parser, f: Fn) !void { try switch (f) { .parseUnit => parseUnit(p), .parseDatum => parseDatum(p), .endUnit => endUnit(p), .returnContext => returnContext(p), .endFirstDatum => endFirstDatum(p), .endJoinDatum => endJoinDatum(p), .parseJoin => parseJoin(p), .parseHashDatum => parseHashDatum(p), .endHashDatum => endHashDatum(p), .parseRuneDatum => parseRuneDatum(p), .endRuneDatum => endRuneDatum(p), .endLabelDatum => endLabelDatum(p), .continueList => continueList(p), .endImproperList => endImproperList(p), .closeImproperList => closeImproperList(p), .endQuoteExpr => endQuoteExpr(p), }; } pub fn run(p: *Parser, input: std.io.AnyReader) !Value { p.input = input; p.context.next = .parseUnit; while (p.context.next) |next| { if (detailed_debug) printStack(p); try p.call(next); } if (p.unread_char) |_| { return p.err(.InvalidCharacter, "top-level"); } return p.result; } fn printStack(p: *Parser) void { const stack = p.stack.items; std.debug.print("\n\n{}:{any} ctx:'{c}' unread:'{c}' \n", .{ stack.len, p.context.next, p.context.char, p.unread_char orelse '_', }); if (stack.len > 0) { var i = stack.len; while (i > 0) : (i -= 1) { const prev = stack[i - 1]; std.debug.print("{}:{any} ctx:'{c}'\n", .{ i - 1, prev.next, prev.char, }); } } } fn err( p: *Parser, comptime e: ParseError, comptime msg: []const u8, ) error{ParseError} { p.err_msg = @tagName(e) ++ " at: " ++ msg; return error.ParseError; } fn push(p: *Parser, next: Fn) !void { try p.stack.append(p.stack_alloc, .{ .next = next }); } fn pushContext(p: *Parser, next: Fn) !void { try p.stack.append(p.stack_alloc, .{ .next = next, .val = p.context.val, .char = p.context.char, }); } fn ret(p: *Parser) void { if (p.stack.pop()) |c| { p.context = c; } else { p.context.next = null; } } fn subr(p: *Parser, start: Fn, next: Fn) !void { try p.pushContext(next); p.context.next = start; } fn jump(p: *Parser, next: Fn, val: ?Value) void { if (val) |v| p.result = v; p.context.next = next; } fn abort(p: *Parser, next: Fn, unread_c: u8) void { p.result = VOID; p.unread_char = unread_c; p.context.next = next; } fn retval(p: *Parser, val: Value) void { p.result = val; p.ret(); } // // Start parser functions // fn parseUnit(p: *Parser) !void { var c1 = p.getUnread() orelse try p.read(); while (c1) |c| : (c1 = try p.read()) { switch (try checkBlanks(p, c)) { .yes => {}, .skip_unit => try p.push(.parseUnit), .no => { p.unread(c); return p.subr(.parseDatum, .endUnit); }, } } return p.retval(value.eof.eof); } fn endUnit(p: *Parser) !void { const c = p.getUnread() orelse return p.ret(); switch (try checkBlanks(p, c)) { .yes => {}, .skip_unit => return skipUnitAndReturn(p), .no => p.unread(c), } return p.ret(); } fn skipUnitAndReturn(p: *Parser) !void { p.context.val = p.result; return p.subr(.parseUnit, .returnContext); } fn returnContext(p: *Parser) !void { return p.retval(p.context.val); } fn parseDatum(p: *Parser) !void { return parseOneDatum(p, p.getUnread().?, .endFirstDatum); } fn endFirstDatum(p: *Parser) !void { if (p.result.eq(VOID)) { return p.ret(); } return parseJoin(p); } fn parseJoin(p: *Parser) !void { const c = p.getUnread() orelse try p.read() orelse return p.ret(); switch (c) { '.', ':' => { p.context.char = c; p.unread(try p.readNoEof("join datum")); }, else => { p.context.char = 0; p.unread(c); }, } p.context.val = p.result; return p.subr(.parseDatum, .endJoinDatum); } fn endJoinDatum(p: *Parser) !void { const prev = p.context.val; const join = p.context.char; if (p.result.eq(VOID)) { if (join == 0) { return p.retval(prev); } else { return p.err(.InvalidCharacter, "join datum"); } } const rune = switch (join) { 0 => JOIN, '.' => DOT, ':' => COLON, else => unreachable, }; const joined = p.cons(rune, p.cons(prev, p.result)); return p.jump(.parseJoin, joined); } fn parseOneDatum(p: *Parser, c: u8, next: Fn) !void { if (isBareChar(c) or c == '.') { return p.jump(next, try parseBareString(p, c)); } return parseCladDatum(p, c, next); } fn parseBareString(p: *Parser, c: u8) !Value { const allow_dots = std.ascii.isDigit(c) or switch (c) { '.', '+', '-' => true, else => false, }; try p.addChar(c); return parseBareStringRest(p, allow_dots); } fn parseBareEscString(p: *Parser) !Value { try p.addChar(try parseBareEsc(p)); return parseBareStringRest(p, false); } fn parseBareStringRest(p: *Parser, allow_dots: bool) !Value { while (try p.read()) |c| { if (isBareChar(c) or (allow_dots and c == '.')) { try p.addChar(c); } else if (c == '\\') { try p.addChar(try parseBareEsc(p)); } else { p.unread(c); break; } } return p.getBareString(); } fn parseBareEsc(p: *Parser) !u8 { const c = try p.readNoEof("bare escape"); if (isBareEsc(c)) { return c; } else { return p.err(.InvalidCharacter, "bare escape"); } } fn parseCladDatum(p: *Parser, c: u8, next: Fn) !void { if (c == '\\') { return p.jump(next, try parseBareEscString(p)); } if (c == '"') { return p.jump(next, try parseQuotedString(p, '"')); } if (c == '|') { return p.jump(next, try parseQuotedString(p, '|')); } return switch (c) { '#' => parseHashExpression(p, next), '(', '[', '{' => parseList(p, c, next), '\'', '`', ',' => parseQuoteExpr(p, c, next), else => p.abort(next, c), }; } fn parseQuotedString(p: *Parser, close: u8) !Value { while (try p.read()) |c| { if (c == close) { return p.getQuotedString(); } if (c != '\\') { try p.addChar(c); } else { try parseQuotedEsc(p, close); } } return error.UnclosedString; } fn parseQuotedEsc(p: *Parser, close: u8) !void { const c = try p.readNoEof("quoted escape"); if (c == close) { return p.addChar(close); } if (c == 'u') { return parseUniHexHandleErrors(p); } try p.addChar(switch (c) { '\\' => c, '0' => 0, 'a' => 7, 'b' => 8, 't' => 9, 'n' => 10, 'v' => 11, 'f' => 12, 'r' => 13, 'e' => 27, 'x' => try parseHexByte(p, "hex escape"), else => return p.err(.InvalidCharacter, "quoted escape"), }); } fn parseUniHexHandleErrors(p: *Parser) !void { return parseUniHex(p) catch |e| switch (e) { error.Utf8CannotEncodeSurrogateHalf => p.err( .UnicodeError, "unicode escape", ), else => e, }; } fn parseUniHex(p: *Parser) !void { const msg = "unicode escape"; if (try p.readNoEof(msg) != '{') { return p.err(.InvalidCharacter, msg); } const uc = try parseHex(p, u21, msg); const c = p.getUnread() orelse return p.err(.UnexpectedEof, msg); if (c != '}') { return p.err(.InvalidCharacter, msg); } const n = try std.unicode.utf8CodepointSequenceLength(uc); const buf = try p.chars.addManyAsSlice(p.chars_alloc, n); _ = try std.unicode.utf8Encode(uc, buf); } fn parseHashExpression(p: *Parser, next: Fn) !void { const c = try p.readNoEof("hash expression"); if (try checkBlanks(p, c) != .no) { return p.err(.InvalidCharacter, "hash expression"); } if (std.ascii.isAlphabetic(c)) { const r = try parseRune(p, c); return parseRuneEnd(p, r, next); } if (c == '%') { const l = try parseLabel(p); return parseLabelEnd(p, l, next); } if (isBareChar(c)) { // Reserved for future extensions to syntax sugar. return p.err(.InvalidCharacter, "hash expression"); } // fast-path to avoid subr if (c == '\\') { return p.jump(next, p.cons(HASH, try parseBareEscString(p))); } if (c == '"') { return p.jump(next, p.cons(HASH, try parseQuotedString(p, '"'))); } if (c == '|') { return p.jump(next, p.cons(HASH, try parseQuotedString(p, '|'))); } p.unread(c); return p.subr(.parseHashDatum, next); } fn parseHashDatum(p: *Parser) !void { return parseCladDatum(p, p.getUnread().?, .endHashDatum); } fn endHashDatum(p: *Parser) !void { if (p.result.eq(VOID)) { return p.err(.InvalidCharacter, "hash datum"); } return p.retval(p.cons(HASH, p.result)); } fn parseRune(p: *Parser, c1: u8) !Value { try p.addChar(c1); var len: usize = 1; while (try p.read()) |c| : (len += 1) { if (len == 6 or !std.ascii.isAlphanumeric(c)) { p.unread(c); return p.getRune(); } try p.addChar(c); } return p.getRune(); } fn parseRuneEnd(p: *Parser, r: Value, next: Fn) !void { const c = p.getUnread() orelse return p.jump(next, r); if (c == '\\') { return p.jump(next, p.cons(r, try parseBareString(p, c))); } if (c == '"') { return p.jump(next, p.cons(r, try parseQuotedString(p, '"'))); } if (c == '|') { return p.jump(next, p.cons(r, try parseQuotedString(p, '|'))); } p.unread(c); switch (c) { '#', '(', '[', '{', '\'', '`', ',' => { try p.push(next); p.context.val = r; // Use jump to prevent recursion. return p.jump(.parseRuneDatum, null); }, else => return p.jump(next, r), } } fn parseRuneDatum(p: *Parser) !void { return parseCladDatum(p, p.getUnread().?, .endRuneDatum); } fn endRuneDatum(p: *Parser) !void { if (p.result.eq(VOID)) { p.retval(p.context.val); } return p.retval(p.cons(p.context.val, p.result)); } fn parseLabel(p: *Parser) !Value { const label = try parseHex(p, u48, "datum label"); return value.fixnum.pack(label); } fn parseLabelEnd(p: *Parser, l: Value, next: Fn) !void { const c = p.getUnread() orelse return p.err(.UnexpectedEof, "datum label"); if (c == '%') { return p.jump(next, p.cons(LABEL, l)); } if (c == '=') { try p.push(next); p.context.val = l; return p.subr(.parseDatum, .endLabelDatum); } return p.err(.InvalidCharacter, "datum label"); } fn endLabelDatum(p: *Parser) !void { if (p.result.eq(VOID)) { return p.err(.InvalidCharacter, "label datum"); } return p.retval(p.cons(LABEL, p.cons(p.context.val, p.result))); } fn parseList(p: *Parser, open: u8, next: Fn) !void { const head = switch (open) { '(' => value.nil.nil, '[' => p.cons(SQUARE, value.nil.nil), '{' => p.cons(BRACE, value.nil.nil), else => unreachable, }; const close: u8 = switch (open) { '(' => ')', '[' => ']', '{' => '}', else => unreachable, }; while (try p.read()) |c| { if (c == close) { return p.jump(next, head); } switch (try checkBlanks(p, c)) { .yes => {}, .skip_unit => { try listParserSetup(p, head, close, next); return p.subr(.parseUnit, .parseUnit); }, .no => { try listParserSetup(p, head, close, next); p.unread(c); return p.jump(.parseDatum, null); }, } } return p.err(.UnexpectedEof, "list"); } fn listParserSetup(p: *Parser, head: Value, close: u8, next: Fn) !void { try p.push(next); p.context.val = head; p.context.char = close; try p.pushContext(.continueList); } fn continueList(p: *Parser) !void { const close = p.context.char; if (p.result.eq(VOID)) { const c = p.getUnread().?; if (c == close) { return endList(p); } return p.err(.InvalidCharacter, "list"); } if (p.result.eq(LSTAIL)) { return p.subr(.parseUnit, .endImproperList); } p.context.val = p.cons(p.result, p.context.val); var c1 = p.getUnread() orelse try p.read(); while (c1) |c| : (c1 = try p.read()) { if (c == close) { return endList(p); } switch (try checkBlanks(p, c)) { .yes => {}, .skip_unit => { try p.pushContext(.continueList); return p.subr(.parseUnit, .parseUnit); }, .no => { p.unread(c); return p.subr(.parseDatum, .continueList); }, } } return p.err(.UnexpectedEof, "list"); } fn endList(p: *Parser) !void { return p.retval(lib.list.reverse(p.context.val)); } fn endImproperList(p: *Parser) !void { if (p.result.eq(VOID)) { return p.err(.InvalidCharacter, "list tail"); } p.context.val = lib.list.reverseWithTail(p.context.val, p.result); return closeImproperList(p); } fn closeImproperList(p: *Parser) !void { const result = p.context.val; const close = p.context.char; var c1 = p.getUnread() orelse try p.read(); while (c1) |c| : (c1 = try p.read()) { if (c == close) { return p.retval(result); } switch (try checkBlanks(p, c)) { .yes => {}, .skip_unit => return p.subr(.parseUnit, .closeImproperList), .no => return p.err(.InvalidCharacter, "after list tail"), } } return p.err(.UnexpectedEof, "after list tail"); } fn parseQuoteExpr(p: *Parser, c1: u8, next: Fn) !void { const q = switch (c1) { '\'' => QUOTE, '`' => GRAVE, ',' => COMMA, else => unreachable, }; // fast-path to avoid subr const c = try p.readNoEof("quote expression"); if (isBareChar(c) or c == '\\') { return p.jump(next, p.cons(q, try parseBareString(p, c))); } try p.push(next); p.context.val = q; p.unread(c); return p.subr(.parseDatum, .endQuoteExpr); } fn endQuoteExpr(p: *Parser) !void { if (p.result.eq(VOID)) { return p.err(.InvalidCharacter, "quote expression datum"); } return p.retval(p.cons(p.context.val, p.result)); } // Helpers fn checkBlanks(p: *Parser, c: u8) !enum { yes, skip_unit, no } { switch (c) { '\t'...'\r', ' ' => return .yes, ';' => { const c2 = try p.read() orelse return .yes; if (c2 == '\n') return .yes; if (c2 == '~') return .skip_unit; while (try p.read()) |c3| if (c3 == '\n') break; return .yes; }, else => return .no, } } fn isBareChar(c: u8) bool { return switch (c) { // zig fmt: off 'a'...'z' , 'A'...'Z' , '0'...'9' , '!' , '$' , '%' , '&' , '*' , '+' , '-' , '/' , '<' , '=' , '>' , '?' , '@' , '^' , '_' , '~' => true, // zig fmt: on else => false, }; } fn isBareEsc(c: u8) bool { return switch (c) { 33...126 => true, else => false, }; } fn parseHex(p: *Parser, T: type, comptime emsg: []const u8) !T { var uc: T = undefined; const c1 = try p.readNoEof(emsg); uc = try parseHexDigit(p, c1, emsg); while (try p.read()) |c| { if (!std.ascii.isHex(c)) { p.unread(c); return uc; } const shl = std.math.shlExact; uc = shl(T, uc, 4) catch return p.err(.OutOfRange, emsg); uc |= try parseHexDigit(p, c, emsg); } return uc; } fn parseHexByte(p: *Parser, comptime emsg: []const u8) !u8 { const h1 = try p.readNoEof(emsg); const h2 = try p.readNoEof(emsg); const hi = try parseHexDigit(p, h1, emsg); const lo = try parseHexDigit(p, h2, emsg); return hi << 4 | lo; } fn parseHexDigit(p: *Parser, c: u8, comptime emsg: []const u8) !u8 { return switch (c) { '0'...'9' => c - '0', 'A'...'F' => c - 'A' + 10, 'a'...'f' => c - 'a' + 10, else => p.err(.InvalidCharacter, emsg), }; }