diff options
Diffstat (limited to 'src/zisp/io/Parser.zig')
| -rw-r--r-- | src/zisp/io/Parser.zig | 847 |
1 files changed, 847 insertions, 0 deletions
diff --git a/src/zisp/io/Parser.zig b/src/zisp/io/Parser.zig new file mode 100644 index 0000000..08bf8ba --- /dev/null +++ b/src/zisp/io/Parser.zig @@ -0,0 +1,847 @@ +// === Syntax === +// +// See docs/parser.md and spec/syntax.bnf to understand the syntax. +// +// +// === Trampolining strategy === +// +// Instead of using recursion directly, the parser is written in a "trampoline" +// style, which ensures that parse depth is not limited by stack size. +// +// All state and context is passed around via a struct pointer. The parser has +// a main loop, which calls a function as dictated by parser.context.next, and +// the function may update the state to have another function called next. +// +// If a function wants to call a recursive subroutine, it pushes some of the +// current context onto a stack, including what function the subroutine should +// return to, and then updates the state to instruct the main loop to call one +// of the entry point subroutines. +// +// If a function wants to make the parser return, either from a subroutine or +// from the main loop, it sets the .result field, and tries to pop the saved +// context. If the context stack was empty, the main loop returns. +// +// +// === Buffering === +// +// For efficiency, call the parser on an input stream with implicit buffering. +// +// The parser does not use its own buffer, beyond one character that may be +// written back into the unread_char field, which is checked at the end to +// ensure it's nothing other than a trailing blank or comment. +// +// This lack of buffering is to ensure that the parser never reads more bytes +// from the input than what it needs to parse a datum. Consider the input: +// +// (a b c) (x y z) +// +// If we used proper buffering, like reading up to 4K bytes per read, then the +// whole stream would be consumed at once before it's parsed. Then, the parser +// would return the first datum, and the rest of the stream would be lost. The +// parser would need some way to reset the input stream's read head to the end +// of the first datum, but not all stream types may support this. +// +// Although in a scenario like this it would be wise to create a single Parser +// instance to call multiple times (in which case it could continue using its +// internal buffer from where it left), this may not always be practical. +// +// One could even have an input stream with Zisp s-expressions and other data +// intertwined, in which case this lack of buffering is also crucial, since one +// needs to alternate between calling the Zisp parser and some other parser on +// the same input stream. +// + +const builtin = @import("builtin"); +const std = @import("std"); + +const lib = @import("../lib.zig"); +const value = @import("../value.zig"); + +const List = std.ArrayListUnmanaged; + +const Value = value.Value; + +const Parser = @This(); + +const is_test = builtin.is_test; +const is_debug = builtin.mode == .Debug; +const detailed_debug = false; + +// zig fmt: off +const DOT = value.rune.pack("DOT"); +const COLON = value.rune.pack("COLON"); +const JOIN = value.rune.pack("JOIN"); +const LABEL = value.rune.pack("LABEL"); +const HASH = value.rune.pack("HASH"); +const QUOTE = value.rune.pack("QUOTE"); +const GRAVE = value.rune.pack("GRAVE"); +const COMMA = value.rune.pack("COMMA"); +const SQUARE = value.rune.pack("SQUARE"); +const BRACE = value.rune.pack("BRACE"); +const VOID = value.rune.packForced(""); +const LSTAIL = value.sstr.pack("."); +// zig fmt: on + +// We could implement an optimization where we swap in a dummy cons when the +// parser is handling a commented-out datum, but this would require changes to +// the algorithm and doesn't seem very important, so it's not implemented. + +const Cons = *const fn (v1: Value, v2: Value) Value; + +fn dummyCons(v1: Value, v2: Value) Value { + _ = v1; + _ = v2; + return value.undef; +} + +const real_cons = &value.pair.cons; +const fake_cons = &dummyCons; + +pub const Error = enum { + InvalidCharacter, + UnclosedString, + UnexpectedEof, + UnicodeError, + OutOfRange, + ReadError, +}; + +pub const Context = struct { + // What to do next. + next: ?Fn = undefined, + // For storing a context value, like accumulated list elements. + val: Value = undefined, + // For storing a context char, like list opening bracket. + char: u8 = undefined, +}; + +pub const Alloc = struct { + stack: std.mem.Allocator, + chars: std.mem.Allocator, +}; + +alloc: Alloc, +input: std.io.AnyReader = undefined, +context: Context = .{}, +stack: List(Context) = undefined, +chars: List(u8) = undefined, +cons: Cons = real_cons, +result: Value = undefined, +unread_char: ?u8 = null, +err_msg: []const u8 = undefined, + +pub fn init( + alloc: Alloc, + init_stack_capacity: usize, + init_chars_capacity: usize, +) !Parser { + var p: Parser = .{ .alloc = alloc }; + p.stack = try .initCapacity(alloc.stack, init_stack_capacity); + p.chars = try .initCapacity(alloc.chars, init_chars_capacity); + return p; +} + +pub fn deinit(p: *Parser) void { + p.stack.deinit(p.alloc.stack); + p.chars.deinit(p.alloc.chars); +} + +// +// Input reading & unreading +// + +fn read(p: *Parser) !?u8 { + if (is_debug and p.unread_char != null) { + std.debug.panic( + "Called read() while there was an unused character! '{c}'", + .{p.unread_char.?}, + ); + } + const c = p.input.readByte() catch |e| switch (e) { + error.EndOfStream => return null, + else => return p.err(.ReadError, "???"), + }; + if (detailed_debug) { + std.debug.print("{c}", .{c}); + } + return c; +} + +fn readNoEof(p: *Parser, comptime emsg: []const u8) !u8 { + return if (try p.read()) |c| c else p.err(.UnexpectedEof, emsg); +} + +fn unread(p: *Parser, c: u8) void { + p.unread_char = c; +} + +fn getUnread(p: *Parser) ?u8 { + const c = p.unread_char orelse return null; + p.unread_char = null; + return c; +} + +// +// String accumulation & creation +// + +fn addChar(p: *Parser, c: u8) !void { + try p.chars.append(p.alloc.chars, c); +} + +fn getBareString(p: *Parser) Value { + defer p.chars.clearRetainingCapacity(); + return if (p.chars.items.len <= 6) + value.sstr.pack(p.chars.items) + else + value.istr.intern(p.chars.items, false); +} + +fn getQuotedString(p: *Parser) Value { + defer p.chars.clearRetainingCapacity(); + return if (p.chars.items.len <= 6) + value.sstr.packQuoted(p.chars.items) + else + value.istr.intern(p.chars.items, true); +} + +fn getRune(p: *Parser) Value { + defer p.chars.clearRetainingCapacity(); + return value.rune.pack(p.chars.items); +} + +// +// Stack management & control flow +// + +const Fn = enum { + parseUnit, + parseDatum, + endUnit, + returnContext, + endFirstDatum, + endJoinDatum, + parseJoin, + parseHashDatum, + endHashDatum, + parseRuneDatum, + endRuneDatum, + endLabelDatum, + continueList, + endImproperList, + closeImproperList, + endQuoteExpr, +}; + +fn call(p: *Parser, f: Fn) !void { + try switch (f) { + .parseUnit => parseUnit(p), + .parseDatum => parseDatum(p), + .endUnit => endUnit(p), + .returnContext => returnContext(p), + .endFirstDatum => endFirstDatum(p), + .endJoinDatum => endJoinDatum(p), + .parseJoin => parseJoin(p), + .parseHashDatum => parseHashDatum(p), + .endHashDatum => endHashDatum(p), + .parseRuneDatum => parseRuneDatum(p), + .endRuneDatum => endRuneDatum(p), + .endLabelDatum => endLabelDatum(p), + .continueList => continueList(p), + .endImproperList => endImproperList(p), + .closeImproperList => closeImproperList(p), + .endQuoteExpr => endQuoteExpr(p), + }; +} + +pub fn run(p: *Parser, input: std.io.AnyReader) !Value { + p.input = input; + p.context.next = .parseUnit; + while (p.context.next) |next| { + if (detailed_debug) printStack(p); + try p.call(next); + } + if (p.unread_char) |_| { + return p.err(.InvalidCharacter, "top-level"); + } + return p.result; +} + +fn printStack(p: *Parser) void { + const stack = p.stack.items; + std.debug.print("\n\n{}:{any} ctx:'{c}' unread:'{c}' \n", .{ + stack.len, + p.context.next, + p.context.char, + p.unread_char orelse '_', + }); + if (stack.len > 0) { + var i = stack.len; + while (i > 0) : (i -= 1) { + const prev = stack[i - 1]; + std.debug.print("{}:{any} ctx:'{c}'\n", .{ + i - 1, + prev.next, + prev.char, + }); + } + } +} + +fn err( + p: *Parser, + comptime e: Error, + comptime msg: []const u8, +) error{ParseError} { + p.err_msg = @tagName(e) ++ " at: " ++ msg; + return error.ParseError; +} + +fn push(p: *Parser, next: Fn) !void { + try p.stack.append(p.alloc.stack, .{ .next = next }); +} + +fn pushContext(p: *Parser, next: Fn) !void { + try p.stack.append(p.alloc.stack, .{ + .next = next, + .val = p.context.val, + .char = p.context.char, + }); +} + +fn ret(p: *Parser) void { + if (p.stack.pop()) |c| { + p.context = c; + } else { + p.context.next = null; + } +} + +fn subr(p: *Parser, start: Fn, next: Fn) !void { + try p.pushContext(next); + p.context.next = start; +} + +fn jump(p: *Parser, next: Fn, val: ?Value) void { + if (val) |v| p.result = v; + p.context.next = next; +} + +fn abort(p: *Parser, next: Fn, unread_c: u8) void { + p.result = VOID; + p.unread_char = unread_c; + p.context.next = next; +} + +fn retval(p: *Parser, val: Value) void { + p.result = val; + p.ret(); +} + +// +// Start parser functions +// + +fn parseUnit(p: *Parser) !void { + var c1 = p.getUnread() orelse try p.read(); + while (c1) |c| : (c1 = try p.read()) { + switch (try checkBlanks(p, c)) { + .yes => {}, + .skip_unit => try p.push(.parseUnit), + .no => { + p.unread(c); + return p.subr(.parseDatum, .endUnit); + }, + } + } + return p.retval(value.eof.eof); +} + +fn endUnit(p: *Parser) !void { + const c = p.getUnread() orelse return p.ret(); + switch (try checkBlanks(p, c)) { + .yes => {}, + .skip_unit => return skipUnitAndReturn(p), + .no => p.unread(c), + } + return p.ret(); +} + +fn skipUnitAndReturn(p: *Parser) !void { + p.context.val = p.result; + return p.subr(.parseUnit, .returnContext); +} + +fn returnContext(p: *Parser) !void { + return p.retval(p.context.val); +} + +fn parseDatum(p: *Parser) !void { + return parseOneDatum(p, p.getUnread().?, .endFirstDatum); +} + +fn endFirstDatum(p: *Parser) !void { + if (p.result.eq(VOID)) { + return p.ret(); + } + return parseJoin(p); +} + +fn parseJoin(p: *Parser) !void { + const c = p.getUnread() orelse try p.read() orelse return p.ret(); + switch (c) { + '.', ':' => { + p.context.char = c; + p.unread(try p.readNoEof("join datum")); + }, + else => { + p.context.char = 0; + p.unread(c); + }, + } + p.context.val = p.result; + return p.subr(.parseDatum, .endJoinDatum); +} + +fn endJoinDatum(p: *Parser) !void { + const prev = p.context.val; + const join = p.context.char; + if (p.result.eq(VOID)) { + if (join == 0) { + return p.retval(prev); + } else { + return p.err(.InvalidCharacter, "join datum"); + } + } + const rune = switch (join) { + 0 => JOIN, + '.' => DOT, + ':' => COLON, + else => unreachable, + }; + const joined = p.cons(rune, p.cons(prev, p.result)); + return p.jump(.parseJoin, joined); +} + +fn parseOneDatum(p: *Parser, c: u8, next: Fn) !void { + if (isBareChar(c) or c == '.') { + return p.jump(next, try parseBareString(p, c)); + } + return parseCladDatum(p, c, next); +} + +fn parseBareString(p: *Parser, c: u8) !Value { + const allow_dots = std.ascii.isDigit(c) or switch (c) { + '.', '+', '-' => true, + else => false, + }; + try p.addChar(c); + return parseBareStringRest(p, allow_dots); +} + +fn parseBareEscString(p: *Parser) !Value { + try p.addChar(try parseBareEsc(p)); + return parseBareStringRest(p, false); +} + +fn parseBareStringRest(p: *Parser, allow_dots: bool) !Value { + while (try p.read()) |c| { + if (isBareChar(c) or (allow_dots and c == '.')) { + try p.addChar(c); + } else if (c == '\\') { + try p.addChar(try parseBareEsc(p)); + } else { + p.unread(c); + break; + } + } + return p.getBareString(); +} + +fn parseBareEsc(p: *Parser) !u8 { + const c = try p.readNoEof("bare escape"); + if (isBareEsc(c)) { + return c; + } else { + return p.err(.InvalidCharacter, "bare escape"); + } +} + +fn parseCladDatum(p: *Parser, c: u8, next: Fn) !void { + if (c == '\\') { + return p.jump(next, try parseBareEscString(p)); + } + if (c == '"') { + return p.jump(next, try parseQuotedString(p, '"')); + } + if (c == '|') { + return p.jump(next, try parseQuotedString(p, '|')); + } + return switch (c) { + '#' => parseHashExpression(p, next), + '(', '[', '{' => parseList(p, c, next), + '\'', '`', ',' => parseQuoteExpr(p, c, next), + else => p.abort(next, c), + }; +} + +fn parseQuotedString(p: *Parser, close: u8) !Value { + while (try p.read()) |c| { + if (c == close) { + return p.getQuotedString(); + } + if (c != '\\') { + try p.addChar(c); + } else { + try parseQuotedEsc(p, close); + } + } + return error.UnclosedString; +} + +fn parseQuotedEsc(p: *Parser, close: u8) !void { + const c = try p.readNoEof("quoted escape"); + if (c == close) { + return p.addChar(close); + } + if (c == 'u') { + return parseUniHexHandleErrors(p); + } + try p.addChar(switch (c) { + '\\' => c, + '0' => 0, + 'a' => 7, + 'b' => 8, + 't' => 9, + 'n' => 10, + 'v' => 11, + 'f' => 12, + 'r' => 13, + 'e' => 27, + 'x' => try parseHexByte(p, "hex escape"), + else => return p.err(.InvalidCharacter, "quoted escape"), + }); +} + +fn parseUniHexHandleErrors(p: *Parser) !void { + return parseUniHex(p) catch |e| switch (e) { + error.Utf8CannotEncodeSurrogateHalf => p.err( + .UnicodeError, + "unicode escape", + ), + else => e, + }; +} + +fn parseUniHex(p: *Parser) !void { + const msg = "unicode escape"; + + if (try p.readNoEof(msg) != '{') { + return p.err(.InvalidCharacter, msg); + } + + const uc = try parseHex(p, u21, msg); + const c = p.getUnread() orelse return p.err(.UnexpectedEof, msg); + if (c != '}') { + return p.err(.InvalidCharacter, msg); + } + + const n = try std.unicode.utf8CodepointSequenceLength(uc); + const buf = try p.chars.addManyAsSlice(p.alloc.chars, n); + _ = try std.unicode.utf8Encode(uc, buf); +} + +fn parseHashExpression(p: *Parser, next: Fn) !void { + const c = try p.readNoEof("hash expression"); + if (try checkBlanks(p, c) != .no) { + return p.err(.InvalidCharacter, "hash expression"); + } + if (std.ascii.isAlphabetic(c)) { + const r = try parseRune(p, c); + return parseRuneEnd(p, r, next); + } + if (c == '%') { + const l = try parseLabel(p); + return parseLabelEnd(p, l, next); + } + p.unread(c); + return p.subr(.parseHashDatum, next); +} + +fn parseHashDatum(p: *Parser) !void { + return parseCladDatum(p, p.getUnread().?, .endHashDatum); +} + +fn endHashDatum(p: *Parser) !void { + if (p.result.eq(VOID)) { + return p.err(.InvalidCharacter, "hash datum"); + } + return p.retval(p.cons(HASH, p.result)); +} + +fn parseRune(p: *Parser, c1: u8) !Value { + try p.addChar(c1); + var len: usize = 1; + while (try p.read()) |c| : (len += 1) { + if (len == 6 or !std.ascii.isAlphanumeric(c)) { + p.unread(c); + return p.getRune(); + } + try p.addChar(c); + } + return p.getRune(); +} + +fn parseRuneEnd(p: *Parser, r: Value, next: Fn) !void { + const c = p.getUnread() orelse return p.jump(next, r); + if (c == '\\') { + return p.jump(next, p.cons(r, try parseBareString(p, c))); + } + if (c == '"') { + return p.jump(next, p.cons(r, try parseQuotedString(p, '"'))); + } + if (c == '|') { + return p.jump(next, p.cons(r, try parseQuotedString(p, '|'))); + } + p.unread(c); + switch (c) { + '#', '(', '[', '{', '\'', '`', ',' => { + try p.push(next); + p.context.val = r; + // Use jump to prevent recursion. + return p.jump(.parseRuneDatum, null); + }, + else => return p.jump(next, r), + } +} + +fn parseRuneDatum(p: *Parser) !void { + return parseCladDatum(p, p.getUnread().?, .endRuneDatum); +} + +fn endRuneDatum(p: *Parser) !void { + if (p.result.eq(VOID)) { + p.retval(p.context.val); + } + return p.retval(p.cons(p.context.val, p.result)); +} + +fn parseLabel(p: *Parser) !Value { + const label = try parseHex(p, u48, "datum label"); + return value.fixnum.pack(label); +} + +fn parseLabelEnd(p: *Parser, l: Value, next: Fn) !void { + const c = p.getUnread() orelse return p.err(.UnexpectedEof, "datum label"); + if (c == '%') { + return p.jump(next, p.cons(LABEL, l)); + } + if (c == '=') { + try p.push(next); + p.context.val = l; + return p.subr(.parseDatum, .endLabelDatum); + } + return p.err(.InvalidCharacter, "datum label"); +} + +fn endLabelDatum(p: *Parser) !void { + if (p.result.eq(VOID)) { + return p.err(.InvalidCharacter, "label datum"); + } + return p.retval(p.cons(LABEL, p.cons(p.context.val, p.result))); +} + +fn parseList(p: *Parser, open: u8, next: Fn) !void { + const head = switch (open) { + '(' => value.nil.nil, + '[' => p.cons(SQUARE, value.nil.nil), + '{' => p.cons(BRACE, value.nil.nil), + else => unreachable, + }; + const close: u8 = switch (open) { + '(' => ')', + '[' => ']', + '{' => '}', + else => unreachable, + }; + while (try p.read()) |c| { + if (c == close) { + return p.jump(next, head); + } + switch (try checkBlanks(p, c)) { + .yes => {}, + .skip_unit => { + try listParserSetup(p, head, close, next); + return p.subr(.parseUnit, .parseUnit); + }, + .no => { + try listParserSetup(p, head, close, next); + p.unread(c); + return p.jump(.parseDatum, null); + }, + } + } + return p.err(.UnexpectedEof, "list"); +} + +fn listParserSetup(p: *Parser, head: Value, close: u8, next: Fn) !void { + try p.push(next); + p.context.val = head; + p.context.char = close; + try p.pushContext(.continueList); +} + +fn continueList(p: *Parser) !void { + const close = p.context.char; + + if (p.result.eq(VOID)) { + const c = p.getUnread().?; + if (c == close) { + return endList(p); + } + return p.err(.InvalidCharacter, "list"); + } + + if (p.result.eq(LSTAIL)) { + return p.subr(.parseUnit, .endImproperList); + } + + p.context.val = p.cons(p.result, p.context.val); + + var c1 = p.getUnread() orelse try p.read(); + while (c1) |c| : (c1 = try p.read()) { + if (c == close) { + return endList(p); + } + switch (try checkBlanks(p, c)) { + .yes => {}, + .skip_unit => { + try p.pushContext(.continueList); + return p.subr(.parseUnit, .parseUnit); + }, + .no => { + p.unread(c); + return p.subr(.parseDatum, .continueList); + }, + } + } + return p.err(.UnexpectedEof, "list"); +} + +fn endList(p: *Parser) !void { + return p.retval(lib.list.reverse(p.context.val)); +} + +fn endImproperList(p: *Parser) !void { + if (p.result.eq(VOID)) { + return p.err(.InvalidCharacter, "list tail"); + } + p.context.val = lib.list.reverseWithTail(p.context.val, p.result); + return closeImproperList(p); +} + +fn closeImproperList(p: *Parser) !void { + const result = p.context.val; + const close = p.context.char; + var c1 = p.getUnread() orelse try p.read(); + while (c1) |c| : (c1 = try p.read()) { + if (c == close) { + return p.retval(result); + } + switch (try checkBlanks(p, c)) { + .yes => {}, + .skip_unit => return p.subr(.parseUnit, .closeImproperList), + .no => return p.err(.InvalidCharacter, "after list tail"), + } + } + return p.err(.UnexpectedEof, "after list tail"); +} + +fn parseQuoteExpr(p: *Parser, c1: u8, next: Fn) !void { + const q = switch (c1) { + '\'' => QUOTE, + '`' => GRAVE, + ',' => COMMA, + else => unreachable, + }; + + try p.push(next); + p.context.val = q; + p.unread(try p.readNoEof("quote expression")); + return p.subr(.parseDatum, .endQuoteExpr); +} + +fn endQuoteExpr(p: *Parser) !void { + if (p.result.eq(VOID)) { + return p.err(.InvalidCharacter, "quote expression datum"); + } + return p.retval(p.cons(p.context.val, p.result)); +} + +// Helpers + +fn checkBlanks(p: *Parser, c: u8) !enum { yes, skip_unit, no } { + switch (c) { + '\t'...'\r', ' ' => return .yes, + ';' => { + const c2 = try p.read() orelse return .yes; + if (c2 == '\n') return .yes; + if (c2 == '~') return .skip_unit; + while (try p.read()) |c3| if (c3 == '\n') break; + return .yes; + }, + else => return .no, + } +} + +fn isBareChar(c: u8) bool { + return switch (c) { + // zig fmt: off + 'a'...'z' , 'A'...'Z' , '0'...'9' , '!' , '$' , '%' , '&' , '*' , + '+' , '-' , '/' , '<' , '=' , '>' , '?' , '@' , '^' , '_' , '~' => true, + // zig fmt: on + else => false, + }; +} + +fn isBareEsc(c: u8) bool { + return switch (c) { + 33...126 => true, + else => false, + }; +} + +fn parseHex(p: *Parser, T: type, comptime emsg: []const u8) !T { + var uc: T = undefined; + + const c1 = try p.readNoEof(emsg); + uc = try parseHexDigit(p, c1, emsg); + + while (try p.read()) |c| { + if (!std.ascii.isHex(c)) { + p.unread(c); + return uc; + } + const shl = std.math.shlExact; + uc = shl(T, uc, 4) catch return p.err(.OutOfRange, emsg); + uc |= try parseHexDigit(p, c, emsg); + } + return uc; +} + +fn parseHexByte(p: *Parser, comptime emsg: []const u8) !u8 { + const h1 = try p.readNoEof(emsg); + const h2 = try p.readNoEof(emsg); + const hi = try parseHexDigit(p, h1, emsg); + const lo = try parseHexDigit(p, h2, emsg); + return hi << 4 | lo; +} + +fn parseHexDigit(p: *Parser, c: u8, comptime emsg: []const u8) !u8 { + return switch (c) { + '0'...'9' => c - '0', + 'A'...'F' => c - 'A' + 10, + 'a'...'f' => c - 'a' + 10, + else => p.err(.InvalidCharacter, emsg), + }; +} |
