diff options
Diffstat (limited to 'src/libzisp/io/Parser.zig')
| -rw-r--r-- | src/libzisp/io/Parser.zig | 870 |
1 files changed, 0 insertions, 870 deletions
diff --git a/src/libzisp/io/Parser.zig b/src/libzisp/io/Parser.zig deleted file mode 100644 index 6684104..0000000 --- a/src/libzisp/io/Parser.zig +++ /dev/null @@ -1,870 +0,0 @@ -// === Syntax === -// -// See docs/parser.md and spec/syntax.bnf to understand the syntax. -// -// -// === Trampolining strategy === -// -// Instead of using recursion directly, the parser is written in a "trampoline" -// style, which ensures that parse depth is not limited by stack size. -// -// All state and context is passed around via a struct pointer. The parser has -// a main loop, which calls a function as dictated by parser.context.next, and -// the function may update the state to have another function called next. -// -// If a function wants to call a recursive subroutine, it pushes some of the -// current context onto a stack, including what function the subroutine should -// return to, and then updates the state to instruct the main loop to call one -// of the entry point subroutines. -// -// If a function wants to make the parser return, either from a subroutine or -// from the main loop, it sets the .result field, and tries to pop the saved -// context. If the context stack was empty, the main loop returns. -// -// -// === Buffering === -// -// For efficiency, call the parser on an input stream with implicit buffering. -// -// The parser does not use its own buffer, beyond one character that may be -// written back into the unread_char field, which is checked at the end to -// ensure it's nothing other than a trailing blank or comment. -// -// This lack of buffering is to ensure that the parser never reads more bytes -// from the input than what it needs to parse a datum. Consider the input: -// -// (a b c) (x y z) -// -// If we used proper buffering, like reading up to 4K bytes per read, then the -// whole stream would be consumed at once before it's parsed. Then, the parser -// would return the first datum, and the rest of the stream would be lost. The -// parser would need some way to reset the input stream's read head to the end -// of the first datum, but not all stream types may support this. -// -// Although in a scenario like this it would be wise to create a single Parser -// instance to call multiple times (in which case it could continue using its -// internal buffer from where it left), this may not always be practical. -// -// One could even have an input stream with Zisp s-expressions and other data -// intertwined, in which case this lack of buffering is also crucial, since one -// needs to alternate between calling the Zisp parser and some other parser on -// the same input stream. -// - -const builtin = @import("builtin"); -const std = @import("std"); - -const lib = @import("../lib.zig"); -const value = @import("../value.zig"); - -const List = std.ArrayListUnmanaged; - -const Value = value.Value; - -const Parser = @This(); - -const is_test = builtin.is_test; -const is_debug = builtin.mode == .Debug; -const detailed_debug = false; - -// zig fmt: off -const DOT = value.rune.pack("DOT"); -const COLON = value.rune.pack("COLON"); -const JOIN = value.rune.pack("JOIN"); -const LABEL = value.rune.pack("LABEL"); -const HASH = value.rune.pack("HASH"); -const QUOTE = value.rune.pack("QUOTE"); -const GRAVE = value.rune.pack("GRAVE"); -const COMMA = value.rune.pack("COMMA"); -const SQUARE = value.rune.pack("SQUARE"); -const BRACE = value.rune.pack("BRACE"); -const VOID = value.rune.packForced(""); -const LSTAIL = value.sstr.pack("."); -// zig fmt: on - -// We could implement an optimization where we swap in a dummy cons when the -// parser is handling a commented-out datum, but this would require changes to -// the algorithm and doesn't seem very important, so it's not implemented. - -const Cons = *const fn (v1: Value, v2: Value) Value; - -fn dummyCons(v1: Value, v2: Value) Value { - _ = v1; - _ = v2; - return value.undef; -} - -const real_cons = &value.pair.cons; -const fake_cons = &dummyCons; - -pub const ParseError = enum { - InvalidCharacter, - UnclosedString, - UnexpectedEof, - UnicodeError, - RuneTooLong, - OutOfMemory, - OutOfRange, - ReadError, -}; - -pub const Context = struct { - // What to do next. - next: ?Fn = undefined, - // For storing a context value, like accumulated list elements. - val: Value = undefined, - // For storing a context char, like list opening bracket. - char: u8 = undefined, -}; - -// parameters -stack_alloc: std.mem.Allocator, -chars_alloc: std.mem.Allocator, - -// state -input: std.io.AnyReader = undefined, -context: Context = .{}, -stack: List(Context) = undefined, -chars: List(u8) = undefined, -cons: Cons = real_cons, -result: Value = undefined, -unread_char: ?u8 = null, -err_msg: []const u8 = undefined, - -pub fn init( - stack_alloc: std.mem.Allocator, - chars_alloc: std.mem.Allocator, - init_stack_mem: usize, - init_chars_mem: usize, -) !Parser { - var p: Parser = .{ - .stack_alloc = stack_alloc, - .chars_alloc = chars_alloc, - }; - const csize = @sizeOf(Context); - p.stack = try .initCapacity(stack_alloc, init_stack_mem / csize); - p.chars = try .initCapacity(chars_alloc, init_chars_mem); - return p; -} - -pub fn deinit(p: *Parser) void { - p.stack.deinit(p.stack_alloc); - p.chars.deinit(p.chars_alloc); -} - -// -// Input reading & unreading -// - -fn read(p: *Parser) !?u8 { - if (is_debug and p.unread_char != null) { - @panic("Called read() while there was an unused character!"); - } - const c = p.input.readByte() catch |e| switch (e) { - error.EndOfStream => return null, - else => return p.err(.ReadError, "???"), - }; - if (detailed_debug) { - std.debug.print("{c}", .{c}); - } - return c; -} - -fn readNoEof(p: *Parser, comptime emsg: []const u8) !u8 { - return if (try p.read()) |c| c else p.err(.UnexpectedEof, emsg); -} - -fn unread(p: *Parser, c: u8) void { - p.unread_char = c; -} - -fn getUnread(p: *Parser) ?u8 { - const c = p.unread_char orelse return null; - p.unread_char = null; - return c; -} - -// -// String accumulation & creation -// - -fn addChar(p: *Parser, c: u8) !void { - try p.chars.append(p.chars_alloc, c); -} - -fn getBareString(p: *Parser) Value { - defer p.chars.clearRetainingCapacity(); - return if (p.chars.items.len <= 6) - value.sstr.pack(p.chars.items) - else - value.istr.intern(p.chars.items, false); -} - -fn getQuotedString(p: *Parser) Value { - defer p.chars.clearRetainingCapacity(); - return if (p.chars.items.len <= 6) - value.sstr.packQuoted(p.chars.items) - else - value.istr.intern(p.chars.items, true); -} - -fn getRune(p: *Parser) Value { - defer p.chars.clearRetainingCapacity(); - return value.rune.pack(p.chars.items); -} - -// -// Stack management & control flow -// - -const Fn = enum { - parseUnit, - parseDatum, - endUnit, - returnContext, - endFirstDatum, - endJoinDatum, - parseJoin, - parseHashDatum, - endHashDatum, - parseRuneDatum, - endRuneDatum, - endLabelDatum, - continueList, - endImproperList, - closeImproperList, - endQuoteExpr, -}; - -fn call(p: *Parser, f: Fn) !void { - try switch (f) { - .parseUnit => parseUnit(p), - .parseDatum => parseDatum(p), - .endUnit => endUnit(p), - .returnContext => returnContext(p), - .endFirstDatum => endFirstDatum(p), - .endJoinDatum => endJoinDatum(p), - .parseJoin => parseJoin(p), - .parseHashDatum => parseHashDatum(p), - .endHashDatum => endHashDatum(p), - .parseRuneDatum => parseRuneDatum(p), - .endRuneDatum => endRuneDatum(p), - .endLabelDatum => endLabelDatum(p), - .continueList => continueList(p), - .endImproperList => endImproperList(p), - .closeImproperList => closeImproperList(p), - .endQuoteExpr => endQuoteExpr(p), - }; -} - -pub fn run(p: *Parser, input: std.io.AnyReader) !Value { - p.input = input; - p.context.next = .parseUnit; - while (p.context.next) |next| { - if (detailed_debug) printStack(p); - try p.call(next); - } - if (p.unread_char) |_| { - return p.err(.InvalidCharacter, "top-level"); - } - return p.result; -} - -fn printStack(p: *Parser) void { - const stack = p.stack.items; - std.debug.print("\n\n{}:{any} ctx:'{c}' unread:'{c}' \n", .{ - stack.len, - p.context.next, - p.context.char, - p.unread_char orelse '_', - }); - if (stack.len > 0) { - var i = stack.len; - while (i > 0) : (i -= 1) { - const prev = stack[i - 1]; - std.debug.print("{}:{any} ctx:'{c}'\n", .{ - i - 1, - prev.next, - prev.char, - }); - } - } -} - -fn err( - p: *Parser, - comptime e: ParseError, - comptime msg: []const u8, -) error{ParseError} { - p.err_msg = @tagName(e) ++ " at: " ++ msg; - return error.ParseError; -} - -fn push(p: *Parser, next: Fn) !void { - try p.stack.append(p.stack_alloc, .{ .next = next }); -} - -fn pushContext(p: *Parser, next: Fn) !void { - try p.stack.append(p.stack_alloc, .{ - .next = next, - .val = p.context.val, - .char = p.context.char, - }); -} - -fn ret(p: *Parser) void { - if (p.stack.pop()) |c| { - p.context = c; - } else { - p.context.next = null; - } -} - -fn subr(p: *Parser, start: Fn, next: Fn) !void { - try p.pushContext(next); - p.context.next = start; -} - -fn jump(p: *Parser, next: Fn, val: ?Value) void { - if (val) |v| p.result = v; - p.context.next = next; -} - -fn abort(p: *Parser, next: Fn, unread_c: u8) void { - p.result = VOID; - p.unread_char = unread_c; - p.context.next = next; -} - -fn retval(p: *Parser, val: Value) void { - p.result = val; - p.ret(); -} - -// -// Start parser functions -// - -fn parseUnit(p: *Parser) !void { - var c1 = p.getUnread() orelse try p.read(); - while (c1) |c| : (c1 = try p.read()) { - switch (try checkBlanks(p, c)) { - .yes => {}, - .skip_unit => try p.push(.parseUnit), - .no => { - p.unread(c); - return p.subr(.parseDatum, .endUnit); - }, - } - } - return p.retval(value.eof.eof); -} - -fn endUnit(p: *Parser) !void { - const c = p.getUnread() orelse return p.ret(); - switch (try checkBlanks(p, c)) { - .yes => {}, - .skip_unit => return skipUnitAndReturn(p), - .no => p.unread(c), - } - return p.ret(); -} - -fn skipUnitAndReturn(p: *Parser) !void { - p.context.val = p.result; - return p.subr(.parseUnit, .returnContext); -} - -fn returnContext(p: *Parser) !void { - return p.retval(p.context.val); -} - -fn parseDatum(p: *Parser) !void { - return parseOneDatum(p, p.getUnread().?, .endFirstDatum); -} - -fn endFirstDatum(p: *Parser) !void { - if (p.result.eq(VOID)) { - return p.ret(); - } - return parseJoin(p); -} - -fn parseJoin(p: *Parser) !void { - const c = p.getUnread() orelse try p.read() orelse return p.ret(); - switch (c) { - '.', ':' => { - p.context.char = c; - p.unread(try p.readNoEof("join datum")); - }, - else => { - p.context.char = 0; - p.unread(c); - }, - } - p.context.val = p.result; - return p.subr(.parseDatum, .endJoinDatum); -} - -fn endJoinDatum(p: *Parser) !void { - const prev = p.context.val; - const join = p.context.char; - if (p.result.eq(VOID)) { - if (join == 0) { - return p.retval(prev); - } else { - return p.err(.InvalidCharacter, "join datum"); - } - } - const rune = switch (join) { - 0 => JOIN, - '.' => DOT, - ':' => COLON, - else => unreachable, - }; - const joined = p.cons(rune, p.cons(prev, p.result)); - return p.jump(.parseJoin, joined); -} - -fn parseOneDatum(p: *Parser, c: u8, next: Fn) !void { - if (isBareChar(c) or c == '.') { - return p.jump(next, try parseBareString(p, c)); - } - return parseCladDatum(p, c, next); -} - -fn parseBareString(p: *Parser, c: u8) !Value { - const allow_dots = std.ascii.isDigit(c) or switch (c) { - '.', '+', '-' => true, - else => false, - }; - try p.addChar(c); - return parseBareStringRest(p, allow_dots); -} - -fn parseBareEscString(p: *Parser) !Value { - try p.addChar(try parseBareEsc(p)); - return parseBareStringRest(p, false); -} - -fn parseBareStringRest(p: *Parser, allow_dots: bool) !Value { - while (try p.read()) |c| { - if (isBareChar(c) or (allow_dots and c == '.')) { - try p.addChar(c); - } else if (c == '\\') { - try p.addChar(try parseBareEsc(p)); - } else { - p.unread(c); - break; - } - } - return p.getBareString(); -} - -fn parseBareEsc(p: *Parser) !u8 { - const c = try p.readNoEof("bare escape"); - if (isBareEsc(c)) { - return c; - } else { - return p.err(.InvalidCharacter, "bare escape"); - } -} - -fn parseCladDatum(p: *Parser, c: u8, next: Fn) !void { - if (c == '\\') { - return p.jump(next, try parseBareEscString(p)); - } - if (c == '"') { - return p.jump(next, try parseQuotedString(p, '"')); - } - if (c == '|') { - return p.jump(next, try parseQuotedString(p, '|')); - } - return switch (c) { - '#' => parseHashExpression(p, next), - '(', '[', '{' => parseList(p, c, next), - '\'', '`', ',' => parseQuoteExpr(p, c, next), - else => p.abort(next, c), - }; -} - -fn parseQuotedString(p: *Parser, close: u8) !Value { - while (try p.read()) |c| { - if (c == close) { - return p.getQuotedString(); - } - if (c != '\\') { - try p.addChar(c); - } else { - try parseQuotedEsc(p, close); - } - } - return error.UnclosedString; -} - -fn parseQuotedEsc(p: *Parser, close: u8) !void { - const c = try p.readNoEof("quoted escape"); - if (c == close) { - return p.addChar(close); - } - if (c == 'u') { - return parseUniHexHandleErrors(p); - } - try p.addChar(switch (c) { - '\\' => c, - '0' => 0, - 'a' => 7, - 'b' => 8, - 't' => 9, - 'n' => 10, - 'v' => 11, - 'f' => 12, - 'r' => 13, - 'e' => 27, - 'x' => try parseHexByte(p, "hex escape"), - else => return p.err(.InvalidCharacter, "quoted escape"), - }); -} - -fn parseUniHexHandleErrors(p: *Parser) !void { - return parseUniHex(p) catch |e| switch (e) { - error.Utf8CannotEncodeSurrogateHalf => p.err( - .UnicodeError, - "unicode escape", - ), - else => e, - }; -} - -fn parseUniHex(p: *Parser) !void { - const msg = "unicode escape"; - - if (try p.readNoEof(msg) != '{') { - return p.err(.InvalidCharacter, msg); - } - - const uc = try parseHex(p, u21, msg); - const c = p.getUnread() orelse return p.err(.UnexpectedEof, msg); - if (c != '}') { - return p.err(.InvalidCharacter, msg); - } - - const n = try std.unicode.utf8CodepointSequenceLength(uc); - const buf = try p.chars.addManyAsSlice(p.chars_alloc, n); - _ = try std.unicode.utf8Encode(uc, buf); -} - -fn parseHashExpression(p: *Parser, next: Fn) !void { - const c = try p.readNoEof("hash expression"); - if (try checkBlanks(p, c) != .no) { - return p.err(.InvalidCharacter, "hash expression"); - } - if (std.ascii.isAlphabetic(c)) { - const r = try parseRune(p, c); - return parseRuneEnd(p, r, next); - } - if (c == '%') { - const l = try parseLabel(p); - return parseLabelEnd(p, l, next); - } - if (isBareChar(c)) { - // Reserved for future extensions to syntax sugar. - return p.err(.InvalidCharacter, "hash expression"); - } - // fast-path to avoid subr - if (c == '\\') { - return p.jump(next, p.cons(HASH, try parseBareEscString(p))); - } - if (c == '"') { - return p.jump(next, p.cons(HASH, try parseQuotedString(p, '"'))); - } - if (c == '|') { - return p.jump(next, p.cons(HASH, try parseQuotedString(p, '|'))); - } - p.unread(c); - return p.subr(.parseHashDatum, next); -} - -fn parseHashDatum(p: *Parser) !void { - return parseCladDatum(p, p.getUnread().?, .endHashDatum); -} - -fn endHashDatum(p: *Parser) !void { - if (p.result.eq(VOID)) { - return p.err(.InvalidCharacter, "hash datum"); - } - return p.retval(p.cons(HASH, p.result)); -} - -fn parseRune(p: *Parser, c1: u8) !Value { - try p.addChar(c1); - var len: usize = 1; - while (try p.read()) |c| : (len += 1) { - if (len == 6 or !std.ascii.isAlphanumeric(c)) { - p.unread(c); - return p.getRune(); - } - try p.addChar(c); - } - return p.getRune(); -} - -fn parseRuneEnd(p: *Parser, r: Value, next: Fn) !void { - const c = p.getUnread() orelse return p.jump(next, r); - if (c == '\\') { - return p.jump(next, p.cons(r, try parseBareString(p, c))); - } - if (c == '"') { - return p.jump(next, p.cons(r, try parseQuotedString(p, '"'))); - } - if (c == '|') { - return p.jump(next, p.cons(r, try parseQuotedString(p, '|'))); - } - p.unread(c); - switch (c) { - '#', '(', '[', '{', '\'', '`', ',' => { - try p.push(next); - p.context.val = r; - // Use jump to prevent recursion. - return p.jump(.parseRuneDatum, null); - }, - else => return p.jump(next, r), - } -} - -fn parseRuneDatum(p: *Parser) !void { - return parseCladDatum(p, p.getUnread().?, .endRuneDatum); -} - -fn endRuneDatum(p: *Parser) !void { - if (p.result.eq(VOID)) { - p.retval(p.context.val); - } - return p.retval(p.cons(p.context.val, p.result)); -} - -fn parseLabel(p: *Parser) !Value { - const label = try parseHex(p, u48, "datum label"); - return value.fixnum.pack(label); -} - -fn parseLabelEnd(p: *Parser, l: Value, next: Fn) !void { - const c = p.getUnread() orelse return p.err(.UnexpectedEof, "datum label"); - if (c == '%') { - return p.jump(next, p.cons(LABEL, l)); - } - if (c == '=') { - try p.push(next); - p.context.val = l; - return p.subr(.parseDatum, .endLabelDatum); - } - return p.err(.InvalidCharacter, "datum label"); -} - -fn endLabelDatum(p: *Parser) !void { - if (p.result.eq(VOID)) { - return p.err(.InvalidCharacter, "label datum"); - } - return p.retval(p.cons(LABEL, p.cons(p.context.val, p.result))); -} - -fn parseList(p: *Parser, open: u8, next: Fn) !void { - const head = switch (open) { - '(' => value.nil.nil, - '[' => p.cons(SQUARE, value.nil.nil), - '{' => p.cons(BRACE, value.nil.nil), - else => unreachable, - }; - const close: u8 = switch (open) { - '(' => ')', - '[' => ']', - '{' => '}', - else => unreachable, - }; - while (try p.read()) |c| { - if (c == close) { - return p.jump(next, head); - } - switch (try checkBlanks(p, c)) { - .yes => {}, - .skip_unit => { - try listParserSetup(p, head, close, next); - return p.subr(.parseUnit, .parseUnit); - }, - .no => { - try listParserSetup(p, head, close, next); - p.unread(c); - return p.jump(.parseDatum, null); - }, - } - } - return p.err(.UnexpectedEof, "list"); -} - -fn listParserSetup(p: *Parser, head: Value, close: u8, next: Fn) !void { - try p.push(next); - p.context.val = head; - p.context.char = close; - try p.pushContext(.continueList); -} - -fn continueList(p: *Parser) !void { - const close = p.context.char; - - if (p.result.eq(VOID)) { - const c = p.getUnread().?; - if (c == close) { - return endList(p); - } - return p.err(.InvalidCharacter, "list"); - } - - if (p.result.eq(LSTAIL)) { - return p.subr(.parseUnit, .endImproperList); - } - - p.context.val = p.cons(p.result, p.context.val); - - var c1 = p.getUnread() orelse try p.read(); - while (c1) |c| : (c1 = try p.read()) { - if (c == close) { - return endList(p); - } - switch (try checkBlanks(p, c)) { - .yes => {}, - .skip_unit => { - try p.pushContext(.continueList); - return p.subr(.parseUnit, .parseUnit); - }, - .no => { - p.unread(c); - return p.subr(.parseDatum, .continueList); - }, - } - } - return p.err(.UnexpectedEof, "list"); -} - -fn endList(p: *Parser) !void { - return p.retval(lib.list.reverse(p.context.val)); -} - -fn endImproperList(p: *Parser) !void { - if (p.result.eq(VOID)) { - return p.err(.InvalidCharacter, "list tail"); - } - p.context.val = lib.list.reverseWithTail(p.context.val, p.result); - return closeImproperList(p); -} - -fn closeImproperList(p: *Parser) !void { - const result = p.context.val; - const close = p.context.char; - var c1 = p.getUnread() orelse try p.read(); - while (c1) |c| : (c1 = try p.read()) { - if (c == close) { - return p.retval(result); - } - switch (try checkBlanks(p, c)) { - .yes => {}, - .skip_unit => return p.subr(.parseUnit, .closeImproperList), - .no => return p.err(.InvalidCharacter, "after list tail"), - } - } - return p.err(.UnexpectedEof, "after list tail"); -} - -fn parseQuoteExpr(p: *Parser, c1: u8, next: Fn) !void { - const q = switch (c1) { - '\'' => QUOTE, - '`' => GRAVE, - ',' => COMMA, - else => unreachable, - }; - - // fast-path to avoid subr - const c = try p.readNoEof("quote expression"); - if (isBareChar(c) or c == '\\') { - return p.jump(next, p.cons(q, try parseBareString(p, c))); - } - - try p.push(next); - p.context.val = q; - p.unread(c); - return p.subr(.parseDatum, .endQuoteExpr); -} - -fn endQuoteExpr(p: *Parser) !void { - if (p.result.eq(VOID)) { - return p.err(.InvalidCharacter, "quote expression datum"); - } - return p.retval(p.cons(p.context.val, p.result)); -} - -// Helpers - -fn checkBlanks(p: *Parser, c: u8) !enum { yes, skip_unit, no } { - switch (c) { - '\t'...'\r', ' ' => return .yes, - ';' => { - const c2 = try p.read() orelse return .yes; - if (c2 == '\n') return .yes; - if (c2 == '~') return .skip_unit; - while (try p.read()) |c3| if (c3 == '\n') break; - return .yes; - }, - else => return .no, - } -} - -fn isBareChar(c: u8) bool { - return switch (c) { - // zig fmt: off - 'a'...'z' , 'A'...'Z' , '0'...'9' , '!' , '$' , '%' , '&' , '*' , - '+' , '-' , '/' , '<' , '=' , '>' , '?' , '@' , '^' , '_' , '~' => true, - // zig fmt: on - else => false, - }; -} - -fn isBareEsc(c: u8) bool { - return switch (c) { - 33...126 => true, - else => false, - }; -} - -fn parseHex(p: *Parser, T: type, comptime emsg: []const u8) !T { - var uc: T = undefined; - - const c1 = try p.readNoEof(emsg); - uc = try parseHexDigit(p, c1, emsg); - - while (try p.read()) |c| { - if (!std.ascii.isHex(c)) { - p.unread(c); - return uc; - } - const shl = std.math.shlExact; - uc = shl(T, uc, 4) catch return p.err(.OutOfRange, emsg); - uc |= try parseHexDigit(p, c, emsg); - } - return uc; -} - -fn parseHexByte(p: *Parser, comptime emsg: []const u8) !u8 { - const h1 = try p.readNoEof(emsg); - const h2 = try p.readNoEof(emsg); - const hi = try parseHexDigit(p, h1, emsg); - const lo = try parseHexDigit(p, h2, emsg); - return hi << 4 | lo; -} - -fn parseHexDigit(p: *Parser, c: u8, comptime emsg: []const u8) !u8 { - return switch (c) { - '0'...'9' => c - '0', - 'A'...'F' => c - 'A' + 10, - 'a'...'f' => c - 'a' + 10, - else => p.err(.InvalidCharacter, emsg), - }; -} |
