// // === Parser for Code & Data === // // Zisp s-expressions are defined in terms of an extremely minimal set of data // types; only that which is necessary to build representations of more complex // expressions and data types: // // +---------+-------------+---------------+--------+---------+--------+ // | TYPE | Bare String | Quoted String | Rune | Pair | Nil | // +---------+-------------+---------------+--------+---------+--------+ // | EXAMPLE | foobar | "foo bar" | #name | (X . Y) | () | // +---------+-------------+---------------+--------+---------+--------+ // // Bare strings and quoted strings are polymorphic sub-types of the generic // string type. This lets the compiler distinguish between identifiers and // string literals. // // There are also non-negative integers, but they are only used for datum // labels; regular number literals are handled by the "decoder" (see below). // // The parser recognizes various "syntax sugar" and transforms it into uses of // the above data types. The most ubiquitous example is of course the list: // // (datum1 datum2 ...) -> (datum1 . (datum2 . (... . ()))) // // The following table summarizes the other supported transformations: // // #datum -> (#HASH . datum) #rune(...) -> (#rune ...) // // [...] -> (#SQUARE ...) dat1dat2 -> (#JOIN dat1 . dat2) // // {...} -> (#BRACE ...) dat1.dat2 -> (#DOT dat1 . dat2) // // 'datum -> (#QUOTE . datum) dat1:dat2 -> (#COLON dat1 . dat2) // // `datum -> (#GRAVE . datum) dat1|dat2 -> (#PIPE dat1 . dat2) // // ,datum -> (#COMMA . datum) #%n=datum -> (#LABEL hex . datum) // // #%n -> (#LABEL . hex) // // A separate process called "decoding" can transform these objects into other // data types. For example, (#HASH x y z) could become a vector, so that the // expression #(x y z) works just like in Scheme. // // Decoding also resolves datum labels, and goes over bare strings to find ones // that are actually a number literal. This lets us offload the complexity of // number parsing elsewhere, so the parser remains extremely simple. // // Further notes about the syntax sugar table and examples above: // // * The terms datum, dat1, and dat2 each refer to an arbitrary datum; ellipsis // means zero or more data; hex is a hexadecimal number of up to 12 digits. // // * The #datum form only applies to expressions that cannot be mistaken for a // rune, such as #(...) or #"..." or #'string etc.; a hash sign followed by a // bare string will parse as a rune instead (or raise an error if it contains // characters that are illegal in rune names). // // * A backslash causes the immediately following character to lose any special // meaning it would normally have, and be considered as part of a bare string // instead. (This does not apply to whitespace.) For example, the following // three character sequences are each a valid bare string: // // foo\(bar\) \]blah \#\'xyz // // This can be used to follow a hash with a bare string without it being // parsed as a rune: // // #\foo -> (#HASH . foo) // // * Though not represented in the table due to notational difficulty, the // format "#rune(...)" doesn't require a list in the second position; any // datum works, so long as there's no ambiguity; for example: // // #rune1#rune2 -> (#rune1 . #rune2) // // #rune"text" -> (#rune . "text") // // #rune'string -> (#rune #QUOTE . string) // // As a counter-example, following a rune immediately with a bare string is // not possible, since it's ambiguous: // // #abcdefgh ;Could be (#abcdef . gh) or (#abcde . fgh) or ... // // The parser will see this as an attempt to use an 8-letter rune name, and // raise an error, since rune names are limited to 6 characters. // // * Syntax sugar can combine arbitrarily; some examples follow: // // #{...} -> (#HASH #BRACE ...) // // #'foo -> (#HASH #QUOTE . foo) // // ##'[...] -> (#HASH #HASH #QUOTE #SQUARE ...) // // {x y}[i j] -> (#JOIN (#BRACE x y) #SQUARE i j) // // foo.bar.baz{x y} -> (#JOIN (#DOT (#DOT foo . bar) . baz) #BRACE x y) // // * While 'foo parses as (quote foo) in Scheme, it parses as (#QUOTE . foo) in // Zisp; the operand of #QUOTE is the entire cdr. The same principle is used // when parsing other sugar; some examples follow: // // Incorrect Correct // // #(x y z) -> (#HASH (x y z)) #(x y z) -> (#HASH x y z) // // [x y z] -> (#SQUARE (x y z)) [x y z] -> (#SQUARE x y z) // // #{x} -> (#HASH (#BRACE (x))) #{x} -> (#HASH #BRACE x) // // foo(x y) -> (#JOIN foo (x y)) foo(x y) -> (#JOIN foo x y) // // * Runes are case-sensitive, and the parser only emits runes using upper-case // letters when expressing syntax sugar. This way, there can be no accidental // clash with runes that appear verbatim in code, as long as only lower-case // letters are used for rune literals in code. // // // === Decoding === // // A separate process called "decoding" can transform simple data structures, // consisting of only the above types, into a richer set of Zisp data types. // // For example, the decoder may turn (#HASH ...) into a vector, as one would // expect a vector literal like #(...) to work in Scheme. // // Runes may be decoded in isolation as well, rather than transforming a list // whose head they appear in. This can implement #true and #false. // // The decoder may also perform arbitrary transforms on any type; for example, // it may turn bare strings into numbers wher appropriate. This can implement // number literals. // // The decoder recognizes (#QUOTE ...) to implement the traditional quoting // mechanism, but with a significant difference: // // Traditional quote is "unhygienic" in Scheme terms. An expression such as // '(foo bar) will always be read as (quote (foo bar)) regardless of what sort // of lexical context it appears in, so the semantics will depend on whatever // the identifier "quote" is bound to in that lexical context, meaning that the // expression may end up evaluating to something other than the list (foo bar). // // The Zisp decoder, which transforms not text to text, but objects to objects, // can turn (#QUOTE ...) into an abstract object which encapsulates the notion // of quoting, which the Zisp evaluator can recognize and act upon, ensuring // that an expression like '(foo bar) always turns into the list (foo bar). // // One way to think about this, in Scheme (R6RS / syntax-case) terms, is that // expressions like '(foo bar) turn directly into a *syntax object* when read, // rather than a regular list object. // // The decoder is, of course, configurable and extensible. The transformations // mentioned above would be performed only when it's told to decode data which // represents Zisp code. The decoder may be given a different configuration, // telling it to decode, for example, data which represents a different kind of // domain-specific data, such as application settings, build system commands, // complex data records with non-standard data types, and so on. // // // === Trampolining strategy === // // Instead of using recursion directly, the parser is written in a "trampoline" // style, which ensures that parsing depth is not limited by the stack size. // // All state and context is passed around via a struct pointer. The parser has // a main loop, which calls a function as dictated by state.context.next, and // the function may update the state to have another function called next. // // If a function wants to call the parser recursively, it pushes some of the // current context onto a stack, including what function the recursive parser // should "return" to, and then updates the state to instruct the main loop to // call one of the starting functions. // // If a function wants to make the parser return, either from a recursive parse // or from the main loop, it sets the .result field, and tries to pop the saved // context. If the context stack was empty, the main loop returns. // // While it's possible to just set .next and return, to make the main loop call // another function next (possibly even setting .result to pass a value to it), // this is completely unnecessary. A few non-recursive calls won't blow the // stack. It's only recursive parsing that we use the above strategy for. // // // === .start_parse VS .start_datum === // // The difference between .start_parse and .start_datum is that the former will // allow whitespace and comments at the beginning, whereas the latter expects a // datum immediately without anything else in front of it. // // When calling the parser recursively, it may seem logical to always make it // start with .start_datum, because we already cleared whitespace and comments // out of the way. However, in some cases, we must use .start_parse instead. // // This is because of datum comments. When one appears, we start a recursive // parser, but instead of making it return to a function that will consume the // result, we make it return to the original starting point, so the result is // ignored and the parser retries what it was originally doing. If we always // used .start_datum, this would never allow whitespace after a datum comment, // since we would be back at .start_datum after the comment is out of the way: // // (foo #;bar baz) ;must use .start_parse at the start of each element // // // === List parsing strategy === // // When it comes to pairs and lists, we basically treat everything as a list, // and a pair is just seen as the shortest possible improper list. This saves // memory: If we implemented list parsing as pair parsing, we would be calling // the parser recursively, deeper and deeper, for every list element. Though // we're not limited by stack space thanks to the strategy described above, it // would still waste memory and the time it takes to allocate memory. // // // === Buffering === // // We use a small circular buffer just so we can backtrack. We read bytes into // this buffer one by one, because we don't want to consume more bytes from the // input than what we actually parse. Consider the following input stream: // // (a b c) (x y z) // // If we used proper buffering, like reading up to 4K bytes per read, then the // whole stream would be consumed at once before it's parsed. Then, the parser // would return the first datum, and the rest of the stream would be lost. The // parser would need some way to reset the input stream's read head to the end // of the first datum, but not all stream types may support this. // // For efficiency, call the parser on an input stream with implicit buffering. // // // === Notation used in comments === // // Some comments throughout the file give you an example of where the parser // currently might be in a stream. These illustrations use the pipe symbol, // which looks like a cursor, to indicate the current position: // // (foo| bar baz) <- parser arrived at the end of the string foo // // (foo bar baz)| <- parser reached EOF (assuming no trailing spaces) // const builtin = @import("builtin"); const std = @import("std"); const lib = @import("../lib.zig"); const value = @import("../value.zig"); const List = std.ArrayListUnmanaged; const Value = value.Value; const cons = value.pair.cons; const is_test = builtin.is_test; const is_debug = builtin.mode == .Debug; const detailed_debug = false; // In debug, we want to see if we leak, so very small numbers. const init_stack_capacity = if (is_debug) 32 else 32; const init_chars_capacity = if (is_debug) 512 else 512; // zig fmt: off const DOT = value.rune.pack("DOT"); const COLON = value.rune.pack("COLON"); const PIPE = value.rune.pack("PIPE"); const JOIN = value.rune.pack("JOIN"); const LABEL = value.rune.pack("LABEL"); const HASH = value.rune.pack("HASH"); const QUOTE = value.rune.pack("QUOTE"); const GRAVE = value.rune.pack("GRAVE"); const COMMA = value.rune.pack("COMMA"); const SQUARE = value.rune.pack("SQUARE"); const BRACE = value.rune.pack("BRACE"); // zig fmt: on const S_DOT = value.sstr.pack("."); const Context = struct { // What to do next. next: Fn = .parse_unit, // For storing a context value, like accumulated list elements. val: Value = undefined, // For storing a context char, like list opening bracket. char: u8 = undefined, }; const ParseError = error{ InvalidCharacter, UnclosedString, UnexpectedEof, UnicodeError, RuneTooLong, OutOfMemory, OutOfRange, ReadError, }; const State = struct { input: std.io.AnyReader, stack_alloc: std.mem.Allocator, chars_alloc: std.mem.Allocator, context: Context = .{}, stack: List(Context) = undefined, chars: List(u8) = undefined, // TODO: Implement bool flag for when skipping a unit result: Value = undefined, unused_char: ?u8 = null, err_code: anyerror = undefined, err_msg: []const u8 = undefined, fn init( input: std.io.AnyReader, stack_alloc: std.mem.Allocator, chars_alloc: std.mem.Allocator, ) !State { var s: State = .{ .input = input, .stack_alloc = stack_alloc, .chars_alloc = chars_alloc, }; s.stack = try .initCapacity(s.stack_alloc, init_stack_capacity); s.chars = try .initCapacity(s.chars_alloc, init_chars_capacity); return s; } fn deinit(s: *State) void { s.stack.deinit(s.stack_alloc); s.chars.deinit(s.chars_alloc); } fn err(s: *State, e: ParseError, msg: []const u8) ParseError { s.err_msg = msg; return e; } fn read(s: *State) !?u8 { if (is_debug and s.unused_char != null) { @panic("Called read() while there was an unused character!"); } const c = s.input.readByte() catch |e| switch (e) { error.EndOfStream => return null, else => { s.err_code = e; return error.ReadError; }, }; if (detailed_debug) { std.debug.print("{c}", .{c}); } return c; } fn readNoEof(s: *State, emsg: []const u8) !u8 { return if (try s.read()) |c| c else s.err(error.UnexpectedEof, emsg); } fn getUnused(s: *State) ?u8 { if (s.unused_char) |c| { s.unused_char = null; return c; } return null; } fn skipLine(s: *State) !void { while (try s.read()) |c| if (c == '\n') break; } fn addChar(s: *State, c: u8) !void { try s.chars.append(s.chars_alloc, c); } fn getBareString(s: *State) Value { defer s.chars.clearRetainingCapacity(); return if (s.chars.items.len <= 6) value.sstr.pack(s.chars.items) else value.istr.intern(s.chars.items, false); } fn getQuotedString(s: *State) Value { defer s.chars.clearRetainingCapacity(); return if (s.chars.items.len <= 6) value.sstr.packQuoted(s.chars.items) else value.istr.intern(s.chars.items, true); } fn getRune(s: *State) Value { defer s.chars.clearRetainingCapacity(); return value.rune.pack(s.chars.items); } fn push(s: *State, next: Fn) !void { try s.stack.append(s.stack_alloc, .{ .next = next }); } fn pushContext(s: *State, next: Fn) !void { try s.stack.append(s.stack_alloc, .{ .next = next, .val = s.context.val, .char = s.context.char, }); } fn subr(s: *State, start: Fn, next: Fn) !void { try s.pushContext(next); s.context.next = start; } fn jump(s: *State, next: Fn, val: ?Value) void { if (val) |v| s.result = v; s.context.next = next; } fn abort(s: *State, next: Fn, unused_c: u8) void { s.result = value.undef; s.unused_char = unused_c; s.context.next = next; } fn retval(s: *State, val: Value) void { s.result = val; if (s.stack.pop()) |c| { s.context = c; } else { s.context.next = .done; } } }; pub fn parse(input: std.io.AnyReader) Value { var debug_alloc: std.heap.DebugAllocator(.{}) = undefined; if (!is_test and is_debug) { debug_alloc = .init; } defer if (!is_test and is_debug) { if (debug_alloc.deinit() == .leak) { @panic("leak"); } }; const alloc = if (is_test) std.testing.allocator else if (is_debug) debug_alloc.allocator() else std.heap.smp_allocator; const stack_mem = init_stack_capacity * @sizeOf(Context); const chars_mem = init_chars_capacity; var stack_sfa = std.heap.stackFallback(stack_mem, alloc); var chars_sfa = std.heap.stackFallback(chars_mem, alloc); const stack_alloc = stack_sfa.get(); const chars_alloc = chars_sfa.get(); var s = State.init(input, stack_alloc, chars_alloc) catch @panic(""); defer s.deinit(); while (s.context.next != .done) callNext(&s) catch { if (s.unused_char) |c| { std.debug.panic( "Parse error: {} at: {s}, char: {c}\n", .{ s.err_code, s.err_msg, c }, ); } else { std.debug.panic( "Parse error: {} at: {s}\n", .{ s.err_code, s.err_msg }, ); } }; if (s.unused_char) |c| { std.debug.panic("Invalid character: {c}\n", .{c}); } return s.result; } const Fn = enum { parse_unit, return_context, end_one_datum, parse_join_datum, end_join_datum, join_data, parse_hash_datum, end_hash_datum, parse_rune_datum, end_rune_datum, end_label_datum, parse_list_element, continue_list, end_improper_list, close_improper_list, end_quote_expr, done, }; fn callNext(s: *State) !void { if (detailed_debug) { const stack = s.stack.items; std.debug.print("\n\n{}:{} ctx:'{c}' unused:'{c}' \n", .{ stack.len, s.context.next, s.context.char, s.unused_char orelse '_', }); if (stack.len > 0) { var i = stack.len; while (i > 0) : (i -= 1) { const prev = stack[i - 1]; std.debug.print("{}:{} ctx:'{c}'\n", .{ i - 1, prev.next, prev.char, }); } } } try switch (s.context.next) { .parse_unit => parseUnit(s), .return_context => returnContext(s), .end_one_datum => endOneDatum(s), .parse_join_datum => parseJoinDatum(s), .end_join_datum => endJoinDatum(s), .join_data => joinData(s), .parse_hash_datum => parseHashDatum(s), .end_hash_datum => endHashDatum(s), .parse_rune_datum => parseRuneDatum(s), .end_rune_datum => endRuneDatum(s), .end_label_datum => endLabelDatum(s), .parse_list_element => parseListElement(s), .continue_list => continueList(s), .end_improper_list => endImproperList(s), .close_improper_list => closeImproperList(s), .end_quote_expr => endQuoteExpr(s), .done => unreachable, }; } fn parseUnit(s: *State) !void { var c1 = s.getUnused() orelse try s.read(); while (c1) |c| : (c1 = try s.read()) { switch (try checkBlank(s, c)) { .yes => {}, .skip_unit => try s.push(.parse_unit), .skip_line => try s.skipLine(), .no => return parseDatum(s, c), } } return s.retval(value.eof.eof); } fn checkBlank(s: *State, c: u8) !enum { yes, skip_unit, skip_line, no } { return switch (c) { '\t'...'\r', ' ' => .yes, ';' => switch (try s.read() orelse '\n') { '\n' => .yes, '~' => .skip_unit, else => .skip_line, }, else => .no, }; } fn parseDatum(s: *State, c: u8) !void { return parseOneDatum(s, c, .end_one_datum); } fn endOneDatum(s: *State) !void { if (s.result.eq(value.undef)) { return s.retval(value.undef); } const d = s.result; const c1 = s.getUnused() orelse try s.read(); if (c1) |c| { switch (try checkBlank(s, c)) { .yes => {}, .skip_unit => return skipUnitAndReturn(s, d), .skip_line => try s.skipLine(), .no => return parseJoin(s, d, c), } } return s.retval(d); } fn skipUnitAndReturn(s: *State, d: Value) !void { s.context.val = d; return s.subr(.parse_unit, .return_context); } fn returnContext(s: *State) !void { return s.retval(s.context.val); } fn parseJoin(s: *State, d: Value, c: u8) !void { s.context.val = d; s.context.char = c; switch (c) { '.', ':', '|' => { s.context.char = c; s.unused_char = try s.readNoEof("join datum"); }, else => { s.context.char = 0; s.unused_char = c; }, } return s.subr(.parse_join_datum, .join_data); } fn parseJoinDatum(s: *State) !void { return parseOneDatum(s, s.getUnused().?, .end_join_datum); } fn endJoinDatum(s: *State) !void { return s.retval(s.result); } fn joinData(s: *State) !void { const head = s.context.val; const join = s.context.char; const tail = s.result; if (tail.eq(value.undef)) { if (join == 0) { return s.retval(head); } else { return s.err(error.InvalidCharacter, "join datum"); } } const rune = switch (join) { 0 => JOIN, '.' => DOT, ':' => COLON, '|' => PIPE, else => unreachable, }; const data = cons(head, tail); return s.jump(.end_one_datum, cons(rune, data)); } fn parseOneDatum(s: *State, c: u8, next: Fn) !void { if (isBareChar(c)) { const d, s.unused_char = try parseBareString(s, c); return s.jump(next, d); } return parseCladDatum(s, c, next); } fn parseCladDatum(s: *State, c: u8, next: Fn) !void { if (c == '\\') { const bs, s.unused_char = try parseBareEscString(s); return s.jump(next, bs); } if (c == '"') { const qs = try parseQuotedString(s); return s.jump(next, qs); } return switch (c) { '#' => parseHashExpression(s, next), '(', '[', '{' => parseList(s, c, next), '\'', '`', ',' => parseQuoteExpr(s, c, next), else => s.abort(next, c), }; } fn isBareChar(c: u8) bool { return switch (c) { // zig fmt: off 'a'...'z' , 'A'...'Z' , '0'...'9', '!' , '$' , '%' , '&' , '*' , '+', '-' , '/' , '<' , '=' , '>' , '?', '@' , '^' , '_' , '~' , '.' => true, // zig fmt: on else => false, }; } fn isBareEsc(c: u8) bool { return switch (c) { 33...126 => true, else => false, }; } fn parseBareString(s: *State, c: u8) !struct { Value, ?u8 } { try s.addChar(c); return parseBareStringRest(s); } fn parseBareEscString(s: *State) !struct { Value, ?u8 } { try s.addChar(try parseBareEsc(s)); return parseBareStringRest(s); } fn parseBareStringRest(s: *State) !struct { Value, ?u8 } { while (try s.read()) |c| { if (isBareChar(c)) { try s.addChar(c); } else if (c == '\\') { try s.addChar(try parseBareEsc(s)); } else { return .{ s.getBareString(), c }; } } return .{ s.getBareString(), null }; } fn parseBareEsc(s: *State) !u8 { const c = try s.readNoEof("bare escape"); if (isBareEsc(c)) { return c; } else { return s.err(error.InvalidCharacter, "bare escape"); } } fn parseQuotedString(s: *State) !Value { while (try s.read()) |c| { if (c == '"') { return s.getQuotedString(); } if (c != '\\') { try s.addChar(c); } else { try parseQuotedEsc(s); } } return error.UnclosedString; } fn parseQuotedEsc(s: *State) !void { const c = try s.readNoEof("quoted escape"); if (c == 'u') { return parseUniHexHandleErrors(s); } try s.addChar(switch (c) { '\\', '"' => c, '0' => 0, 'a' => 7, 'b' => 8, 't' => 9, 'n' => 10, 'v' => 11, 'f' => 12, 'r' => 13, 'e' => 27, 'x' => try parseHexByte(s, "hex escape"), else => return s.err(error.InvalidCharacter, "quoted escape"), }); } fn parseUniHexHandleErrors(s: *State) !void { return parseUniHex(s) catch |err| switch (err) { error.Utf8CannotEncodeSurrogateHalf => e: { s.err_code = err; s.err_msg = "unicode escape"; break :e error.UnicodeError; }, else => |e| e, }; } fn parseUniHex(s: *State) !void { const msg = "unicode escape"; if (try s.readNoEof(msg) != '{') { return s.err(error.InvalidCharacter, msg); } const uc, const unused_c = try parseHex(s, u21, msg); if (unused_c) |c| { if (c != '}') { return s.err(error.InvalidCharacter, msg); } } else { return s.err(error.UnexpectedEof, msg); } const n = try std.unicode.utf8CodepointSequenceLength(uc); const buf = try s.chars.addManyAsSlice(s.chars_alloc, n); _ = try std.unicode.utf8Encode(uc, buf); } fn parseHashExpression(s: *State, next: Fn) !void { const c = try s.readNoEof("hash expression"); if (try checkBlank(s, c) != .no) { return s.err(error.InvalidCharacter, "hash expression"); } if (std.ascii.isAlphabetic(c)) { const r, const unused_c = try parseRune(s, c); return parseRuneEnd(s, r, unused_c, next); } if (c == '%') { const l, const unused_c = try parseLabel(s); return parseLabelEnd(s, l, unused_c, next); } if (isBareChar(c)) { // Reserved for future extensions to syntax sugar. return s.err(error.InvalidCharacter, "hash expression"); } // fast-path to avoid subr if (c == '\\') { const bs, s.unused_char = try parseBareEscString(s); return s.jump(next, cons(HASH, bs)); } if (c == '"') { const qs = try parseQuotedString(s); return s.jump(next, cons(HASH, qs)); } s.unused_char = c; return s.subr(.parse_hash_datum, next); } fn parseHashDatum(s: *State) !void { return parseCladDatum(s, s.getUnused().?, .end_hash_datum); } fn endHashDatum(s: *State) !void { if (s.result.eq(value.undef)) { return s.err(error.InvalidCharacter, "hash datum"); } return s.retval(cons(HASH, s.result)); } fn parseRune(s: *State, c1: u8) !struct { Value, ?u8 } { try s.addChar(c1); var len: usize = 1; while (try s.read()) |c| : (len += 1) { if (len == 6 or !std.ascii.isAlphanumeric(c)) { return .{ s.getRune(), c }; } try s.addChar(c); } return .{ s.getRune(), null }; } fn parseRuneEnd(s: *State, r: Value, c1: ?u8, next: Fn) !void { const c = c1 orelse return s.jump(next, r); if (c == '\\') { const bs, s.unused_char = try parseBareString(s, c); return s.jump(next, cons(r, bs)); } if (c == '"') { const qs = try parseQuotedString(s); return s.jump(next, cons(r, qs)); } s.unused_char = c; switch (c) { '#', '(', '[', '{', '\'', '`', ',' => { try s.push(next); return s.jump(.parse_rune_datum, r); }, else => return s.jump(next, r), } } fn parseRuneDatum(s: *State) !void { s.context.val = s.result; return parseCladDatum(s, s.getUnused().?, .end_rune_datum); } fn endRuneDatum(s: *State) !void { const r = s.context.val; const d = s.result; if (d.eq(value.undef)) { s.retval(r); } return s.retval(cons(r, d)); } fn parseLabel(s: *State) !struct { Value, ?u8 } { const label, const unused_c = try parseHex(s, u48, "datum label"); return .{ value.fixnum.pack(label), unused_c }; } fn parseLabelEnd(s: *State, l: Value, c1: ?u8, next: Fn) !void { const c = c1 orelse return s.err(error.UnexpectedEof, "datum label"); if (c == '%') { return s.jump(next, cons(LABEL, l)); } if (c == '=') { try s.push(next); s.context.val = l; return s.subr(.parse_unit, .end_label_datum); } return s.err(error.InvalidCharacter, "datum label"); } fn endLabelDatum(s: *State) !void { const l = s.context.val; const d = s.result; if (d.eq(value.undef)) { return s.err(error.InvalidCharacter, "label datum"); } return s.retval(cons(LABEL, cons(l, d))); } fn parseList(s: *State, open: u8, next: Fn) !void { const head = switch (open) { '(' => value.nil.nil, '[' => cons(SQUARE, value.nil.nil), '{' => cons(BRACE, value.nil.nil), else => unreachable, }; const close: u8 = switch (open) { '(' => ')', '[' => ']', '{' => '}', else => unreachable, }; while (try s.read()) |c| { if (c == close) { return s.jump(next, head); } switch (try checkBlank(s, c)) { .yes => {}, .skip_unit => { try listParserSetup(s, head, close, next); // Parse twice in a row, ignoring the first result. return s.subr(.parse_unit, .parse_unit); }, .skip_line => try s.skipLine(), .no => { try listParserSetup(s, head, close, next); s.unused_char = c; return s.jump(.parse_list_element, null); }, } } return s.err(error.UnexpectedEof, "list"); } fn listParserSetup(s: *State, head: Value, close: u8, next: Fn) !void { s.context.val = head; s.context.char = close; try s.push(next); try s.pushContext(.continue_list); } fn parseListElement(s: *State) !void { return parseDatum(s, s.getUnused().?); } fn continueList(s: *State) !void { const close = s.context.char; if (s.result.eq(value.undef)) { const c = s.getUnused().?; if (c == close) { return endList(s); } return s.err(error.InvalidCharacter, "list"); } if (s.result.eq(S_DOT)) { return s.subr(.parse_unit, .end_improper_list); } s.context.val = cons(s.result, s.context.val); var c1 = s.getUnused() orelse try s.read(); while (c1) |c| : (c1 = try s.read()) { if (c == close) { return endList(s); } switch (try checkBlank(s, c)) { .yes => {}, .skip_unit => { try s.pushContext(.continue_list); return s.subr(.parse_unit, .parse_unit); }, .skip_line => try s.skipLine(), .no => { s.unused_char = c; return s.subr(.parse_list_element, .continue_list); }, } } return s.err(error.UnexpectedEof, "list"); } fn endList(s: *State) !void { return s.retval(lib.list.reverse(s.context.val)); } fn endImproperList(s: *State) !void { const tail = s.result; if (tail.eq(value.undef)) { return s.err(error.InvalidCharacter, "list tail"); } s.context.val = lib.list.reverseWithTail(s.context.val, tail); return closeImproperList(s); } fn closeImproperList(s: *State) !void { const result = s.context.val; const close = s.context.char; var c1 = s.getUnused() orelse try s.read(); while (c1) |c| : (c1 = try s.readNoEof("after list tail")) { if (c == close) { return s.retval(result); } switch (try checkBlank(s, c)) { .yes => {}, .skip_unit => return s.subr(.parse_unit, .close_improper_list), .skip_line => try s.skipLine(), .no => return s.err(error.InvalidCharacter, "after list tail"), } } unreachable; } fn parseQuoteExpr(s: *State, c1: u8, next: Fn) !void { const q = switch (c1) { '\'' => QUOTE, '`' => GRAVE, ',' => COMMA, else => unreachable, }; // fast-path to avoid subr const c = try s.readNoEof("quote expression"); if (isBareChar(c) or c == '\\') { const bs, s.unused_char = try parseBareString(s, c); return s.jump(next, cons(q, bs)); } s.context.val = q; s.unused_char = c; return s.subr(.parse_list_element, .end_quote_expr); } fn endQuoteExpr(s: *State) !void { if (s.result.eq(value.undef)) { return s.err(error.InvalidCharacter, "quote expression datum"); } const q = s.context.val; const d = s.result; return s.retval(cons(q, d)); } // Helpers fn parseHex( s: *State, u_type: type, emsg: []const u8, ) !struct { u_type, ?u8 } { var uc: u_type = undefined; const c1 = try s.readNoEof(emsg); uc = try parseHexDigit(s, c1, emsg); while (try s.read()) |c| { if (!std.ascii.isHex(c)) { return .{ uc, c }; } const shl = std.math.shlExact; uc = shl(u_type, uc, 4) catch return s.err(error.OutOfRange, emsg); uc |= try parseHexDigit(s, c, emsg); } return .{ uc, null }; } fn parseHexByte(s: *State, emsg: []const u8) !u8 { const h1 = try s.readNoEof(emsg); const h2 = try s.readNoEof(emsg); const hi = try parseHexDigit(s, h1, emsg); const lo = try parseHexDigit(s, h2, emsg); return hi << 4 | lo; } fn parseHexDigit(s: *State, c: u8, emsg: []const u8) !u8 { return switch (c) { '0'...'9' => c - '0', 'A'...'F' => c - 'A' + 10, 'a'...'f' => c - 'a' + 10, else => s.err(error.InvalidCharacter, emsg), }; }