// // === Parser for Code & Data === // // Zisp s-expressions come in two flavors: code (sugar) and data (no sugar). // // However, code expressions are parsed into the same data types which the data // expressions can represent, so homoiconicity is preserved. // // The "sugar" used in code expressions is merely shorthand for more complex // data expressions, which could have been written by hand. // // Data expressions have a very simple format, and are only able to express a // minimal set of data types: // // string -> foo , "foo bar" ;symbols and strings are the same data type // // rune -> #foo ;limited to 6 ASCII letters (a - z, A - Z) // // pair -> (DATUM . DATUM) ;the only composite data type supported // // nil -> () ;we prefer the term nil over null // // The list short-hand syntax may be considered the only "syntax sugar" that is // supported by the data parser: // // (DATUM DATUM DATUM) -> (DATUM . (DATUM . (DATUM . ()))) // // We may use terms like "code parser" and "data parser" out of convenience, // although there may only be a single parser that implements both formats by // switching between modes. // // When the code parser encounters syntax sugar, it always transforms it into a // list starting with a rune, like in the following examples: // // #(...) -> (#HASH ...) // // [...] -> (#SQUARE ...) // // 'foo -> (#QUOTE . foo) // // These can combine arbitrarily: // // #{...} -> (#HASH #BRACE ...) // // #'foo -> (#HASH #QUOTE . foo) // // ##'[...] -> (#HASH #HASH #QUOTE #SQUARE ...) // // As a specialty, double-quoted strings are actually considered sugar by the // code parser, and are transformed as follows into data: // // "..." -> (#STRING . "...") // // (Otherwise, all string literals would be identifiers, or all identifiers // would be string literals, because Zisp doesn't differentiate strings and // symbols like traditional lisps. Also, note that although we could reuse // #QUOTE here, instead of using #STRING, this would make it impossible to // differentiate between the code expressions #'foo and #"foo".) // // Runes are case-sensitive, and the code parser only emits runes using // upper-case letters, so lower-case runes are free for user extensions. // // You may be wondering about numbers. As far as the parser is concerned, // numbers are strings. It's the decoder (see below) that will turn bare // strings (those not marked with #STRING) into numbers. // // Note that 'foo becomes (quote foo) in Scheme, but (#QUOTE . foo) in Zisp. // The operand of #QUOTE is the entire cdr. The same principle is used when // parsing other sugar: // // Incorrect Correct // // #(x y z) -> (#HASH (x y z)) #(x y z) -> (#HASH x y z) // // [x y z] -> (#SQUARE (x y z)) [x y z] -> (#SQUARE x y z) // // #{x} -> (#HASH (#BRACE (x))) #{x} -> (#HASH #BRACE x) // // // === Decoder === // // A separate process called "decoding" can transform simple data structures, // consisting of only the above types, into a richer set of Zisp data types. // // For example, the decoder may turn (#HASH ...) into a vector, as one would // expect a vector literal like #(...) to work in Scheme. // // Runes may be decoded in isolation as well, rather than transforming a list // whose head they appear in. This is how #true and #false are implemented. // // The decoder may also perform arbitrary transforms on any type; for example, // it may turn bare strings (those not marked with #STRING) into numbers when // it's decoding data representing code. This is how number literals are // implemented in Zisp. // // The decoder recognizes (#QUOTE ...) to implement the traditional quoting // mechanism, but in a better way: // // Traditional quote is "unhygienic" in Scheme terms. An expression such as // '(foo bar) will always be read as (quote (foo bar)) regardless of what sort // of lexical context it appears in, so the semantics will depend on whatever // the identifier "quote" is bound to in that lexical context, meaning that the // expression may end up evaluating to something other than the list (foo bar). // // The Zisp decoder, which transforms not text to text, but objects to objects, // can turn (#QUOTE ...) into an abstract object which encapsulates the notion // of quoting, which the Zisp evaluator can recognize and act upon, ensuring // that an expression like '(foo bar) always turns into the list (foo bar). // // One way to think about this, in Scheme (R6RS / syntax-case) terms, is that // expressions like '(foo bar) turn directly into a *syntax object* when read, // rather than a regular list object. // // The decoder is, of course, configurable and extensible. The transformations // mentioned above would be performed only when it's told to decode data which // represents Zisp code. The decoder may be given a different configuration, // telling it to decode, for example, data which represents a different kind of // domain-specific data, such as application settings, build system commands, // complex data records with non-standard data types, and so on. // // // === Implementation details === // // Instead of using recursion directly, the parser is written in something akin // to a manual continuation-passing style, which ensures that parsing depth is // not limited by the stack size. // // All state/context is passed around via a struct pointer. The parser has a // main loop which calls a function, passing it the state, and also expects a // state pointer in return. // // The state has a .next field, which indicates which function the main loop // should call next. It also has a .parent field, used as follows: // // If a function wants to call the parser recursively, it sets the .next field // of the current state to where the recursive call should return to, then it // creates a new state with a given starting point, sets its .parent field to // the current state, and returns the new state pointer. // // If a function wants to make the parser return, it sets the .retval field of // the current state, and sets .next to the .perform_return value. This makes // the main loop either return to the parent state (after copying the .retval // field from child to parent), or if there is no parent state, it returns the // value as the top-level result. // // Note: While it's possible to simply set .next and return the current state, // to have another function be called next (possibly even setting .retval to // pass a value to it), this is completely unnecessary. A few non-recursive // function calls will obviously not blow the stack. It's only recursive // parsing that we use these fields for. // // Note 2: When calling the parser recursively, it may seem sensible to always // set the .next of the new state to .start_datum, because you already cleared // incoming whitespace and comments from the stream. However, in some cases, // you must set it to .start_parse instead. This is due to datum comments. // After a datum comment is parsed, the parser will ignore it and restore the // previous state, to try again what it was doing. If the state was set to // .start_datum, this means no whitespace or comments would be tolerated after // the datum comment. // // Note 3: When it comes to pairs/lists, we mainly try to parse them as lists, // and a pair becomes a special-case of an improper list (two elements). This // has the advantage of saving memory: If we implemented list parsing as pair // parsing, we would be calling the parser recursively, deeper and deeper, for // every pair that the list is made up of. Although we're not limited by stack // space, thanks to the strategy described above, this would still waste memory // while parsing. // // // === Notation used in comments === // // Some comments throughout the file give you an example of where the parser // currently might be in a stream. These illustrations use the pipe symbol, // which looks like a cursor, to indicate the current position: // // (foo| bar baz) <- parser arrived at the end of the string foo // // (foo bar baz)| <- parser reached EOF (assuming no trailing spaces) // const std = @import("std"); const gc = @import("../gc.zig"); const list = @import("../list.zig"); const value = @import("../value.zig"); const Value = value.Value; const State = struct { alloc: std.mem.Allocator, input: []const u8, pos: usize = 0, mode: enum { code, data } = .code, next: Fn = .start_parse, parent: ?*State = null, // Used to store various context, but most notably the stack of list // elements parsed so far, so just initialize it to nil. context: Value = value.nil.nil, opening_bracket: u8 = 0, retval: Value = value.eof.eof, fn eof(self: *State) bool { return self.pos >= self.input.len; } fn peek(self: *State) u8 { return self.input[self.pos]; } fn skip(self: *State) void { self.pos += 1; } fn getc(self: *State) u8 { const c = self.peek(); self.skip(); return c; } // Consumes whitespace and line comments. fn consumeBlanks(self: *State) void { while (!self.eof()) { if (self.isWhitespace()) { self.skip(); } else if (self.peek() == ';') { self.skip(); self.consumeLineComment(); } else { return; } } } fn consumeLineComment(self: *State) void { while (!self.eof()) { if (self.getc() == '\n') { return; } } } fn isWhitespace(self: *State) bool { return switch (self.peek()) { '\t', '\n', ' ' => true, else => false, }; } fn isFinalNull(self: *State) bool { return self.peek() == 0 and self.pos == self.input.len - 1; } fn recurParse(self: *State, start_from: Fn, return_to: Fn) *State { const sub = self.alloc.create(State) catch @panic("OOM"); sub.* = .{ .alloc = self.alloc, .input = self.input }; sub.pos = self.pos; sub.mode = self.mode; sub.next = start_from; sub.parent = self; self.next = return_to; return sub; } fn returnDatum(self: *State, val: Value) *State { self.retval = val; self.next = .perform_return; return self; } fn performReturn(self: *State) ?*State { if (self.parent) |parent| { parent.retval = self.retval; parent.pos = self.pos; parent.alloc.destroy(self); return parent; } else { return null; } } }; // Probably best *not* to use function pointers here, but rather dispatch to // functions manually based on enum value. This should help the optimizer. const Fn = enum { start_parse, start_datum, end_hash_datum, end_rune_datum, end_quote, continue_list, finalize_improper_list, end_improper_list, perform_return, }; pub fn parse(input: []const u8) Value { var gpa: std.heap.GeneralPurposeAllocator(.{}) = .init; var top = State{ .alloc = gpa.allocator(), .input = input }; var s = ⊤ while (true) s = switch (s.next) { .start_parse => startParse(s), .start_datum => startDatum(s), .end_hash_datum => endHashDatum(s), .end_rune_datum => endRuneDatum(s), .end_quote => endQuote(s), .continue_list => continueList(s), .finalize_improper_list => finalizeImproperList(s), .end_improper_list => endImproperList(s), .perform_return => s.performReturn() orelse return s.retval, }; } fn startParse(s: *State) *State { s.consumeBlanks(); if (s.eof()) { return s.returnDatum(value.eof.eof); } return switch (s.peek()) { // whitespace already consumed 0...32, 127...255 => err(s, "invalid character"), ')', ']', '}' => err(s, "unexpected closing bracket"), else => startDatum(s), }; } // This is called when we *immediately* expect a datum and nothing else; for // example, no whitespace or comments, because they've already been consumed. fn startDatum(s: *State) *State { if (s.eof()) { return err(s, "expected datum, got EOF"); } if (s.isWhitespace()) { return err(s, "expected datum, got whitespace"); } return switch (s.peek()) { // whitespace checked above 0...32, 127...255 => err(s, "invalid character"), ')', ']', '}' => err(s, "unexpected closing bracket"), ';' => err(s, "expected datum, got semicolon"), '#' => handleHash(s), '"' => startQuotedString(s), '\'', '`', ',' => startQuote(s), '(', '[', '{' => startList(s), // Periods are only allowed in the middle of a string, or to express // improper lists, because the following look too much like typos: // // (foo. bar) (foo .bar) (123. 456) (123 .456) // '.' => err(s, "misplaced period"), else => startBareString(s), }; } fn handleHash(s: *State) *State { s.skip(); // // We just consumed a hash. Possibilities include: // // #|foo ;rune // // #|;DATUM ;datum comment // // #|DATUM ;hash-datum (code mode only) // if (s.eof()) { return err(s, "EOF after hash"); } if (s.isWhitespace()) { return err(s, "whitespace after hash"); } // Is it a rune? #foo switch (s.peek()) { 'a'...'z', 'A'...'Z' => return handleRune(s), else => {}, } // Is it a datum comment? #;DATUM if (s.peek() == ';') { s.skip(); // Don't change s.next in this case. Just let the parser try to redo // what it was doing as soon as the commented-out datum has been read. return s.recurParse(.start_datum, s.next); } // Otherwise, it must be a hash-datum. #DATUM // But data mode doesn't allow that. if (s.mode == .data) { return err(s, "use of hash-datum sequence not allowed in data mode"); } return s.recurParse(.start_datum, .end_hash_datum); } fn handleRune(s: *State) *State { const rune = readRune(s) orelse return err(s, "rune too long"); // // Now we're at the end of the rune, but it could be a rune-datum: // // #foo|(...) // if (isEndOfRune(s)) { // Nope, just a stand-alone rune. return s.returnDatum(rune); } // Otherwise, it's followed by a datum, like: #foo(...) // Which is only allowed in code mode. if (s.mode == .data) { return err(s, "invalid use of hash in data mode"); } s.context = rune; return s.recurParse(.start_datum, .end_rune_datum); } fn readRune(s: *State) ?Value { var buf: [6]u8 = undefined; var i: u8 = 0; while (!s.eof()) : (i += 1) switch (s.peek()) { 'a'...'z', 'A'...'Z' => { if (i == buf.len) { return null; } buf[i] = s.getc(); }, else => break, }; // 'i' can't be 0 since this function is only called if at least one ASCII // letter was seen after the hash. std.debug.assert(i != 0); return value.rune.pack(buf[0..i]); } fn isEndOfRune(s: *State) bool { return s.eof() or switch (s.peek()) { '\t', '\n', ' ', ')', ']', '}' => true, else => false, }; } fn endRuneDatum(s: *State) *State { return s.returnDatum(value.pair.cons(s.context, s.retval)); } fn endHashDatum(s: *State) *State { const rune = value.rune.pack("HASH"); return s.returnDatum(value.pair.cons(rune, s.retval)); } fn startQuotedString(s: *State) *State { // We're at |"..." so consume the opening quote before we start reading. s.skip(); const str = readQuotedString(s) catch return err(s, "unclosed string"); if (s.mode == .code) { // "foo bar" => (#STRING . "foo bar") const rune = value.rune.pack("STRING"); const pair = value.pair.cons(rune, str); return s.returnDatum(pair); } else { return s.returnDatum(str); } } // RQS = Read Quoted String const RqsError = enum { Unclosed }; fn readQuotedString(s: *State) !Value { return try readQuotedSstr(s) orelse readQuotedLongString(s); } fn readQuotedSstr(s: *State) !?Value { // We will reset to this position if we fail. const start_pos = s.pos; var buf: [6]u8 = undefined; var i: u8 = 0; while (!s.eof()) { const c = s.getc(); if (c == '"') { // ok, return what we accumulated return value.sstr.pack(buf[0..i]); } if (i == 6) { // failed; reset and bail out s.pos = start_pos; return null; } // ok, save this byte and go on buf[i] = c; i += 1; } return error.Unclosed; } fn readQuotedLongString(s: *State) Value { return err(s, "NOT YET IMPLEMENTED"); } fn startBareString(s: *State) *State { // We're at |foo so start reading directly. return readBareSstr(s) orelse readBareLongString(s); } fn readBareSstr(s: *State) ?*State { // We will reset to this position if we fail. const start_pos = s.pos; var buf: [6]u8 = undefined; var i: u8 = 0; while (!s.eof()) : (i += 1) { if (isBareStringEnd(s)) { break; } if (i == buf.len) { // failed; reset and bail out s.pos = start_pos; return null; } buf[i] = s.getc(); } return s.returnDatum(value.sstr.pack(buf[0..i])); } fn isBareStringEnd(s: *State) bool { // We will ignore illegal characters here, because they aren't consumed by // this function; whatever code comes next must handle them. return s.eof() or switch (s.peek()) { 0...32, 127...255 => true, '(', ')', '[', ']', '{', '}', ';', '#', '"', '\'', '`', ',' => true, else => false, }; } fn readBareLongString(s: *State) *State { return err(s, "NOT YET IMPLEMENTED"); } fn startQuote(s: *State) *State { // We're at one of: |'... |`... |,... s.context = value.rune.pack(switch (s.getc()) { '\'' => "QUOTE", '`' => "GRAVE", ',' => "COMMA", else => unreachable, }); return s.recurParse(.start_datum, .end_quote); } fn endQuote(s: *State) *State { return s.returnDatum(value.pair.cons(s.context, s.retval)); } // List processing is, unsurprisingly, the most complicated, and it's made even // more complicated by the possibility of datum comments in strange places... fn startList(s: *State) *State { const open = s.getc(); if (s.mode == .data and open != '(') { return err(s, "invalid opening bracket in data mode"); } s.consumeBlanks(); if (s.eof()) { return err(s, "unexpected EOF while parsing list"); } s.opening_bracket = open; return if (isEndOfList(s)) endList(s) else s.recurParse(.start_parse, .continue_list); } fn isEndOfList(s: *State) bool { return switch (s.peek()) { ')', ']', '}' => true, else => false, }; } fn continueList(s: *State) *State { s.context = value.pair.cons(s.retval, s.context); s.consumeBlanks(); if (s.eof()) { return err(s, "unexpected EOF while parsing list"); } if (isEndOfList(s)) { s.context = list.reverse(s.context); return endList(s); } if (s.peek() == '.') { s.skip(); // Scheme allows (foo .(bar)) but we don't. Mind your spaces! if (!s.isWhitespace()) { return err(s, "misplaced period"); } return s.recurParse(.start_parse, .finalize_improper_list); } return s.recurParse(.start_parse, .continue_list); } fn finalizeImproperList(s: *State) *State { s.context = list.reverseWithTail(s.context, s.retval); return endImproperList(s); } fn endImproperList(s: *State) *State { s.consumeBlanks(); if (s.eof()) { return err(s, "unexpected EOF while parsing list"); } if (isEndOfList(s)) { return endList(s); } if (s.getc() == '#') { if (s.eof()) { return err(s, "unexpected EOF after hash while parsing list"); } if (s.getc() == ';') { return s.recurParse(.start_datum, .end_improper_list); } } return err(s, "malformed list / extra datum at end of improper list"); } fn endList(s: *State) *State { const open = s.opening_bracket; const char = s.getc(); // Check for proper ending: (foo bar baz) if (open == '(' and char == ')') { return s.returnDatum(s.context); } if (open == '[' and char == ']') { const rune = value.rune.pack("SQUARE"); return s.returnDatum(value.pair.cons(rune, s.context)); } if (open == '{' and char == '}') { const rune = value.rune.pack("BRACE"); return s.returnDatum(value.pair.cons(rune, s.context)); } return err(s, "wrong closing bracket for list"); } fn err(s: *State, msg: []const u8) noreturn { std.debug.print("{s}\n", .{msg}); std.debug.print("pos: {}\n", .{s.pos}); @panic("parse error"); }