summaryrefslogtreecommitdiff
path: root/src/libzisp/io/parser.zig
diff options
context:
space:
mode:
authorTaylan Kammer <taylan.kammer@gmail.com>2025-02-22 15:04:05 +0100
committerTaylan Kammer <taylan.kammer@gmail.com>2025-02-22 15:04:05 +0100
commitc922361115c8ee398ec4e26bb0af8cca4dcb9667 (patch)
tree6b093f2ad960bf073d295458faf6707e5123c1e4 /src/libzisp/io/parser.zig
parent2d52864265e06a4d863dba84e1d20c8d52d8c5dd (diff)
update
Diffstat (limited to 'src/libzisp/io/parser.zig')
-rw-r--r--src/libzisp/io/parser.zig683
1 files changed, 683 insertions, 0 deletions
diff --git a/src/libzisp/io/parser.zig b/src/libzisp/io/parser.zig
new file mode 100644
index 0000000..71c6946
--- /dev/null
+++ b/src/libzisp/io/parser.zig
@@ -0,0 +1,683 @@
+//
+// === Parser for Code & Data ===
+//
+// Zisp s-expressions come in two flavors: code (sugar) and data (no sugar).
+//
+// However, code expressions are parsed into the same data types which the data
+// expressions can represent, so homoiconicity is preserved.
+//
+// The "sugar" used in code expressions is merely shorthand for more complex
+// data expressions, which could have been written by hand.
+//
+// Data expressions have a very simple format, and are only able to express a
+// minimal set of data types:
+//
+// string -> foo , "foo bar" ;symbols and strings are the same data type
+//
+// rune -> #foo ;limited to 6 ASCII letters (a - z, A - Z)
+//
+// pair -> (DATUM . DATUM) ;the only composite data type supported
+//
+// nil -> () ;we prefer the term nil over null
+//
+// The list short-hand syntax may be considered the only "syntax sugar" that is
+// supported by the data parser:
+//
+// (DATUM DATUM DATUM) -> (DATUM . (DATUM . (DATUM . ())))
+//
+// We may use terms like "code parser" and "data parser" out of convenience,
+// although there may only be a single parser that implements both formats by
+// switching between modes.
+//
+// When the code parser encounters syntax sugar, it always transforms it into a
+// list starting with a rune, like in the following examples:
+//
+// #(...) -> (#HASH ...)
+//
+// [...] -> (#SQUARE ...)
+//
+// 'foo -> (#QUOTE . foo)
+//
+// These can combine arbitrarily:
+//
+// #{...} -> (#HASH #BRACE ...)
+//
+// #'foo -> (#HASH #QUOTE . foo)
+//
+// ##'[...] -> (#HASH #HASH #QUOTE #SQUARE ...)
+//
+// As a specialty, double-quoted strings are actually considered sugar by the
+// code parser, and are transformed as follows into data:
+//
+// "..." -> (#STRING . "...")
+//
+// (Otherwise, all string literals would be identifiers, or all identifiers
+// would be string literals, because Zisp doesn't differentiate strings and
+// symbols like traditional lisps. Also, note that although we could reuse
+// #QUOTE here, instead of using #STRING, this would make it impossible to
+// differentiate between the code expressions #'foo and #"foo".)
+//
+// Runes are case-sensitive, and the code parser only emits runes using
+// upper-case letters, so lower-case runes are free for user extensions.
+//
+// You may be wondering about numbers. As far as the parser is concerned,
+// numbers are strings. It's the decoder (see below) that will turn bare
+// strings (those not marked with #STRING) into numbers.
+//
+// Note that 'foo becomes (quote foo) in Scheme, but (#QUOTE . foo) in Zisp.
+// The operand of #QUOTE is the entire cdr. The same principle is used when
+// parsing other sugar:
+//
+// Incorrect Correct
+//
+// #(x y z) -> (#HASH (x y z)) #(x y z) -> (#HASH x y z)
+//
+// [x y z] -> (#SQUARE (x y z)) [x y z] -> (#SQUARE x y z)
+//
+// #{x} -> (#HASH (#BRACE (x))) #{x} -> (#HASH #BRACE x)
+//
+//
+// === Decoder ===
+//
+// A separate process called "decoding" can transform simple data structures,
+// consisting of only the above types, into a richer set of Zisp data types.
+//
+// For example, the decoder may turn (#HASH ...) into a vector, as one would
+// expect a vector literal like #(...) to work in Scheme.
+//
+// Runes may be decoded in isolation as well, rather than transforming a list
+// whose head they appear in. This is how #true and #false are implemented.
+//
+// The decoder may also perform arbitrary transforms on any type; for example,
+// it may turn bare strings (those not marked with #STRING) into numbers when
+// it's decoding data representing code. This is how number literals are
+// implemented in Zisp.
+//
+// The decoder recognizes (#QUOTE ...) to implement the traditional quoting
+// mechanism, but in a better way:
+//
+// Traditional quote is "unhygienic" in Scheme terms. An expression such as
+// '(foo bar) will always be read as (quote (foo bar)) regardless of what sort
+// of lexical context it appears in, so the semantics will depend on whatever
+// the identifier "quote" is bound to in that lexical context, meaning that the
+// expression may end up evaluating to something other than the list (foo bar).
+//
+// The Zisp decoder, which transforms not text to text, but objects to objects,
+// can turn (#QUOTE ...) into an abstract object which encapsulates the notion
+// of quoting, which the Zisp evaluator can recognize and act upon, ensuring
+// that an expression like '(foo bar) always turns into the list (foo bar).
+//
+// One way to think about this, in Scheme (R6RS / syntax-case) terms, is that
+// expressions like '(foo bar) turn directly into a *syntax object* when read,
+// rather than a regular list object.
+//
+// The decoder is, of course, configurable and extensible. The transformations
+// mentioned above would be performed only when it's told to decode data which
+// represents Zisp code. The decoder may be given a different configuration,
+// telling it to decode, for example, data which represents a different kind of
+// domain-specific data, such as application settings, build system commands,
+// complex data records with non-standard data types, and so on.
+//
+//
+// === Implementation details ===
+//
+// Instead of using recursion directly, the parser is written in something akin
+// to a manual continuation-passing style, which ensures that parsing depth is
+// not limited by the stack size.
+//
+// All state/context is passed around via a struct pointer. The parser has a
+// main loop which calls a function, passing it the state, and also expects a
+// state pointer in return.
+//
+// The state has a .next field, which indicates which function the main loop
+// should call next. It also has a .parent field, used as follows:
+//
+// If a function wants to call the parser recursively, it sets the .next field
+// of the current state to where the recursive call should return to, then it
+// creates a new state with a given starting point, sets its .parent field to
+// the current state, and returns the new state pointer.
+//
+// If a function wants to make the parser return, it sets the .retval field of
+// the current state, and sets .next to the .perform_return value. This makes
+// the main loop either return to the parent state (after copying the .retval
+// field from child to parent), or if there is no parent state, it returns the
+// value as the top-level result.
+//
+// Note: While it's possible to simply set .next and return the current state,
+// to have another function be called next (possibly even setting .retval to
+// pass a value to it), this is completely unnecessary. A few non-recursive
+// function calls will obviously not blow the stack. It's only recursive
+// parsing that we use these fields for.
+//
+// Note 2: When calling the parser recursively, it may seem sensible to always
+// set the .next of the new state to .start_datum, because you already cleared
+// incoming whitespace and comments from the stream. However, in some cases,
+// you must set it to .start_parse instead. This is due to datum comments.
+// After a datum comment is parsed, the parser will ignore it and restore the
+// previous state, to try again what it was doing. If the state was set to
+// .start_datum, this means no whitespace or comments would be tolerated after
+// the datum comment.
+//
+// Note 3: When it comes to pairs/lists, we mainly try to parse them as lists,
+// and a pair becomes a special-case of an improper list (two elements). This
+// has the advantage of saving memory: If we implemented list parsing as pair
+// parsing, we would be calling the parser recursively, deeper and deeper, for
+// every pair that the list is made up of. Although we're not limited by stack
+// space, thanks to the strategy described above, this would still waste memory
+// while parsing.
+//
+//
+// === Notation used in comments ===
+//
+// Some comments throughout the file give you an example of where the parser
+// currently might be in a stream. These illustrations use the pipe symbol,
+// which looks like a cursor, to indicate the current position:
+//
+// (foo| bar baz) <- parser arrived at the end of the string foo
+//
+// (foo bar baz)| <- parser reached EOF (assuming no trailing spaces)
+//
+
+const std = @import("std");
+
+const gc = @import("../gc.zig");
+const list = @import("../list.zig");
+const value = @import("../value.zig");
+
+const Value = value.Value;
+
+const State = struct {
+ alloc: std.mem.Allocator,
+
+ input: []const u8,
+ pos: usize = 0,
+
+ mode: enum { code, data } = .code,
+
+ next: Fn = .start_parse,
+
+ parent: ?*State = null,
+
+ // Used to store various context, but most notably the stack of list
+ // elements parsed so far, so just initialize it to nil.
+ context: Value = value.nil.nil,
+
+ opening_bracket: u8 = 0,
+
+ retval: Value = value.eof.eof,
+
+ fn eof(self: *State) bool {
+ return self.pos >= self.input.len;
+ }
+
+ fn peek(self: *State) u8 {
+ return self.input[self.pos];
+ }
+
+ fn skip(self: *State) void {
+ self.pos += 1;
+ }
+
+ fn getc(self: *State) u8 {
+ const c = self.peek();
+ self.skip();
+ return c;
+ }
+
+ // Consumes whitespace and line comments.
+ fn consumeBlanks(self: *State) void {
+ while (!self.eof()) {
+ if (self.isWhitespace()) {
+ self.skip();
+ } else if (self.peek() == ';') {
+ self.skip();
+ self.consumeLineComment();
+ } else {
+ return;
+ }
+ }
+ }
+
+ fn consumeLineComment(self: *State) void {
+ while (!self.eof()) {
+ if (self.getc() == '\n') {
+ return;
+ }
+ }
+ }
+
+ fn isWhitespace(self: *State) bool {
+ return switch (self.peek()) {
+ '\t', '\n', ' ' => true,
+ else => false,
+ };
+ }
+
+ fn isFinalNull(self: *State) bool {
+ return self.peek() == 0 and self.pos == self.input.len - 1;
+ }
+
+ fn recurParse(self: *State, start_from: Fn, return_to: Fn) *State {
+ const sub = self.alloc.create(State) catch @panic("OOM");
+ sub.* = .{ .alloc = self.alloc, .input = self.input };
+ sub.pos = self.pos;
+ sub.mode = self.mode;
+ sub.next = start_from;
+ sub.parent = self;
+ self.next = return_to;
+ return sub;
+ }
+
+ fn returnDatum(self: *State, val: Value) *State {
+ self.retval = val;
+ self.next = .perform_return;
+ return self;
+ }
+
+ fn performReturn(self: *State) ?*State {
+ if (self.parent) |parent| {
+ parent.retval = self.retval;
+ parent.pos = self.pos;
+ parent.alloc.destroy(self);
+ return parent;
+ } else {
+ return null;
+ }
+ }
+};
+
+// Probably best *not* to use function pointers here, but rather dispatch to
+// functions manually based on enum value. This should help the optimizer.
+
+const Fn = enum {
+ start_parse,
+ start_datum,
+ end_hash_datum,
+ end_rune_datum,
+ end_quote,
+ continue_list,
+ finalize_improper_list,
+ end_improper_list,
+ perform_return,
+};
+
+pub fn parse(input: []const u8) Value {
+ var gpa: std.heap.GeneralPurposeAllocator(.{}) = .init;
+ var top = State{ .alloc = gpa.allocator(), .input = input };
+ var s = &top;
+ while (true) s = switch (s.next) {
+ .start_parse => startParse(s),
+ .start_datum => startDatum(s),
+ .end_hash_datum => endHashDatum(s),
+ .end_rune_datum => endRuneDatum(s),
+ .end_quote => endQuote(s),
+ .continue_list => continueList(s),
+ .finalize_improper_list => finalizeImproperList(s),
+ .end_improper_list => endImproperList(s),
+ .perform_return => s.performReturn() orelse return s.retval,
+ };
+}
+
+fn startParse(s: *State) *State {
+ s.consumeBlanks();
+ if (s.eof()) {
+ return s.returnDatum(value.eof.eof);
+ }
+ return switch (s.peek()) {
+ // whitespace already consumed
+ 0...32, 127...255 => err(s, "invalid character"),
+ ')', ']', '}' => err(s, "unexpected closing bracket"),
+ else => startDatum(s),
+ };
+}
+
+// This is called when we *immediately* expect a datum and nothing else; for
+// example, no whitespace or comments, because they've already been consumed.
+fn startDatum(s: *State) *State {
+ if (s.eof()) {
+ return err(s, "expected datum, got EOF");
+ }
+ if (s.isWhitespace()) {
+ return err(s, "expected datum, got whitespace");
+ }
+ return switch (s.peek()) {
+ // whitespace checked above
+ 0...32, 127...255 => err(s, "invalid character"),
+
+ ')', ']', '}' => err(s, "unexpected closing bracket"),
+
+ ';' => err(s, "expected datum, got semicolon"),
+
+ '#' => handleHash(s),
+
+ '"' => startQuotedString(s),
+
+ '\'', '`', ',' => startQuote(s),
+
+ '(', '[', '{' => startList(s),
+
+ // Periods are only allowed in the middle of a string, or to express
+ // improper lists, because the following look too much like typos:
+ //
+ // (foo. bar) (foo .bar) (123. 456) (123 .456)
+ //
+ '.' => err(s, "misplaced period"),
+
+ else => startBareString(s),
+ };
+}
+
+fn handleHash(s: *State) *State {
+ s.skip();
+ //
+ // We just consumed a hash. Possibilities include:
+ //
+ // #|foo ;rune
+ //
+ // #|;DATUM ;datum comment
+ //
+ // #|DATUM ;hash-datum (code mode only)
+ //
+
+ if (s.eof()) {
+ return err(s, "EOF after hash");
+ }
+ if (s.isWhitespace()) {
+ return err(s, "whitespace after hash");
+ }
+
+ // Is it a rune? #foo
+ switch (s.peek()) {
+ 'a'...'z', 'A'...'Z' => return handleRune(s),
+ else => {},
+ }
+
+ // Is it a datum comment? #;DATUM
+ if (s.peek() == ';') {
+ s.skip();
+ // Don't change s.next in this case. Just let the parser try to redo
+ // what it was doing as soon as the commented-out datum has been read.
+ return s.recurParse(.start_datum, s.next);
+ }
+
+ // Otherwise, it must be a hash-datum. #DATUM
+
+ // But data mode doesn't allow that.
+ if (s.mode == .data) {
+ return err(s, "use of hash-datum sequence not allowed in data mode");
+ }
+
+ return s.recurParse(.start_datum, .end_hash_datum);
+}
+
+fn handleRune(s: *State) *State {
+ const rune = readRune(s) orelse return err(s, "rune too long");
+ //
+ // Now we're at the end of the rune, but it could be a rune-datum:
+ //
+ // #foo|(...)
+ //
+
+ if (isEndOfRune(s)) {
+ // Nope, just a stand-alone rune.
+ return s.returnDatum(rune);
+ }
+
+ // Otherwise, it's followed by a datum, like: #foo(...)
+
+ // Which is only allowed in code mode.
+ if (s.mode == .data) {
+ return err(s, "invalid use of hash in data mode");
+ }
+
+ s.context = rune;
+ return s.recurParse(.start_datum, .end_rune_datum);
+}
+
+fn readRune(s: *State) ?Value {
+ var buf: [6]u8 = undefined;
+ var i: u8 = 0;
+ while (!s.eof()) : (i += 1) switch (s.peek()) {
+ 'a'...'z', 'A'...'Z' => {
+ if (i == buf.len) {
+ return null;
+ }
+ buf[i] = s.getc();
+ },
+ else => break,
+ };
+
+ // 'i' can't be 0 since this function is only called if at least one ASCII
+ // letter was seen after the hash.
+ std.debug.assert(i != 0);
+
+ return value.rune.pack(buf[0..i]);
+}
+
+fn isEndOfRune(s: *State) bool {
+ return s.eof() or switch (s.peek()) {
+ '\t', '\n', ' ', ')', ']', '}' => true,
+ else => false,
+ };
+}
+
+fn endRuneDatum(s: *State) *State {
+ return s.returnDatum(value.pair.cons(s.context, s.retval));
+}
+
+fn endHashDatum(s: *State) *State {
+ const rune = value.rune.pack("HASH");
+ return s.returnDatum(value.pair.cons(rune, s.retval));
+}
+
+fn startQuotedString(s: *State) *State {
+ // We're at |"..." so consume the opening quote before we start reading.
+ s.skip();
+
+ const str = readQuotedString(s) catch return err(s, "unclosed string");
+ if (s.mode == .code) {
+ // "foo bar" => (#STRING . "foo bar")
+ const rune = value.rune.pack("STRING");
+ const pair = value.pair.cons(rune, str);
+ return s.returnDatum(pair);
+ } else {
+ return s.returnDatum(str);
+ }
+}
+
+// RQS = Read Quoted String
+const RqsError = enum { Unclosed };
+
+fn readQuotedString(s: *State) !Value {
+ return try readQuotedSstr(s) orelse readQuotedLongString(s);
+}
+
+fn readQuotedSstr(s: *State) !?Value {
+ // We will reset to this position if we fail.
+ const start_pos = s.pos;
+
+ var buf: [6]u8 = undefined;
+ var i: u8 = 0;
+ while (!s.eof()) {
+ const c = s.getc();
+ if (c == '"') {
+ // ok, return what we accumulated
+ return value.sstr.pack(buf[0..i]);
+ }
+ if (i == 6) {
+ // failed; reset and bail out
+ s.pos = start_pos;
+ return null;
+ }
+ // ok, save this byte and go on
+ buf[i] = c;
+ i += 1;
+ }
+ return error.Unclosed;
+}
+
+fn readQuotedLongString(s: *State) Value {
+ return err(s, "NOT YET IMPLEMENTED");
+}
+
+fn startBareString(s: *State) *State {
+ // We're at |foo so start reading directly.
+ return readBareSstr(s) orelse readBareLongString(s);
+}
+
+fn readBareSstr(s: *State) ?*State {
+ // We will reset to this position if we fail.
+ const start_pos = s.pos;
+
+ var buf: [6]u8 = undefined;
+ var i: u8 = 0;
+ while (!s.eof()) : (i += 1) {
+ if (isBareStringEnd(s)) {
+ break;
+ }
+ if (i == buf.len) {
+ // failed; reset and bail out
+ s.pos = start_pos;
+ return null;
+ }
+ buf[i] = s.getc();
+ }
+
+ return s.returnDatum(value.sstr.pack(buf[0..i]));
+}
+
+fn isBareStringEnd(s: *State) bool {
+ // We will ignore illegal characters here, because they aren't consumed by
+ // this function; whatever code comes next must handle them.
+ return s.eof() or switch (s.peek()) {
+ 0...32, 127...255 => true,
+ '(', ')', '[', ']', '{', '}', ';', '#', '"', '\'', '`', ',' => true,
+ else => false,
+ };
+}
+
+fn readBareLongString(s: *State) *State {
+ return err(s, "NOT YET IMPLEMENTED");
+}
+
+fn startQuote(s: *State) *State {
+ // We're at one of: |'... |`... |,...
+ s.context = value.rune.pack(switch (s.getc()) {
+ '\'' => "QUOTE",
+ '`' => "GRAVE",
+ ',' => "COMMA",
+ else => unreachable,
+ });
+ return s.recurParse(.start_datum, .end_quote);
+}
+
+fn endQuote(s: *State) *State {
+ return s.returnDatum(value.pair.cons(s.context, s.retval));
+}
+
+// List processing is, unsurprisingly, the most complicated, and it's made even
+// more complicated by the possibility of datum comments in strange places...
+
+fn startList(s: *State) *State {
+ const open = s.getc();
+
+ if (s.mode == .data and open != '(') {
+ return err(s, "invalid opening bracket in data mode");
+ }
+
+ s.consumeBlanks();
+ if (s.eof()) {
+ return err(s, "unexpected EOF while parsing list");
+ }
+
+ s.opening_bracket = open;
+ return if (isEndOfList(s))
+ endList(s)
+ else
+ s.recurParse(.start_parse, .continue_list);
+}
+
+fn isEndOfList(s: *State) bool {
+ return switch (s.peek()) {
+ ')', ']', '}' => true,
+ else => false,
+ };
+}
+
+fn continueList(s: *State) *State {
+ s.context = value.pair.cons(s.retval, s.context);
+
+ s.consumeBlanks();
+ if (s.eof()) {
+ return err(s, "unexpected EOF while parsing list");
+ }
+
+ if (isEndOfList(s)) {
+ s.context = list.reverse(s.context);
+ return endList(s);
+ }
+
+ if (s.peek() == '.') {
+ s.skip();
+ // Scheme allows (foo .(bar)) but we don't. Mind your spaces!
+ if (!s.isWhitespace()) {
+ return err(s, "misplaced period");
+ }
+ return s.recurParse(.start_parse, .finalize_improper_list);
+ }
+
+ return s.recurParse(.start_parse, .continue_list);
+}
+
+fn finalizeImproperList(s: *State) *State {
+ s.context = list.reverseWithTail(s.context, s.retval);
+ return endImproperList(s);
+}
+
+fn endImproperList(s: *State) *State {
+ s.consumeBlanks();
+ if (s.eof()) {
+ return err(s, "unexpected EOF while parsing list");
+ }
+
+ if (isEndOfList(s)) {
+ return endList(s);
+ }
+
+ if (s.getc() == '#') {
+ if (s.eof()) {
+ return err(s, "unexpected EOF after hash while parsing list");
+ }
+ if (s.getc() == ';') {
+ return s.recurParse(.start_datum, .end_improper_list);
+ }
+ }
+
+ return err(s, "malformed list / extra datum at end of improper list");
+}
+
+fn endList(s: *State) *State {
+ const open = s.opening_bracket;
+ const char = s.getc();
+
+ // Check for proper ending: (foo bar baz)
+ if (open == '(' and char == ')') {
+ return s.returnDatum(s.context);
+ }
+ if (open == '[' and char == ']') {
+ const rune = value.rune.pack("SQUARE");
+ return s.returnDatum(value.pair.cons(rune, s.context));
+ }
+ if (open == '{' and char == '}') {
+ const rune = value.rune.pack("BRACE");
+ return s.returnDatum(value.pair.cons(rune, s.context));
+ }
+
+ return err(s, "wrong closing bracket for list");
+}
+
+fn err(s: *State, msg: []const u8) noreturn {
+ std.debug.print("{s}\n", .{msg});
+ std.debug.print("pos: {}\n", .{s.pos});
+ @panic("parse error");
+}