summaryrefslogtreecommitdiff
path: root/src/libzisp/io
diff options
context:
space:
mode:
authorTaylan Kammer <taylan.kammer@gmail.com>2025-02-23 21:24:39 +0100
committerTaylan Kammer <taylan.kammer@gmail.com>2025-02-23 21:24:39 +0100
commitfb500ae9be9291cbba0f4f767381fd2d16e85517 (patch)
treeb660dab086a72ba1e950fa8e16323363804dcf45 /src/libzisp/io
parentd8790e81131b9883ab058beb854347d888453d2f (diff)
Improve parser.
Diffstat (limited to 'src/libzisp/io')
-rw-r--r--src/libzisp/io/parser.zig213
1 files changed, 126 insertions, 87 deletions
diff --git a/src/libzisp/io/parser.zig b/src/libzisp/io/parser.zig
index 5162c2f..27afc69 100644
--- a/src/libzisp/io/parser.zig
+++ b/src/libzisp/io/parser.zig
@@ -125,48 +125,68 @@
//
// Instead of using recursion directly, the parser is written in something akin
// to a manual continuation-passing style, which ensures that parsing depth is
-// not limited by the stack size.
+// not limited by the stack size. This is also called "trampolining."
//
// All state/context is passed around via a struct pointer. The parser has a
// main loop which calls a function, passing it the state, and also expects a
// state pointer in return.
//
-// The state has a .next field, which indicates which function the main loop
-// should call next. It also has a .parent field, used as follows:
+// The state has a .next field, that indicates which function the main loop
+// should call next. It also has a .parent field, potentially linking to a
+// previous state. These two fields are used as follows:
//
// If a function wants to call the parser recursively, it sets the .next field
-// of the current state to where the recursive call should return to, then it
-// creates a new state with a given starting point, sets its .parent field to
-// the current state, and returns the new state pointer.
-//
-// If a function wants to make the parser return, it sets the .retval field of
-// the current state, and sets .next to the .perform_return value. This makes
-// the main loop either return to the parent state (after copying the .retval
-// field from child to parent), or if there is no parent state, it returns the
-// value as the top-level result.
-//
-// Note: While it's possible to simply set .next and return the current state,
-// to have another function be called next (possibly even setting .retval to
+// of the *current* state to where the recursive call should return to. Then,
+// it creates a new state, sets the .next field of that one to where it should
+// start (either .start_parse or .start_datum), sets .parent to the old state,
+// and returns the new one.
+//
+// So, the main loop will now do what the newly returned state is instructing.
+//
+// If a function wants to make the parser return from this recursive parsing
+// routine, it sets the .retval field of the current state, and sets .next to
+// the special .perform_return instruction. This makes the main loop copy the
+// value in .retval into the .retval field of the parent state, and reinstates
+// it, continuing with whatever was put in its .next field, which should be a
+// function that will consume the return value.
+//
+// If .parent is null, the main loop returns .retval as the final result.
+//
+// Some further notes:
+//
+// While it's possible to just set .next and return the current state, to make
+// the main loop call another function next (possibly even setting .retval to
// pass a value to it), this is completely unnecessary. A few non-recursive
-// function calls will obviously not blow the stack. It's only recursive
-// parsing that we use these fields for.
-//
-// Note 2: When calling the parser recursively, it may seem sensible to always
-// set the .next of the new state to .start_datum, because you already cleared
-// incoming whitespace and comments from the stream. However, in some cases,
-// you must set it to .start_parse instead. This is due to datum comments.
-// After a datum comment is parsed, the parser will ignore it and restore the
-// previous state, to try again what it was doing. If the state was set to
-// .start_datum, this means no whitespace or comments would be tolerated after
-// the datum comment.
-//
-// Note 3: When it comes to pairs/lists, we mainly try to parse them as lists,
-// and a pair becomes a special-case of an improper list (two elements). This
-// has the advantage of saving memory: If we implemented list parsing as pair
-// parsing, we would be calling the parser recursively, deeper and deeper, for
-// every pair that the list is made up of. Although we're not limited by stack
-// space (thanks to the strategy described above) this would still waste memory
-// while parsing.
+// function calls won't blow the stack. It's only recursive parsing that we
+// need the above strategy for.
+//
+// The difference between .start_parse and .start_datum is that the former will
+// allow whitespace and comments at the beginning, whereas the latter expects a
+// datum immediately without anything else in front of it.
+//
+// When calling the parser recursively, it may seem logical to always make it
+// start with .start_datum, because we already cleared whitespace and comments
+// out of the way. However, in some cases, we must use .start_parse instead.
+// This is because of datum comments. When one appears, we start a recursive
+// parser, but instead of making it return to a function that will consume the
+// result, we make it return to the original state's starting point. This way,
+// the parser "retries" what it was originally doing, with the comment out of
+// the way. If we always used .start_datum for recursive parsing, this would
+// never allow whitespace *after* a datum comment, because we would be back at
+// .start_datum as soon as we reach the end of the datum comment. Whitespace
+// after a datum comment is sometimes allowed and sometimes not, so there's no
+// generic solution that's always correct:
+//
+// (foo #;bar baz) ;; whitespace after #;bar allowed
+//
+// #'#;(+ 1 2)(+ 3 4) ;; whitespace after #;(+ 1 2) not allowed
+//
+// When it comes to pairs and lists, we basically treat everything as a list,
+// and a pair is just seen as the shortest possible improper list. This saves
+// memory: If we implemented list parsing as pair parsing, we would be calling
+// the parser recursively, deeper and deeper, for every list element. Though
+// we're not limited by stack space thanks to the strategy described above, it
+// would still waste memory by creating tons of new state objects.
//
//
// === Notation used in comments ===
@@ -189,93 +209,110 @@ const Value = value.Value;
pub const Mode = enum { code, data };
-const State = struct {
+const TopState = struct {
alloc: std.mem.Allocator,
input: []const u8,
pos: usize = 0,
mode: Mode = undefined,
+};
+
+const State = struct {
+ top: *TopState,
+
next: Fn = .start_parse,
parent: ?*State = null,
+ retval: Value = undefined,
+
+ // To store accumulated context, such as list elements.
context: Value = undefined,
+
+ // To remember what kind of list we're in: () [] {}
opening_bracket: u8 = undefined,
- retval: Value = undefined,
- fn eof(self: *State) bool {
- return self.pos >= self.input.len;
+ fn mode(s: *State) Mode {
+ return s.top.mode;
+ }
+
+ fn eof(s: *State) bool {
+ return s.top.pos >= s.top.input.len;
}
- fn peek(self: *State) u8 {
- return self.input[self.pos];
+ fn peek(s: *State) u8 {
+ return s.top.input[s.top.pos];
}
- fn skip(self: *State) void {
- self.pos += 1;
+ fn skip(s: *State) void {
+ s.top.pos += 1;
}
- fn getc(self: *State) u8 {
- const c = self.peek();
- self.skip();
+ fn getc(s: *State) u8 {
+ const c = s.peek();
+ s.skip();
return c;
}
+ fn pos(s: *State) usize {
+ return s.top.pos;
+ }
+
+ fn resetPos(s: *State, p: usize) void {
+ s.top.pos = p;
+ }
+
// Consumes whitespace and line comments.
- fn consumeBlanks(self: *State) void {
- while (!self.eof()) {
- if (self.isWhitespace()) {
- self.skip();
- } else if (self.peek() == ';') {
- self.skip();
- self.consumeLineComment();
+ fn consumeBlanks(s: *State) void {
+ while (!s.eof()) {
+ if (s.isWhitespace()) {
+ s.skip();
+ } else if (s.peek() == ';') {
+ s.skip();
+ s.consumeLineComment();
} else {
return;
}
}
}
- fn consumeLineComment(self: *State) void {
- while (!self.eof()) {
- if (self.getc() == '\n') {
+ fn consumeLineComment(s: *State) void {
+ while (!s.eof()) {
+ if (s.getc() == '\n') {
return;
}
}
}
- fn isWhitespace(self: *State) bool {
- return switch (self.peek()) {
+ fn isWhitespace(s: *State) bool {
+ return switch (s.peek()) {
'\t', '\n', ' ' => true,
else => false,
};
}
- fn isFinalNull(self: *State) bool {
- return self.peek() == 0 and self.pos == self.input.len - 1;
+ fn isFinalNull(s: *State) bool {
+ return s.peek() == 0 and s.top.pos == s.top.input.len - 1;
}
- fn recurParse(self: *State, start_from: Fn, return_to: Fn) *State {
- const newState = self.alloc.create(State) catch @panic("OOM");
+ fn recurParse(s: *State, start_from: Fn, return_to: Fn) *State {
+ const newState = s.top.alloc.create(State) catch @panic("OOM");
newState.* = .{
- .alloc = self.alloc,
- .input = self.input,
- .pos = self.pos,
- .mode = self.mode,
+ .top = s.top,
.next = start_from,
- .parent = self,
+ .parent = s,
};
- self.next = return_to;
+ s.next = return_to;
return newState;
}
- fn returnDatum(self: *State, val: Value) *State {
- self.retval = val;
- self.next = .perform_return;
- return self;
+ fn returnDatum(s: *State, val: Value) *State {
+ s.retval = val;
+ s.next = .perform_return;
+ return s;
}
- fn performReturn(self: *State) ?*State {
- if (self.parent) |parent| {
- parent.retval = self.retval;
- parent.pos = self.pos;
- parent.alloc.destroy(self);
+ fn performReturn(s: *State) ?*State {
+ if (s.parent) |parent| {
+ parent.retval = s.retval;
+ s.top.alloc.destroy(s);
return parent;
} else {
return null;
@@ -304,8 +341,10 @@ pub fn parseCode(input: []const u8) Value {
pub fn parse(input: []const u8, mode: Mode) Value {
var gpa: std.heap.GeneralPurposeAllocator(.{}) = .init;
- var top = State{ .alloc = gpa.allocator(), .input = input, .mode = mode };
- var s = &top;
+ const alloc = gpa.allocator();
+ var top = TopState{ .alloc = alloc, .input = input, .mode = mode };
+ var s0 = State{ .top = &top };
+ var s = &s0;
while (true) s = switch (s.next) {
.start_parse => startParse(s),
.start_datum => startDatum(s),
@@ -404,7 +443,7 @@ fn handleHash(s: *State) *State {
// Otherwise, it must be a hash-datum. #DATUM
// But data mode doesn't allow that.
- if (s.mode == .data) {
+ if (s.mode() == .data) {
return err(s, "use of hash-datum sequence not allowed in data mode");
}
@@ -427,7 +466,7 @@ fn handleRune(s: *State) *State {
// Otherwise, it's followed by a datum, like: #foo(...)
// Which is only allowed in code mode.
- if (s.mode == .data) {
+ if (s.mode() == .data) {
return err(s, "invalid use of hash in data mode");
}
@@ -476,7 +515,7 @@ fn startQuotedString(s: *State) *State {
s.skip();
const str = readQuotedString(s) catch return err(s, "unclosed string");
- if (s.mode == .code) {
+ if (s.mode() == .code) {
// "foo bar" => (#STRING . "foo bar")
const rune = value.rune.pack("STRING");
const pair = value.pair.cons(rune, str);
@@ -495,7 +534,7 @@ fn readQuotedString(s: *State) !Value {
fn readQuotedSstr(s: *State) !?Value {
// We will reset to this position if we fail.
- const start_pos = s.pos;
+ const start_pos = s.pos();
var buf: [6]u8 = undefined;
var i: u8 = 0;
@@ -507,7 +546,7 @@ fn readQuotedSstr(s: *State) !?Value {
}
if (i == 6) {
// failed; reset and bail out
- s.pos = start_pos;
+ s.resetPos(start_pos);
return null;
}
// ok, save this byte and go on
@@ -528,7 +567,7 @@ fn startBareString(s: *State) *State {
fn readBareSstr(s: *State) ?*State {
// We will reset to this position if we fail.
- const start_pos = s.pos;
+ const start_pos = s.pos();
var buf: [6]u8 = undefined;
var i: u8 = 0;
@@ -538,7 +577,7 @@ fn readBareSstr(s: *State) ?*State {
}
if (i == buf.len) {
// failed; reset and bail out
- s.pos = start_pos;
+ s.resetPos(start_pos);
return null;
}
buf[i] = s.getc();
@@ -586,7 +625,7 @@ fn endQuote(s: *State) *State {
fn startList(s: *State) *State {
const open = s.getc();
- if (s.mode == .data and open != '(') {
+ if (s.mode() == .data and open != '(') {
return err(s, "invalid opening bracket in data mode");
}
@@ -688,6 +727,6 @@ fn endImproperList(s: *State) *State {
fn err(s: *State, msg: []const u8) noreturn {
std.debug.print("{s}\n", .{msg});
- std.debug.print("pos: {}\n", .{s.pos});
+ std.debug.print("pos: {}\n", .{s.pos()});
@panic("parse error");
}