summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTaylan Kammer <taylan.kammer@gmail.com>2025-03-18 21:38:53 +0100
committerTaylan Kammer <taylan.kammer@gmail.com>2025-03-18 21:38:53 +0100
commit6273f9fd55625ac088b0c753c43fb4661b0011e2 (patch)
tree8ba5447aec17e4c3004075c200e7ae74cc988733
parent4b20042c145a5c8ca18b04b35b2bb8c3f94aced5 (diff)
Fix parser.
-rw-r--r--src/libzisp/io/parser.zig99
1 files changed, 63 insertions, 36 deletions
diff --git a/src/libzisp/io/parser.zig b/src/libzisp/io/parser.zig
index 9a1bc17..b79b679 100644
--- a/src/libzisp/io/parser.zig
+++ b/src/libzisp/io/parser.zig
@@ -44,9 +44,9 @@
// * The terms datum, dat1, and dat2 each refer to an arbitrary datum; ellipsis
// means zero or more data; n is a non-negative integer.
//
-// * The #datum form applies only to expressions that cannot be mistaken for a
+// * The #datum form only applies to expressions that cannot be mistaken for a
// rune, such as for example: #(...) or #"..." or #'string etc.; following a
-// hash sign with a plain string would otherwise be parsed as a rune.
+// hash sign with a plain string will parse as a rune instead.
//
// * Though not represented in the table due to notational difficulty, the
// format "#rune(...)" doesn't require a list in the second position; any
@@ -119,7 +119,7 @@
//
// The decoder may also perform arbitrary transforms on any type; for example,
// it may turn bare strings (those not flagged as double-quoted) into numbers
-// when appropriate. This is how number literals are implemented.
+// when appropriate. This can implement number literals.
//
// The decoder recognizes (#QUOTE ...) to implement the traditional quoting
// mechanism, but with a significant difference:
@@ -147,7 +147,7 @@
// complex data records with non-standard data types, and so on.
//
//
-// === Implementation details ===
+// === Trampolining strategy ===
//
// Instead of using recursion directly, the parser is written in a "trampoline"
// style, which ensures that parsing depth is not limited by the stack size.
@@ -165,12 +165,13 @@
// or from the main loop, it sets the .retval field, and tries to pop the saved
// context. If the context stack was empty, the main loop returns.
//
-// Some further notes:
-//
// While it's possible to just set .next and return, to make the main loop call
// another function next (possibly even setting .retval to pass a value to it),
// this is completely unnecessary. A few non-recursive calls won't blow the
-// stack. It's only recursive parsing that we need the above strategy for.
+// stack. It's only recursive parsing that we use the above strategy for.
+//
+//
+// === .start_parse VS .start_datum ===
//
// The difference between .start_parse and .start_datum is that the former will
// allow whitespace and comments at the beginning, whereas the latter expects a
@@ -179,6 +180,7 @@
// When calling the parser recursively, it may seem logical to always make it
// start with .start_datum, because we already cleared whitespace and comments
// out of the way. However, in some cases, we must use .start_parse instead.
+//
// This is because of datum comments. When one appears, we start a recursive
// parser, but instead of making it return to a function that will consume the
// result, we make it return to the original starting point, so the result is
@@ -188,6 +190,9 @@
//
// (foo #;bar baz) ;must use .start_parse at the start of each element
//
+//
+// === List parsing strategy ===
+//
// When it comes to pairs and lists, we basically treat everything as a list,
// and a pair is just seen as the shortest possible improper list. This saves
// memory: If we implemented list parsing as pair parsing, we would be calling
@@ -196,6 +201,23 @@
// would still waste memory and the time it takes to allocate memory.
//
//
+// === Buffering strategy ===
+//
+// We use a small circular buffer just so we can backtrack. We read bytes into
+// this buffer one by one, because we don't want to consume more bytes from the
+// input than what we actually parse. Consider the following input stream:
+//
+// (a b c) (x y z)
+//
+// If we used proper buffering, like reading up to 4K bytes per read, then the
+// whole stream would be consumed at once before it's parsed. Then, the parser
+// would return the first datum, and the rest of the stream would be lost. The
+// parser would need some way to reset the input stream's read head to the end
+// of the first datum, but not all stream types may support this.
+//
+// For efficiency, call the parser on an input stream with implicit buffering.
+//
+//
// === Notation used in comments ===
//
// Some comments throughout the file give you an example of where the parser
@@ -240,19 +262,21 @@ const Context = struct {
char: u8 = undefined,
};
-// Buffer size is 4096, so index type is u12 (0..4095).
-const BUF_SIZE = 4096;
-const IDX_TYPE = u12;
+// Make sure these are in sync, as we will use +% to increment the position.
+// Size 16 is enough because the largest amount by which we backtrack is short
+// strings / runes, which are limited to 6 bytes.
+const BUF_SIZE = 16;
+const POS_TYPE = u4;
const debug_mode = @import("builtin").mode == .Debug;
const State = struct {
input: std.io.AnyReader,
- is_eof: bool = false,
+ counter: usize = 0,
- buf: [BUF_SIZE]u8 = .{0} ** BUF_SIZE,
- pos: IDX_TYPE = 0,
- write_pos: IDX_TYPE = 0,
+ buf: [BUF_SIZE]u8 = undefined,
+ pos: POS_TYPE = 0,
+ write_pos: POS_TYPE = 0,
context: Context = .{},
stack: std.ArrayList(Context),
@@ -287,43 +311,41 @@ const State = struct {
}
}
- fn fillBuffer(s: *State) !void {
+ fn readNext(s: *State) !void {
if (s.pos != s.write_pos) {
return;
}
- // Make sure *not* to overwrite the entire buffer with newly read data,
- // because we use it as a circular buffer so as to be able to reset to
- // previous points in it. We overwrite at most BUF_SIZE / 2 bytes, so
- // we can go back by up to BUF_SIZE / 2 bytes.
- const avail = BUF_SIZE - @as(u64, s.write_pos);
- const write_max = std.math.clamp(avail, 0, BUF_SIZE / 2);
- const write_to = s.write_pos + write_max;
- const count = try s.input.read(s.buf[s.write_pos..write_to]);
- s.is_eof = count == 0;
- s.write_pos +%= @intCast(count);
+ s.buf[s.pos] = try s.input.readByte();
+ s.write_pos +%= 1;
}
fn eof(s: *State) bool {
if (debug_mode) {
s.checked_eof = true;
}
- fillBuffer(s) catch @panic("reader error");
- return s.is_eof;
+ readNext(s) catch |e| switch (e) {
+ error.EndOfStream => return true,
+ else => @panic("read error"),
+ };
+ return false;
}
fn peek(s: *State) u8 {
+ if (debug_mode) {
+ if (!s.checked_eof) {
+ @panic("Didn't check EOF before calling peek()!");
+ }
+ }
return s.buf[s.pos];
}
fn skip(s: *State) void {
if (debug_mode) {
- if (!s.checked_eof) {
- @panic("Didn't check EOF before calling skip()!");
- }
s.checked_eof = false;
}
// std.debug.print("{c}\n", .{s.buf[s.pos]});
s.pos +%= 1;
+ s.counter += 1;
}
fn getc(s: *State) u8 {
@@ -455,8 +477,6 @@ fn startDatum(s: *State) void {
'(', '[', '{' => startList(s),
- '.' => err(s, "misplaced period"),
-
else => startBareString(s),
}
}
@@ -751,13 +771,20 @@ fn continueList(s: *State) void {
return endList(s);
}
+ // Check if there's an improper-list ending.
if (s.peek() == '.') {
+ const pos = s.pos;
s.skip();
+ if (s.eof()) {
+ return err(s, "unexpected EOF while parsing list");
+ }
// Scheme allows (foo .(bar)) but we don't. Mind your spaces!
- if (!s.isWhitespace()) {
- return err(s, "misplaced period");
+ if (s.isWhitespace()) {
+ s.skip();
+ return s.recurParse(.start_parse, .finish_improper_list);
}
- return s.recurParse(.start_parse, .finish_improper_list);
+ // Nope, reset.
+ s.pos = pos;
}
s.recurParse(.start_parse, .continue_list);
@@ -796,6 +823,6 @@ fn endImproperList(s: *State) void {
fn err(s: *State, msg: []const u8) noreturn {
std.debug.print("{s}\n", .{msg});
- std.debug.print("pos: {}\n", .{s.pos});
+ std.debug.print("pos: {}\n", .{s.counter});
@panic("parse error");
}