Fix parser.

author: Taylan Kammer <taylan.kammer@gmail.com> 2025-03-18 21:38:53 +0100
committer: Taylan Kammer <taylan.kammer@gmail.com> 2025-03-18 21:38:53 +0100
commit: 6273f9fd55625ac088b0c753c43fb4661b0011e2 (patch)
tree: 8ba5447aec17e4c3004075c200e7ae74cc988733
parent: 4b20042c145a5c8ca18b04b35b2bb8c3f94aced5 (diff)
1 files changed, 63 insertions, 36 deletions
diff --git a/src/libzisp/io/parser.zig b/src/libzisp/io/parser.zig
index 9a1bc17..b79b679 100644
--- a/src/libzisp/io/parser.zig
+++ b/src/libzisp/io/parser.zig
@@ -44,9 +44,9 @@
 // * The terms datum, dat1, and dat2 each refer to an arbitrary datum; ellipsis
 //   means zero or more data; n is a non-negative integer.
 //
-// * The #datum form applies only to expressions that cannot be mistaken for a
+// * The #datum form only applies to expressions that cannot be mistaken for a
 //   rune, such as for example: #(...) or #"..." or #'string etc.; following a
-//   hash sign with a plain string would otherwise be parsed as a rune.
+//   hash sign with a plain string will parse as a rune instead.
 //
 // * Though not represented in the table due to notational difficulty, the
 //   format "#rune(...)" doesn't require a list in the second position; any
@@ -119,7 +119,7 @@
 //
 // The decoder may also perform arbitrary transforms on any type; for example,
 // it may turn bare strings (those not flagged as double-quoted) into numbers
-// when appropriate.  This is how number literals are implemented.
+// when appropriate.  This can implement number literals.
 //
 // The decoder recognizes (#QUOTE ...) to implement the traditional quoting
 // mechanism, but with a significant difference:
@@ -147,7 +147,7 @@
 // complex data records with non-standard data types, and so on.
 //
 //
-// === Implementation details ===
+// === Trampolining strategy ===
 //
 // Instead of using recursion directly, the parser is written in a "trampoline"
 // style, which ensures that parsing depth is not limited by the stack size.
@@ -165,12 +165,13 @@
 // or from the main loop, it sets the .retval field, and tries to pop the saved
 // context.  If the context stack was empty, the main loop returns.
 //
-// Some further notes:
-//
 // While it's possible to just set .next and return, to make the main loop call
 // another function next (possibly even setting .retval to pass a value to it),
 // this is completely unnecessary.  A few non-recursive calls won't blow the
-// stack.  It's only recursive parsing that we need the above strategy for.
+// stack.  It's only recursive parsing that we use the above strategy for.
+//
+//
+// === .start_parse VS .start_datum ===
 //
 // The difference between .start_parse and .start_datum is that the former will
 // allow whitespace and comments at the beginning, whereas the latter expects a
@@ -179,6 +180,7 @@
 // When calling the parser recursively, it may seem logical to always make it
 // start with .start_datum, because we already cleared whitespace and comments
 // out of the way.  However, in some cases, we must use .start_parse instead.
+//
 // This is because of datum comments.  When one appears, we start a recursive
 // parser, but instead of making it return to a function that will consume the
 // result, we make it return to the original starting point, so the result is
@@ -188,6 +190,9 @@
 //
 //   (foo #;bar baz)  ;must use .start_parse at the start of each element
 //
+//
+// === List parsing strategy ===
+//
 // When it comes to pairs and lists, we basically treat everything as a list,
 // and a pair is just seen as the shortest possible improper list.  This saves
 // memory: If we implemented list parsing as pair parsing, we would be calling
@@ -196,6 +201,23 @@
 // would still waste memory and the time it takes to allocate memory.
 //
 //
+// === Buffering strategy ===
+//
+// We use a small circular buffer just so we can backtrack.  We read bytes into
+// this buffer one by one, because we don't want to consume more bytes from the
+// input than what we actually parse.  Consider the following input stream:
+//
+//   (a b c) (x y z)
+//
+// If we used proper buffering, like reading up to 4K bytes per read, then the
+// whole stream would be consumed at once before it's parsed.  Then, the parser
+// would return the first datum, and the rest of the stream would be lost.  The
+// parser would need some way to reset the input stream's read head to the end
+// of the first datum, but not all stream types may support this.
+//
+// For efficiency, call the parser on an input stream with implicit buffering.
+//
+//
 // === Notation used in comments ===
 //
 // Some comments throughout the file give you an example of where the parser
@@ -240,19 +262,21 @@ const Context = struct {
     char: u8 = undefined,
 };
 
-// Buffer size is 4096, so index type is u12 (0..4095).
-const BUF_SIZE = 4096;
-const IDX_TYPE = u12;
+// Make sure these are in sync, as we will use +% to increment the position.
+// Size 16 is enough because the largest amount by which we backtrack is short
+// strings / runes, which are limited to 6 bytes.
+const BUF_SIZE = 16;
+const POS_TYPE = u4;
 
 const debug_mode = @import("builtin").mode == .Debug;
 
 const State = struct {
     input: std.io.AnyReader,
-    is_eof: bool = false,
+    counter: usize = 0,
 
-    buf: [BUF_SIZE]u8 = .{0} ** BUF_SIZE,
-    pos: IDX_TYPE = 0,
-    write_pos: IDX_TYPE = 0,
+    buf: [BUF_SIZE]u8 = undefined,
+    pos: POS_TYPE = 0,
+    write_pos: POS_TYPE = 0,
 
     context: Context = .{},
     stack: std.ArrayList(Context),
@@ -287,43 +311,41 @@ const State = struct {
         }
     }
 
-    fn fillBuffer(s: *State) !void {
+    fn readNext(s: *State) !void {
         if (s.pos != s.write_pos) {
             return;
         }
-        // Make sure *not* to overwrite the entire buffer with newly read data,
-        // because we use it as a circular buffer so as to be able to reset to
-        // previous points in it.  We overwrite at most BUF_SIZE / 2 bytes, so
-        // we can go back by up to BUF_SIZE / 2 bytes.
-        const avail = BUF_SIZE - @as(u64, s.write_pos);
-        const write_max = std.math.clamp(avail, 0, BUF_SIZE / 2);
-        const write_to = s.write_pos + write_max;
-        const count = try s.input.read(s.buf[s.write_pos..write_to]);
-        s.is_eof = count == 0;
-        s.write_pos +%= @intCast(count);
+        s.buf[s.pos] = try s.input.readByte();
+        s.write_pos +%= 1;
     }
 
     fn eof(s: *State) bool {
         if (debug_mode) {
             s.checked_eof = true;
         }
-        fillBuffer(s) catch @panic("reader error");
-        return s.is_eof;
+        readNext(s) catch |e| switch (e) {
+            error.EndOfStream => return true,
+            else => @panic("read error"),
+        };
+        return false;
     }
 
     fn peek(s: *State) u8 {
+        if (debug_mode) {
+            if (!s.checked_eof) {
+                @panic("Didn't check EOF before calling peek()!");
+            }
+        }
         return s.buf[s.pos];
     }
 
     fn skip(s: *State) void {
         if (debug_mode) {
-            if (!s.checked_eof) {
-                @panic("Didn't check EOF before calling skip()!");
-            }
             s.checked_eof = false;
         }
         // std.debug.print("{c}\n", .{s.buf[s.pos]});
         s.pos +%= 1;
+        s.counter += 1;
     }
 
     fn getc(s: *State) u8 {
@@ -455,8 +477,6 @@ fn startDatum(s: *State) void {
 
         '(', '[', '{' => startList(s),
 
-        '.' => err(s, "misplaced period"),
-
         else => startBareString(s),
     }
 }
@@ -751,13 +771,20 @@ fn continueList(s: *State) void {
         return endList(s);
     }
 
+    // Check if there's an improper-list ending.
     if (s.peek() == '.') {
+        const pos = s.pos;
         s.skip();
+        if (s.eof()) {
+            return err(s, "unexpected EOF while parsing list");
+        }
         // Scheme allows (foo .(bar)) but we don't.  Mind your spaces!
-        if (!s.isWhitespace()) {
-            return err(s, "misplaced period");
+        if (s.isWhitespace()) {
+            s.skip();
+            return s.recurParse(.start_parse, .finish_improper_list);
         }
-        return s.recurParse(.start_parse, .finish_improper_list);
+        // Nope, reset.
+        s.pos = pos;
     }
 
     s.recurParse(.start_parse, .continue_list);
@@ -796,6 +823,6 @@ fn endImproperList(s: *State) void {
 
 fn err(s: *State, msg: []const u8) noreturn {
     std.debug.print("{s}\n", .{msg});
-    std.debug.print("pos: {}\n", .{s.pos});
+    std.debug.print("pos: {}\n", .{s.counter});
     @panic("parse error");
 }
author	Taylan Kammer <taylan.kammer@gmail.com>	2025-03-18 21:38:53 +0100
committer	Taylan Kammer <taylan.kammer@gmail.com>	2025-03-18 21:38:53 +0100
commit	6273f9fd55625ac088b0c753c43fb4661b0011e2 (patch)
tree	8ba5447aec17e4c3004075c200e7ae74cc988733
parent	4b20042c145a5c8ca18b04b35b2bb8c3f94aced5 (diff)