diff options
| author | Taylan Kammer <taylan.kammer@gmail.com> | 2025-03-18 21:38:53 +0100 |
|---|---|---|
| committer | Taylan Kammer <taylan.kammer@gmail.com> | 2025-03-18 21:38:53 +0100 |
| commit | 6273f9fd55625ac088b0c753c43fb4661b0011e2 (patch) | |
| tree | 8ba5447aec17e4c3004075c200e7ae74cc988733 | |
| parent | 4b20042c145a5c8ca18b04b35b2bb8c3f94aced5 (diff) | |
Fix parser.
| -rw-r--r-- | src/libzisp/io/parser.zig | 99 |
1 files changed, 63 insertions, 36 deletions
diff --git a/src/libzisp/io/parser.zig b/src/libzisp/io/parser.zig index 9a1bc17..b79b679 100644 --- a/src/libzisp/io/parser.zig +++ b/src/libzisp/io/parser.zig @@ -44,9 +44,9 @@ // * The terms datum, dat1, and dat2 each refer to an arbitrary datum; ellipsis // means zero or more data; n is a non-negative integer. // -// * The #datum form applies only to expressions that cannot be mistaken for a +// * The #datum form only applies to expressions that cannot be mistaken for a // rune, such as for example: #(...) or #"..." or #'string etc.; following a -// hash sign with a plain string would otherwise be parsed as a rune. +// hash sign with a plain string will parse as a rune instead. // // * Though not represented in the table due to notational difficulty, the // format "#rune(...)" doesn't require a list in the second position; any @@ -119,7 +119,7 @@ // // The decoder may also perform arbitrary transforms on any type; for example, // it may turn bare strings (those not flagged as double-quoted) into numbers -// when appropriate. This is how number literals are implemented. +// when appropriate. This can implement number literals. // // The decoder recognizes (#QUOTE ...) to implement the traditional quoting // mechanism, but with a significant difference: @@ -147,7 +147,7 @@ // complex data records with non-standard data types, and so on. // // -// === Implementation details === +// === Trampolining strategy === // // Instead of using recursion directly, the parser is written in a "trampoline" // style, which ensures that parsing depth is not limited by the stack size. @@ -165,12 +165,13 @@ // or from the main loop, it sets the .retval field, and tries to pop the saved // context. If the context stack was empty, the main loop returns. // -// Some further notes: -// // While it's possible to just set .next and return, to make the main loop call // another function next (possibly even setting .retval to pass a value to it), // this is completely unnecessary. A few non-recursive calls won't blow the -// stack. It's only recursive parsing that we need the above strategy for. +// stack. It's only recursive parsing that we use the above strategy for. +// +// +// === .start_parse VS .start_datum === // // The difference between .start_parse and .start_datum is that the former will // allow whitespace and comments at the beginning, whereas the latter expects a @@ -179,6 +180,7 @@ // When calling the parser recursively, it may seem logical to always make it // start with .start_datum, because we already cleared whitespace and comments // out of the way. However, in some cases, we must use .start_parse instead. +// // This is because of datum comments. When one appears, we start a recursive // parser, but instead of making it return to a function that will consume the // result, we make it return to the original starting point, so the result is @@ -188,6 +190,9 @@ // // (foo #;bar baz) ;must use .start_parse at the start of each element // +// +// === List parsing strategy === +// // When it comes to pairs and lists, we basically treat everything as a list, // and a pair is just seen as the shortest possible improper list. This saves // memory: If we implemented list parsing as pair parsing, we would be calling @@ -196,6 +201,23 @@ // would still waste memory and the time it takes to allocate memory. // // +// === Buffering strategy === +// +// We use a small circular buffer just so we can backtrack. We read bytes into +// this buffer one by one, because we don't want to consume more bytes from the +// input than what we actually parse. Consider the following input stream: +// +// (a b c) (x y z) +// +// If we used proper buffering, like reading up to 4K bytes per read, then the +// whole stream would be consumed at once before it's parsed. Then, the parser +// would return the first datum, and the rest of the stream would be lost. The +// parser would need some way to reset the input stream's read head to the end +// of the first datum, but not all stream types may support this. +// +// For efficiency, call the parser on an input stream with implicit buffering. +// +// // === Notation used in comments === // // Some comments throughout the file give you an example of where the parser @@ -240,19 +262,21 @@ const Context = struct { char: u8 = undefined, }; -// Buffer size is 4096, so index type is u12 (0..4095). -const BUF_SIZE = 4096; -const IDX_TYPE = u12; +// Make sure these are in sync, as we will use +% to increment the position. +// Size 16 is enough because the largest amount by which we backtrack is short +// strings / runes, which are limited to 6 bytes. +const BUF_SIZE = 16; +const POS_TYPE = u4; const debug_mode = @import("builtin").mode == .Debug; const State = struct { input: std.io.AnyReader, - is_eof: bool = false, + counter: usize = 0, - buf: [BUF_SIZE]u8 = .{0} ** BUF_SIZE, - pos: IDX_TYPE = 0, - write_pos: IDX_TYPE = 0, + buf: [BUF_SIZE]u8 = undefined, + pos: POS_TYPE = 0, + write_pos: POS_TYPE = 0, context: Context = .{}, stack: std.ArrayList(Context), @@ -287,43 +311,41 @@ const State = struct { } } - fn fillBuffer(s: *State) !void { + fn readNext(s: *State) !void { if (s.pos != s.write_pos) { return; } - // Make sure *not* to overwrite the entire buffer with newly read data, - // because we use it as a circular buffer so as to be able to reset to - // previous points in it. We overwrite at most BUF_SIZE / 2 bytes, so - // we can go back by up to BUF_SIZE / 2 bytes. - const avail = BUF_SIZE - @as(u64, s.write_pos); - const write_max = std.math.clamp(avail, 0, BUF_SIZE / 2); - const write_to = s.write_pos + write_max; - const count = try s.input.read(s.buf[s.write_pos..write_to]); - s.is_eof = count == 0; - s.write_pos +%= @intCast(count); + s.buf[s.pos] = try s.input.readByte(); + s.write_pos +%= 1; } fn eof(s: *State) bool { if (debug_mode) { s.checked_eof = true; } - fillBuffer(s) catch @panic("reader error"); - return s.is_eof; + readNext(s) catch |e| switch (e) { + error.EndOfStream => return true, + else => @panic("read error"), + }; + return false; } fn peek(s: *State) u8 { + if (debug_mode) { + if (!s.checked_eof) { + @panic("Didn't check EOF before calling peek()!"); + } + } return s.buf[s.pos]; } fn skip(s: *State) void { if (debug_mode) { - if (!s.checked_eof) { - @panic("Didn't check EOF before calling skip()!"); - } s.checked_eof = false; } // std.debug.print("{c}\n", .{s.buf[s.pos]}); s.pos +%= 1; + s.counter += 1; } fn getc(s: *State) u8 { @@ -455,8 +477,6 @@ fn startDatum(s: *State) void { '(', '[', '{' => startList(s), - '.' => err(s, "misplaced period"), - else => startBareString(s), } } @@ -751,13 +771,20 @@ fn continueList(s: *State) void { return endList(s); } + // Check if there's an improper-list ending. if (s.peek() == '.') { + const pos = s.pos; s.skip(); + if (s.eof()) { + return err(s, "unexpected EOF while parsing list"); + } // Scheme allows (foo .(bar)) but we don't. Mind your spaces! - if (!s.isWhitespace()) { - return err(s, "misplaced period"); + if (s.isWhitespace()) { + s.skip(); + return s.recurParse(.start_parse, .finish_improper_list); } - return s.recurParse(.start_parse, .finish_improper_list); + // Nope, reset. + s.pos = pos; } s.recurParse(.start_parse, .continue_list); @@ -796,6 +823,6 @@ fn endImproperList(s: *State) void { fn err(s: *State, msg: []const u8) noreturn { std.debug.print("{s}\n", .{msg}); - std.debug.print("pos: {}\n", .{s.pos}); + std.debug.print("pos: {}\n", .{s.counter}); @panic("parse error"); } |
