summaryrefslogtreecommitdiff
path: root/src/libzisp/io/parser.zig
diff options
context:
space:
mode:
authorTaylan Kammer <taylan.kammer@gmail.com>2025-03-29 23:45:03 +0100
committerTaylan Kammer <taylan.kammer@gmail.com>2025-03-29 23:53:26 +0100
commitfc23b42c6e2183c8ca8b6c42dc4e90d8061f835d (patch)
treea88f496bad87f881fcecd247a489438bc0c79779 /src/libzisp/io/parser.zig
parent451aa92846b5fd5c8a0739336de3aa26d741d750 (diff)
new new parser
Diffstat (limited to 'src/libzisp/io/parser.zig')
-rw-r--r--src/libzisp/io/parser.zig1131
1 files changed, 44 insertions, 1087 deletions
diff --git a/src/libzisp/io/parser.zig b/src/libzisp/io/parser.zig
index 99bf943..bd45ae1 100644
--- a/src/libzisp/io/parser.zig
+++ b/src/libzisp/io/parser.zig
@@ -1,1123 +1,80 @@
-//
-// === Parser for Code & Data ===
-//
-// Zisp s-expressions are defined in terms of an extremely minimal set of data
-// types; only that which is necessary to build representations of more complex
-// expressions and data types:
-//
-// +---------+-------------+---------------+--------+---------+--------+
-// | TYPE | Bare String | Quoted String | Rune | Pair | Nil |
-// +---------+-------------+---------------+--------+---------+--------+
-// | EXAMPLE | foobar | "foo bar" | #name | (X . Y) | () |
-// +---------+-------------+---------------+--------+---------+--------+
-//
-// Bare strings and quoted strings are polymorphic sub-types of the generic
-// string type. This lets the compiler distinguish between identifiers and
-// string literals.
-//
-// There are also non-negative integers, but they are only used for datum
-// labels; regular number literals are handled by the "decoder" (see below).
-//
-// The parser recognizes various "syntax sugar" and transforms it into uses of
-// the above data types. The most ubiquitous example is of course the list:
-//
-// (datum1 datum2 ...) -> (datum1 . (datum2 . (... . ())))
-//
-// The following table summarizes the other supported transformations:
-//
-// #datum -> (#HASH . datum) #rune(...) -> (#rune ...)
-//
-// [...] -> (#SQUARE ...) dat1dat2 -> (#JOIN dat1 . dat2)
-//
-// {...} -> (#BRACE ...) dat1.dat2 -> (#DOT dat1 . dat2)
-//
-// 'datum -> (#QUOTE . datum) dat1:dat2 -> (#COLON dat1 . dat2)
-//
-// `datum -> (#GRAVE . datum) dat1|dat2 -> (#PIPE dat1 . dat2)
-//
-// ,datum -> (#COMMA . datum) #%n=datum -> (#LABEL hex . datum)
-//
-// #%n -> (#LABEL . hex)
-//
-// A separate process called "decoding" can transform these objects into other
-// data types. For example, (#HASH x y z) could become a vector, so that the
-// expression #(x y z) works just like in Scheme.
-//
-// Decoding also resolves datum labels, and goes over bare strings to find ones
-// that are actually a number literal. This lets us offload the complexity of
-// number parsing elsewhere, so the parser remains extremely simple.
-//
-// Further notes about the syntax sugar table and examples above:
-//
-// * The terms datum, dat1, and dat2 each refer to an arbitrary datum; ellipsis
-// means zero or more data; hex is a hexadecimal number of up to 12 digits.
-//
-// * The #datum form only applies to expressions that cannot be mistaken for a
-// rune, such as #(...) or #"..." or #'string etc.; a hash sign followed by a
-// bare string will parse as a rune instead (or raise an error if it contains
-// characters that are illegal in rune names).
-//
-// * A backslash causes the immediately following character to lose any special
-// meaning it would normally have, and be considered as part of a bare string
-// instead. (This does not apply to whitespace.) For example, the following
-// three character sequences are each a valid bare string:
-//
-// foo\(bar\) \]blah \#\'xyz
-//
-// This can be used to follow a hash with a bare string without it being
-// parsed as a rune:
-//
-// #\foo -> (#HASH . foo)
-//
-// * Though not represented in the table due to notational difficulty, the
-// format "#rune(...)" doesn't require a list in the second position; any
-// datum works, so long as there's no ambiguity; for example:
-//
-// #rune1#rune2 -> (#rune1 . #rune2)
-//
-// #rune"text" -> (#rune . "text")
-//
-// #rune'string -> (#rune #QUOTE . string)
-//
-// As a counter-example, following a rune immediately with a bare string is
-// not possible, since it's ambiguous:
-//
-// #abcdefgh ;Could be (#abcdef . gh) or (#abcde . fgh) or ...
-//
-// The parser will see this as an attempt to use an 8-letter rune name, and
-// raise an error, since rune names are limited to 6 characters.
-//
-// * Syntax sugar can combine arbitrarily; some examples follow:
-//
-// #{...} -> (#HASH #BRACE ...)
-//
-// #'foo -> (#HASH #QUOTE . foo)
-//
-// ##'[...] -> (#HASH #HASH #QUOTE #SQUARE ...)
-//
-// {x y}[i j] -> (#JOIN (#BRACE x y) #SQUARE i j)
-//
-// foo.bar.baz{x y} -> (#JOIN (#DOT (#DOT foo . bar) . baz) #BRACE x y)
-//
-// * While 'foo parses as (quote foo) in Scheme, it parses as (#QUOTE . foo) in
-// Zisp; the operand of #QUOTE is the entire cdr. The same principle is used
-// when parsing other sugar; some examples follow:
-//
-// Incorrect Correct
-//
-// #(x y z) -> (#HASH (x y z)) #(x y z) -> (#HASH x y z)
-//
-// [x y z] -> (#SQUARE (x y z)) [x y z] -> (#SQUARE x y z)
-//
-// #{x} -> (#HASH (#BRACE (x))) #{x} -> (#HASH #BRACE x)
-//
-// foo(x y) -> (#JOIN foo (x y)) foo(x y) -> (#JOIN foo x y)
-//
-// * Runes are case-sensitive, and the parser only emits runes using upper-case
-// letters when expressing syntax sugar. This way, there can be no accidental
-// clash with runes that appear verbatim in code, as long as only lower-case
-// letters are used for rune literals in code.
-//
-//
-// === Decoding ===
-//
-// A separate process called "decoding" can transform simple data structures,
-// consisting of only the above types, into a richer set of Zisp data types.
-//
-// For example, the decoder may turn (#HASH ...) into a vector, as one would
-// expect a vector literal like #(...) to work in Scheme.
-//
-// Runes may be decoded in isolation as well, rather than transforming a list
-// whose head they appear in. This can implement #true and #false.
-//
-// The decoder may also perform arbitrary transforms on any type; for example,
-// it may turn bare strings into numbers wher appropriate. This can implement
-// number literals.
-//
-// The decoder recognizes (#QUOTE ...) to implement the traditional quoting
-// mechanism, but with a significant difference:
-//
-// Traditional quote is "unhygienic" in Scheme terms. An expression such as
-// '(foo bar) will always be read as (quote (foo bar)) regardless of what sort
-// of lexical context it appears in, so the semantics will depend on whatever
-// the identifier "quote" is bound to in that lexical context, meaning that the
-// expression may end up evaluating to something other than the list (foo bar).
-//
-// The Zisp decoder, which transforms not text to text, but objects to objects,
-// can turn (#QUOTE ...) into an abstract object which encapsulates the notion
-// of quoting, which the Zisp evaluator can recognize and act upon, ensuring
-// that an expression like '(foo bar) always turns into the list (foo bar).
-//
-// One way to think about this, in Scheme (R6RS / syntax-case) terms, is that
-// expressions like '(foo bar) turn directly into a *syntax object* when read,
-// rather than a regular list object.
-//
-// The decoder is, of course, configurable and extensible. The transformations
-// mentioned above would be performed only when it's told to decode data which
-// represents Zisp code. The decoder may be given a different configuration,
-// telling it to decode, for example, data which represents a different kind of
-// domain-specific data, such as application settings, build system commands,
-// complex data records with non-standard data types, and so on.
-//
-//
-// === Trampolining strategy ===
-//
-// Instead of using recursion directly, the parser is written in a "trampoline"
-// style, which ensures that parsing depth is not limited by the stack size.
-//
-// All state and context is passed around via a struct pointer. The parser has
-// a main loop, which calls a function as dictated by state.context.next, and
-// the function may update the state to have another function called next.
-//
-// If a function wants to call the parser recursively, it pushes some of the
-// current context onto a stack, including what function the recursive parser
-// should "return" to, and then updates the state to instruct the main loop to
-// call one of the starting functions.
-//
-// If a function wants to make the parser return, either from a recursive parse
-// or from the main loop, it sets the .result field, and tries to pop the saved
-// context. If the context stack was empty, the main loop returns.
-//
-// While it's possible to just set .next and return, to make the main loop call
-// another function next (possibly even setting .result to pass a value to it),
-// this is completely unnecessary. A few non-recursive calls won't blow the
-// stack. It's only recursive parsing that we use the above strategy for.
-//
-//
-// === .start_parse VS .start_datum ===
-//
-// The difference between .start_parse and .start_datum is that the former will
-// allow whitespace and comments at the beginning, whereas the latter expects a
-// datum immediately without anything else in front of it.
-//
-// When calling the parser recursively, it may seem logical to always make it
-// start with .start_datum, because we already cleared whitespace and comments
-// out of the way. However, in some cases, we must use .start_parse instead.
-//
-// This is because of datum comments. When one appears, we start a recursive
-// parser, but instead of making it return to a function that will consume the
-// result, we make it return to the original starting point, so the result is
-// ignored and the parser retries what it was originally doing. If we always
-// used .start_datum, this would never allow whitespace after a datum comment,
-// since we would be back at .start_datum after the comment is out of the way:
-//
-// (foo #;bar baz) ;must use .start_parse at the start of each element
-//
-//
-// === List parsing strategy ===
-//
-// When it comes to pairs and lists, we basically treat everything as a list,
-// and a pair is just seen as the shortest possible improper list. This saves
-// memory: If we implemented list parsing as pair parsing, we would be calling
-// the parser recursively, deeper and deeper, for every list element. Though
-// we're not limited by stack space thanks to the strategy described above, it
-// would still waste memory and the time it takes to allocate memory.
-//
-//
-// === Buffering ===
-//
-// We use a small circular buffer just so we can backtrack. We read bytes into
-// this buffer one by one, because we don't want to consume more bytes from the
-// input than what we actually parse. Consider the following input stream:
-//
-// (a b c) (x y z)
-//
-// If we used proper buffering, like reading up to 4K bytes per read, then the
-// whole stream would be consumed at once before it's parsed. Then, the parser
-// would return the first datum, and the rest of the stream would be lost. The
-// parser would need some way to reset the input stream's read head to the end
-// of the first datum, but not all stream types may support this.
-//
-// For efficiency, call the parser on an input stream with implicit buffering.
-//
-//
-// === Notation used in comments ===
-//
-// Some comments throughout the file give you an example of where the parser
-// currently might be in a stream. These illustrations use the pipe symbol,
-// which looks like a cursor, to indicate the current position:
-//
-// (foo| bar baz) <- parser arrived at the end of the string foo
-//
-// (foo bar baz)| <- parser reached EOF (assuming no trailing spaces)
-//
-
const builtin = @import("builtin");
const std = @import("std");
-const lib = @import("../lib.zig");
const value = @import("../value.zig");
-const List = std.ArrayListUnmanaged;
+const Parser = @import("Parser.zig");
const Value = value.Value;
-const cons = value.pair.cons;
-
const is_test = builtin.is_test;
const is_debug = builtin.mode == .Debug;
-const detailed_debug = false;
-
// In debug, we want to see if we leak, so very small numbers.
-const init_stack_capacity = if (is_debug) 2 else 32;
-const init_chars_capacity = if (is_debug) 2 else 2048;
-
-// zig fmt: off
-const DOT = value.rune.pack("DOT");
-const COLON = value.rune.pack("COLON");
-const PIPE = value.rune.pack("PIPE");
-const JOIN = value.rune.pack("JOIN");
-const LABEL = value.rune.pack("LABEL");
-const HASH = value.rune.pack("HASH");
-const QUOTE = value.rune.pack("QUOTE");
-const GRAVE = value.rune.pack("GRAVE");
-const COMMA = value.rune.pack("COMMA");
-const SQUARE = value.rune.pack("SQUARE");
-const BRACE = value.rune.pack("BRACE");
-const VOID = value.rune.packForced("");
-const LSTAIL = value.rune.packForced(".");
-// zig fmt: on
-
-const Context = struct {
- // What to do next.
- next: Fn = .parse_unit,
- // For storing a context value, like accumulated list elements.
- val: Value = undefined,
- // For storing a context char, like list opening bracket.
- char: u8 = undefined,
-};
-
-const ParseError = enum {
- InvalidCharacter,
- UnclosedString,
- UnexpectedEof,
- UnicodeError,
- RuneTooLong,
- OutOfMemory,
- OutOfRange,
- ReadError,
-};
-
-const State = struct {
- input: std.io.AnyReader,
- stack_alloc: std.mem.Allocator,
- chars_alloc: std.mem.Allocator,
-
- context: Context = .{},
- stack: List(Context) = undefined,
-
- chars: List(u8) = undefined,
-
- // TODO: Implement bool flag for when skipping a unit
-
- result: Value = undefined,
- unused_char: ?u8 = null,
-
- err_msg: []const u8 = undefined,
-
- fn init(
- input: std.io.AnyReader,
- stack_alloc: std.mem.Allocator,
- chars_alloc: std.mem.Allocator,
- ) !State {
- var s: State = .{
- .input = input,
- .stack_alloc = stack_alloc,
- .chars_alloc = chars_alloc,
- };
- s.stack = try .initCapacity(s.stack_alloc, init_stack_capacity);
- s.chars = try .initCapacity(s.chars_alloc, init_chars_capacity);
- return s;
- }
-
- fn deinit(s: *State) void {
- s.stack.deinit(s.stack_alloc);
- s.chars.deinit(s.chars_alloc);
- }
-
- fn err(
- s: *State,
- comptime e: ParseError,
- comptime msg: []const u8,
- ) error{ParseError} {
- s.err_msg = @tagName(e) ++ " at: " ++ msg;
- return error.ParseError;
- }
-
- fn read(s: *State) !?u8 {
- if (is_debug and s.unused_char != null) {
- @panic("Called read() while there was an unused character!");
- }
- const c = s.input.readByte() catch |e| switch (e) {
- error.EndOfStream => return null,
- else => return s.err(.ReadError, "???"),
- };
- if (detailed_debug) {
- std.debug.print("{c}", .{c});
- }
- return c;
- }
+const def_init_stack_mem = (if (is_debug) 2 else 32) * @sizeOf(Parser.Context);
+const def_init_chars_mem = (if (is_debug) 2 else 2048);
- fn readNoEof(s: *State, comptime emsg: []const u8) !u8 {
- return if (try s.read()) |c| c else s.err(.UnexpectedEof, emsg);
- }
-
- fn getUnused(s: *State) ?u8 {
- if (s.unused_char) |c| {
- s.unused_char = null;
- return c;
- }
- return null;
- }
-
- fn addChar(s: *State, c: u8) !void {
- try s.chars.append(s.chars_alloc, c);
- }
-
- fn getBareString(s: *State) Value {
- defer s.chars.clearRetainingCapacity();
- return if (s.chars.items.len <= 6)
- value.sstr.pack(s.chars.items)
- else
- value.istr.intern(s.chars.items, false);
- }
+pub const DefaultStackSfa = std.heap.StackFallbackAllocator(def_init_stack_mem);
+pub const DefaultCharsSfa = std.heap.StackFallbackAllocator(def_init_chars_mem);
- fn getQuotedString(s: *State) Value {
- defer s.chars.clearRetainingCapacity();
- return if (s.chars.items.len <= 6)
- value.sstr.packQuoted(s.chars.items)
- else
- value.istr.intern(s.chars.items, true);
- }
-
- fn getRune(s: *State) Value {
- defer s.chars.clearRetainingCapacity();
- return value.rune.pack(s.chars.items);
- }
-
- fn push(s: *State, next: Fn) !void {
- try s.stack.append(s.stack_alloc, .{ .next = next });
- }
-
- fn pushContext(s: *State, next: Fn) !void {
- try s.stack.append(s.stack_alloc, .{
- .next = next,
- .val = s.context.val,
- .char = s.context.char,
- });
- }
-
- fn subr(s: *State, start: Fn, next: Fn) !void {
- try s.pushContext(next);
- s.context.next = start;
- }
-
- fn jump(s: *State, next: Fn, val: ?Value) void {
- if (val) |v| s.result = v;
- s.context.next = next;
- }
-
- fn abort(s: *State, next: Fn, unused_c: u8) void {
- s.result = VOID;
- s.unused_char = unused_c;
- s.context.next = next;
- }
-
- fn retval(s: *State, val: Value) void {
- s.result = val;
- if (s.stack.pop()) |c| {
- s.context = c;
- } else {
- s.context.next = .done;
- }
- }
-};
-
-pub fn _parse(input: std.io.AnyReader) !Value {
- var debug_alloc: std.heap.DebugAllocator(.{}) = undefined;
+pub fn default(
+ fb_alloc: *std.mem.Allocator,
+ stack_sfa: *DefaultStackSfa,
+ chars_sfa: *DefaultCharsSfa,
+) !Parser {
if (!is_test and is_debug) {
- debug_alloc = .init;
+ fb_alloc.* = std.heap.DebugAllocator(.{}).init;
}
- defer if (!is_test and is_debug) {
- if (debug_alloc.deinit() == .leak) {
- @panic("leak");
- }
- };
const alloc = if (is_test)
std.testing.allocator
else if (is_debug)
- debug_alloc.allocator()
+ fb_alloc.allocator()
else
std.heap.smp_allocator;
- const stack_mem = init_stack_capacity * @sizeOf(Context);
- const chars_mem = init_chars_capacity;
- var stack_sfa = std.heap.stackFallback(stack_mem, alloc);
- var chars_sfa = std.heap.stackFallback(chars_mem, alloc);
- const stack_alloc = stack_sfa.get();
- const chars_alloc = chars_sfa.get();
- var s = State.init(input, stack_alloc, chars_alloc) catch @panic("");
- defer s.deinit();
-
- while (s.context.next != .done) callNext(&s) catch |e| {
- // _ = e;
- // if (s.unused_char) |c| {
- // std.debug.panic(
- // "Parse error: {s}, unused_char: 0x{x}\n",
- // .{ s.err_msg, c },
- // );
- // } else {
- // std.debug.panic("Parse error: {s}\n", .{s.err_msg});
- // }
- return e;
- };
- if (s.unused_char) |c| {
- if (c != ' ') {
- std.debug.panic("Invalid trailing character: {c}\n", .{c});
- }
- }
- return s.result;
-}
-
-pub fn parse(input: std.io.AnyReader) Value {
- return _parse(input) catch @panic("");
-}
-
-const Fn = enum {
- parse_unit,
- return_context,
- end_one_datum,
- parse_join_datum,
- end_join_datum,
- join_data,
- parse_hash_datum,
- end_hash_datum,
- parse_rune_datum,
- end_rune_datum,
- end_label_datum,
- parse_list_element,
- continue_list,
- end_improper_list,
- close_improper_list,
- end_quote_expr,
- done,
-};
-
-fn callNext(s: *State) !void {
- if (detailed_debug) {
- const stack = s.stack.items;
- std.debug.print("\n\n{}:{} ctx:'{c}' unused:'{c}' \n", .{
- stack.len,
- s.context.next,
- s.context.char,
- s.unused_char orelse '_',
- });
- if (stack.len > 0) {
- var i = stack.len;
- while (i > 0) : (i -= 1) {
- const prev = stack[i - 1];
- std.debug.print("{}:{} ctx:'{c}'\n", .{
- i - 1,
- prev.next,
- prev.char,
- });
- }
- }
- }
- try switch (s.context.next) {
- .parse_unit => parseUnit(s),
- .return_context => returnContext(s),
- .end_one_datum => endOneDatum(s),
- .parse_join_datum => parseJoinDatum(s),
- .end_join_datum => endJoinDatum(s),
- .join_data => joinData(s),
- .parse_hash_datum => parseHashDatum(s),
- .end_hash_datum => endHashDatum(s),
- .parse_rune_datum => parseRuneDatum(s),
- .end_rune_datum => endRuneDatum(s),
- .end_label_datum => endLabelDatum(s),
- .parse_list_element => parseListElement(s),
- .continue_list => continueList(s),
- .end_improper_list => endImproperList(s),
- .close_improper_list => closeImproperList(s),
- .end_quote_expr => endQuoteExpr(s),
- .done => unreachable,
- };
-}
-
-fn parseUnit(s: *State) !void {
- var c1 = s.getUnused() orelse try s.read();
- while (c1) |c| : (c1 = try s.read()) {
- switch (try checkBlanks(s, c)) {
- .yes => {},
- .skip_unit => try s.push(.parse_unit),
- .no => return parseDatum(s, c),
- }
- }
- return s.retval(value.eof.eof);
-}
-
-fn checkBlanks(s: *State, c: u8) !enum { yes, skip_unit, no } {
- return switch (c) {
- '\t'...'\r', ' ' => .yes,
- ';' => switch (try s.read() orelse '\n') {
- '\n' => .yes,
- '~' => .skip_unit,
- else => while (try s.read() != '\n') {} else .yes,
- },
- else => .no,
- };
-}
-
-fn parseDatum(s: *State, c: u8) !void {
- if (c == '.') {
- return parseDotString(s);
- }
- return parseOneDatum(s, c, .end_one_datum);
-}
-
-fn parseDotString(s: *State) !void {
- var n: u48 = 1;
- while (try s.read()) |c| : (n += 1) {
- switch (try checkBlanks(s, c)) {
- .yes => return dotString(s, n, false),
- .skip_unit => return dotString(s, n, true),
- .no => switch (c) {
- '.' => {},
- ')', ']', '}' => {
- s.unused_char = c;
- return dotString(s, n, false);
- },
- else => return s.err(.InvalidCharacter, "dot string"),
- },
- }
- }
- unreachable;
-}
-
-fn dotString(s: *State, n: u48, skip_unit: bool) !void {
- const result = if (n == 1) LSTAIL else r: {
- const buf = try s.chars.addManyAsSlice(s.chars_alloc, n);
- @memset(buf, '.');
- break :r s.getBareString();
- };
- if (skip_unit) {
- s.context.val = result;
- return s.subr(.parse_unit, .return_context);
- } else {
- return s.retval(result);
- }
-}
-
-fn endOneDatum(s: *State) !void {
- if (s.result.eq(VOID)) {
- return s.retval(VOID);
- }
- const d = s.result;
- const c1 = s.getUnused() orelse try s.read();
- if (c1) |c| {
- switch (try checkBlanks(s, c)) {
- .yes => {},
- .skip_unit => return skipUnitAndReturn(s, d),
- .no => return parseJoin(s, d, c),
- }
- }
- s.unused_char = ' ';
- return s.retval(d);
-}
-
-fn skipUnitAndReturn(s: *State, d: Value) !void {
- s.context.val = d;
- return s.subr(.parse_unit, .return_context);
-}
-
-fn returnContext(s: *State) !void {
- s.unused_char = ' ';
- return s.retval(s.context.val);
-}
-
-fn parseJoin(s: *State, d: Value, c: u8) !void {
- switch (c) {
- '.', ':', '|' => {
- s.context.char = c;
- s.unused_char = try s.readNoEof("join datum");
- },
- else => {
- s.context.char = 0;
- s.unused_char = c;
- },
- }
- s.context.val = d;
- return s.subr(.parse_join_datum, .join_data);
-}
-
-fn parseJoinDatum(s: *State) !void {
- return parseOneDatum(s, s.getUnused().?, .end_join_datum);
-}
-
-fn endJoinDatum(s: *State) !void {
- return s.retval(s.result);
-}
-
-fn joinData(s: *State) !void {
- const head = s.context.val;
- const join = s.context.char;
- const tail = s.result;
- if (tail.eq(VOID)) {
- if (join == 0) {
- return s.retval(head);
- } else {
- return s.err(.InvalidCharacter, "join datum");
- }
- }
- const rune = switch (join) {
- 0 => JOIN,
- '.' => DOT,
- ':' => COLON,
- '|' => PIPE,
- else => unreachable,
- };
- const data = cons(head, tail);
- return s.jump(.end_one_datum, cons(rune, data));
-}
-
-fn parseOneDatum(s: *State, c: u8, next: Fn) !void {
- if (isBareChar(c)) {
- return s.jump(next, try parseBareString(s, c));
- }
- return parseCladDatum(s, c, next);
-}
-
-fn parseCladDatum(s: *State, c: u8, next: Fn) !void {
- if (c == '\\') {
- return s.jump(next, try parseBareEscString(s));
- }
- if (c == '"') {
- return s.jump(next, try parseQuotedString(s));
- }
- return switch (c) {
- '#' => parseHashExpression(s, next),
- '(', '[', '{' => parseList(s, c, next),
- '\'', '`', ',' => parseQuoteExpr(s, c, next),
- else => s.abort(next, c),
- };
-}
-
-fn isBareChar(c: u8) bool {
- return switch (c) {
- // zig fmt: off
- 'a'...'z' , 'A'...'Z' , '0'...'9' , '!' , '$' , '%' , '&' , '*' ,
- '+' , '-' , '/' , '<' , '=' , '>' , '?' , '@' , '^' , '_' , '~' => true,
- // zig fmt: on
- else => false,
- };
-}
-
-fn isBareEsc(c: u8) bool {
- return switch (c) {
- 33...126 => true,
- else => false,
- };
-}
-
-fn parseBareString(s: *State, c: u8) !Value {
- try s.addChar(c);
- var is_num = false;
- if (std.ascii.isDigit(c)) {
- is_num = true;
- } else if (c == '+' or c == '-') {
- const c2 = try s.read() orelse return s.getBareString();
- if (std.ascii.isDigit(c2)) {
- try s.addChar(c2);
- is_num = true;
- } else if (isBareChar(c2)) {
- try s.addChar(c2);
- } else if (c2 == '\\') {
- try s.addChar(try parseBareEsc(s));
- } else {
- s.unused_char = c2;
- return s.getBareString();
- }
- }
- return parseBareStringRest(s, is_num);
-}
+ stack_sfa.* = std.heap.stackFallback(def_init_stack_mem, alloc);
+ chars_sfa.* = std.heap.stackFallback(def_init_chars_mem, alloc);
-fn parseBareEscString(s: *State) !Value {
- try s.addChar(try parseBareEsc(s));
- return parseBareStringRest(s, false);
+ return Parser.init(
+ stack_sfa.get(),
+ chars_sfa.get(),
+ def_init_stack_mem,
+ def_init_chars_mem,
+ );
}
-fn parseBareStringRest(s: *State, is_num: bool) !Value {
- while (try s.read()) |c| {
- if (isBareChar(c) or (is_num and c == '.')) {
- try s.addChar(c);
- } else if (c == '\\') {
- try s.addChar(try parseBareEsc(s));
- } else {
- s.unused_char = c;
- break;
+pub fn deinit(fb_alloc: *std.mem.Allocator) void {
+ if (!is_test and is_debug) {
+ if (fb_alloc.?.deinit() == .leak) {
+ @panic("parser leaked");
}
}
- return s.getBareString();
-}
-
-fn parseBareEsc(s: *State) !u8 {
- const c = try s.readNoEof("bare escape");
- if (isBareEsc(c)) {
- return c;
- } else {
- return s.err(.InvalidCharacter, "bare escape");
- }
}
-fn parseQuotedString(s: *State) !Value {
- while (try s.read()) |c| {
- if (c == '"') {
- return s.getQuotedString();
- }
- if (c != '\\') {
- try s.addChar(c);
+pub fn parse(input: std.io.AnyReader) Value {
+ var fb_alloc: std.mem.Allocator = undefined;
+ var stack_sfa: DefaultStackSfa = undefined;
+ var chars_sfa: DefaultCharsSfa = undefined;
+ var p = default(&fb_alloc, &stack_sfa, &chars_sfa) catch @panic("OOM");
+ defer p.deinit();
+ return _parse(input) catch {
+ if (p.unused_char) |c| {
+ std.debug.panic(
+ "Parse error: {s}, unused_char: 0x{x}\n",
+ .{ p.err_msg, c },
+ );
} else {
- try parseQuotedEsc(s);
+ std.debug.panic("Parse error: {s}\n", .{p.err_msg});
}
- }
- return error.UnclosedString;
-}
-
-fn parseQuotedEsc(s: *State) !void {
- const c = try s.readNoEof("quoted escape");
- if (c == 'u') {
- return parseUniHexHandleErrors(s);
- }
- try s.addChar(switch (c) {
- '\\', '"' => c,
- '0' => 0,
- 'a' => 7,
- 'b' => 8,
- 't' => 9,
- 'n' => 10,
- 'v' => 11,
- 'f' => 12,
- 'r' => 13,
- 'e' => 27,
- 'x' => try parseHexByte(s, "hex escape"),
- else => return s.err(.InvalidCharacter, "quoted escape"),
- });
-}
-
-fn parseUniHexHandleErrors(s: *State) !void {
- return parseUniHex(s) catch |err| switch (err) {
- error.Utf8CannotEncodeSurrogateHalf => s.err(
- .UnicodeError,
- "unicode escape",
- ),
- else => |e| e,
};
}
-fn parseUniHex(s: *State) !void {
- const msg = "unicode escape";
-
- if (try s.readNoEof(msg) != '{') {
- return s.err(.InvalidCharacter, msg);
- }
-
- const uc, const unused_c = try parseHex(s, u21, msg);
- if (unused_c) |c| {
- if (c != '}') {
- return s.err(.InvalidCharacter, msg);
- }
- } else {
- return s.err(.UnexpectedEof, msg);
- }
-
- const n = try std.unicode.utf8CodepointSequenceLength(uc);
- const buf = try s.chars.addManyAsSlice(s.chars_alloc, n);
- _ = try std.unicode.utf8Encode(uc, buf);
-}
-
-fn parseHashExpression(s: *State, next: Fn) !void {
- const c = try s.readNoEof("hash expression");
- if (try checkBlanks(s, c) != .no) {
- return s.err(.InvalidCharacter, "hash expression");
- }
- if (std.ascii.isAlphabetic(c)) {
- const r, const unused_c = try parseRune(s, c);
- return parseRuneEnd(s, r, unused_c, next);
- }
- if (c == '%') {
- const l, const unused_c = try parseLabel(s);
- return parseLabelEnd(s, l, unused_c, next);
- }
- if (isBareChar(c)) {
- // Reserved for future extensions to syntax sugar.
- return s.err(.InvalidCharacter, "hash expression");
- }
- // fast-path to avoid subr
- if (c == '\\') {
- return s.jump(next, cons(HASH, try parseBareEscString(s)));
- }
- if (c == '"') {
- return s.jump(next, cons(HASH, try parseQuotedString(s)));
- }
- s.unused_char = c;
- return s.subr(.parse_hash_datum, next);
-}
-
-fn parseHashDatum(s: *State) !void {
- return parseCladDatum(s, s.getUnused().?, .end_hash_datum);
-}
-
-fn endHashDatum(s: *State) !void {
- if (s.result.eq(VOID)) {
- return s.err(.InvalidCharacter, "hash datum");
- }
- return s.retval(cons(HASH, s.result));
-}
-
-fn parseRune(s: *State, c1: u8) !struct { Value, ?u8 } {
- try s.addChar(c1);
- var len: usize = 1;
- while (try s.read()) |c| : (len += 1) {
- if (len == 6 or !std.ascii.isAlphanumeric(c)) {
- return .{ s.getRune(), c };
- }
- try s.addChar(c);
- }
- return .{ s.getRune(), null };
-}
-
-fn parseRuneEnd(s: *State, r: Value, c1: ?u8, next: Fn) !void {
- const c = c1 orelse return s.jump(next, r);
- if (c == '\\') {
- return s.jump(next, cons(r, try parseBareString(s, c)));
- }
- if (c == '"') {
- return s.jump(next, cons(r, try parseQuotedString(s)));
- }
- s.unused_char = c;
- switch (c) {
- '#', '(', '[', '{', '\'', '`', ',' => {
- try s.push(next);
- return s.jump(.parse_rune_datum, r);
- },
- else => return s.jump(next, r),
- }
-}
-
-fn parseRuneDatum(s: *State) !void {
- s.context.val = s.result;
- return parseCladDatum(s, s.getUnused().?, .end_rune_datum);
-}
-
-fn endRuneDatum(s: *State) !void {
- if (s.result.eq(VOID)) {
- s.retval(s.context.val);
- }
- return s.retval(cons(s.context.val, s.result));
-}
-
-fn parseLabel(s: *State) !struct { Value, ?u8 } {
- const label, const unused_c = try parseHex(s, u48, "datum label");
- return .{ value.fixnum.pack(label), unused_c };
-}
-
-fn parseLabelEnd(s: *State, l: Value, c1: ?u8, next: Fn) !void {
- const c = c1 orelse return s.err(.UnexpectedEof, "datum label");
- if (c == '%') {
- return s.jump(next, cons(LABEL, l));
- }
- if (c == '=') {
- try s.push(next);
- s.context.val = l;
- return s.subr(.parse_unit, .end_label_datum);
- }
- return s.err(.InvalidCharacter, "datum label");
-}
-
-fn endLabelDatum(s: *State) !void {
- if (s.result.eq(VOID)) {
- return s.err(.InvalidCharacter, "label datum");
- }
- return s.retval(cons(LABEL, cons(s.context.val, s.result)));
-}
-
-fn parseList(s: *State, open: u8, next: Fn) !void {
- const head = switch (open) {
- '(' => value.nil.nil,
- '[' => cons(SQUARE, value.nil.nil),
- '{' => cons(BRACE, value.nil.nil),
- else => unreachable,
- };
- const close: u8 = switch (open) {
- '(' => ')',
- '[' => ']',
- '{' => '}',
- else => unreachable,
- };
- while (try s.read()) |c| {
- if (c == close) {
- return s.jump(next, head);
- }
- switch (try checkBlanks(s, c)) {
- .yes => {},
- .skip_unit => {
- try listParserSetup(s, head, close, next);
- // Parse twice in a row, ignoring the first result.
- return s.subr(.parse_unit, .parse_unit);
- },
- .no => {
- try listParserSetup(s, head, close, next);
- s.unused_char = c;
- return s.jump(.parse_list_element, null);
- },
- }
- }
- return s.err(.UnexpectedEof, "list");
-}
-
-fn listParserSetup(s: *State, head: Value, close: u8, next: Fn) !void {
- s.context.val = head;
- s.context.char = close;
- try s.push(next);
- try s.pushContext(.continue_list);
-}
-
-fn parseListElement(s: *State) !void {
- return parseDatum(s, s.getUnused().?);
-}
-
-fn continueList(s: *State) !void {
- const close = s.context.char;
-
- if (s.result.eq(VOID)) {
- const c = s.getUnused().?;
- if (c == close) {
- return endList(s);
- }
- return s.err(.InvalidCharacter, "list");
- }
-
- if (s.result.eq(LSTAIL)) {
- return s.subr(.parse_unit, .end_improper_list);
- }
-
- s.context.val = cons(s.result, s.context.val);
-
- var c1 = s.getUnused() orelse try s.read();
- while (c1) |c| : (c1 = try s.read()) {
- if (c == close) {
- return endList(s);
- }
- switch (try checkBlanks(s, c)) {
- .yes => {},
- .skip_unit => {
- try s.pushContext(.continue_list);
- return s.subr(.parse_unit, .parse_unit);
- },
- .no => {
- s.unused_char = c;
- return s.subr(.parse_list_element, .continue_list);
- },
- }
- }
- return s.err(.UnexpectedEof, "list");
-}
-
-fn endList(s: *State) !void {
- return s.retval(lib.list.reverse(s.context.val));
-}
-
-fn endImproperList(s: *State) !void {
- if (s.result.eq(VOID)) {
- return s.err(.InvalidCharacter, "list tail");
- }
- s.context.val = lib.list.reverseWithTail(s.context.val, s.result);
- return closeImproperList(s);
-}
-
-fn closeImproperList(s: *State) !void {
- const result = s.context.val;
- const close = s.context.char;
- var c1 = s.getUnused() orelse try s.read();
- while (c1) |c| : (c1 = try s.readNoEof("after list tail")) {
- if (c == close) {
- return s.retval(result);
- }
- switch (try checkBlanks(s, c)) {
- .yes => {},
- .skip_unit => return s.subr(.parse_unit, .close_improper_list),
- .no => return s.err(.InvalidCharacter, "after list tail"),
- }
- }
- unreachable;
-}
-
-fn parseQuoteExpr(s: *State, c1: u8, next: Fn) !void {
- const q = switch (c1) {
- '\'' => QUOTE,
- '`' => GRAVE,
- ',' => COMMA,
- else => unreachable,
- };
-
- // fast-path to avoid subr
- const c = try s.readNoEof("quote expression");
- if (isBareChar(c) or c == '\\') {
- return s.jump(next, cons(q, try parseBareString(s, c)));
- }
-
- try s.push(next);
- s.context.val = q;
- s.unused_char = c;
- return s.subr(.parse_unit, .end_quote_expr);
-}
-
-fn endQuoteExpr(s: *State) !void {
- if (s.result.eq(VOID)) {
- return s.err(.InvalidCharacter, "quote expression datum");
- }
- return s.retval(cons(s.context.val, s.result));
-}
-
-// Helpers
-
-fn parseHex(
- s: *State,
- u_type: type,
- comptime emsg: []const u8,
-) !struct { u_type, ?u8 } {
- var uc: u_type = undefined;
-
- const c1 = try s.readNoEof(emsg);
- uc = try parseHexDigit(s, c1, emsg);
-
- while (try s.read()) |c| {
- if (!std.ascii.isHex(c)) {
- return .{ uc, c };
- }
- const shl = std.math.shlExact;
- uc = shl(u_type, uc, 4) catch return s.err(.OutOfRange, emsg);
- uc |= try parseHexDigit(s, c, emsg);
- }
- return .{ uc, null };
-}
-
-fn parseHexByte(s: *State, comptime emsg: []const u8) !u8 {
- const h1 = try s.readNoEof(emsg);
- const h2 = try s.readNoEof(emsg);
- const hi = try parseHexDigit(s, h1, emsg);
- const lo = try parseHexDigit(s, h2, emsg);
- return hi << 4 | lo;
-}
-
-fn parseHexDigit(s: *State, c: u8, comptime emsg: []const u8) !u8 {
- return switch (c) {
- '0'...'9' => c - '0',
- 'A'...'F' => c - 'A' + 10,
- 'a'...'f' => c - 'a' + 10,
- else => s.err(.InvalidCharacter, emsg),
- };
+pub fn _parse(input: std.io.AnyReader) !Value {
+ var fb_alloc: std.mem.Allocator = undefined;
+ var stack_sfa: DefaultStackSfa = undefined;
+ var chars_sfa: DefaultCharsSfa = undefined;
+ var p = try default(&fb_alloc, &stack_sfa, &chars_sfa);
+ defer p.deinit();
+ return p.run(input);
}