diff options
| author | Taylan Kammer <taylan.kammer@gmail.com> | 2025-02-20 18:29:59 +0100 |
|---|---|---|
| committer | Taylan Kammer <taylan.kammer@gmail.com> | 2025-02-20 18:29:59 +0100 |
| commit | ab0edc69d17cd32397a4f7a059120fd6152bb138 (patch) | |
| tree | 505bc85978714cb0b6f4e5d8c9058bbcf1fe1bab /src/libzisp/read.zig | |
| parent | 3b713ef3e872bda3da9e5a67a9bfd5c6701cb665 (diff) | |
update
Diffstat (limited to 'src/libzisp/read.zig')
| -rw-r--r-- | src/libzisp/read.zig | 658 |
1 files changed, 0 insertions, 658 deletions
diff --git a/src/libzisp/read.zig b/src/libzisp/read.zig deleted file mode 100644 index 74ae51b..0000000 --- a/src/libzisp/read.zig +++ /dev/null @@ -1,658 +0,0 @@ -// -// === Parser for Code & Data === -// -// Zisp s-expressions come in two flavors: code (sugar) and data (no sugar). -// -// Code expressions are first parsed into the same data types which the data -// expressions can express; it's merely surface-level syntax sugar. -// -// Data expressions don't support any syntax sugar and have a simple format, -// only being able to represent the following data types: -// -// string -> foo , "foo bar" -// -// number -> 123 , -1.23 , +123 , +nan.0 , -inf.0 , ... -// -// rune -> #foo ;limited to 6 ASCII letters (a - z, A - Z) -// -// list -> (...) ;the usual: actually pairs; may be improper -// -// null -> () ;also called nil around here -// -// We may use terms like "code parser" and "data parser" out of convenience, -// although there may only be a single parser that implements both formats by -// switching between modes. -// -// When the code parser encounters syntax sugar, it always transforms it into a -// list starting with a rune, like in the following examples: -// -// #(...) -> (#hash ...) -// -// [...] -> (#square ...) -// -// 'foo -> (#quote . foo) -// -// These can combine: -// -// #{...} -> (#hash #brace ...) -// -// #'foo -> (#hash #quote . foo) -// -// ##'[...] -> (#hash #hash #quote #square ...) -// -// As a specialty, double-quoted strings are actually considered sugar by the -// code parser, and are transformed as follows into data: -// -// "..." -> (#string "...") -// -// (Otherwise, all string literals would be identifiers, or all identifiers -// would be string literals, because Zisp doesn't differentiate strings and -// symbols like traditional lisps.) -// -// -// === Decoder === -// -// A separate process called "decoding" can transform simple data structures, -// consisting of only the above types, into a richer set of Zisp data types. -// -// For example, the decoder may turn (#hash ...) into a vector. Some runes may -// be decoded in isolation rather than as part of a list, which is how #true and #false are implemented. -// -//It also -// interprets common rune invocations like (#quote ...) to implement the -// traditional quoting mechanism, but in a better way: -// -// Traditional quote is "unhygienic" in Scheme terms. An expressoin such as -// '(foo bar) will always be read as (quote (foo bar)) regardless of what sort -// of lexical context it appears in, so the semantics will depend on whatever -// the identifier "quote" is bound to in that lexical context. -// -// The Zisp decoder, which transforms not text to text, but objects to objects, -// can turn (#quote ...) into an abstract object which encapsulates the notion -// of quoting, which the Zisp evaluator can recognize and act upon. -// -// One way to think about this, in Scheme (R6RS / syntax-case) terms, is that -// expressions like '(foo bar) turn directly into a *syntax object* when read, -// rather than a regular list object. -// -// -// === Implementation details === -// -// Instead of using recursion directly, the parser is written in something akin -// to a manual continuation-passing style, which ensures that parsing depth is -// not limited by the stack size. -// -// All state/context is passed around via a struct pointer. The parser has a -// main loop which calls a function, passes it the state, and expects to get a -// new state pointer in return, which tells which function the main loop should -// call next, based on the .next field of the state. -// -// When a called function wants to call the parser recursively, it sets the -// .next field to an enumeration value that indicates where the parser should -// return to after it's done with the sub-parsing, and then constructs a new -// state struct, saving a pointer to the original in a .parent field. -// -// Making the parser "return" is a matter of setting the .retval field, and -// setting the .next field to the value .finish, to indicate to the main loop -// that it should either pass control back to the parent, or finish parsing. -// -// -// === Notation used in comments === -// -// Some comments throughout the file give you an example of where the parser -// currently is in a stream. These illustrations use the pipe symbol, which -// looks like a cursor, to indicate the current position of the parser: -// -// (foo| bar baz) <- parser arrived at the end of the string foo -// -// (foo bar baz)| <- parser reached EOF (assuming no trailing spaces) -// - -const std = @import("std"); - -const gc = @import("gc.zig"); -const list = @import("list.zig"); -const value = @import("value.zig"); - -const Value = value.Value; - -const State = struct { - alloc: std.mem.Allocator, - - input: []const u8, - pos: usize = 0, - - mode: enum { code, data } = .code, - - next: Next = .start_parsing, - - parent: ?*State = null, - - datum_rune: Value = value.boole.f, - list_tail: Value = value.nil.nil, - opening_bracket: enum { paren, square, brace } = .paren, - opening_quote: enum { apos, tick, comma } = .apos, - - retval: Value = value.eof.eof, - - fn eof(self: *State) bool { - return self.pos >= self.input.len; - } - - fn peek(self: *State) u8 { - return self.input[self.pos]; - } - - fn skip(self: *State) void { - self.pos += 1; - } - - fn getc(self: *State) u8 { - const c = self.peek(); - self.skip(); - return c; - } - - // Consumes whitespace and line comments. - fn consumeBlanks(self: *State) void { - while (!self.eof()) { - if (self.isWhitespace()) { - self.skip(); - } else if (self.peek() == ';') { - self.skip(); - self.consumeLineComment(); - } else { - return; - } - } - } - - fn consumeLineComment(self: *State) void { - while (!self.eof()) { - if (self.getc() == '\n') { - return; - } - } - } - - fn isWhitespace(self: *State) bool { - return switch (self.peek()) { - '\t', '\n', ' ' => true, - else => false, - }; - } - - // Checks for: Whitespace, closing brackets, and EOF. - // - // This can tell us that we're in a position such as: - // - // (foo| bar) - // - // (foo bar|) - // - // foo| - // - // We could also accept semicolon, so the following works, like in Scheme: - // - // (foo;comment - // bar) - // - // But IMO this should be an error. It's too easy to misread, and might - // just be a typo: the semicolon may have been meant to be a colon. - // - fn isEndDelimiter(self: *State) bool { - return switch (self.peek()) { - '\t', '\n', ' ', ')', ']', '}' => true, - else => false, - }; - } - - fn isFinalNull(self: *State) bool { - return self.peek() == 0 and self.pos == self.input.len - 1; - } - - fn newSubstate(self: *State, next: Next) *State { - const sub = self.alloc.create(State) catch @panic("OOM"); - sub.* = .{ .alloc = self.alloc, .input = self.input }; - sub.pos = self.pos; - sub.mode = self.mode; - sub.next = next; - sub.parent = self; - return sub; - } - - fn setReturn(self: *State, val: Value) *State { - self.retval = val; - self.next = .finish; - return self; - } - - fn finish(self: *State) ?*State { - if (self.parent) |parent| { - parent.retval = self.retval; - parent.pos = self.pos; - parent.alloc.destroy(self); - return parent; - } else { - return null; - } - } -}; - -// Probably best *not* to use function pointers here, but rather dispatch to -// functions manually based on enum value. This should help the optimizer. - -const Next = enum { - start_parsing, - start_datum, - end_hash_datum, - end_rune_datum, - end_quote, - continue_list, - end_improper_list, - finish, -}; - -pub fn read(input: []const u8) Value { - var gpa: std.heap.GeneralPurposeAllocator(.{}) = .init; - var top = State{ .alloc = gpa.allocator(), .input = input }; - var s = ⊤ - while (true) { - s = switch (s.next) { - .start_parsing => startParsing(s), - .start_datum => startDatum(s), - .end_hash_datum => endHashDatum(s), - .end_rune_datum => endRuneDatum(s), - .end_quote => endQuote(s), - .continue_list => continueList(s), - .end_improper_list => endImproperList(s), - .finish => s.finish() orelse break, - }; - std.debug.print("next: {}\n", .{s.next}); - } - if (s.eof() or s.isFinalNull()) { - return s.retval; - } else { - // Should never happen. - err(s, "READER BUG: unconsumed input"); - } -} - -fn startParsing(s: *State) *State { - s.consumeBlanks(); - if (s.eof()) { - return s.setReturn(value.eof.eof); - } - return switch (s.peek()) { - // whitespace already consumed - 0...31, 127...255 => err(s, "invalid character"), - ')', ']', '}' => err(s, "unexpected closing bracket"), - else => startDatum(s), - }; -} - -// This is called when we *immediately* expect a datum and nothing else; for -// example, no whitespace or comments, because they've already been consumed. -fn startDatum(s: *State) *State { - if (s.isWhitespace()) { - return err(s, "expected datum, got whitespace"); - } - if (s.eof()) { - return err(s, "expected datum, got EOF"); - } - return switch (s.getc()) { - // whitespace checked above - 0...31, 127...255 => err(s, "invalid character"), - - ')', ']', '}' => err(s, "unexpected closing bracket"), - - ';' => err(s, "expected datum, got semicolon"), - - '#' => handleHash(s), - - '"' => startQuotedString(s), - - '\'', '`', ',' => |c| startQuote(s, c), - - '(', '[', '{' => |c| startList(s, c), - - '+', '-' => |c| handlePlusMinus(s, c), - - '0'...'9' => |c| handleDigit(s, c), - - // Periods only allowed between digits, and to express improper lists. - // Things like the following look too much like it could be a typo: - // - // (foo .5) (foo .bar) - // - '.' => err(s, "misplaced period"), - - else => startBareString(s), - }; -} - -fn handleHash(s: *State) *State { - // - // We just consumed a hash. Possibilities include: - // - // #|foo ;rune - // - // #|;DATUM ;datum comment - // - // #|DATUM ;hash-datum (code mode only) - // - - if (s.isWhitespace()) { - return err(s, "whitespace after hash"); - } - if (s.eof()) { - return err(s, "EOF after hash"); - } - - // Is it a rune? #foo - switch (s.peek()) { - 'A'...'Z', 'a'...'z' => return handleRune(s), - else => {}, - } - - // Is it a datum comment? #;DATUM - if (s.peek() == ';') { - s.skip(); - // Don't change s.next in this case. Just let the parser try to redo - // what it was doing as soon as the commented-out datum has been read. - return s.newSubstate(.start_datum); - } - - // Otherwise, it must be a hash-datum. #DATUM - - // But data mode doesn't allow that. - if (s.mode == .data) { - return err(s, "invalid use of hash in data mode"); - } - - s.next = .end_hash_datum; - return s.newSubstate(.start_datum); -} - -fn handleRune(s: *State) *State { - // - // We are now here, knowing that at least one ASCII letter follows: - // - // #|foo - // - - var buf: [6]u8 = undefined; - var i: u8 = 0; - while (!s.eof()) : (i += 1) switch (s.peek()) { - 'a'...'z', 'A'...'Z' => { - if (i == buf.len) { - return err(s, "rune too long"); - } - buf[i] = s.getc(); - }, - else => break, - }; - - // Note: 'i' can't be 0 since this function is only called if at least one - // ASCII letter is found after the hash. - - const rune = value.rune.pack(buf[0..i]); - - // - // Now we're at the end of the rune, but it could be a rune-datum: - // - // #foo|(...) - // - - if (s.isEndDelimiter()) { - return s.setReturn(rune); - } - - // Otherwise, it's followed by a datum, like: #foo(...) - - // Which is only allowed in code mode. - if (s.mode == .data) { - return err(s, "invalid use of hash in data mode"); - } - - s.datum_rune = rune; - s.next = .end_rune_datum; - return s.newSubstate(.start_datum); -} - -fn endRuneDatum(s: *State) *State { - return s.setReturn(value.pair.cons( - s.datum_rune, - s.retval, - )); -} - -fn endHashDatum(s: *State) *State { - return s.setReturn(value.pair.cons( - value.rune.pack("hash"), - s.retval, - )); -} - -fn startQuotedString(s: *State) *State { - // We are now here: - // - // "|..." - // - const str = readQuotedString(s) catch return err(s, "unclosed string"); - if (s.mode == .code) { - // "foo bar" => (#string . "foo bar") - const rune = value.rune.pack("string"); - const pair = value.pair.cons(rune, str); - return s.setReturn(pair); - } else { - return s.setReturn(str); - } -} - -const StringReadError = enum { UnclosedString }; - -fn readQuotedString(s: *State) error{UnclosedString}!Value { - return try readQuotedSstr(s) orelse readQuotedLongString(s); -} - -fn readQuotedSstr(s: *State) error{UnclosedString}!?Value { - // We will reset to this position if we fail. - const start_pos = s.pos; - - var buf: [6]u8 = undefined; - var i: u8 = 0; - while (!s.eof()) { - const c = s.getc(); - if (c == '"') { - // ok, return what we accumulated - return value.sstr.pack(buf[0..i]); - } - if (i == 6) { - // failed; reset and bail out - s.pos = start_pos; - return null; - } - // ok, save this byte and go on - buf[i] = c; - i += 1; - } - return error.UnclosedString; -} - -fn readQuotedLongString(s: *State) Value { - _ = s; - @panic("not implemented"); -} - -fn startQuote(s: *State, c: u8) *State { - // Allowed in Scheme, but why? Not in Zisp. - if (s.isWhitespace()) { - return err(s, "whitespace after apostrophe"); - } - s.opening_quote = switch (c) { - '\'' => .apos, - '`' => .tick, - ',' => .comma, - else => unreachable, - }; - const sub = s.newSubstate(.start_datum); - sub.mode = .data; - s.next = .end_quote; - return sub; -} - -fn endQuote(s: *State) *State { - const name = switch (s.opening_quote) { - .apos => "apos", - .tick => "tick", - .comma => "comma", - }; - return s.setReturn(value.pair.cons( - value.rune.pack(name), - s.retval, - )); -} - -fn startList(s: *State, open: u8) *State { - if (s.mode == .data and open != '(') { - return err(s, "invalid opening bracket in data mode"); - } - - s.consumeBlanks(); - - // Check for empty lists: (), [], {} - if (open == '(' and s.peek() == ')') { - s.skip(); - return s.setReturn(value.nil.nil); - } - if (open == '[' and s.peek() == ']') { - s.skip(); - return s.setReturn(value.pair.cons( - value.rune.pack("square"), - value.nil.nil, - )); - } - if (open == '{' and s.peek() == '}') { - s.skip(); - return s.setReturn(value.pair.cons( - value.rune.pack("brace"), - value.nil.nil, - )); - } - - s.opening_bracket = switch (open) { - '(' => .paren, - '[' => .square, - '{' => .brace, - else => unreachable, - }; - s.next = .continue_list; - return s.newSubstate(.start_datum); -} - -fn continueList(s: *State) *State { - s.consumeBlanks(); - - if (s.eof()) { - return err(s, "unexpected EOF while parsing list"); - } - - const tail = value.pair.cons(s.retval, s.list_tail); - - const open = s.opening_bracket; - const char = s.peek(); - - // Check for proper ending: (foo bar baz) - if (open == .paren and char == ')') { - s.skip(); - return s.setReturn(list.reverse(tail)); - } - if (open == .square and char == ']') { - s.skip(); - return s.setReturn(value.pair.cons( - value.rune.pack("square"), - list.reverse(tail), - )); - } - if (open == .brace and char == '}') { - s.skip(); - return s.setReturn(value.pair.cons( - value.rune.pack("brace"), - list.reverse(tail), - )); - } - - s.list_tail = tail; - - // Check for improper ending: (foo bar . baz) - if (char == '.') { - s.skip(); - - // We should now be at (... foo .| bar) and whitespace must follow. - // Scheme allows (foo .(bar)) but we don't. Mind your spaces! - if (!s.isWhitespace()) { - return err(s, "invalid use of period"); - } - - s.consumeBlanks(); - - s.next = .end_improper_list; - return s.newSubstate(.start_datum); - } - - // OK, next element. - return s.newSubstate(.start_datum); -} - -fn endImproperList(s: *State) *State { - s.consumeBlanks(); - - if (s.eof()) { - return err(s, "unexpected EOF"); - } - - const open = s.opening_bracket; - const char = s.getc(); - - const body = s.list_tail; - const tail = s.retval; - - if (open == .paren and char == ')') { - return s.setReturn(list.reverseWithTail(body, tail)); - } - if (open == .square and char == ']') { - return s.setReturn(value.pair.cons( - value.rune.pack("square"), - list.reverseWithTail(body, tail), - )); - } - if (open == .brace and char == '}') { - return s.setReturn(value.pair.cons( - value.rune.pack("brace"), - list.reverseWithTail(body, tail), - )); - } - - return err(s, "malformed list or extra datum at end of improper list"); -} - -fn handlePlusMinus(s: *State, c: u8) *State { - _ = c; - return s; -} - -fn handleDigit(s: *State, c: u8) *State { - _ = c; - return s; -} - -fn startBareString(s: *State) *State { - return s; -} - -fn err(s: *State, msg: []const u8) noreturn { - std.debug.print("{s}\n", .{msg}); - std.debug.print("pos: {}\n", .{s.pos}); - @panic("reader error"); -} |
