update

author: Taylan Kammer <taylan.kammer@gmail.com> 2025-02-20 18:29:59 +0100
committer: Taylan Kammer <taylan.kammer@gmail.com> 2025-02-20 18:29:59 +0100
commit: ab0edc69d17cd32397a4f7a059120fd6152bb138 (patch)
tree: 505bc85978714cb0b6f4e5d8c9058bbcf1fe1bab /src/libzisp/read.zig
parent: 3b713ef3e872bda3da9e5a67a9bfd5c6701cb665 (diff)
1 files changed, 0 insertions, 658 deletions
diff --git a/src/libzisp/read.zig b/src/libzisp/read.zig
deleted file mode 100644
index 74ae51b..0000000
--- a/src/libzisp/read.zig
+++ /dev/null
@@ -1,658 +0,0 @@
-//
-// === Parser for Code & Data ===
-//
-// Zisp s-expressions come in two flavors: code (sugar) and data (no sugar).
-//
-// Code expressions are first parsed into the same data types which the data
-// expressions can express; it's merely surface-level syntax sugar.
-//
-// Data expressions don't support any syntax sugar and have a simple format,
-// only being able to represent the following data types:
-//
-//   string -> foo , "foo bar"
-//
-//   number -> 123 , -1.23 , +123 , +nan.0 , -inf.0 , ...
-//
-//   rune   -> #foo     ;limited to 6 ASCII letters (a - z, A - Z)
-//
-//   list   -> (...)    ;the usual: actually pairs; may be improper
-//
-//   null   -> ()       ;also called nil around here
-//
-// We may use terms like "code parser" and "data parser" out of convenience,
-// although there may only be a single parser that implements both formats by
-// switching between modes.
-//
-// When the code parser encounters syntax sugar, it always transforms it into a
-// list starting with a rune, like in the following examples:
-//
-//   #(...)   -> (#hash ...)
-//
-//   [...]    -> (#square ...)
-//
-//   'foo     -> (#quote . foo)
-//
-// These can combine:
-//
-//   #{...}   -> (#hash #brace ...)
-//
-//   #'foo    -> (#hash #quote . foo)
-//
-//   ##'[...] -> (#hash #hash #quote #square ...)
-//
-// As a specialty, double-quoted strings are actually considered sugar by the
-// code parser, and are transformed as follows into data:
-//
-//   "..."    -> (#string "...")
-//
-// (Otherwise, all string literals would be identifiers, or all identifiers
-// would be string literals, because Zisp doesn't differentiate strings and
-// symbols like traditional lisps.)
-//
-//
-// === Decoder ===
-//
-// A separate process called "decoding" can transform simple data structures,
-// consisting of only the above types, into a richer set of Zisp data types.
-//
-// For example, the decoder may turn (#hash ...) into a vector.  Some runes may
-// be decoded in isolation rather than as part of a list, which is how #true and #false are implemented.
-//
-//It also
-// interprets common rune invocations like (#quote ...) to implement the
-// traditional quoting mechanism, but in a better way:
-//
-// Traditional quote is "unhygienic" in Scheme terms.  An expressoin such as
-// '(foo bar) will always be read as (quote (foo bar)) regardless of what sort
-// of lexical context it appears in, so the semantics will depend on whatever
-// the identifier "quote" is bound to in that lexical context.
-//
-// The Zisp decoder, which transforms not text to text, but objects to objects,
-// can turn (#quote ...) into an abstract object which encapsulates the notion
-// of quoting, which the Zisp evaluator can recognize and act upon.
-//
-// One way to think about this, in Scheme (R6RS / syntax-case) terms, is that
-// expressions like '(foo bar) turn directly into a *syntax object* when read,
-// rather than a regular list object.
-//
-//
-// === Implementation details ===
-//
-// Instead of using recursion directly, the parser is written in something akin
-// to a manual continuation-passing style, which ensures that parsing depth is
-// not limited by the stack size.
-//
-// All state/context is passed around via a struct pointer.  The parser has a
-// main loop which calls a function, passes it the state, and expects to get a
-// new state pointer in return, which tells which function the main loop should
-// call next, based on the .next field of the state.
-//
-// When a called function wants to call the parser recursively, it sets the
-// .next field to an enumeration value that indicates where the parser should
-// return to after it's done with the sub-parsing, and then constructs a new
-// state struct, saving a pointer to the original in a .parent field.
-//
-// Making the parser "return" is a matter of setting the .retval field, and
-// setting the .next field to the value .finish, to indicate to the main loop
-// that it should either pass control back to the parent, or finish parsing.
-//
-//
-// === Notation used in comments ===
-//
-// Some comments throughout the file give you an example of where the parser
-// currently is in a stream.  These illustrations use the pipe symbol, which
-// looks like a cursor, to indicate the current position of the parser:
-//
-//   (foo| bar baz)   <- parser arrived at the end of the string foo
-//
-//   (foo bar baz)|   <- parser reached EOF (assuming no trailing spaces)
-//
-
-const std = @import("std");
-
-const gc = @import("gc.zig");
-const list = @import("list.zig");
-const value = @import("value.zig");
-
-const Value = value.Value;
-
-const State = struct {
-    alloc: std.mem.Allocator,
-
-    input: []const u8,
-    pos: usize = 0,
-
-    mode: enum { code, data } = .code,
-
-    next: Next = .start_parsing,
-
-    parent: ?*State = null,
-
-    datum_rune: Value = value.boole.f,
-    list_tail: Value = value.nil.nil,
-    opening_bracket: enum { paren, square, brace } = .paren,
-    opening_quote: enum { apos, tick, comma } = .apos,
-
-    retval: Value = value.eof.eof,
-
-    fn eof(self: *State) bool {
-        return self.pos >= self.input.len;
-    }
-
-    fn peek(self: *State) u8 {
-        return self.input[self.pos];
-    }
-
-    fn skip(self: *State) void {
-        self.pos += 1;
-    }
-
-    fn getc(self: *State) u8 {
-        const c = self.peek();
-        self.skip();
-        return c;
-    }
-
-    // Consumes whitespace and line comments.
-    fn consumeBlanks(self: *State) void {
-        while (!self.eof()) {
-            if (self.isWhitespace()) {
-                self.skip();
-            } else if (self.peek() == ';') {
-                self.skip();
-                self.consumeLineComment();
-            } else {
-                return;
-            }
-        }
-    }
-
-    fn consumeLineComment(self: *State) void {
-        while (!self.eof()) {
-            if (self.getc() == '\n') {
-                return;
-            }
-        }
-    }
-
-    fn isWhitespace(self: *State) bool {
-        return switch (self.peek()) {
-            '\t', '\n', ' ' => true,
-            else => false,
-        };
-    }
-
-    // Checks for: Whitespace, closing brackets, and EOF.
-    //
-    // This can tell us that we're in a position such as:
-    //
-    //   (foo| bar)
-    //
-    //   (foo bar|)
-    //
-    //   foo|
-    //
-    // We could also accept semicolon, so the following works, like in Scheme:
-    //
-    //   (foo;comment
-    //    bar)
-    //
-    // But IMO this should be an error.  It's too easy to misread, and might
-    // just be a typo: the semicolon may have been meant to be a colon.
-    //
-    fn isEndDelimiter(self: *State) bool {
-        return switch (self.peek()) {
-            '\t', '\n', ' ', ')', ']', '}' => true,
-            else => false,
-        };
-    }
-
-    fn isFinalNull(self: *State) bool {
-        return self.peek() == 0 and self.pos == self.input.len - 1;
-    }
-
-    fn newSubstate(self: *State, next: Next) *State {
-        const sub = self.alloc.create(State) catch @panic("OOM");
-        sub.* = .{ .alloc = self.alloc, .input = self.input };
-        sub.pos = self.pos;
-        sub.mode = self.mode;
-        sub.next = next;
-        sub.parent = self;
-        return sub;
-    }
-
-    fn setReturn(self: *State, val: Value) *State {
-        self.retval = val;
-        self.next = .finish;
-        return self;
-    }
-
-    fn finish(self: *State) ?*State {
-        if (self.parent) |parent| {
-            parent.retval = self.retval;
-            parent.pos = self.pos;
-            parent.alloc.destroy(self);
-            return parent;
-        } else {
-            return null;
-        }
-    }
-};
-
-// Probably best *not* to use function pointers here, but rather dispatch to
-// functions manually based on enum value.  This should help the optimizer.
-
-const Next = enum {
-    start_parsing,
-    start_datum,
-    end_hash_datum,
-    end_rune_datum,
-    end_quote,
-    continue_list,
-    end_improper_list,
-    finish,
-};
-
-pub fn read(input: []const u8) Value {
-    var gpa: std.heap.GeneralPurposeAllocator(.{}) = .init;
-    var top = State{ .alloc = gpa.allocator(), .input = input };
-    var s = &top;
-    while (true) {
-        s = switch (s.next) {
-            .start_parsing => startParsing(s),
-            .start_datum => startDatum(s),
-            .end_hash_datum => endHashDatum(s),
-            .end_rune_datum => endRuneDatum(s),
-            .end_quote => endQuote(s),
-            .continue_list => continueList(s),
-            .end_improper_list => endImproperList(s),
-            .finish => s.finish() orelse break,
-        };
-        std.debug.print("next: {}\n", .{s.next});
-    }
-    if (s.eof() or s.isFinalNull()) {
-        return s.retval;
-    } else {
-        // Should never happen.
-        err(s, "READER BUG: unconsumed input");
-    }
-}
-
-fn startParsing(s: *State) *State {
-    s.consumeBlanks();
-    if (s.eof()) {
-        return s.setReturn(value.eof.eof);
-    }
-    return switch (s.peek()) {
-        // whitespace already consumed
-        0...31, 127...255 => err(s, "invalid character"),
-        ')', ']', '}' => err(s, "unexpected closing bracket"),
-        else => startDatum(s),
-    };
-}
-
-// This is called when we *immediately* expect a datum and nothing else; for
-// example, no whitespace or comments, because they've already been consumed.
-fn startDatum(s: *State) *State {
-    if (s.isWhitespace()) {
-        return err(s, "expected datum, got whitespace");
-    }
-    if (s.eof()) {
-        return err(s, "expected datum, got EOF");
-    }
-    return switch (s.getc()) {
-        // whitespace checked above
-        0...31, 127...255 => err(s, "invalid character"),
-
-        ')', ']', '}' => err(s, "unexpected closing bracket"),
-
-        ';' => err(s, "expected datum, got semicolon"),
-
-        '#' => handleHash(s),
-
-        '"' => startQuotedString(s),
-
-        '\'', '`', ',' => |c| startQuote(s, c),
-
-        '(', '[', '{' => |c| startList(s, c),
-
-        '+', '-' => |c| handlePlusMinus(s, c),
-
-        '0'...'9' => |c| handleDigit(s, c),
-
-        // Periods only allowed between digits, and to express improper lists.
-        // Things like the following look too much like it could be a typo:
-        //
-        //   (foo .5) (foo .bar)
-        //
-        '.' => err(s, "misplaced period"),
-
-        else => startBareString(s),
-    };
-}
-
-fn handleHash(s: *State) *State {
-    //
-    // We just consumed a hash.  Possibilities include:
-    //
-    //   #|foo       ;rune
-    //
-    //   #|;DATUM    ;datum comment
-    //
-    //   #|DATUM     ;hash-datum (code mode only)
-    //
-
-    if (s.isWhitespace()) {
-        return err(s, "whitespace after hash");
-    }
-    if (s.eof()) {
-        return err(s, "EOF after hash");
-    }
-
-    // Is it a rune?  #foo
-    switch (s.peek()) {
-        'A'...'Z', 'a'...'z' => return handleRune(s),
-        else => {},
-    }
-
-    // Is it a datum comment?  #;DATUM
-    if (s.peek() == ';') {
-        s.skip();
-        // Don't change s.next in this case.  Just let the parser try to redo
-        // what it was doing as soon as the commented-out datum has been read.
-        return s.newSubstate(.start_datum);
-    }
-
-    // Otherwise, it must be a hash-datum.  #DATUM
-
-    // But data mode doesn't allow that.
-    if (s.mode == .data) {
-        return err(s, "invalid use of hash in data mode");
-    }
-
-    s.next = .end_hash_datum;
-    return s.newSubstate(.start_datum);
-}
-
-fn handleRune(s: *State) *State {
-    //
-    // We are now here, knowing that at least one ASCII letter follows:
-    //
-    //   #|foo
-    //
-
-    var buf: [6]u8 = undefined;
-    var i: u8 = 0;
-    while (!s.eof()) : (i += 1) switch (s.peek()) {
-        'a'...'z', 'A'...'Z' => {
-            if (i == buf.len) {
-                return err(s, "rune too long");
-            }
-            buf[i] = s.getc();
-        },
-        else => break,
-    };
-
-    // Note: 'i' can't be 0 since this function is only called if at least one
-    // ASCII letter is found after the hash.
-
-    const rune = value.rune.pack(buf[0..i]);
-
-    //
-    // Now we're at the end of the rune, but it could be a rune-datum:
-    //
-    //   #foo|(...)
-    //
-
-    if (s.isEndDelimiter()) {
-        return s.setReturn(rune);
-    }
-
-    // Otherwise, it's followed by a datum, like: #foo(...)
-
-    // Which is only allowed in code mode.
-    if (s.mode == .data) {
-        return err(s, "invalid use of hash in data mode");
-    }
-
-    s.datum_rune = rune;
-    s.next = .end_rune_datum;
-    return s.newSubstate(.start_datum);
-}
-
-fn endRuneDatum(s: *State) *State {
-    return s.setReturn(value.pair.cons(
-        s.datum_rune,
-        s.retval,
-    ));
-}
-
-fn endHashDatum(s: *State) *State {
-    return s.setReturn(value.pair.cons(
-        value.rune.pack("hash"),
-        s.retval,
-    ));
-}
-
-fn startQuotedString(s: *State) *State {
-    // We are now here:
-    //
-    //   "|..."
-    //
-    const str = readQuotedString(s) catch return err(s, "unclosed string");
-    if (s.mode == .code) {
-        // "foo bar" => (#string . "foo bar")
-        const rune = value.rune.pack("string");
-        const pair = value.pair.cons(rune, str);
-        return s.setReturn(pair);
-    } else {
-        return s.setReturn(str);
-    }
-}
-
-const StringReadError = enum { UnclosedString };
-
-fn readQuotedString(s: *State) error{UnclosedString}!Value {
-    return try readQuotedSstr(s) orelse readQuotedLongString(s);
-}
-
-fn readQuotedSstr(s: *State) error{UnclosedString}!?Value {
-    // We will reset to this position if we fail.
-    const start_pos = s.pos;
-
-    var buf: [6]u8 = undefined;
-    var i: u8 = 0;
-    while (!s.eof()) {
-        const c = s.getc();
-        if (c == '"') {
-            // ok, return what we accumulated
-            return value.sstr.pack(buf[0..i]);
-        }
-        if (i == 6) {
-            // failed; reset and bail out
-            s.pos = start_pos;
-            return null;
-        }
-        // ok, save this byte and go on
-        buf[i] = c;
-        i += 1;
-    }
-    return error.UnclosedString;
-}
-
-fn readQuotedLongString(s: *State) Value {
-    _ = s;
-    @panic("not implemented");
-}
-
-fn startQuote(s: *State, c: u8) *State {
-    // Allowed in Scheme, but why?  Not in Zisp.
-    if (s.isWhitespace()) {
-        return err(s, "whitespace after apostrophe");
-    }
-    s.opening_quote = switch (c) {
-        '\'' => .apos,
-        '`' => .tick,
-        ',' => .comma,
-        else => unreachable,
-    };
-    const sub = s.newSubstate(.start_datum);
-    sub.mode = .data;
-    s.next = .end_quote;
-    return sub;
-}
-
-fn endQuote(s: *State) *State {
-    const name = switch (s.opening_quote) {
-        .apos => "apos",
-        .tick => "tick",
-        .comma => "comma",
-    };
-    return s.setReturn(value.pair.cons(
-        value.rune.pack(name),
-        s.retval,
-    ));
-}
-
-fn startList(s: *State, open: u8) *State {
-    if (s.mode == .data and open != '(') {
-        return err(s, "invalid opening bracket in data mode");
-    }
-
-    s.consumeBlanks();
-
-    // Check for empty lists: (), [], {}
-    if (open == '(' and s.peek() == ')') {
-        s.skip();
-        return s.setReturn(value.nil.nil);
-    }
-    if (open == '[' and s.peek() == ']') {
-        s.skip();
-        return s.setReturn(value.pair.cons(
-            value.rune.pack("square"),
-            value.nil.nil,
-        ));
-    }
-    if (open == '{' and s.peek() == '}') {
-        s.skip();
-        return s.setReturn(value.pair.cons(
-            value.rune.pack("brace"),
-            value.nil.nil,
-        ));
-    }
-
-    s.opening_bracket = switch (open) {
-        '(' => .paren,
-        '[' => .square,
-        '{' => .brace,
-        else => unreachable,
-    };
-    s.next = .continue_list;
-    return s.newSubstate(.start_datum);
-}
-
-fn continueList(s: *State) *State {
-    s.consumeBlanks();
-
-    if (s.eof()) {
-        return err(s, "unexpected EOF while parsing list");
-    }
-
-    const tail = value.pair.cons(s.retval, s.list_tail);
-
-    const open = s.opening_bracket;
-    const char = s.peek();
-
-    // Check for proper ending: (foo bar baz)
-    if (open == .paren and char == ')') {
-        s.skip();
-        return s.setReturn(list.reverse(tail));
-    }
-    if (open == .square and char == ']') {
-        s.skip();
-        return s.setReturn(value.pair.cons(
-            value.rune.pack("square"),
-            list.reverse(tail),
-        ));
-    }
-    if (open == .brace and char == '}') {
-        s.skip();
-        return s.setReturn(value.pair.cons(
-            value.rune.pack("brace"),
-            list.reverse(tail),
-        ));
-    }
-
-    s.list_tail = tail;
-
-    // Check for improper ending: (foo bar . baz)
-    if (char == '.') {
-        s.skip();
-
-        // We should now be at (... foo .| bar) and whitespace must follow.
-        // Scheme allows (foo .(bar)) but we don't.  Mind your spaces!
-        if (!s.isWhitespace()) {
-            return err(s, "invalid use of period");
-        }
-
-        s.consumeBlanks();
-
-        s.next = .end_improper_list;
-        return s.newSubstate(.start_datum);
-    }
-
-    // OK, next element.
-    return s.newSubstate(.start_datum);
-}
-
-fn endImproperList(s: *State) *State {
-    s.consumeBlanks();
-
-    if (s.eof()) {
-        return err(s, "unexpected EOF");
-    }
-
-    const open = s.opening_bracket;
-    const char = s.getc();
-
-    const body = s.list_tail;
-    const tail = s.retval;
-
-    if (open == .paren and char == ')') {
-        return s.setReturn(list.reverseWithTail(body, tail));
-    }
-    if (open == .square and char == ']') {
-        return s.setReturn(value.pair.cons(
-            value.rune.pack("square"),
-            list.reverseWithTail(body, tail),
-        ));
-    }
-    if (open == .brace and char == '}') {
-        return s.setReturn(value.pair.cons(
-            value.rune.pack("brace"),
-            list.reverseWithTail(body, tail),
-        ));
-    }
-
-    return err(s, "malformed list or extra datum at end of improper list");
-}
-
-fn handlePlusMinus(s: *State, c: u8) *State {
-    _ = c;
-    return s;
-}
-
-fn handleDigit(s: *State, c: u8) *State {
-    _ = c;
-    return s;
-}
-
-fn startBareString(s: *State) *State {
-    return s;
-}
-
-fn err(s: *State, msg: []const u8) noreturn {
-    std.debug.print("{s}\n", .{msg});
-    std.debug.print("pos: {}\n", .{s.pos});
-    @panic("reader error");
-}
author	Taylan Kammer <taylan.kammer@gmail.com>	2025-02-20 18:29:59 +0100
committer	Taylan Kammer <taylan.kammer@gmail.com>	2025-02-20 18:29:59 +0100
commit	ab0edc69d17cd32397a4f7a059120fd6152bb138 (patch)
tree	505bc85978714cb0b6f4e5d8c9058bbcf1fe1bab /src/libzisp/read.zig
parent	3b713ef3e872bda3da9e5a67a9bfd5c6701cb665 (diff)