update

author: Taylan Kammer <taylan.kammer@gmail.com> 2025-02-19 23:29:26 +0100
committer: Taylan Kammer <taylan.kammer@gmail.com> 2025-02-19 23:29:26 +0100
commit: 4e88891235664917a2db44b84c0bbeeb13dd71ad (patch)
tree: 7ed8ac2272ce92054fdf2f4e5e09b156dfc5a4d1 /src/libzisp
parent: 4d0db1a1065f18d879b3ff90da6ecb14e9e1ae31 (diff)
6 files changed, 442 insertions, 88 deletions
diff --git a/src/libzisp/read.zig b/src/libzisp/read.zig
index 9ef9891..812b7c7 100644
--- a/src/libzisp/read.zig
+++ b/src/libzisp/read.zig
@@ -5,101 +5,369 @@ const value = @import("value.zig");
 
 const Value = value.Value;
 
+const Next = enum {
+    start,
+    datum,
+    hash_end,
+    rune_datum_end,
+    quote_end,
+    list,
+    list_end,
+    done,
+};
+
 const State = struct {
     alloc: std.mem.Allocator,
+
     input: []const u8,
     pos: usize = 0,
 
-    next: enum {
-        start,
+    mode: enum { code, data } = .code,
 
-        list,
-        list_end,
+    next: Next = .start,
 
-        err,
+    parent: ?*State = null,
 
-        done,
-    } = .start,
+    last_rune: ?Value = null,
+    list_tail: ?Value = null,
 
     retval: Value = value.eof.eof,
 
-    parent: ?*State = null,
+    fn eof(self: *State) bool {
+        return self.pos >= self.input.len;
+    }
+
+    fn peek(self: *State) u8 {
+        return self.input[self.pos];
+    }
+
+    fn getc(self: *State) u8 {
+        const c = self.peek();
+        self.pos += 1;
+        return c;
+    }
+
+    fn isFinalNull(self: *State) bool {
+        return self.peek() == 0 and self.pos == self.input.len - 1;
+    }
+
+    fn newChild(self: *State, next: Next) *State {
+        const s = self.alloc.create(State) catch @panic("OOM");
+        s.* = State{ .alloc = self.alloc, .input = self.input };
+        s.pos = self.pos;
+        s.mode = self.mode;
+        s.next = next;
+        s.parent = self;
+        return s;
+    }
+
+    fn setReturn(self: *State, val: Value) *State {
+        self.retval = val;
+        self.next = .done;
+        return self;
+    }
+
+    fn finish(self: *State) ?*State {
+        if (self.parent) |p| {
+            p.retval = self.retval;
+            p.pos = self.pos;
+            p.alloc.destroy(self);
+            return p;
+        } else {
+            return null;
+        }
+    }
 };
 
 pub fn read(input: []const u8) Value {
     var gpa: std.heap.GeneralPurposeAllocator(.{}) = .init;
     var top = State{ .alloc = gpa.allocator(), .input = input };
     var s = &top;
-    while (s.pos <= s.input.len) : (s.pos += 1) {
-        s = switch (s.next) {
-            .start => start(s),
-
-            .list => list(s),
-            .list_end => list(s),
-
-            .err => err(s),
-
-            .done => ret: {
-                if (s.parent) |parent| {
-                    s.alloc.destroy(s);
-                    break :ret parent;
-                } else {
-                    return s.retval;
-                }
-            },
-        };
+    while (true) s = switch (s.next) {
+        .start => start(s),
+        .datum => datum(s),
+        .hash_end => hashEnd(s),
+        .rune_datum_end => runeDatumEnd(s),
+        .quote_end => quoteEnd(s),
+        .list => list(s),
+        .list_end => list(s),
+        .done => s.finish() orelse break,
+    };
+    if (s.eof() or s.isFinalNull()) {
+        return s.retval;
+    } else {
+        @panic("unconsumed input");
     }
-    unreachable;
 }
 
 fn start(s: *State) *State {
-    switch (s.input[s.pos]) {
-        0...8 => s.next = .err,
-
-        '\t', '\n' => {},
+    while (true) {
+        if (s.eof()) {
+            s.next = .done;
+            return s;
+        }
+        const c = s.getc();
+        if (isWhitespace(c)) {
+            continue;
+        }
+        return switch (c) {
+            // whitespace checked above
+            0...31, 127...255 => err(s, "invalid character"),
+            ')', ']', '}' => err(s, "unexpected closing bracket"),
+            ';' => semi(s),
+            else => ret: {
+                // backtrack; let other function handle it
+                s.pos -= 1;
+                break :ret datum(s);
+            },
+        };
+    }
+}
 
-        11...31 => s.next = .err,
+fn semi(s: *State) *State {
+    while (true) {
+        if (s.eof()) {
+            s.next = .done;
+            return s;
+        }
+        const c = s.getc();
+        if (c == '\n') {
+            break;
+        }
+    }
+    return s;
+}
 
-        ' ' => {},
+fn datum(s: *State) *State {
+    const c = s.getc();
+    if (isWhitespace(c)) {
+        return err(s, "expected datum, got whitespace");
+    }
+    return switch (c) {
+        // whitespace checked above
+        0...31, 127...255 => err(s, "invalid character"),
+        ')', ']', '}' => err(s, "unexpected closing bracket"),
+        ';' => err(s, "expected datum, got semicolon"),
+        '"' => string(s),
+        '#' => hash(s),
+        '\'' => quote(s),
+        '(' => list(s),
+        '+' => plus(s),
+        ',' => comma(s),
+        '.' => dot(s),
+        '0'...'9' => number(s),
+        '[' => square(s),
+        '`' => backtick(s),
+        '{' => brace(s),
+        else => symbol(s),
+    };
+}
 
-        '!' => s.next = .err,
+fn isWhitespace(c: u8) bool {
+    return switch (c) {
+        '\t', '\n', ' ' => true,
+        else => false,
+    };
+}
 
-        '"' => quotedString(s),
+// Whitespace, semicolon, and closing brackets of any kind
+fn isEndDelimiter(c: u8) bool {
+    return switch (c) {
+        '\t', '\n', ' ', ';' => true,
+        ')', ']', '}' => true,
+        else => false,
+    };
+}
 
-        else => s.next = .err,
+fn string(s: *State) *State {
+    const str = readString(s) catch return err(s, "unclosed string");
+    if (s.mode == .code) {
+        // "foo bar" => (#string . "foo bar")
+        const rune = value.rune.pack("string");
+        const pair = value.pair.cons(rune, str);
+        return s.setReturn(pair);
+    } else {
+        return s.setReturn(str);
     }
-    return s;
 }
 
-fn quotedString(s: *State) void {
-    var buf: [6]u8 = .{0} ** 6;
-    const len = readString(&buf, s);
-    s.retval = value.pair.cons(
-        value.sstr.pack("quote"),
-        value.pair.cons(
-            value.sstr.pack(buf[0..len]),
-            value.nil.nil,
-        ),
-    );
-    s.next = .done;
+const StringReadError = enum { UnclosedString };
+
+fn readString(s: *State) error{UnclosedString}!Value {
+    return try tryReadSstr(s) orelse readLongString(s);
 }
 
-fn readString(buf: []u8, s: *State) usize {
-    s.pos += 1; // skip opening quote
-    for (s.input[s.pos..], 0..) |c, i| {
+fn tryReadSstr(s: *State) error{UnclosedString}!?Value {
+    // We will reset to this position if we fail.
+    const start_pos = s.pos;
+
+    var buf: [6]u8 = undefined;
+    var i: usize = 0;
+    while (!s.eof()) {
+        const c = s.getc();
         if (c == '"') {
-            s.pos += i;
-            return i;
+            // ok, return what we accumulated
+            return value.sstr.pack(buf[0..i]);
         }
+        if (i == 6) {
+            // failed; reset and bail out
+            s.pos = start_pos;
+            return null;
+        }
+        // ok, save this byte and go on
         buf[i] = c;
+        i += 1;
+    }
+    return error.UnclosedString;
+}
+
+fn readLongString(s: *State) Value {
+    _ = s;
+    @panic("not implemented");
+}
+
+fn hash(s: *State) *State {
+    if (isWhitespace(s.peek())) {
+        return err(s, "whitespace after hash sign");
     }
-    unreachable;
+
+    // is it a datum comment?
+    if (s.peek() == ';') {
+        // consume semicolon
+        _ = s.getc();
+        // Just ignore value and return to starting state after reading it.
+        s.next = .start;
+    } else {
+        s.next = .hash_end;
+    }
+
+    // No whitespace or anything; hash must be immediately followed by datum,
+    // including if it's a datum comment.  Note that if it's actually a rune
+    // we're reading, like #foo, we abuse our ability to reading an sstr here
+    // and later turn it into a rune instead, since they're the same length.
+    return s.newChild(.datum);
+}
+
+fn hashEnd(s: *State) *State {
+    // It's not actually a sstr but a rune, like: #foo or #foo(...)
+    if (value.sstr.check(s.retval)) {
+        return hashRuneEnd(s);
+    }
+
+    // Hash followed by an actual datum; becomes a (#hash ...) invocation:
+    //
+    //   #(...) -> (#hash . (...))
+    //
+    //   #"..." -> (#hash . "...")
+    //
+
+    // But data mode doesn't allow that.
+    if (s.mode == .data) {
+        return err(s, "invalid use of hash in data mode");
+    }
+
+    // Also, bare long strings are not OK here; too similar to a rune.
+    if (value.ptr.checkZisp(s.retval, .string)) {
+        return err(s, "long string after hash sign");
+    }
+
+    return s.setReturn(value.pair.cons(
+        value.rune.pack("hash"),
+        s.retval,
+    ));
+}
+
+// Note: Can only come here from hashEnd().
+fn hashRuneEnd(s: *State) *State {
+    // Convert the fake sstr that was meant to be a rune.
+    const rune = value.rune.make(s.retval);
+
+    // Maybe it's a stand-alone rune, like: #foo
+    if (isEndDelimiter(s.peek())) {
+        // Which is only allowed in data mode.
+        if (s.mode == .code) {
+            return err(s, "bare runes not allowed in code");
+        } else {
+            return s.setReturn(rune);
+        }
+    }
+
+    // Otherwise, it's followed by a datum, like: #foo(...)
+
+    // Which is only allowed in code mode.
+    if (s.mode == .data) {
+        return err(s, "invalid use of hash in data mode");
+    } else {
+        s.last_rune = rune;
+        s.next = .rune_datum_end;
+        return s.newChild(.datum);
+    }
+}
+
+fn runeDatumEnd(s: *State) *State {
+    if (s.last_rune) |rune| {
+        return s.setReturn(value.pair.cons(rune, s.retval));
+    } else {
+        unreachable;
+    }
+}
+
+fn quote(s: *State) *State {
+    // Allowed in Scheme, but why? Not in Zisp.
+    if (isWhitespace(s.peek())) {
+        return err(s, "whitespace after apostrophe");
+    }
+    s.next = .quote_end;
+    const c = s.newChild(.datum);
+    c.mode = .data;
+    return c;
+}
+
+fn quoteEnd(s: *State) *State {
+    return s.setReturn(value.pair.cons(
+        value.rune.pack("quote"),
+        s.retval,
+    ));
 }
 
 fn list(s: *State) *State {
     return s;
 }
 
-fn err(s: *State) *State {
+fn plus(s: *State) *State {
+    return s;
+}
+
+fn comma(s: *State) *State {
+    return s;
+}
+
+fn dot(s: *State) *State {
+    return s;
+}
+
+fn number(s: *State) *State {
+    return s;
+}
+
+fn square(s: *State) *State {
+    return s;
+}
+
+fn backtick(s: *State) *State {
+    return s;
+}
+
+fn brace(s: *State) *State {
+    return s;
+}
+
+fn symbol(s: *State) *State {
     return s;
 }
+
+fn err(s: *State, msg: []const u8) *State {
+    _ = s;
+    std.debug.print("{s}\n", .{msg});
+    @panic("reader error");
+}
diff --git a/src/libzisp/value.zig b/src/libzisp/value.zig
index dd9df3c..46ba9fa 100644
--- a/src/libzisp/value.zig
+++ b/src/libzisp/value.zig
@@ -65,9 +65,9 @@
 //
 //   001   :: Weak pointer to Zisp heap object
 //
-//   01.   :: Undefined
+//   01.   :: Undefined (may be used by GC to flag pointers for some reason?)
 //
-//   1..   :: Undefined
+//   1..   :: Foreign pointer (basically, a 50-bit fixnum of another type)
 //
 // This means pointers to the Zisp heap are 48 bits.  Of those, we only really
 // need 45, since 64-bit platforms are in practice limited to 48-bit addresses,
@@ -79,12 +79,15 @@
 // regular Zisp heap pointer can never be null.  Weak pointers, which can be
 // null, avoid stepping on that forbidden value thanks to bit 49 being set.
 //
+// Foreign pointers allow storing arbitrary pointers, or integers basically, of
+// up to 50 bits, without any further definition in Zisp of what they mean.
+//
 //
 // === Other values ===
 //
-// This 51-bit range is divided as follows, based on the initial bits:
+// This 51-bit range is divided as follows, based on the high bits:
 //
-//   000   :: Undefined
+//   000   :: Runes
 //
 //   001   :: Short string
 //
@@ -94,9 +97,12 @@
 //
 //   1..   :: Undefined
 //
-// Zisp strings are immutable and always encoded in UTF-8.  Any string fitting
-// into 6 bytes or less will be stored as an immediate value, not requiring any
-// heap allocation or interning.  (It's implicitly interned.)
+// Runes are symbols of 1 to 6 ASCII letters, used to implement reader syntax;
+// both built-in and extensions.
+//
+// Zisp strings are immutable.  Any string fitting into 6 bytes or less will be
+// stored as an immediate value, not requiring any heap allocation or interning.
+// It's implicitly interned, so to speak.  This includes the empty string.
 //
 // The null byte serves as a terminator and cannot appear in these strings; a
 // string that short but actually containing a null byte will need to be heap
@@ -115,15 +121,19 @@
 // 8 bits are allowed to be set, with the other 40 being reserved, so there's a
 // limit of 256 singleton values that can be defined.
 //
-// And on top of all that we still have a 48-bit and a 50-bit range left!
+// And on top of all that we still have a 50-bit range left!
 //
-// The forbidden value 4, Positive Infinity, is in the 48-bit undefined value
-// range starting with the 000 tag.
+// The forbidden value 4, Positive Infinity, would be the "empty string rune"
+// but that isn't allowed anyway, so all is fine.
 //
 
 // Here's the original article explaining the strategy:
 //
-// https://tkammer.de/zisp/notes/nan.html
+//   https://tkammer.de/zisp/notes/nan.html
+//
+// More about runes:
+//
+//   https://tkammer.de/zisp/notes/symbols.html
 //
 // Note: Packed structs are least-to-most significant, so the order of fields
 // must be reversed relative to a typical big-endian illustration of the bit
@@ -136,6 +146,7 @@ pub const fixnum = @import("value/fixnum.zig");
 
 pub const ptr = @import("value/ptr.zig");
 
+pub const rune = @import("value/rune.zig");
 pub const sstr = @import("value/sstr.zig");
 pub const char = @import("value/char.zig");
 pub const boole = @import("value/boole.zig");
@@ -144,7 +155,7 @@ pub const eof = @import("value/eof.zig");
 
 pub const pair = @import("value/pair.zig");
 
-/// To fill up the u11 exponent part of a NaN.
+// To fill up the u11 exponent part of a NaN.
 const FILL = 0x7ff;
 
 /// Represents a Zisp value/object.
@@ -214,6 +225,16 @@ pub const Value = packed union {
         _is_ifxnum: bool,
     },
 
+    /// For initializing and reading runes.
+    rune: packed struct {
+        // actually [6]u8 but packed struct cannot contain arrays
+        name: u48,
+        _tag: OtherTag = .rune,
+        _is_ptr: bool = false,
+        _: u11 = FILL,
+        _is_fixnum: bool = false,
+    },
+
     /// For initializing and reading short strings.
     sstr: packed struct {
         // actually [6]u8 but packed struct cannot contain arrays
@@ -244,7 +265,7 @@ pub const Value = packed union {
         _is_fixnum: bool = false,
     },
 
-    const OtherTag = enum(u3) { sstr = 1, char = 2, misc = 3 };
+    const OtherTag = enum(u3) { rune, sstr, char, misc };
 
     const Self = @This();
 
@@ -282,6 +303,7 @@ pub const Value = packed union {
         return self.isPacked() and !self.ieee.sign and !self.ieee.quiet;
     }
 
+    /// Checks for any "other" type of value.
     pub fn isOther(self: Self, tag: OtherTag) bool {
         return self._isOther() and self.other.tag == tag;
     }
diff --git a/src/libzisp/value/fixnum.zig b/src/libzisp/value/fixnum.zig
index 888dd3a..c705880 100644
--- a/src/libzisp/value/fixnum.zig
+++ b/src/libzisp/value/fixnum.zig
@@ -28,11 +28,11 @@ pub fn isValidRange(int: i64) bool {
 
 fn assertValidRange(int: i64) void {
     if (int < fixnum_min) {
-        std.debug.print("int too small for fixnum: {}", .{int});
+        std.debug.print("int too small for fixnum: {}\n", .{int});
         @panic("int too small for fixnum");
     }
     if (int > fixnum_max) {
-        std.debug.print("int too large for fixnum: {}", .{int});
+        std.debug.print("int too large for fixnum: {}\n", .{int});
         @panic("int too large for fixnum");
     }
 }
diff --git a/src/libzisp/value/ptr.zig b/src/libzisp/value/ptr.zig
index fe13af5..e1fadf2 100644
--- a/src/libzisp/value/ptr.zig
+++ b/src/libzisp/value/ptr.zig
@@ -31,27 +31,13 @@ pub fn assertForeign(v: Value) void {
     }
 }
 
-pub fn checkForeignRange(ptr: *anyopaque) bool {
-    const int = @intFromPtr(ptr);
-    return int <= std.math.maxInt(u50);
-}
-
-fn assertForeignRange(ptr: *anyopaque) void {
-    if (!checkForeignRange(ptr)) {
-        std.debug.print("foreign pointer out of range: {}\n", .{ptr});
-        @panic("foreign pointer out of range");
-    }
-}
-
-pub fn packForeign(ptr: *anyopaque) Value {
-    assertForeignRange(ptr);
-    const int: u50 = @intCast(@intFromPtr(ptr));
+pub fn packForeign(int: u50) Value {
     return .{ .fptr = .{ .value = int } };
 }
 
-pub fn unpackForeign(v: Value) *anyopaque {
+pub fn unpackForeign(v: Value) u50 {
     assertForeign(v);
-    return @ptrFromInt(v.fptr.value);
+    return v.fptr.value;
 }
 
 // Zisp Pointers
diff --git a/src/libzisp/value/rune.zig b/src/libzisp/value/rune.zig
new file mode 100644
index 0000000..ab251b4
--- /dev/null
+++ b/src/libzisp/value/rune.zig
@@ -0,0 +1,75 @@
+const std = @import("std");
+
+const value = @import("../value.zig");
+
+const Value = value.Value;
+
+// Zig API
+
+pub fn check(v: Value) bool {
+    return v.isOther(.rune);
+}
+
+pub fn assert(v: Value) void {
+    if (!check(v)) {
+        v.dump();
+        @panic("not rune");
+    }
+}
+
+pub fn isValidRune(s: []const u8) bool {
+    if (s.len == 0 or s.len > 6) {
+        return false;
+    }
+    for (s) |c| {
+        switch (c) {
+            'A'...'Z' => {},
+            'a'...'z' => {},
+            else => return false,
+        }
+    }
+    return true;
+}
+
+fn assertValidRune(s: []const u8) void {
+    if (!isValidRune(s)) {
+        std.debug.print("invalid rune: '{s}'\n", .{s});
+        @panic("invalid rune");
+    }
+}
+
+// See sstr.zig which uses equivalent code; probably good to keep in sync.
+
+pub fn pack(s: []const u8) Value {
+    assertValidRune(s);
+    var v = Value{ .rune = .{ .name = 0 } };
+    const dest: [*]u8 = @ptrCast(&v.rune.name);
+    @memcpy(dest, s);
+    return v;
+}
+
+pub fn unpack(v: Value) struct { [6]u8, u3 } {
+    const s: [6]u8 = @bitCast(v.rune.name);
+    inline for (0..6) |i| {
+        if (s[i] == 0) return .{ s, i };
+    }
+    return .{ s, 6 };
+}
+
+// Zisp API
+
+pub fn pred(v: Value) Value {
+    return value.boole.pack(check(v));
+}
+
+pub fn make(v: Value) Value {
+    const s, const l = value.sstr.unpack(v);
+    return pack(s[0..l]);
+}
+
+pub fn getName(v: Value) Value {
+    const s, const l = unpack(v);
+    return value.sstr.pack(s[0..l]);
+}
+
+// TODO: Registering decoders
diff --git a/src/libzisp/value/sstr.zig b/src/libzisp/value/sstr.zig
index 896b8d7..2be2647 100644
--- a/src/libzisp/value/sstr.zig
+++ b/src/libzisp/value/sstr.zig
@@ -31,7 +31,7 @@ pub fn isValidSstr(s: []const u8) bool {
 
 fn assertValidSstr(s: []const u8) void {
     if (!isValidSstr(s)) {
-        std.debug.print("invalid sstr: {s}", .{s});
+        std.debug.print("invalid sstr: {s}\n", .{s});
         @panic("invalid sstr");
     }
 }
@@ -40,6 +40,8 @@ fn assertValidSstr(s: []const u8) void {
 // shifting and bit masking, but memcpy always wins easily according to our
 // micro-benchmarks, under both ReleaseSafe and ReleaseFast.
 
+// Note: rune.zig uses equivalent code; probably good to keep in sync.
+
 pub fn pack(s: []const u8) Value {
     assertValidSstr(s);
     var v = Value{ .sstr = .{ .string = 0 } };
@@ -49,6 +51,7 @@ pub fn pack(s: []const u8) Value {
 }
 
 pub fn unpack(v: Value) struct { [6]u8, u3 } {
+    assert(v);
     const s: [6]u8 = @bitCast(v.sstr.string);
     inline for (0..6) |i| {
         if (s[i] == 0) return .{ s, i };
author	Taylan Kammer <taylan.kammer@gmail.com>	2025-02-19 23:29:26 +0100
committer	Taylan Kammer <taylan.kammer@gmail.com>	2025-02-19 23:29:26 +0100
commit	4e88891235664917a2db44b84c0bbeeb13dd71ad (patch)
tree	7ed8ac2272ce92054fdf2f4e5e09b156dfc5a4d1 /src/libzisp
parent	4d0db1a1065f18d879b3ff90da6ecb14e9e1ae31 (diff)