diff options
| author | Taylan Kammer <taylan.kammer@gmail.com> | 2025-02-19 23:29:26 +0100 |
|---|---|---|
| committer | Taylan Kammer <taylan.kammer@gmail.com> | 2025-02-19 23:29:26 +0100 |
| commit | 4e88891235664917a2db44b84c0bbeeb13dd71ad (patch) | |
| tree | 7ed8ac2272ce92054fdf2f4e5e09b156dfc5a4d1 /src/libzisp | |
| parent | 4d0db1a1065f18d879b3ff90da6ecb14e9e1ae31 (diff) | |
update
Diffstat (limited to 'src/libzisp')
| -rw-r--r-- | src/libzisp/read.zig | 378 | ||||
| -rw-r--r-- | src/libzisp/value.zig | 48 | ||||
| -rw-r--r-- | src/libzisp/value/fixnum.zig | 4 | ||||
| -rw-r--r-- | src/libzisp/value/ptr.zig | 20 | ||||
| -rw-r--r-- | src/libzisp/value/rune.zig | 75 | ||||
| -rw-r--r-- | src/libzisp/value/sstr.zig | 5 |
6 files changed, 442 insertions, 88 deletions
diff --git a/src/libzisp/read.zig b/src/libzisp/read.zig index 9ef9891..812b7c7 100644 --- a/src/libzisp/read.zig +++ b/src/libzisp/read.zig @@ -5,101 +5,369 @@ const value = @import("value.zig"); const Value = value.Value; +const Next = enum { + start, + datum, + hash_end, + rune_datum_end, + quote_end, + list, + list_end, + done, +}; + const State = struct { alloc: std.mem.Allocator, + input: []const u8, pos: usize = 0, - next: enum { - start, + mode: enum { code, data } = .code, - list, - list_end, + next: Next = .start, - err, + parent: ?*State = null, - done, - } = .start, + last_rune: ?Value = null, + list_tail: ?Value = null, retval: Value = value.eof.eof, - parent: ?*State = null, + fn eof(self: *State) bool { + return self.pos >= self.input.len; + } + + fn peek(self: *State) u8 { + return self.input[self.pos]; + } + + fn getc(self: *State) u8 { + const c = self.peek(); + self.pos += 1; + return c; + } + + fn isFinalNull(self: *State) bool { + return self.peek() == 0 and self.pos == self.input.len - 1; + } + + fn newChild(self: *State, next: Next) *State { + const s = self.alloc.create(State) catch @panic("OOM"); + s.* = State{ .alloc = self.alloc, .input = self.input }; + s.pos = self.pos; + s.mode = self.mode; + s.next = next; + s.parent = self; + return s; + } + + fn setReturn(self: *State, val: Value) *State { + self.retval = val; + self.next = .done; + return self; + } + + fn finish(self: *State) ?*State { + if (self.parent) |p| { + p.retval = self.retval; + p.pos = self.pos; + p.alloc.destroy(self); + return p; + } else { + return null; + } + } }; pub fn read(input: []const u8) Value { var gpa: std.heap.GeneralPurposeAllocator(.{}) = .init; var top = State{ .alloc = gpa.allocator(), .input = input }; var s = ⊤ - while (s.pos <= s.input.len) : (s.pos += 1) { - s = switch (s.next) { - .start => start(s), - - .list => list(s), - .list_end => list(s), - - .err => err(s), - - .done => ret: { - if (s.parent) |parent| { - s.alloc.destroy(s); - break :ret parent; - } else { - return s.retval; - } - }, - }; + while (true) s = switch (s.next) { + .start => start(s), + .datum => datum(s), + .hash_end => hashEnd(s), + .rune_datum_end => runeDatumEnd(s), + .quote_end => quoteEnd(s), + .list => list(s), + .list_end => list(s), + .done => s.finish() orelse break, + }; + if (s.eof() or s.isFinalNull()) { + return s.retval; + } else { + @panic("unconsumed input"); } - unreachable; } fn start(s: *State) *State { - switch (s.input[s.pos]) { - 0...8 => s.next = .err, - - '\t', '\n' => {}, + while (true) { + if (s.eof()) { + s.next = .done; + return s; + } + const c = s.getc(); + if (isWhitespace(c)) { + continue; + } + return switch (c) { + // whitespace checked above + 0...31, 127...255 => err(s, "invalid character"), + ')', ']', '}' => err(s, "unexpected closing bracket"), + ';' => semi(s), + else => ret: { + // backtrack; let other function handle it + s.pos -= 1; + break :ret datum(s); + }, + }; + } +} - 11...31 => s.next = .err, +fn semi(s: *State) *State { + while (true) { + if (s.eof()) { + s.next = .done; + return s; + } + const c = s.getc(); + if (c == '\n') { + break; + } + } + return s; +} - ' ' => {}, +fn datum(s: *State) *State { + const c = s.getc(); + if (isWhitespace(c)) { + return err(s, "expected datum, got whitespace"); + } + return switch (c) { + // whitespace checked above + 0...31, 127...255 => err(s, "invalid character"), + ')', ']', '}' => err(s, "unexpected closing bracket"), + ';' => err(s, "expected datum, got semicolon"), + '"' => string(s), + '#' => hash(s), + '\'' => quote(s), + '(' => list(s), + '+' => plus(s), + ',' => comma(s), + '.' => dot(s), + '0'...'9' => number(s), + '[' => square(s), + '`' => backtick(s), + '{' => brace(s), + else => symbol(s), + }; +} - '!' => s.next = .err, +fn isWhitespace(c: u8) bool { + return switch (c) { + '\t', '\n', ' ' => true, + else => false, + }; +} - '"' => quotedString(s), +// Whitespace, semicolon, and closing brackets of any kind +fn isEndDelimiter(c: u8) bool { + return switch (c) { + '\t', '\n', ' ', ';' => true, + ')', ']', '}' => true, + else => false, + }; +} - else => s.next = .err, +fn string(s: *State) *State { + const str = readString(s) catch return err(s, "unclosed string"); + if (s.mode == .code) { + // "foo bar" => (#string . "foo bar") + const rune = value.rune.pack("string"); + const pair = value.pair.cons(rune, str); + return s.setReturn(pair); + } else { + return s.setReturn(str); } - return s; } -fn quotedString(s: *State) void { - var buf: [6]u8 = .{0} ** 6; - const len = readString(&buf, s); - s.retval = value.pair.cons( - value.sstr.pack("quote"), - value.pair.cons( - value.sstr.pack(buf[0..len]), - value.nil.nil, - ), - ); - s.next = .done; +const StringReadError = enum { UnclosedString }; + +fn readString(s: *State) error{UnclosedString}!Value { + return try tryReadSstr(s) orelse readLongString(s); } -fn readString(buf: []u8, s: *State) usize { - s.pos += 1; // skip opening quote - for (s.input[s.pos..], 0..) |c, i| { +fn tryReadSstr(s: *State) error{UnclosedString}!?Value { + // We will reset to this position if we fail. + const start_pos = s.pos; + + var buf: [6]u8 = undefined; + var i: usize = 0; + while (!s.eof()) { + const c = s.getc(); if (c == '"') { - s.pos += i; - return i; + // ok, return what we accumulated + return value.sstr.pack(buf[0..i]); } + if (i == 6) { + // failed; reset and bail out + s.pos = start_pos; + return null; + } + // ok, save this byte and go on buf[i] = c; + i += 1; + } + return error.UnclosedString; +} + +fn readLongString(s: *State) Value { + _ = s; + @panic("not implemented"); +} + +fn hash(s: *State) *State { + if (isWhitespace(s.peek())) { + return err(s, "whitespace after hash sign"); } - unreachable; + + // is it a datum comment? + if (s.peek() == ';') { + // consume semicolon + _ = s.getc(); + // Just ignore value and return to starting state after reading it. + s.next = .start; + } else { + s.next = .hash_end; + } + + // No whitespace or anything; hash must be immediately followed by datum, + // including if it's a datum comment. Note that if it's actually a rune + // we're reading, like #foo, we abuse our ability to reading an sstr here + // and later turn it into a rune instead, since they're the same length. + return s.newChild(.datum); +} + +fn hashEnd(s: *State) *State { + // It's not actually a sstr but a rune, like: #foo or #foo(...) + if (value.sstr.check(s.retval)) { + return hashRuneEnd(s); + } + + // Hash followed by an actual datum; becomes a (#hash ...) invocation: + // + // #(...) -> (#hash . (...)) + // + // #"..." -> (#hash . "...") + // + + // But data mode doesn't allow that. + if (s.mode == .data) { + return err(s, "invalid use of hash in data mode"); + } + + // Also, bare long strings are not OK here; too similar to a rune. + if (value.ptr.checkZisp(s.retval, .string)) { + return err(s, "long string after hash sign"); + } + + return s.setReturn(value.pair.cons( + value.rune.pack("hash"), + s.retval, + )); +} + +// Note: Can only come here from hashEnd(). +fn hashRuneEnd(s: *State) *State { + // Convert the fake sstr that was meant to be a rune. + const rune = value.rune.make(s.retval); + + // Maybe it's a stand-alone rune, like: #foo + if (isEndDelimiter(s.peek())) { + // Which is only allowed in data mode. + if (s.mode == .code) { + return err(s, "bare runes not allowed in code"); + } else { + return s.setReturn(rune); + } + } + + // Otherwise, it's followed by a datum, like: #foo(...) + + // Which is only allowed in code mode. + if (s.mode == .data) { + return err(s, "invalid use of hash in data mode"); + } else { + s.last_rune = rune; + s.next = .rune_datum_end; + return s.newChild(.datum); + } +} + +fn runeDatumEnd(s: *State) *State { + if (s.last_rune) |rune| { + return s.setReturn(value.pair.cons(rune, s.retval)); + } else { + unreachable; + } +} + +fn quote(s: *State) *State { + // Allowed in Scheme, but why? Not in Zisp. + if (isWhitespace(s.peek())) { + return err(s, "whitespace after apostrophe"); + } + s.next = .quote_end; + const c = s.newChild(.datum); + c.mode = .data; + return c; +} + +fn quoteEnd(s: *State) *State { + return s.setReturn(value.pair.cons( + value.rune.pack("quote"), + s.retval, + )); } fn list(s: *State) *State { return s; } -fn err(s: *State) *State { +fn plus(s: *State) *State { + return s; +} + +fn comma(s: *State) *State { + return s; +} + +fn dot(s: *State) *State { + return s; +} + +fn number(s: *State) *State { + return s; +} + +fn square(s: *State) *State { + return s; +} + +fn backtick(s: *State) *State { + return s; +} + +fn brace(s: *State) *State { + return s; +} + +fn symbol(s: *State) *State { return s; } + +fn err(s: *State, msg: []const u8) *State { + _ = s; + std.debug.print("{s}\n", .{msg}); + @panic("reader error"); +} diff --git a/src/libzisp/value.zig b/src/libzisp/value.zig index dd9df3c..46ba9fa 100644 --- a/src/libzisp/value.zig +++ b/src/libzisp/value.zig @@ -65,9 +65,9 @@ // // 001 :: Weak pointer to Zisp heap object // -// 01. :: Undefined +// 01. :: Undefined (may be used by GC to flag pointers for some reason?) // -// 1.. :: Undefined +// 1.. :: Foreign pointer (basically, a 50-bit fixnum of another type) // // This means pointers to the Zisp heap are 48 bits. Of those, we only really // need 45, since 64-bit platforms are in practice limited to 48-bit addresses, @@ -79,12 +79,15 @@ // regular Zisp heap pointer can never be null. Weak pointers, which can be // null, avoid stepping on that forbidden value thanks to bit 49 being set. // +// Foreign pointers allow storing arbitrary pointers, or integers basically, of +// up to 50 bits, without any further definition in Zisp of what they mean. +// // // === Other values === // -// This 51-bit range is divided as follows, based on the initial bits: +// This 51-bit range is divided as follows, based on the high bits: // -// 000 :: Undefined +// 000 :: Runes // // 001 :: Short string // @@ -94,9 +97,12 @@ // // 1.. :: Undefined // -// Zisp strings are immutable and always encoded in UTF-8. Any string fitting -// into 6 bytes or less will be stored as an immediate value, not requiring any -// heap allocation or interning. (It's implicitly interned.) +// Runes are symbols of 1 to 6 ASCII letters, used to implement reader syntax; +// both built-in and extensions. +// +// Zisp strings are immutable. Any string fitting into 6 bytes or less will be +// stored as an immediate value, not requiring any heap allocation or interning. +// It's implicitly interned, so to speak. This includes the empty string. // // The null byte serves as a terminator and cannot appear in these strings; a // string that short but actually containing a null byte will need to be heap @@ -115,15 +121,19 @@ // 8 bits are allowed to be set, with the other 40 being reserved, so there's a // limit of 256 singleton values that can be defined. // -// And on top of all that we still have a 48-bit and a 50-bit range left! +// And on top of all that we still have a 50-bit range left! // -// The forbidden value 4, Positive Infinity, is in the 48-bit undefined value -// range starting with the 000 tag. +// The forbidden value 4, Positive Infinity, would be the "empty string rune" +// but that isn't allowed anyway, so all is fine. // // Here's the original article explaining the strategy: // -// https://tkammer.de/zisp/notes/nan.html +// https://tkammer.de/zisp/notes/nan.html +// +// More about runes: +// +// https://tkammer.de/zisp/notes/symbols.html // // Note: Packed structs are least-to-most significant, so the order of fields // must be reversed relative to a typical big-endian illustration of the bit @@ -136,6 +146,7 @@ pub const fixnum = @import("value/fixnum.zig"); pub const ptr = @import("value/ptr.zig"); +pub const rune = @import("value/rune.zig"); pub const sstr = @import("value/sstr.zig"); pub const char = @import("value/char.zig"); pub const boole = @import("value/boole.zig"); @@ -144,7 +155,7 @@ pub const eof = @import("value/eof.zig"); pub const pair = @import("value/pair.zig"); -/// To fill up the u11 exponent part of a NaN. +// To fill up the u11 exponent part of a NaN. const FILL = 0x7ff; /// Represents a Zisp value/object. @@ -214,6 +225,16 @@ pub const Value = packed union { _is_ifxnum: bool, }, + /// For initializing and reading runes. + rune: packed struct { + // actually [6]u8 but packed struct cannot contain arrays + name: u48, + _tag: OtherTag = .rune, + _is_ptr: bool = false, + _: u11 = FILL, + _is_fixnum: bool = false, + }, + /// For initializing and reading short strings. sstr: packed struct { // actually [6]u8 but packed struct cannot contain arrays @@ -244,7 +265,7 @@ pub const Value = packed union { _is_fixnum: bool = false, }, - const OtherTag = enum(u3) { sstr = 1, char = 2, misc = 3 }; + const OtherTag = enum(u3) { rune, sstr, char, misc }; const Self = @This(); @@ -282,6 +303,7 @@ pub const Value = packed union { return self.isPacked() and !self.ieee.sign and !self.ieee.quiet; } + /// Checks for any "other" type of value. pub fn isOther(self: Self, tag: OtherTag) bool { return self._isOther() and self.other.tag == tag; } diff --git a/src/libzisp/value/fixnum.zig b/src/libzisp/value/fixnum.zig index 888dd3a..c705880 100644 --- a/src/libzisp/value/fixnum.zig +++ b/src/libzisp/value/fixnum.zig @@ -28,11 +28,11 @@ pub fn isValidRange(int: i64) bool { fn assertValidRange(int: i64) void { if (int < fixnum_min) { - std.debug.print("int too small for fixnum: {}", .{int}); + std.debug.print("int too small for fixnum: {}\n", .{int}); @panic("int too small for fixnum"); } if (int > fixnum_max) { - std.debug.print("int too large for fixnum: {}", .{int}); + std.debug.print("int too large for fixnum: {}\n", .{int}); @panic("int too large for fixnum"); } } diff --git a/src/libzisp/value/ptr.zig b/src/libzisp/value/ptr.zig index fe13af5..e1fadf2 100644 --- a/src/libzisp/value/ptr.zig +++ b/src/libzisp/value/ptr.zig @@ -31,27 +31,13 @@ pub fn assertForeign(v: Value) void { } } -pub fn checkForeignRange(ptr: *anyopaque) bool { - const int = @intFromPtr(ptr); - return int <= std.math.maxInt(u50); -} - -fn assertForeignRange(ptr: *anyopaque) void { - if (!checkForeignRange(ptr)) { - std.debug.print("foreign pointer out of range: {}\n", .{ptr}); - @panic("foreign pointer out of range"); - } -} - -pub fn packForeign(ptr: *anyopaque) Value { - assertForeignRange(ptr); - const int: u50 = @intCast(@intFromPtr(ptr)); +pub fn packForeign(int: u50) Value { return .{ .fptr = .{ .value = int } }; } -pub fn unpackForeign(v: Value) *anyopaque { +pub fn unpackForeign(v: Value) u50 { assertForeign(v); - return @ptrFromInt(v.fptr.value); + return v.fptr.value; } // Zisp Pointers diff --git a/src/libzisp/value/rune.zig b/src/libzisp/value/rune.zig new file mode 100644 index 0000000..ab251b4 --- /dev/null +++ b/src/libzisp/value/rune.zig @@ -0,0 +1,75 @@ +const std = @import("std"); + +const value = @import("../value.zig"); + +const Value = value.Value; + +// Zig API + +pub fn check(v: Value) bool { + return v.isOther(.rune); +} + +pub fn assert(v: Value) void { + if (!check(v)) { + v.dump(); + @panic("not rune"); + } +} + +pub fn isValidRune(s: []const u8) bool { + if (s.len == 0 or s.len > 6) { + return false; + } + for (s) |c| { + switch (c) { + 'A'...'Z' => {}, + 'a'...'z' => {}, + else => return false, + } + } + return true; +} + +fn assertValidRune(s: []const u8) void { + if (!isValidRune(s)) { + std.debug.print("invalid rune: '{s}'\n", .{s}); + @panic("invalid rune"); + } +} + +// See sstr.zig which uses equivalent code; probably good to keep in sync. + +pub fn pack(s: []const u8) Value { + assertValidRune(s); + var v = Value{ .rune = .{ .name = 0 } }; + const dest: [*]u8 = @ptrCast(&v.rune.name); + @memcpy(dest, s); + return v; +} + +pub fn unpack(v: Value) struct { [6]u8, u3 } { + const s: [6]u8 = @bitCast(v.rune.name); + inline for (0..6) |i| { + if (s[i] == 0) return .{ s, i }; + } + return .{ s, 6 }; +} + +// Zisp API + +pub fn pred(v: Value) Value { + return value.boole.pack(check(v)); +} + +pub fn make(v: Value) Value { + const s, const l = value.sstr.unpack(v); + return pack(s[0..l]); +} + +pub fn getName(v: Value) Value { + const s, const l = unpack(v); + return value.sstr.pack(s[0..l]); +} + +// TODO: Registering decoders diff --git a/src/libzisp/value/sstr.zig b/src/libzisp/value/sstr.zig index 896b8d7..2be2647 100644 --- a/src/libzisp/value/sstr.zig +++ b/src/libzisp/value/sstr.zig @@ -31,7 +31,7 @@ pub fn isValidSstr(s: []const u8) bool { fn assertValidSstr(s: []const u8) void { if (!isValidSstr(s)) { - std.debug.print("invalid sstr: {s}", .{s}); + std.debug.print("invalid sstr: {s}\n", .{s}); @panic("invalid sstr"); } } @@ -40,6 +40,8 @@ fn assertValidSstr(s: []const u8) void { // shifting and bit masking, but memcpy always wins easily according to our // micro-benchmarks, under both ReleaseSafe and ReleaseFast. +// Note: rune.zig uses equivalent code; probably good to keep in sync. + pub fn pack(s: []const u8) Value { assertValidSstr(s); var v = Value{ .sstr = .{ .string = 0 } }; @@ -49,6 +51,7 @@ pub fn pack(s: []const u8) Value { } pub fn unpack(v: Value) struct { [6]u8, u3 } { + assert(v); const s: [6]u8 = @bitCast(v.sstr.string); inline for (0..6) |i| { if (s[i] == 0) return .{ s, i }; |
