diff options
| author | Taylan Kammer <taylan.kammer@gmail.com> | 2025-02-20 18:25:25 +0100 |
|---|---|---|
| committer | Taylan Kammer <taylan.kammer@gmail.com> | 2025-02-20 18:25:25 +0100 |
| commit | 3b713ef3e872bda3da9e5a67a9bfd5c6701cb665 (patch) | |
| tree | 65c98d427b2616129e0338e0fe6d0c8fb709d029 | |
| parent | 4e88891235664917a2db44b84c0bbeeb13dd71ad (diff) | |
update
| -rw-r--r-- | html/notes/format.md | 2 | ||||
| -rw-r--r-- | src/libzisp.zig | 2 | ||||
| -rw-r--r-- | src/libzisp/list.zig | 20 | ||||
| -rw-r--r-- | src/libzisp/read.zig | 709 |
4 files changed, 521 insertions, 212 deletions
diff --git a/html/notes/format.md b/html/notes/format.md index 39da84a..f757736 100644 --- a/html/notes/format.md +++ b/html/notes/format.md @@ -1,3 +1,5 @@ +# I hate 'display' + WIP WIP WIP (format "template" arg ...) ;sprintf diff --git a/src/libzisp.zig b/src/libzisp.zig index 174cd40..29b87dd 100644 --- a/src/libzisp.zig +++ b/src/libzisp.zig @@ -271,4 +271,6 @@ test "read2" { const f, const fl = value.sstr.unpack(value.pair.cdr(cdr)); try std.testing.expectEqualStrings("foo", f[0..fl]); + + _ = read.read("(\"foo\" '\"bar\" [#x \"baz\"])"); } diff --git a/src/libzisp/list.zig b/src/libzisp/list.zig new file mode 100644 index 0000000..a4ce7a8 --- /dev/null +++ b/src/libzisp/list.zig @@ -0,0 +1,20 @@ +const value = @import("value.zig"); + +const Value = value.Value; + +pub fn reverse(list: Value) Value { + return reverseWithTail(list, value.nil.nil); +} + +pub fn reverseWithTail(list: Value, tail: Value) Value { + var head = list; + var result = tail; + while (!value.nil.check(head)) { + value.pair.assert(head); + const car = value.pair.car(head); + const cdr = value.pair.cdr(head); + result = value.pair.cons(car, result); + head = cdr; + } + return result; +} diff --git a/src/libzisp/read.zig b/src/libzisp/read.zig index 812b7c7..74ae51b 100644 --- a/src/libzisp/read.zig +++ b/src/libzisp/read.zig @@ -1,21 +1,121 @@ +// +// === Parser for Code & Data === +// +// Zisp s-expressions come in two flavors: code (sugar) and data (no sugar). +// +// Code expressions are first parsed into the same data types which the data +// expressions can express; it's merely surface-level syntax sugar. +// +// Data expressions don't support any syntax sugar and have a simple format, +// only being able to represent the following data types: +// +// string -> foo , "foo bar" +// +// number -> 123 , -1.23 , +123 , +nan.0 , -inf.0 , ... +// +// rune -> #foo ;limited to 6 ASCII letters (a - z, A - Z) +// +// list -> (...) ;the usual: actually pairs; may be improper +// +// null -> () ;also called nil around here +// +// We may use terms like "code parser" and "data parser" out of convenience, +// although there may only be a single parser that implements both formats by +// switching between modes. +// +// When the code parser encounters syntax sugar, it always transforms it into a +// list starting with a rune, like in the following examples: +// +// #(...) -> (#hash ...) +// +// [...] -> (#square ...) +// +// 'foo -> (#quote . foo) +// +// These can combine: +// +// #{...} -> (#hash #brace ...) +// +// #'foo -> (#hash #quote . foo) +// +// ##'[...] -> (#hash #hash #quote #square ...) +// +// As a specialty, double-quoted strings are actually considered sugar by the +// code parser, and are transformed as follows into data: +// +// "..." -> (#string "...") +// +// (Otherwise, all string literals would be identifiers, or all identifiers +// would be string literals, because Zisp doesn't differentiate strings and +// symbols like traditional lisps.) +// +// +// === Decoder === +// +// A separate process called "decoding" can transform simple data structures, +// consisting of only the above types, into a richer set of Zisp data types. +// +// For example, the decoder may turn (#hash ...) into a vector. Some runes may +// be decoded in isolation rather than as part of a list, which is how #true and #false are implemented. +// +//It also +// interprets common rune invocations like (#quote ...) to implement the +// traditional quoting mechanism, but in a better way: +// +// Traditional quote is "unhygienic" in Scheme terms. An expressoin such as +// '(foo bar) will always be read as (quote (foo bar)) regardless of what sort +// of lexical context it appears in, so the semantics will depend on whatever +// the identifier "quote" is bound to in that lexical context. +// +// The Zisp decoder, which transforms not text to text, but objects to objects, +// can turn (#quote ...) into an abstract object which encapsulates the notion +// of quoting, which the Zisp evaluator can recognize and act upon. +// +// One way to think about this, in Scheme (R6RS / syntax-case) terms, is that +// expressions like '(foo bar) turn directly into a *syntax object* when read, +// rather than a regular list object. +// +// +// === Implementation details === +// +// Instead of using recursion directly, the parser is written in something akin +// to a manual continuation-passing style, which ensures that parsing depth is +// not limited by the stack size. +// +// All state/context is passed around via a struct pointer. The parser has a +// main loop which calls a function, passes it the state, and expects to get a +// new state pointer in return, which tells which function the main loop should +// call next, based on the .next field of the state. +// +// When a called function wants to call the parser recursively, it sets the +// .next field to an enumeration value that indicates where the parser should +// return to after it's done with the sub-parsing, and then constructs a new +// state struct, saving a pointer to the original in a .parent field. +// +// Making the parser "return" is a matter of setting the .retval field, and +// setting the .next field to the value .finish, to indicate to the main loop +// that it should either pass control back to the parent, or finish parsing. +// +// +// === Notation used in comments === +// +// Some comments throughout the file give you an example of where the parser +// currently is in a stream. These illustrations use the pipe symbol, which +// looks like a cursor, to indicate the current position of the parser: +// +// (foo| bar baz) <- parser arrived at the end of the string foo +// +// (foo bar baz)| <- parser reached EOF (assuming no trailing spaces) +// + const std = @import("std"); const gc = @import("gc.zig"); +const list = @import("list.zig"); const value = @import("value.zig"); const Value = value.Value; -const Next = enum { - start, - datum, - hash_end, - rune_datum_end, - quote_end, - list, - list_end, - done, -}; - const State = struct { alloc: std.mem.Allocator, @@ -24,12 +124,14 @@ const State = struct { mode: enum { code, data } = .code, - next: Next = .start, + next: Next = .start_parsing, parent: ?*State = null, - last_rune: ?Value = null, - list_tail: ?Value = null, + datum_rune: Value = value.boole.f, + list_tail: Value = value.nil.nil, + opening_bracket: enum { paren, square, brace } = .paren, + opening_quote: enum { apos, tick, comma } = .apos, retval: Value = value.eof.eof, @@ -41,146 +143,303 @@ const State = struct { return self.input[self.pos]; } + fn skip(self: *State) void { + self.pos += 1; + } + fn getc(self: *State) u8 { const c = self.peek(); - self.pos += 1; + self.skip(); return c; } + // Consumes whitespace and line comments. + fn consumeBlanks(self: *State) void { + while (!self.eof()) { + if (self.isWhitespace()) { + self.skip(); + } else if (self.peek() == ';') { + self.skip(); + self.consumeLineComment(); + } else { + return; + } + } + } + + fn consumeLineComment(self: *State) void { + while (!self.eof()) { + if (self.getc() == '\n') { + return; + } + } + } + + fn isWhitespace(self: *State) bool { + return switch (self.peek()) { + '\t', '\n', ' ' => true, + else => false, + }; + } + + // Checks for: Whitespace, closing brackets, and EOF. + // + // This can tell us that we're in a position such as: + // + // (foo| bar) + // + // (foo bar|) + // + // foo| + // + // We could also accept semicolon, so the following works, like in Scheme: + // + // (foo;comment + // bar) + // + // But IMO this should be an error. It's too easy to misread, and might + // just be a typo: the semicolon may have been meant to be a colon. + // + fn isEndDelimiter(self: *State) bool { + return switch (self.peek()) { + '\t', '\n', ' ', ')', ']', '}' => true, + else => false, + }; + } + fn isFinalNull(self: *State) bool { return self.peek() == 0 and self.pos == self.input.len - 1; } - fn newChild(self: *State, next: Next) *State { - const s = self.alloc.create(State) catch @panic("OOM"); - s.* = State{ .alloc = self.alloc, .input = self.input }; - s.pos = self.pos; - s.mode = self.mode; - s.next = next; - s.parent = self; - return s; + fn newSubstate(self: *State, next: Next) *State { + const sub = self.alloc.create(State) catch @panic("OOM"); + sub.* = .{ .alloc = self.alloc, .input = self.input }; + sub.pos = self.pos; + sub.mode = self.mode; + sub.next = next; + sub.parent = self; + return sub; } fn setReturn(self: *State, val: Value) *State { self.retval = val; - self.next = .done; + self.next = .finish; return self; } fn finish(self: *State) ?*State { - if (self.parent) |p| { - p.retval = self.retval; - p.pos = self.pos; - p.alloc.destroy(self); - return p; + if (self.parent) |parent| { + parent.retval = self.retval; + parent.pos = self.pos; + parent.alloc.destroy(self); + return parent; } else { return null; } } }; +// Probably best *not* to use function pointers here, but rather dispatch to +// functions manually based on enum value. This should help the optimizer. + +const Next = enum { + start_parsing, + start_datum, + end_hash_datum, + end_rune_datum, + end_quote, + continue_list, + end_improper_list, + finish, +}; + pub fn read(input: []const u8) Value { var gpa: std.heap.GeneralPurposeAllocator(.{}) = .init; var top = State{ .alloc = gpa.allocator(), .input = input }; var s = ⊤ - while (true) s = switch (s.next) { - .start => start(s), - .datum => datum(s), - .hash_end => hashEnd(s), - .rune_datum_end => runeDatumEnd(s), - .quote_end => quoteEnd(s), - .list => list(s), - .list_end => list(s), - .done => s.finish() orelse break, - }; + while (true) { + s = switch (s.next) { + .start_parsing => startParsing(s), + .start_datum => startDatum(s), + .end_hash_datum => endHashDatum(s), + .end_rune_datum => endRuneDatum(s), + .end_quote => endQuote(s), + .continue_list => continueList(s), + .end_improper_list => endImproperList(s), + .finish => s.finish() orelse break, + }; + std.debug.print("next: {}\n", .{s.next}); + } if (s.eof() or s.isFinalNull()) { return s.retval; } else { - @panic("unconsumed input"); + // Should never happen. + err(s, "READER BUG: unconsumed input"); } } -fn start(s: *State) *State { - while (true) { - if (s.eof()) { - s.next = .done; - return s; - } - const c = s.getc(); - if (isWhitespace(c)) { - continue; - } - return switch (c) { - // whitespace checked above - 0...31, 127...255 => err(s, "invalid character"), - ')', ']', '}' => err(s, "unexpected closing bracket"), - ';' => semi(s), - else => ret: { - // backtrack; let other function handle it - s.pos -= 1; - break :ret datum(s); - }, - }; +fn startParsing(s: *State) *State { + s.consumeBlanks(); + if (s.eof()) { + return s.setReturn(value.eof.eof); } + return switch (s.peek()) { + // whitespace already consumed + 0...31, 127...255 => err(s, "invalid character"), + ')', ']', '}' => err(s, "unexpected closing bracket"), + else => startDatum(s), + }; } -fn semi(s: *State) *State { - while (true) { - if (s.eof()) { - s.next = .done; - return s; - } - const c = s.getc(); - if (c == '\n') { - break; - } - } - return s; -} - -fn datum(s: *State) *State { - const c = s.getc(); - if (isWhitespace(c)) { +// This is called when we *immediately* expect a datum and nothing else; for +// example, no whitespace or comments, because they've already been consumed. +fn startDatum(s: *State) *State { + if (s.isWhitespace()) { return err(s, "expected datum, got whitespace"); } - return switch (c) { + if (s.eof()) { + return err(s, "expected datum, got EOF"); + } + return switch (s.getc()) { // whitespace checked above 0...31, 127...255 => err(s, "invalid character"), + ')', ']', '}' => err(s, "unexpected closing bracket"), + ';' => err(s, "expected datum, got semicolon"), - '"' => string(s), - '#' => hash(s), - '\'' => quote(s), - '(' => list(s), - '+' => plus(s), - ',' => comma(s), - '.' => dot(s), - '0'...'9' => number(s), - '[' => square(s), - '`' => backtick(s), - '{' => brace(s), - else => symbol(s), + + '#' => handleHash(s), + + '"' => startQuotedString(s), + + '\'', '`', ',' => |c| startQuote(s, c), + + '(', '[', '{' => |c| startList(s, c), + + '+', '-' => |c| handlePlusMinus(s, c), + + '0'...'9' => |c| handleDigit(s, c), + + // Periods only allowed between digits, and to express improper lists. + // Things like the following look too much like it could be a typo: + // + // (foo .5) (foo .bar) + // + '.' => err(s, "misplaced period"), + + else => startBareString(s), }; } -fn isWhitespace(c: u8) bool { - return switch (c) { - '\t', '\n', ' ' => true, - else => false, - }; +fn handleHash(s: *State) *State { + // + // We just consumed a hash. Possibilities include: + // + // #|foo ;rune + // + // #|;DATUM ;datum comment + // + // #|DATUM ;hash-datum (code mode only) + // + + if (s.isWhitespace()) { + return err(s, "whitespace after hash"); + } + if (s.eof()) { + return err(s, "EOF after hash"); + } + + // Is it a rune? #foo + switch (s.peek()) { + 'A'...'Z', 'a'...'z' => return handleRune(s), + else => {}, + } + + // Is it a datum comment? #;DATUM + if (s.peek() == ';') { + s.skip(); + // Don't change s.next in this case. Just let the parser try to redo + // what it was doing as soon as the commented-out datum has been read. + return s.newSubstate(.start_datum); + } + + // Otherwise, it must be a hash-datum. #DATUM + + // But data mode doesn't allow that. + if (s.mode == .data) { + return err(s, "invalid use of hash in data mode"); + } + + s.next = .end_hash_datum; + return s.newSubstate(.start_datum); } -// Whitespace, semicolon, and closing brackets of any kind -fn isEndDelimiter(c: u8) bool { - return switch (c) { - '\t', '\n', ' ', ';' => true, - ')', ']', '}' => true, - else => false, +fn handleRune(s: *State) *State { + // + // We are now here, knowing that at least one ASCII letter follows: + // + // #|foo + // + + var buf: [6]u8 = undefined; + var i: u8 = 0; + while (!s.eof()) : (i += 1) switch (s.peek()) { + 'a'...'z', 'A'...'Z' => { + if (i == buf.len) { + return err(s, "rune too long"); + } + buf[i] = s.getc(); + }, + else => break, }; + + // Note: 'i' can't be 0 since this function is only called if at least one + // ASCII letter is found after the hash. + + const rune = value.rune.pack(buf[0..i]); + + // + // Now we're at the end of the rune, but it could be a rune-datum: + // + // #foo|(...) + // + + if (s.isEndDelimiter()) { + return s.setReturn(rune); + } + + // Otherwise, it's followed by a datum, like: #foo(...) + + // Which is only allowed in code mode. + if (s.mode == .data) { + return err(s, "invalid use of hash in data mode"); + } + + s.datum_rune = rune; + s.next = .end_rune_datum; + return s.newSubstate(.start_datum); } -fn string(s: *State) *State { - const str = readString(s) catch return err(s, "unclosed string"); +fn endRuneDatum(s: *State) *State { + return s.setReturn(value.pair.cons( + s.datum_rune, + s.retval, + )); +} + +fn endHashDatum(s: *State) *State { + return s.setReturn(value.pair.cons( + value.rune.pack("hash"), + s.retval, + )); +} + +fn startQuotedString(s: *State) *State { + // We are now here: + // + // "|..." + // + const str = readQuotedString(s) catch return err(s, "unclosed string"); if (s.mode == .code) { // "foo bar" => (#string . "foo bar") const rune = value.rune.pack("string"); @@ -193,16 +452,16 @@ fn string(s: *State) *State { const StringReadError = enum { UnclosedString }; -fn readString(s: *State) error{UnclosedString}!Value { - return try tryReadSstr(s) orelse readLongString(s); +fn readQuotedString(s: *State) error{UnclosedString}!Value { + return try readQuotedSstr(s) orelse readQuotedLongString(s); } -fn tryReadSstr(s: *State) error{UnclosedString}!?Value { +fn readQuotedSstr(s: *State) error{UnclosedString}!?Value { // We will reset to this position if we fail. const start_pos = s.pos; var buf: [6]u8 = undefined; - var i: usize = 0; + var i: u8 = 0; while (!s.eof()) { const c = s.getc(); if (c == '"') { @@ -221,153 +480,179 @@ fn tryReadSstr(s: *State) error{UnclosedString}!?Value { return error.UnclosedString; } -fn readLongString(s: *State) Value { +fn readQuotedLongString(s: *State) Value { _ = s; @panic("not implemented"); } -fn hash(s: *State) *State { - if (isWhitespace(s.peek())) { - return err(s, "whitespace after hash sign"); - } - - // is it a datum comment? - if (s.peek() == ';') { - // consume semicolon - _ = s.getc(); - // Just ignore value and return to starting state after reading it. - s.next = .start; - } else { - s.next = .hash_end; +fn startQuote(s: *State, c: u8) *State { + // Allowed in Scheme, but why? Not in Zisp. + if (s.isWhitespace()) { + return err(s, "whitespace after apostrophe"); } + s.opening_quote = switch (c) { + '\'' => .apos, + '`' => .tick, + ',' => .comma, + else => unreachable, + }; + const sub = s.newSubstate(.start_datum); + sub.mode = .data; + s.next = .end_quote; + return sub; +} - // No whitespace or anything; hash must be immediately followed by datum, - // including if it's a datum comment. Note that if it's actually a rune - // we're reading, like #foo, we abuse our ability to reading an sstr here - // and later turn it into a rune instead, since they're the same length. - return s.newChild(.datum); +fn endQuote(s: *State) *State { + const name = switch (s.opening_quote) { + .apos => "apos", + .tick => "tick", + .comma => "comma", + }; + return s.setReturn(value.pair.cons( + value.rune.pack(name), + s.retval, + )); } -fn hashEnd(s: *State) *State { - // It's not actually a sstr but a rune, like: #foo or #foo(...) - if (value.sstr.check(s.retval)) { - return hashRuneEnd(s); +fn startList(s: *State, open: u8) *State { + if (s.mode == .data and open != '(') { + return err(s, "invalid opening bracket in data mode"); } - // Hash followed by an actual datum; becomes a (#hash ...) invocation: - // - // #(...) -> (#hash . (...)) - // - // #"..." -> (#hash . "...") - // + s.consumeBlanks(); - // But data mode doesn't allow that. - if (s.mode == .data) { - return err(s, "invalid use of hash in data mode"); + // Check for empty lists: (), [], {} + if (open == '(' and s.peek() == ')') { + s.skip(); + return s.setReturn(value.nil.nil); } - - // Also, bare long strings are not OK here; too similar to a rune. - if (value.ptr.checkZisp(s.retval, .string)) { - return err(s, "long string after hash sign"); + if (open == '[' and s.peek() == ']') { + s.skip(); + return s.setReturn(value.pair.cons( + value.rune.pack("square"), + value.nil.nil, + )); + } + if (open == '{' and s.peek() == '}') { + s.skip(); + return s.setReturn(value.pair.cons( + value.rune.pack("brace"), + value.nil.nil, + )); } - return s.setReturn(value.pair.cons( - value.rune.pack("hash"), - s.retval, - )); + s.opening_bracket = switch (open) { + '(' => .paren, + '[' => .square, + '{' => .brace, + else => unreachable, + }; + s.next = .continue_list; + return s.newSubstate(.start_datum); } -// Note: Can only come here from hashEnd(). -fn hashRuneEnd(s: *State) *State { - // Convert the fake sstr that was meant to be a rune. - const rune = value.rune.make(s.retval); +fn continueList(s: *State) *State { + s.consumeBlanks(); - // Maybe it's a stand-alone rune, like: #foo - if (isEndDelimiter(s.peek())) { - // Which is only allowed in data mode. - if (s.mode == .code) { - return err(s, "bare runes not allowed in code"); - } else { - return s.setReturn(rune); - } + if (s.eof()) { + return err(s, "unexpected EOF while parsing list"); } - // Otherwise, it's followed by a datum, like: #foo(...) + const tail = value.pair.cons(s.retval, s.list_tail); - // Which is only allowed in code mode. - if (s.mode == .data) { - return err(s, "invalid use of hash in data mode"); - } else { - s.last_rune = rune; - s.next = .rune_datum_end; - return s.newChild(.datum); - } -} + const open = s.opening_bracket; + const char = s.peek(); -fn runeDatumEnd(s: *State) *State { - if (s.last_rune) |rune| { - return s.setReturn(value.pair.cons(rune, s.retval)); - } else { - unreachable; + // Check for proper ending: (foo bar baz) + if (open == .paren and char == ')') { + s.skip(); + return s.setReturn(list.reverse(tail)); + } + if (open == .square and char == ']') { + s.skip(); + return s.setReturn(value.pair.cons( + value.rune.pack("square"), + list.reverse(tail), + )); + } + if (open == .brace and char == '}') { + s.skip(); + return s.setReturn(value.pair.cons( + value.rune.pack("brace"), + list.reverse(tail), + )); } -} -fn quote(s: *State) *State { - // Allowed in Scheme, but why? Not in Zisp. - if (isWhitespace(s.peek())) { - return err(s, "whitespace after apostrophe"); + s.list_tail = tail; + + // Check for improper ending: (foo bar . baz) + if (char == '.') { + s.skip(); + + // We should now be at (... foo .| bar) and whitespace must follow. + // Scheme allows (foo .(bar)) but we don't. Mind your spaces! + if (!s.isWhitespace()) { + return err(s, "invalid use of period"); + } + + s.consumeBlanks(); + + s.next = .end_improper_list; + return s.newSubstate(.start_datum); } - s.next = .quote_end; - const c = s.newChild(.datum); - c.mode = .data; - return c; -} -fn quoteEnd(s: *State) *State { - return s.setReturn(value.pair.cons( - value.rune.pack("quote"), - s.retval, - )); + // OK, next element. + return s.newSubstate(.start_datum); } -fn list(s: *State) *State { - return s; -} +fn endImproperList(s: *State) *State { + s.consumeBlanks(); -fn plus(s: *State) *State { - return s; -} + if (s.eof()) { + return err(s, "unexpected EOF"); + } -fn comma(s: *State) *State { - return s; -} + const open = s.opening_bracket; + const char = s.getc(); -fn dot(s: *State) *State { - return s; -} + const body = s.list_tail; + const tail = s.retval; -fn number(s: *State) *State { - return s; -} + if (open == .paren and char == ')') { + return s.setReturn(list.reverseWithTail(body, tail)); + } + if (open == .square and char == ']') { + return s.setReturn(value.pair.cons( + value.rune.pack("square"), + list.reverseWithTail(body, tail), + )); + } + if (open == .brace and char == '}') { + return s.setReturn(value.pair.cons( + value.rune.pack("brace"), + list.reverseWithTail(body, tail), + )); + } -fn square(s: *State) *State { - return s; + return err(s, "malformed list or extra datum at end of improper list"); } -fn backtick(s: *State) *State { +fn handlePlusMinus(s: *State, c: u8) *State { + _ = c; return s; } -fn brace(s: *State) *State { +fn handleDigit(s: *State, c: u8) *State { + _ = c; return s; } -fn symbol(s: *State) *State { +fn startBareString(s: *State) *State { return s; } -fn err(s: *State, msg: []const u8) *State { - _ = s; +fn err(s: *State, msg: []const u8) noreturn { std.debug.print("{s}\n", .{msg}); + std.debug.print("pos: {}\n", .{s.pos}); @panic("reader error"); } |
