diff options
| author | Taylan Kammer <taylan.kammer@gmail.com> | 2025-02-22 15:04:05 +0100 |
|---|---|---|
| committer | Taylan Kammer <taylan.kammer@gmail.com> | 2025-02-22 15:04:05 +0100 |
| commit | c922361115c8ee398ec4e26bb0af8cca4dcb9667 (patch) | |
| tree | 6b093f2ad960bf073d295458faf6707e5123c1e4 /src/libzisp/io | |
| parent | 2d52864265e06a4d863dba84e1d20c8d52d8c5dd (diff) | |
update
Diffstat (limited to 'src/libzisp/io')
| -rw-r--r-- | src/libzisp/io/decoder.zig | 1 | ||||
| -rw-r--r-- | src/libzisp/io/encoder.zig | 1 | ||||
| -rw-r--r-- | src/libzisp/io/parser.zig | 683 | ||||
| -rw-r--r-- | src/libzisp/io/reader.zig | 10 | ||||
| -rw-r--r-- | src/libzisp/io/writer.zig | 1 |
5 files changed, 696 insertions, 0 deletions
diff --git a/src/libzisp/io/decoder.zig b/src/libzisp/io/decoder.zig new file mode 100644 index 0000000..eb27e20 --- /dev/null +++ b/src/libzisp/io/decoder.zig @@ -0,0 +1 @@ +// wip diff --git a/src/libzisp/io/encoder.zig b/src/libzisp/io/encoder.zig new file mode 100644 index 0000000..eb27e20 --- /dev/null +++ b/src/libzisp/io/encoder.zig @@ -0,0 +1 @@ +// wip diff --git a/src/libzisp/io/parser.zig b/src/libzisp/io/parser.zig new file mode 100644 index 0000000..71c6946 --- /dev/null +++ b/src/libzisp/io/parser.zig @@ -0,0 +1,683 @@ +// +// === Parser for Code & Data === +// +// Zisp s-expressions come in two flavors: code (sugar) and data (no sugar). +// +// However, code expressions are parsed into the same data types which the data +// expressions can represent, so homoiconicity is preserved. +// +// The "sugar" used in code expressions is merely shorthand for more complex +// data expressions, which could have been written by hand. +// +// Data expressions have a very simple format, and are only able to express a +// minimal set of data types: +// +// string -> foo , "foo bar" ;symbols and strings are the same data type +// +// rune -> #foo ;limited to 6 ASCII letters (a - z, A - Z) +// +// pair -> (DATUM . DATUM) ;the only composite data type supported +// +// nil -> () ;we prefer the term nil over null +// +// The list short-hand syntax may be considered the only "syntax sugar" that is +// supported by the data parser: +// +// (DATUM DATUM DATUM) -> (DATUM . (DATUM . (DATUM . ()))) +// +// We may use terms like "code parser" and "data parser" out of convenience, +// although there may only be a single parser that implements both formats by +// switching between modes. +// +// When the code parser encounters syntax sugar, it always transforms it into a +// list starting with a rune, like in the following examples: +// +// #(...) -> (#HASH ...) +// +// [...] -> (#SQUARE ...) +// +// 'foo -> (#QUOTE . foo) +// +// These can combine arbitrarily: +// +// #{...} -> (#HASH #BRACE ...) +// +// #'foo -> (#HASH #QUOTE . foo) +// +// ##'[...] -> (#HASH #HASH #QUOTE #SQUARE ...) +// +// As a specialty, double-quoted strings are actually considered sugar by the +// code parser, and are transformed as follows into data: +// +// "..." -> (#STRING . "...") +// +// (Otherwise, all string literals would be identifiers, or all identifiers +// would be string literals, because Zisp doesn't differentiate strings and +// symbols like traditional lisps. Also, note that although we could reuse +// #QUOTE here, instead of using #STRING, this would make it impossible to +// differentiate between the code expressions #'foo and #"foo".) +// +// Runes are case-sensitive, and the code parser only emits runes using +// upper-case letters, so lower-case runes are free for user extensions. +// +// You may be wondering about numbers. As far as the parser is concerned, +// numbers are strings. It's the decoder (see below) that will turn bare +// strings (those not marked with #STRING) into numbers. +// +// Note that 'foo becomes (quote foo) in Scheme, but (#QUOTE . foo) in Zisp. +// The operand of #QUOTE is the entire cdr. The same principle is used when +// parsing other sugar: +// +// Incorrect Correct +// +// #(x y z) -> (#HASH (x y z)) #(x y z) -> (#HASH x y z) +// +// [x y z] -> (#SQUARE (x y z)) [x y z] -> (#SQUARE x y z) +// +// #{x} -> (#HASH (#BRACE (x))) #{x} -> (#HASH #BRACE x) +// +// +// === Decoder === +// +// A separate process called "decoding" can transform simple data structures, +// consisting of only the above types, into a richer set of Zisp data types. +// +// For example, the decoder may turn (#HASH ...) into a vector, as one would +// expect a vector literal like #(...) to work in Scheme. +// +// Runes may be decoded in isolation as well, rather than transforming a list +// whose head they appear in. This is how #true and #false are implemented. +// +// The decoder may also perform arbitrary transforms on any type; for example, +// it may turn bare strings (those not marked with #STRING) into numbers when +// it's decoding data representing code. This is how number literals are +// implemented in Zisp. +// +// The decoder recognizes (#QUOTE ...) to implement the traditional quoting +// mechanism, but in a better way: +// +// Traditional quote is "unhygienic" in Scheme terms. An expression such as +// '(foo bar) will always be read as (quote (foo bar)) regardless of what sort +// of lexical context it appears in, so the semantics will depend on whatever +// the identifier "quote" is bound to in that lexical context, meaning that the +// expression may end up evaluating to something other than the list (foo bar). +// +// The Zisp decoder, which transforms not text to text, but objects to objects, +// can turn (#QUOTE ...) into an abstract object which encapsulates the notion +// of quoting, which the Zisp evaluator can recognize and act upon, ensuring +// that an expression like '(foo bar) always turns into the list (foo bar). +// +// One way to think about this, in Scheme (R6RS / syntax-case) terms, is that +// expressions like '(foo bar) turn directly into a *syntax object* when read, +// rather than a regular list object. +// +// The decoder is, of course, configurable and extensible. The transformations +// mentioned above would be performed only when it's told to decode data which +// represents Zisp code. The decoder may be given a different configuration, +// telling it to decode, for example, data which represents a different kind of +// domain-specific data, such as application settings, build system commands, +// complex data records with non-standard data types, and so on. +// +// +// === Implementation details === +// +// Instead of using recursion directly, the parser is written in something akin +// to a manual continuation-passing style, which ensures that parsing depth is +// not limited by the stack size. +// +// All state/context is passed around via a struct pointer. The parser has a +// main loop which calls a function, passing it the state, and also expects a +// state pointer in return. +// +// The state has a .next field, which indicates which function the main loop +// should call next. It also has a .parent field, used as follows: +// +// If a function wants to call the parser recursively, it sets the .next field +// of the current state to where the recursive call should return to, then it +// creates a new state with a given starting point, sets its .parent field to +// the current state, and returns the new state pointer. +// +// If a function wants to make the parser return, it sets the .retval field of +// the current state, and sets .next to the .perform_return value. This makes +// the main loop either return to the parent state (after copying the .retval +// field from child to parent), or if there is no parent state, it returns the +// value as the top-level result. +// +// Note: While it's possible to simply set .next and return the current state, +// to have another function be called next (possibly even setting .retval to +// pass a value to it), this is completely unnecessary. A few non-recursive +// function calls will obviously not blow the stack. It's only recursive +// parsing that we use these fields for. +// +// Note 2: When calling the parser recursively, it may seem sensible to always +// set the .next of the new state to .start_datum, because you already cleared +// incoming whitespace and comments from the stream. However, in some cases, +// you must set it to .start_parse instead. This is due to datum comments. +// After a datum comment is parsed, the parser will ignore it and restore the +// previous state, to try again what it was doing. If the state was set to +// .start_datum, this means no whitespace or comments would be tolerated after +// the datum comment. +// +// Note 3: When it comes to pairs/lists, we mainly try to parse them as lists, +// and a pair becomes a special-case of an improper list (two elements). This +// has the advantage of saving memory: If we implemented list parsing as pair +// parsing, we would be calling the parser recursively, deeper and deeper, for +// every pair that the list is made up of. Although we're not limited by stack +// space, thanks to the strategy described above, this would still waste memory +// while parsing. +// +// +// === Notation used in comments === +// +// Some comments throughout the file give you an example of where the parser +// currently might be in a stream. These illustrations use the pipe symbol, +// which looks like a cursor, to indicate the current position: +// +// (foo| bar baz) <- parser arrived at the end of the string foo +// +// (foo bar baz)| <- parser reached EOF (assuming no trailing spaces) +// + +const std = @import("std"); + +const gc = @import("../gc.zig"); +const list = @import("../list.zig"); +const value = @import("../value.zig"); + +const Value = value.Value; + +const State = struct { + alloc: std.mem.Allocator, + + input: []const u8, + pos: usize = 0, + + mode: enum { code, data } = .code, + + next: Fn = .start_parse, + + parent: ?*State = null, + + // Used to store various context, but most notably the stack of list + // elements parsed so far, so just initialize it to nil. + context: Value = value.nil.nil, + + opening_bracket: u8 = 0, + + retval: Value = value.eof.eof, + + fn eof(self: *State) bool { + return self.pos >= self.input.len; + } + + fn peek(self: *State) u8 { + return self.input[self.pos]; + } + + fn skip(self: *State) void { + self.pos += 1; + } + + fn getc(self: *State) u8 { + const c = self.peek(); + self.skip(); + return c; + } + + // Consumes whitespace and line comments. + fn consumeBlanks(self: *State) void { + while (!self.eof()) { + if (self.isWhitespace()) { + self.skip(); + } else if (self.peek() == ';') { + self.skip(); + self.consumeLineComment(); + } else { + return; + } + } + } + + fn consumeLineComment(self: *State) void { + while (!self.eof()) { + if (self.getc() == '\n') { + return; + } + } + } + + fn isWhitespace(self: *State) bool { + return switch (self.peek()) { + '\t', '\n', ' ' => true, + else => false, + }; + } + + fn isFinalNull(self: *State) bool { + return self.peek() == 0 and self.pos == self.input.len - 1; + } + + fn recurParse(self: *State, start_from: Fn, return_to: Fn) *State { + const sub = self.alloc.create(State) catch @panic("OOM"); + sub.* = .{ .alloc = self.alloc, .input = self.input }; + sub.pos = self.pos; + sub.mode = self.mode; + sub.next = start_from; + sub.parent = self; + self.next = return_to; + return sub; + } + + fn returnDatum(self: *State, val: Value) *State { + self.retval = val; + self.next = .perform_return; + return self; + } + + fn performReturn(self: *State) ?*State { + if (self.parent) |parent| { + parent.retval = self.retval; + parent.pos = self.pos; + parent.alloc.destroy(self); + return parent; + } else { + return null; + } + } +}; + +// Probably best *not* to use function pointers here, but rather dispatch to +// functions manually based on enum value. This should help the optimizer. + +const Fn = enum { + start_parse, + start_datum, + end_hash_datum, + end_rune_datum, + end_quote, + continue_list, + finalize_improper_list, + end_improper_list, + perform_return, +}; + +pub fn parse(input: []const u8) Value { + var gpa: std.heap.GeneralPurposeAllocator(.{}) = .init; + var top = State{ .alloc = gpa.allocator(), .input = input }; + var s = ⊤ + while (true) s = switch (s.next) { + .start_parse => startParse(s), + .start_datum => startDatum(s), + .end_hash_datum => endHashDatum(s), + .end_rune_datum => endRuneDatum(s), + .end_quote => endQuote(s), + .continue_list => continueList(s), + .finalize_improper_list => finalizeImproperList(s), + .end_improper_list => endImproperList(s), + .perform_return => s.performReturn() orelse return s.retval, + }; +} + +fn startParse(s: *State) *State { + s.consumeBlanks(); + if (s.eof()) { + return s.returnDatum(value.eof.eof); + } + return switch (s.peek()) { + // whitespace already consumed + 0...32, 127...255 => err(s, "invalid character"), + ')', ']', '}' => err(s, "unexpected closing bracket"), + else => startDatum(s), + }; +} + +// This is called when we *immediately* expect a datum and nothing else; for +// example, no whitespace or comments, because they've already been consumed. +fn startDatum(s: *State) *State { + if (s.eof()) { + return err(s, "expected datum, got EOF"); + } + if (s.isWhitespace()) { + return err(s, "expected datum, got whitespace"); + } + return switch (s.peek()) { + // whitespace checked above + 0...32, 127...255 => err(s, "invalid character"), + + ')', ']', '}' => err(s, "unexpected closing bracket"), + + ';' => err(s, "expected datum, got semicolon"), + + '#' => handleHash(s), + + '"' => startQuotedString(s), + + '\'', '`', ',' => startQuote(s), + + '(', '[', '{' => startList(s), + + // Periods are only allowed in the middle of a string, or to express + // improper lists, because the following look too much like typos: + // + // (foo. bar) (foo .bar) (123. 456) (123 .456) + // + '.' => err(s, "misplaced period"), + + else => startBareString(s), + }; +} + +fn handleHash(s: *State) *State { + s.skip(); + // + // We just consumed a hash. Possibilities include: + // + // #|foo ;rune + // + // #|;DATUM ;datum comment + // + // #|DATUM ;hash-datum (code mode only) + // + + if (s.eof()) { + return err(s, "EOF after hash"); + } + if (s.isWhitespace()) { + return err(s, "whitespace after hash"); + } + + // Is it a rune? #foo + switch (s.peek()) { + 'a'...'z', 'A'...'Z' => return handleRune(s), + else => {}, + } + + // Is it a datum comment? #;DATUM + if (s.peek() == ';') { + s.skip(); + // Don't change s.next in this case. Just let the parser try to redo + // what it was doing as soon as the commented-out datum has been read. + return s.recurParse(.start_datum, s.next); + } + + // Otherwise, it must be a hash-datum. #DATUM + + // But data mode doesn't allow that. + if (s.mode == .data) { + return err(s, "use of hash-datum sequence not allowed in data mode"); + } + + return s.recurParse(.start_datum, .end_hash_datum); +} + +fn handleRune(s: *State) *State { + const rune = readRune(s) orelse return err(s, "rune too long"); + // + // Now we're at the end of the rune, but it could be a rune-datum: + // + // #foo|(...) + // + + if (isEndOfRune(s)) { + // Nope, just a stand-alone rune. + return s.returnDatum(rune); + } + + // Otherwise, it's followed by a datum, like: #foo(...) + + // Which is only allowed in code mode. + if (s.mode == .data) { + return err(s, "invalid use of hash in data mode"); + } + + s.context = rune; + return s.recurParse(.start_datum, .end_rune_datum); +} + +fn readRune(s: *State) ?Value { + var buf: [6]u8 = undefined; + var i: u8 = 0; + while (!s.eof()) : (i += 1) switch (s.peek()) { + 'a'...'z', 'A'...'Z' => { + if (i == buf.len) { + return null; + } + buf[i] = s.getc(); + }, + else => break, + }; + + // 'i' can't be 0 since this function is only called if at least one ASCII + // letter was seen after the hash. + std.debug.assert(i != 0); + + return value.rune.pack(buf[0..i]); +} + +fn isEndOfRune(s: *State) bool { + return s.eof() or switch (s.peek()) { + '\t', '\n', ' ', ')', ']', '}' => true, + else => false, + }; +} + +fn endRuneDatum(s: *State) *State { + return s.returnDatum(value.pair.cons(s.context, s.retval)); +} + +fn endHashDatum(s: *State) *State { + const rune = value.rune.pack("HASH"); + return s.returnDatum(value.pair.cons(rune, s.retval)); +} + +fn startQuotedString(s: *State) *State { + // We're at |"..." so consume the opening quote before we start reading. + s.skip(); + + const str = readQuotedString(s) catch return err(s, "unclosed string"); + if (s.mode == .code) { + // "foo bar" => (#STRING . "foo bar") + const rune = value.rune.pack("STRING"); + const pair = value.pair.cons(rune, str); + return s.returnDatum(pair); + } else { + return s.returnDatum(str); + } +} + +// RQS = Read Quoted String +const RqsError = enum { Unclosed }; + +fn readQuotedString(s: *State) !Value { + return try readQuotedSstr(s) orelse readQuotedLongString(s); +} + +fn readQuotedSstr(s: *State) !?Value { + // We will reset to this position if we fail. + const start_pos = s.pos; + + var buf: [6]u8 = undefined; + var i: u8 = 0; + while (!s.eof()) { + const c = s.getc(); + if (c == '"') { + // ok, return what we accumulated + return value.sstr.pack(buf[0..i]); + } + if (i == 6) { + // failed; reset and bail out + s.pos = start_pos; + return null; + } + // ok, save this byte and go on + buf[i] = c; + i += 1; + } + return error.Unclosed; +} + +fn readQuotedLongString(s: *State) Value { + return err(s, "NOT YET IMPLEMENTED"); +} + +fn startBareString(s: *State) *State { + // We're at |foo so start reading directly. + return readBareSstr(s) orelse readBareLongString(s); +} + +fn readBareSstr(s: *State) ?*State { + // We will reset to this position if we fail. + const start_pos = s.pos; + + var buf: [6]u8 = undefined; + var i: u8 = 0; + while (!s.eof()) : (i += 1) { + if (isBareStringEnd(s)) { + break; + } + if (i == buf.len) { + // failed; reset and bail out + s.pos = start_pos; + return null; + } + buf[i] = s.getc(); + } + + return s.returnDatum(value.sstr.pack(buf[0..i])); +} + +fn isBareStringEnd(s: *State) bool { + // We will ignore illegal characters here, because they aren't consumed by + // this function; whatever code comes next must handle them. + return s.eof() or switch (s.peek()) { + 0...32, 127...255 => true, + '(', ')', '[', ']', '{', '}', ';', '#', '"', '\'', '`', ',' => true, + else => false, + }; +} + +fn readBareLongString(s: *State) *State { + return err(s, "NOT YET IMPLEMENTED"); +} + +fn startQuote(s: *State) *State { + // We're at one of: |'... |`... |,... + s.context = value.rune.pack(switch (s.getc()) { + '\'' => "QUOTE", + '`' => "GRAVE", + ',' => "COMMA", + else => unreachable, + }); + return s.recurParse(.start_datum, .end_quote); +} + +fn endQuote(s: *State) *State { + return s.returnDatum(value.pair.cons(s.context, s.retval)); +} + +// List processing is, unsurprisingly, the most complicated, and it's made even +// more complicated by the possibility of datum comments in strange places... + +fn startList(s: *State) *State { + const open = s.getc(); + + if (s.mode == .data and open != '(') { + return err(s, "invalid opening bracket in data mode"); + } + + s.consumeBlanks(); + if (s.eof()) { + return err(s, "unexpected EOF while parsing list"); + } + + s.opening_bracket = open; + return if (isEndOfList(s)) + endList(s) + else + s.recurParse(.start_parse, .continue_list); +} + +fn isEndOfList(s: *State) bool { + return switch (s.peek()) { + ')', ']', '}' => true, + else => false, + }; +} + +fn continueList(s: *State) *State { + s.context = value.pair.cons(s.retval, s.context); + + s.consumeBlanks(); + if (s.eof()) { + return err(s, "unexpected EOF while parsing list"); + } + + if (isEndOfList(s)) { + s.context = list.reverse(s.context); + return endList(s); + } + + if (s.peek() == '.') { + s.skip(); + // Scheme allows (foo .(bar)) but we don't. Mind your spaces! + if (!s.isWhitespace()) { + return err(s, "misplaced period"); + } + return s.recurParse(.start_parse, .finalize_improper_list); + } + + return s.recurParse(.start_parse, .continue_list); +} + +fn finalizeImproperList(s: *State) *State { + s.context = list.reverseWithTail(s.context, s.retval); + return endImproperList(s); +} + +fn endImproperList(s: *State) *State { + s.consumeBlanks(); + if (s.eof()) { + return err(s, "unexpected EOF while parsing list"); + } + + if (isEndOfList(s)) { + return endList(s); + } + + if (s.getc() == '#') { + if (s.eof()) { + return err(s, "unexpected EOF after hash while parsing list"); + } + if (s.getc() == ';') { + return s.recurParse(.start_datum, .end_improper_list); + } + } + + return err(s, "malformed list / extra datum at end of improper list"); +} + +fn endList(s: *State) *State { + const open = s.opening_bracket; + const char = s.getc(); + + // Check for proper ending: (foo bar baz) + if (open == '(' and char == ')') { + return s.returnDatum(s.context); + } + if (open == '[' and char == ']') { + const rune = value.rune.pack("SQUARE"); + return s.returnDatum(value.pair.cons(rune, s.context)); + } + if (open == '{' and char == '}') { + const rune = value.rune.pack("BRACE"); + return s.returnDatum(value.pair.cons(rune, s.context)); + } + + return err(s, "wrong closing bracket for list"); +} + +fn err(s: *State, msg: []const u8) noreturn { + std.debug.print("{s}\n", .{msg}); + std.debug.print("pos: {}\n", .{s.pos}); + @panic("parse error"); +} diff --git a/src/libzisp/io/reader.zig b/src/libzisp/io/reader.zig new file mode 100644 index 0000000..d6de79d --- /dev/null +++ b/src/libzisp/io/reader.zig @@ -0,0 +1,10 @@ +// See parse.zig for details. + +const parser = @import("parser.zig"); +const decoder = @import("decoder.zig"); + +const Value = @import("../value.zig").Value; + +pub fn readCode(input: []const u8) Value { + return decoder.decode(parser.parse(input)); +} diff --git a/src/libzisp/io/writer.zig b/src/libzisp/io/writer.zig new file mode 100644 index 0000000..eb27e20 --- /dev/null +++ b/src/libzisp/io/writer.zig @@ -0,0 +1 @@ +// wip |
