update

author: Taylan Kammer <taylan.kammer@gmail.com> 2025-02-22 18:01:33 +0100
committer: Taylan Kammer <taylan.kammer@gmail.com> 2025-02-22 18:01:33 +0100
commit: b7fb551ae61d26c30e6078f1f617862430141ce3 (patch)
tree: 6ca7992432d11e8997def25671561454c7660c92
parent: c922361115c8ee398ec4e26bb0af8cca4dcb9667 (diff)
7 files changed, 94 insertions, 72 deletions
diff --git a/src/libzisp.zig b/src/libzisp.zig
index 8141994..400f9fb 100644
--- a/src/libzisp.zig
+++ b/src/libzisp.zig
@@ -6,11 +6,11 @@ const builtin = @import("builtin");
 const testing = std.testing;
 
 pub const gc = @import("libzisp/gc.zig");
+pub const io = @import("libzisp/io.zig");
+pub const lib = @import("libzisp/lib.zig");
 pub const value = @import("libzisp/value.zig");
-pub const parser = @import("libzisp/io/parser.zig");
 
 pub const Value = value.Value;
-pub const Bucket = gc.Bucket;
 
 test "double" {
     const d1: f64 = 0.123456789;
@@ -45,7 +45,7 @@ test "fixnum" {
 test "ptr" {
     const ptr = value.ptr;
 
-    const val: [*]Bucket = @ptrFromInt(256);
+    const val: [*]gc.Bucket = @ptrFromInt(256);
     const tag = ptr.Tag.string;
 
     const p = ptr.pack(val, tag);
@@ -251,7 +251,7 @@ test "pair" {
 }
 
 test "parse" {
-    const val = parser.parse("\"foo\"");
+    const val = io.parser.parseCode("\"foo\"");
     const r, const rl = value.rune.unpack(value.pair.car(val));
     const s, const sl = value.sstr.unpack(value.pair.cdr(val));
     try std.testing.expectEqualStrings("STRING", r[0..rl]);
@@ -259,7 +259,7 @@ test "parse" {
 }
 
 test "parse2" {
-    const val = parser.parse(
+    const val = io.parser.parseCode(
         \\ ;; Testing some crazy datum comments
         \\ ##;"bar"#;([x #"y"]{##`,'z})"foo"
         \\ ;; end
@@ -278,7 +278,9 @@ test "parse2" {
 }
 
 test "parse3" {
-    const val = parser.parse("(foo #;x #;(x y) #;x #bar [#x #\"baz\"] 'bat)");
+    const val = io.parser.parseCode(
+        \\(foo #;x #;(x y) #;x #bar [#x #"baz"] 'bat)
+    );
 
     const car = value.pair.car;
     const cdr = value.pair.cdr;
@@ -292,7 +294,7 @@ test "parse3" {
 }
 
 test "parse4" {
-    const val = parser.parse("(foo . #;x bar #;y)");
+    const val = io.parser.parseCode("(foo . #;x bar #;y)");
 
     const s, const sl = value.sstr.unpack(value.pair.car(val));
     try std.testing.expectEqualStrings("foo", s[0..sl]);
diff --git a/src/libzisp/io.zig b/src/libzisp/io.zig
new file mode 100644
index 0000000..3d6d384
--- /dev/null
+++ b/src/libzisp/io.zig
@@ -0,0 +1,8 @@
+pub const parser = @import("io/parser.zig");
+pub const unparser = @import("io/unparser.zig");
+
+pub const decoder = @import("io/decoder.zig");
+pub const encoder = @import("io/encoder.zig");
+
+pub const reader = @import("io/reader.zig");
+pub const writer = @import("io/writer.zig");
diff --git a/src/libzisp/io/parser.zig b/src/libzisp/io/parser.zig
index 71c6946..5162c2f 100644
--- a/src/libzisp/io/parser.zig
+++ b/src/libzisp/io/parser.zig
@@ -9,19 +9,21 @@
 // The "sugar" used in code expressions is merely shorthand for more complex
 // data expressions, which could have been written by hand.
 //
-// Data expressions have a very simple format, and are only able to express a
-// minimal set of data types:
+// Data expressions have a very simple format, and are only able to express the
+// bare minimum set of data types needed to represent more complex data:
 //
-//   string -> foo , "foo bar"   ;symbols and strings are the same data type
+//   type      format             comment
+//   ----      ------             -------
 //
-//   rune   -> #foo              ;limited to 6 ASCII letters (a - z, A - Z)
+//   string    foo , "foo bar"    symbols and strings are the same data type
 //
-//   pair   -> (DATUM . DATUM)   ;the only composite data type supported
+//   rune      #name              name is 1-6 ASCII letters (a - z, A - Z)
 //
-//   nil    -> ()                ;we prefer the term nil over null
+//   pair      (DATUM . DATUM)    the only composite data type supported
 //
-// The list short-hand syntax may be considered the only "syntax sugar" that is
-// supported by the data parser:
+//   nil       ()                 we prefer the term nil over null
+//
+// The list short-hand syntax is the only "syntax sugar" supported in data:
 //
 //   (DATUM DATUM DATUM)  ->  (DATUM . (DATUM . (DATUM . ())))
 //
@@ -62,7 +64,7 @@
 //
 // You may be wondering about numbers.  As far as the parser is concerned,
 // numbers are strings.  It's the decoder (see below) that will turn bare
-// strings (those not marked with #STRING) into numbers.
+// strings (those not marked with #STRING) into numbers where appropriate.
 //
 // Note that 'foo becomes (quote foo) in Scheme, but (#QUOTE . foo) in Zisp.
 // The operand of #QUOTE is the entire cdr.  The same principle is used when
@@ -94,7 +96,7 @@
 // implemented in Zisp.
 //
 // The decoder recognizes (#QUOTE ...) to implement the traditional quoting
-// mechanism, but in a better way:
+// mechanism, but with a significant difference:
 //
 // Traditional quote is "unhygienic" in Scheme terms.  An expression such as
 // '(foo bar) will always be read as (quote (foo bar)) regardless of what sort
@@ -163,7 +165,7 @@
 // has the advantage of saving memory: If we implemented list parsing as pair
 // parsing, we would be calling the parser recursively, deeper and deeper, for
 // every pair that the list is made up of.  Although we're not limited by stack
-// space, thanks to the strategy described above, this would still waste memory
+// space (thanks to the strategy described above) this would still waste memory
 // while parsing.
 //
 //
@@ -180,31 +182,23 @@
 
 const std = @import("std");
 
-const gc = @import("../gc.zig");
-const list = @import("../list.zig");
+const lib = @import("../lib.zig");
 const value = @import("../value.zig");
 
 const Value = value.Value;
 
+pub const Mode = enum { code, data };
+
 const State = struct {
     alloc: std.mem.Allocator,
-
     input: []const u8,
     pos: usize = 0,
-
-    mode: enum { code, data } = .code,
-
+    mode: Mode = undefined,
     next: Fn = .start_parse,
-
     parent: ?*State = null,
-
-    // Used to store various context, but most notably the stack of list
-    // elements parsed so far, so just initialize it to nil.
-    context: Value = value.nil.nil,
-
-    opening_bracket: u8 = 0,
-
-    retval: Value = value.eof.eof,
+    context: Value = undefined,
+    opening_bracket: u8 = undefined,
+    retval: Value = undefined,
 
     fn eof(self: *State) bool {
         return self.pos >= self.input.len;
@@ -258,14 +252,17 @@ const State = struct {
     }
 
     fn recurParse(self: *State, start_from: Fn, return_to: Fn) *State {
-        const sub = self.alloc.create(State) catch @panic("OOM");
-        sub.* = .{ .alloc = self.alloc, .input = self.input };
-        sub.pos = self.pos;
-        sub.mode = self.mode;
-        sub.next = start_from;
-        sub.parent = self;
+        const newState = self.alloc.create(State) catch @panic("OOM");
+        newState.* = .{
+            .alloc = self.alloc,
+            .input = self.input,
+            .pos = self.pos,
+            .mode = self.mode,
+            .next = start_from,
+            .parent = self,
+        };
         self.next = return_to;
-        return sub;
+        return newState;
     }
 
     fn returnDatum(self: *State, val: Value) *State {
@@ -296,14 +293,18 @@ const Fn = enum {
     end_rune_datum,
     end_quote,
     continue_list,
-    finalize_improper_list,
+    finish_improper_list,
     end_improper_list,
     perform_return,
 };
 
-pub fn parse(input: []const u8) Value {
+pub fn parseCode(input: []const u8) Value {
+    return parse(input, .code);
+}
+
+pub fn parse(input: []const u8, mode: Mode) Value {
     var gpa: std.heap.GeneralPurposeAllocator(.{}) = .init;
-    var top = State{ .alloc = gpa.allocator(), .input = input };
+    var top = State{ .alloc = gpa.allocator(), .input = input, .mode = mode };
     var s = &top;
     while (true) s = switch (s.next) {
         .start_parse => startParse(s),
@@ -312,7 +313,7 @@ pub fn parse(input: []const u8) Value {
         .end_rune_datum => endRuneDatum(s),
         .end_quote => endQuote(s),
         .continue_list => continueList(s),
-        .finalize_improper_list => finalizeImproperList(s),
+        .finish_improper_list => finishImproperList(s),
         .end_improper_list => endImproperList(s),
         .perform_return => s.performReturn() orelse return s.retval,
     };
@@ -578,6 +579,10 @@ fn endQuote(s: *State) *State {
 // List processing is, unsurprisingly, the most complicated, and it's made even
 // more complicated by the possibility of datum comments in strange places...
 
+// Make sure to use .start_parse instead of .start_datum to handle elements, so
+// that an arbitrary number of datum comments, separated by blanks (whitespace
+// and line comments) are handled automatically.
+
 fn startList(s: *State) *State {
     const open = s.getc();
 
@@ -590,6 +595,7 @@ fn startList(s: *State) *State {
         return err(s, "unexpected EOF while parsing list");
     }
 
+    s.context = value.nil.nil;
     s.opening_bracket = open;
     return if (isEndOfList(s))
         endList(s)
@@ -604,7 +610,27 @@ fn isEndOfList(s: *State) bool {
     };
 }
 
+fn endList(s: *State) *State {
+    const open = s.opening_bracket;
+    const char = s.getc();
+
+    if (open == '(' and char == ')') {
+        return s.returnDatum(s.context);
+    }
+    if (open == '[' and char == ']') {
+        const rune = value.rune.pack("SQUARE");
+        return s.returnDatum(value.pair.cons(rune, s.context));
+    }
+    if (open == '{' and char == '}') {
+        const rune = value.rune.pack("BRACE");
+        return s.returnDatum(value.pair.cons(rune, s.context));
+    }
+
+    return err(s, "wrong closing bracket for list");
+}
+
 fn continueList(s: *State) *State {
+    // Note that this accumulates list elements in reverse.
     s.context = value.pair.cons(s.retval, s.context);
 
     s.consumeBlanks();
@@ -613,7 +639,7 @@ fn continueList(s: *State) *State {
     }
 
     if (isEndOfList(s)) {
-        s.context = list.reverse(s.context);
+        s.context = lib.list.reverse(s.context);
         return endList(s);
     }
 
@@ -623,21 +649,25 @@ fn continueList(s: *State) *State {
         if (!s.isWhitespace()) {
             return err(s, "misplaced period");
         }
-        return s.recurParse(.start_parse, .finalize_improper_list);
+        return s.recurParse(.start_parse, .finish_improper_list);
     }
 
     return s.recurParse(.start_parse, .continue_list);
 }
 
-fn finalizeImproperList(s: *State) *State {
-    s.context = list.reverseWithTail(s.context, s.retval);
+fn finishImproperList(s: *State) *State {
+    s.context = lib.list.reverseWithTail(s.context, s.retval);
     return endImproperList(s);
 }
 
+// Handling the end of an improper list is a bit awkward, because there may be
+// datum comments *after* the final cdr, where we don't actually want to parse
+// any further data.  So we keep looping here just looking for datum comments.
+
 fn endImproperList(s: *State) *State {
     s.consumeBlanks();
     if (s.eof()) {
-        return err(s, "unexpected EOF while parsing list");
+        return err(s, "unexpected EOF at end of improper list");
     }
 
     if (isEndOfList(s)) {
@@ -646,7 +676,7 @@ fn endImproperList(s: *State) *State {
 
     if (s.getc() == '#') {
         if (s.eof()) {
-            return err(s, "unexpected EOF after hash while parsing list");
+            return err(s, "unexpected hash and EOF at end of improper list");
         }
         if (s.getc() == ';') {
             return s.recurParse(.start_datum, .end_improper_list);
@@ -656,26 +686,6 @@ fn endImproperList(s: *State) *State {
     return err(s, "malformed list / extra datum at end of improper list");
 }
 
-fn endList(s: *State) *State {
-    const open = s.opening_bracket;
-    const char = s.getc();
-
-    // Check for proper ending: (foo bar baz)
-    if (open == '(' and char == ')') {
-        return s.returnDatum(s.context);
-    }
-    if (open == '[' and char == ']') {
-        const rune = value.rune.pack("SQUARE");
-        return s.returnDatum(value.pair.cons(rune, s.context));
-    }
-    if (open == '{' and char == '}') {
-        const rune = value.rune.pack("BRACE");
-        return s.returnDatum(value.pair.cons(rune, s.context));
-    }
-
-    return err(s, "wrong closing bracket for list");
-}
-
 fn err(s: *State, msg: []const u8) noreturn {
     std.debug.print("{s}\n", .{msg});
     std.debug.print("pos: {}\n", .{s.pos});
diff --git a/src/libzisp/io/reader.zig b/src/libzisp/io/reader.zig
index d6de79d..3465cb3 100644
--- a/src/libzisp/io/reader.zig
+++ b/src/libzisp/io/reader.zig
@@ -6,5 +6,5 @@ const decoder = @import("decoder.zig");
 const Value = @import("../value.zig").Value;
 
 pub fn readCode(input: []const u8) Value {
-    return decoder.decode(parser.parse(input));
+    return decoder.decode(parser.parse(input, .code));
 }
diff --git a/src/libzisp/io/unparser.zig b/src/libzisp/io/unparser.zig
new file mode 100644
index 0000000..eb27e20
--- /dev/null
+++ b/src/libzisp/io/unparser.zig
@@ -0,0 +1 @@
+// wip
diff --git a/src/libzisp/lib.zig b/src/libzisp/lib.zig
new file mode 100644
index 0000000..7752110
--- /dev/null
+++ b/src/libzisp/lib.zig
@@ -0,0 +1 @@
+pub const list = @import("lib/list.zig");
diff --git a/src/libzisp/list.zig b/src/libzisp/lib/list.zig
index a4ce7a8..9d6a6bc 100644
--- a/src/libzisp/list.zig
+++ b/src/libzisp/lib/list.zig
@@ -1,4 +1,4 @@
-const value = @import("value.zig");
+const value = @import("../value.zig");
 
 const Value = value.Value;
author	Taylan Kammer <taylan.kammer@gmail.com>	2025-02-22 18:01:33 +0100
committer	Taylan Kammer <taylan.kammer@gmail.com>	2025-02-22 18:01:33 +0100
commit	b7fb551ae61d26c30e6078f1f617862430141ce3 (patch)
tree	6ca7992432d11e8997def25671561454c7660c92
parent	c922361115c8ee398ec4e26bb0af8cca4dcb9667 (diff)