update

author: Taylan Kammer <taylan.kammer@gmail.com> 2025-02-22 15:04:05 +0100
committer: Taylan Kammer <taylan.kammer@gmail.com> 2025-02-22 15:04:05 +0100
commit: c922361115c8ee398ec4e26bb0af8cca4dcb9667 (patch)
tree: 6b093f2ad960bf073d295458faf6707e5123c1e4
parent: 2d52864265e06a4d863dba84e1d20c8d52d8c5dd (diff)
6 files changed, 115 insertions, 192 deletions
diff --git a/src/libzisp.zig b/src/libzisp.zig
index be8b8b7..8141994 100644
--- a/src/libzisp.zig
+++ b/src/libzisp.zig
@@ -6,8 +6,8 @@ const builtin = @import("builtin");
 const testing = std.testing;
 
 pub const gc = @import("libzisp/gc.zig");
-pub const parser = @import("libzisp/parser.zig");
 pub const value = @import("libzisp/value.zig");
+pub const parser = @import("libzisp/io/parser.zig");
 
 pub const Value = value.Value;
 pub const Bucket = gc.Bucket;
diff --git a/src/libzisp/io/decoder.zig b/src/libzisp/io/decoder.zig
new file mode 100644
index 0000000..eb27e20
--- /dev/null
+++ b/src/libzisp/io/decoder.zig
@@ -0,0 +1 @@
+// wip
diff --git a/src/libzisp/io/encoder.zig b/src/libzisp/io/encoder.zig
new file mode 100644
index 0000000..eb27e20
--- /dev/null
+++ b/src/libzisp/io/encoder.zig
@@ -0,0 +1 @@
+// wip
diff --git a/src/libzisp/parser.zig b/src/libzisp/io/parser.zig
index 1c58687..71c6946 100644
--- a/src/libzisp/parser.zig
+++ b/src/libzisp/io/parser.zig
@@ -12,15 +12,13 @@
 // Data expressions have a very simple format, and are only able to express a
 // minimal set of data types:
 //
-//   string -> foo , "foo bar"     ;symbols and strings are the same data type
+//   string -> foo , "foo bar"   ;symbols and strings are the same data type
 //
-//   number -> -1.2 , +nan.0 , ... ;this is the most complex type to represent
+//   rune   -> #foo              ;limited to 6 ASCII letters (a - z, A - Z)
 //
-//   rune   -> #foo                ;limited to 6 ASCII letters (a - z, A - Z)
+//   pair   -> (DATUM . DATUM)   ;the only composite data type supported
 //
-//   pair   -> (DATUM . DATUM)     ;the only composite data type supported
-//
-//   nil    -> ()                  ;we prefer the term nil over null
+//   nil    -> ()                ;we prefer the term nil over null
 //
 // The list short-hand syntax may be considered the only "syntax sugar" that is
 // supported by the data parser:
@@ -57,12 +55,15 @@
 // would be string literals, because Zisp doesn't differentiate strings and
 // symbols like traditional lisps.  Also, note that although we could reuse
 // #QUOTE here, instead of using #STRING, this would make it impossible to
-// differentiate between the expressions #'foo and #"foo" once parsing is
-// finished, even though we may want to give them different meanings.)
+// differentiate between the code expressions #'foo and #"foo".)
 //
 // Runes are case-sensitive, and the code parser only emits runes using
 // upper-case letters, so lower-case runes are free for user extensions.
 //
+// You may be wondering about numbers.  As far as the parser is concerned,
+// numbers are strings.  It's the decoder (see below) that will turn bare
+// strings (those not marked with #STRING) into numbers.
+//
 // Note that 'foo becomes (quote foo) in Scheme, but (#QUOTE . foo) in Zisp.
 // The operand of #QUOTE is the entire cdr.  The same principle is used when
 // parsing other sugar:
@@ -87,22 +88,36 @@
 // Runes may be decoded in isolation as well, rather than transforming a list
 // whose head they appear in.  This is how #true and #false are implemented.
 //
+// The decoder may also perform arbitrary transforms on any type; for example,
+// it may turn bare strings (those not marked with #STRING) into numbers when
+// it's decoding data representing code.  This is how number literals are
+// implemented in Zisp.
+//
 // The decoder recognizes (#QUOTE ...) to implement the traditional quoting
 // mechanism, but in a better way:
 //
 // Traditional quote is "unhygienic" in Scheme terms.  An expression such as
 // '(foo bar) will always be read as (quote (foo bar)) regardless of what sort
 // of lexical context it appears in, so the semantics will depend on whatever
-// the identifier "quote" is bound to in that lexical context.
+// the identifier "quote" is bound to in that lexical context, meaning that the
+// expression may end up evaluating to something other than the list (foo bar).
 //
 // The Zisp decoder, which transforms not text to text, but objects to objects,
 // can turn (#QUOTE ...) into an abstract object which encapsulates the notion
-// of quoting, which the Zisp evaluator can recognize and act upon.
+// of quoting, which the Zisp evaluator can recognize and act upon, ensuring
+// that an expression like '(foo bar) always turns into the list (foo bar).
 //
 // One way to think about this, in Scheme (R6RS / syntax-case) terms, is that
 // expressions like '(foo bar) turn directly into a *syntax object* when read,
 // rather than a regular list object.
 //
+// The decoder is, of course, configurable and extensible.  The transformations
+// mentioned above would be performed only when it's told to decode data which
+// represents Zisp code.  The decoder may be given a different configuration,
+// telling it to decode, for example, data which represents a different kind of
+// domain-specific data, such as application settings, build system commands,
+// complex data records with non-standard data types, and so on.
+//
 //
 // === Implementation details ===
 //
@@ -143,6 +158,14 @@
 // .start_datum, this means no whitespace or comments would be tolerated after
 // the datum comment.
 //
+// Note 3: When it comes to pairs/lists, we mainly try to parse them as lists,
+// and a pair becomes a special-case of an improper list (two elements).  This
+// has the advantage of saving memory: If we implemented list parsing as pair
+// parsing, we would be calling the parser recursively, deeper and deeper, for
+// every pair that the list is made up of.  Although we're not limited by stack
+// space, thanks to the strategy described above, this would still waste memory
+// while parsing.
+//
 //
 // === Notation used in comments ===
 //
@@ -157,9 +180,9 @@
 
 const std = @import("std");
 
-const gc = @import("gc.zig");
-const list = @import("list.zig");
-const value = @import("value.zig");
+const gc = @import("../gc.zig");
+const list = @import("../list.zig");
+const value = @import("../value.zig");
 
 const Value = value.Value;
 
@@ -175,10 +198,11 @@ const State = struct {
 
     parent: ?*State = null,
 
-    datum_rune: Value = value.boole.f,
-    list_stack: Value = value.nil.nil,
+    // Used to store various context, but most notably the stack of list
+    // elements parsed so far, so just initialize it to nil.
+    context: Value = value.nil.nil,
+
     opening_bracket: u8 = 0,
-    opening_quote: enum { quote, grave, comma } = .quote,
 
     retval: Value = value.eof.eof,
 
@@ -272,8 +296,8 @@ const Fn = enum {
     end_rune_datum,
     end_quote,
     continue_list,
+    finalize_improper_list,
     end_improper_list,
-    handle_trailing_datum_comments,
     perform_return,
 };
 
@@ -288,8 +312,8 @@ pub fn parse(input: []const u8) Value {
         .end_rune_datum => endRuneDatum(s),
         .end_quote => endQuote(s),
         .continue_list => continueList(s),
+        .finalize_improper_list => finalizeImproperList(s),
         .end_improper_list => endImproperList(s),
-        .handle_trailing_datum_comments => handleTrailingDatumComments(s),
         .perform_return => s.performReturn() orelse return s.retval,
     };
 }
@@ -301,7 +325,7 @@ fn startParse(s: *State) *State {
     }
     return switch (s.peek()) {
         // whitespace already consumed
-        0...31, 127...255 => err(s, "invalid character"),
+        0...32, 127...255 => err(s, "invalid character"),
         ')', ']', '}' => err(s, "unexpected closing bracket"),
         else => startDatum(s),
     };
@@ -318,7 +342,7 @@ fn startDatum(s: *State) *State {
     }
     return switch (s.peek()) {
         // whitespace checked above
-        0...31, 127...255 => err(s, "invalid character"),
+        0...32, 127...255 => err(s, "invalid character"),
 
         ')', ']', '}' => err(s, "unexpected closing bracket"),
 
@@ -332,14 +356,10 @@ fn startDatum(s: *State) *State {
 
         '(', '[', '{' => startList(s),
 
-        '+', '-' => handlePlusMinus(s),
-
-        '0'...'9' => handleDigit(s),
-
-        // Periods only allowed between digits, and to express improper lists.
-        // Things like the following look too much like it could be a typo:
+        // Periods are only allowed in the middle of a string, or to express
+        // improper lists, because the following look too much like typos:
         //
-        //   (foo .5) (foo .bar)
+        //   (foo. bar)  (foo .bar)  (123. 456)  (123 .456)
         //
         '.' => err(s, "misplaced period"),
 
@@ -368,7 +388,7 @@ fn handleHash(s: *State) *State {
 
     // Is it a rune?  #foo
     switch (s.peek()) {
-        'A'...'Z', 'a'...'z' => return handleRune(s),
+        'a'...'z', 'A'...'Z' => return handleRune(s),
         else => {},
     }
 
@@ -392,7 +412,6 @@ fn handleHash(s: *State) *State {
 
 fn handleRune(s: *State) *State {
     const rune = readRune(s) orelse return err(s, "rune too long");
-
     //
     // Now we're at the end of the rune, but it could be a rune-datum:
     //
@@ -411,7 +430,7 @@ fn handleRune(s: *State) *State {
         return err(s, "invalid use of hash in data mode");
     }
 
-    s.datum_rune = rune;
+    s.context = rune;
     return s.recurParse(.start_datum, .end_rune_datum);
 }
 
@@ -443,21 +462,16 @@ fn isEndOfRune(s: *State) bool {
 }
 
 fn endRuneDatum(s: *State) *State {
-    return s.returnDatum(value.pair.cons(
-        s.datum_rune,
-        s.retval,
-    ));
+    return s.returnDatum(value.pair.cons(s.context, s.retval));
 }
 
 fn endHashDatum(s: *State) *State {
-    return s.returnDatum(value.pair.cons(
-        value.rune.pack("HASH"),
-        s.retval,
-    ));
+    const rune = value.rune.pack("HASH");
+    return s.returnDatum(value.pair.cons(rune, s.retval));
 }
 
 fn startQuotedString(s: *State) *State {
-    // Consume opening quote.
+    // We're at |"..." so consume the opening quote before we start reading.
     s.skip();
 
     const str = readQuotedString(s) catch return err(s, "unclosed string");
@@ -507,6 +521,7 @@ fn readQuotedLongString(s: *State) Value {
 }
 
 fn startBareString(s: *State) *State {
+    // We're at |foo so start reading directly.
     return readBareSstr(s) orelse readBareLongString(s);
 }
 
@@ -546,27 +561,23 @@ fn readBareLongString(s: *State) *State {
 }
 
 fn startQuote(s: *State) *State {
-    s.opening_quote = switch (s.getc()) {
-        '\'' => .quote,
-        '`' => .grave,
-        ',' => .comma,
+    // We're at one of:  |'...  |`...  |,...
+    s.context = value.rune.pack(switch (s.getc()) {
+        '\'' => "QUOTE",
+        '`' => "GRAVE",
+        ',' => "COMMA",
         else => unreachable,
-    };
+    });
     return s.recurParse(.start_datum, .end_quote);
 }
 
 fn endQuote(s: *State) *State {
-    const name = switch (s.opening_quote) {
-        .quote => "QUOTE",
-        .grave => "GRAVE",
-        .comma => "COMMA",
-    };
-    return s.returnDatum(value.pair.cons(
-        value.rune.pack(name),
-        s.retval,
-    ));
+    return s.returnDatum(value.pair.cons(s.context, s.retval));
 }
 
+// List processing is, unsurprisingly, the most complicated, and it's made even
+// more complicated by the possibility of datum comments in strange places...
+
 fn startList(s: *State) *State {
     const open = s.getc();
 
@@ -575,195 +586,94 @@ fn startList(s: *State) *State {
     }
 
     s.consumeBlanks();
-
     if (s.eof()) {
         return err(s, "unexpected EOF while parsing list");
     }
 
-    const char = s.peek();
-
-    // Check for empty lists: (), [], {}
-    if (open == '(' and char == ')') {
-        s.skip();
-        return s.returnDatum(value.nil.nil);
-    }
-    if (open == '[' and char == ']') {
-        s.skip();
-        return s.returnDatum(value.pair.cons(
-            value.rune.pack("SQUARE"),
-            value.nil.nil,
-        ));
-    }
-    if (open == '{' and char == '}') {
-        s.skip();
-        return s.returnDatum(value.pair.cons(
-            value.rune.pack("BRACE"),
-            value.nil.nil,
-        ));
-    }
-
     s.opening_bracket = open;
+    return if (isEndOfList(s))
+        endList(s)
+    else
+        s.recurParse(.start_parse, .continue_list);
+}
 
-    // Now we're facing the first element of the list.
-    return s.recurParse(.start_parse, .continue_list);
+fn isEndOfList(s: *State) bool {
+    return switch (s.peek()) {
+        ')', ']', '}' => true,
+        else => false,
+    };
 }
 
 fn continueList(s: *State) *State {
-    //
-    // We now consumed a list element and are at its end.  Possibilities:
-    //
-    //   (... foo| )
-    //
-    //   (... foo| . bar)
-    //
-    //   (... foo| bar baz ...)
-    //
-    // (This may be the first element, or any other; doesn't matter.)
-    //
+    s.context = value.pair.cons(s.retval, s.context);
 
     s.consumeBlanks();
-
     if (s.eof()) {
         return err(s, "unexpected EOF while parsing list");
     }
 
-    const stack = value.pair.cons(s.retval, s.list_stack);
-
-    const open = s.opening_bracket;
-    const char = s.peek();
-
-    // Check for proper ending: (foo bar baz)
-    if (open == '(' and char == ')') {
-        s.skip();
-        return s.returnDatum(list.reverse(stack));
-    }
-    if (open == '[' and char == ']') {
-        s.skip();
-        return s.returnDatum(value.pair.cons(
-            value.rune.pack("SQUARE"),
-            list.reverse(stack),
-        ));
-    }
-    if (open == '{' and char == '}') {
-        s.skip();
-        return s.returnDatum(value.pair.cons(
-            value.rune.pack("BRACE"),
-            list.reverse(stack),
-        ));
+    if (isEndOfList(s)) {
+        s.context = list.reverse(s.context);
+        return endList(s);
     }
 
-    s.list_stack = stack;
-
-    // Check for improper ending: (foo bar . baz)
-    if (char == '.') {
+    if (s.peek() == '.') {
         s.skip();
-
-        // We should now be at (... foo .| bar) and whitespace must follow.
         // Scheme allows (foo .(bar)) but we don't.  Mind your spaces!
         if (!s.isWhitespace()) {
             return err(s, "misplaced period");
         }
-
-        return s.recurParse(.start_parse, .end_improper_list);
+        return s.recurParse(.start_parse, .finalize_improper_list);
     }
 
-    // OK, we're at the next element now and the list is going on.
     return s.recurParse(.start_parse, .continue_list);
 }
 
-fn endImproperList(s: *State) *State {
-    //
-    // We're at the very end of an improper list now:
-    //
-    //   (... foo . bar| )
-    //
-    // There's another sneaky possibility though: trailing datum comments!
-    //
-    //   (... foo . bar #;DATUM ... )
-    //
-    // These were being handled "automatically" while parsing regular list
-    // elements, but here we have to special-handle them; see below.
-    //
+fn finalizeImproperList(s: *State) *State {
+    s.context = list.reverseWithTail(s.context, s.retval);
+    return endImproperList(s);
+}
 
+fn endImproperList(s: *State) *State {
     s.consumeBlanks();
-
     if (s.eof()) {
         return err(s, "unexpected EOF while parsing list");
     }
 
-    const result = list.reverseWithTail(s.list_stack, s.retval);
-
-    const char = s.getc();
-    const open = s.opening_bracket;
-
-    if (open == '(' and char == ')') {
-        return s.returnDatum(result);
-    }
-    if (open == '[' and char == ']') {
-        const rune = value.rune.pack("SQUARE");
-        return s.returnDatum(value.pair.cons(rune, result));
-    }
-    if (open == '{' and char == '}') {
-        const rune = value.rune.pack("BRACE");
-        return s.returnDatum(value.pair.cons(rune, result));
+    if (isEndOfList(s)) {
+        return endList(s);
     }
 
-    // Handle trailing datum comments, but don't allow anything else!
-
-    if (char == '#' and s.peek() == ';') {
-        s.skip();
-
-        // We will "abuse" the list_stack field to save the result.
-        s.list_stack = result;
-        return s.recurParse(.start_datum, .handle_trailing_datum_comments);
+    if (s.getc() == '#') {
+        if (s.eof()) {
+            return err(s, "unexpected EOF after hash while parsing list");
+        }
+        if (s.getc() == ';') {
+            return s.recurParse(.start_datum, .end_improper_list);
+        }
     }
 
     return err(s, "malformed list / extra datum at end of improper list");
 }
 
-fn handleTrailingDatumComments(s: *State) *State {
-    s.consumeBlanks();
-
-    if (s.eof()) {
-        return err(s, "unexpected EOF while parsing list");
-    }
-
-    const result = s.list_stack;
-
-    const char = s.getc();
+fn endList(s: *State) *State {
     const open = s.opening_bracket;
+    const char = s.getc();
 
+    // Check for proper ending: (foo bar baz)
     if (open == '(' and char == ')') {
-        return s.returnDatum(result);
+        return s.returnDatum(s.context);
     }
     if (open == '[' and char == ']') {
         const rune = value.rune.pack("SQUARE");
-        return s.returnDatum(value.pair.cons(rune, result));
+        return s.returnDatum(value.pair.cons(rune, s.context));
     }
     if (open == '{' and char == '}') {
         const rune = value.rune.pack("BRACE");
-        return s.returnDatum(value.pair.cons(rune, result));
-    }
-
-    // Handle trailing datum comments, but don't allow anything else!
-
-    if (char == '#' and s.peek() == ';') {
-        s.skip();
-
-        // We will "abuse" the list_stack field to save the result.
-        s.list_stack = result;
-        return s.recurParse(.start_datum, .handle_trailing_datum_comments);
+        return s.returnDatum(value.pair.cons(rune, s.context));
     }
 
-    return err(s, "malformed list / extra datum at end of improper list");
-}
-
-fn handlePlusMinus(s: *State) *State {
-    return s;
-}
-
-fn handleDigit(s: *State) *State {
-    return s;
+    return err(s, "wrong closing bracket for list");
 }
 
 fn err(s: *State, msg: []const u8) noreturn {
diff --git a/src/libzisp/io/reader.zig b/src/libzisp/io/reader.zig
new file mode 100644
index 0000000..d6de79d
--- /dev/null
+++ b/src/libzisp/io/reader.zig
@@ -0,0 +1,10 @@
+// See parse.zig for details.
+
+const parser = @import("parser.zig");
+const decoder = @import("decoder.zig");
+
+const Value = @import("../value.zig").Value;
+
+pub fn readCode(input: []const u8) Value {
+    return decoder.decode(parser.parse(input));
+}
diff --git a/src/libzisp/io/writer.zig b/src/libzisp/io/writer.zig
new file mode 100644
index 0000000..eb27e20
--- /dev/null
+++ b/src/libzisp/io/writer.zig
@@ -0,0 +1 @@
+// wip
author	Taylan Kammer <taylan.kammer@gmail.com>	2025-02-22 15:04:05 +0100
committer	Taylan Kammer <taylan.kammer@gmail.com>	2025-02-22 15:04:05 +0100
commit	c922361115c8ee398ec4e26bb0af8cca4dcb9667 (patch)
tree	6b093f2ad960bf073d295458faf6707e5123c1e4
parent	2d52864265e06a4d863dba84e1d20c8d52d8c5dd (diff)