update

author: Taylan Kammer <taylan.kammer@gmail.com> 2025-02-25 23:10:35 +0100
committer: Taylan Kammer <taylan.kammer@gmail.com> 2025-02-25 23:10:35 +0100
commit: 34de389fe744018e808f2c8b301648d504ab610d (patch)
tree: 7806846fd4ce078f89ce173769b2d41c3d0362ed
parent: d08f735e8c8ca108a065d787a92f17b28f0409af (diff)
2 files changed, 57 insertions, 97 deletions
diff --git a/src/libzisp.zig b/src/libzisp.zig
index be3f683..79a54b4 100644
--- a/src/libzisp.zig
+++ b/src/libzisp.zig
@@ -247,7 +247,7 @@ test "pair" {
 }
 
 test "parse" {
-    const val = io.parser.parseCode("\"foo\"");
+    const val = io.parser.parse("\"foo\"");
 
     try std.testing.expect(value.sstr.check(val));
 
@@ -256,7 +256,7 @@ test "parse" {
 }
 
 test "parse2" {
-    const val = io.parser.parseCode(
+    const val = io.parser.parse(
         \\ ;; Testing some crazy datum comments
         \\ ##;"bar"#;([x #"y"]{##`,'z})"foo"
         \\ ;; end
@@ -273,7 +273,7 @@ test "parse2" {
 }
 
 test "parse3" {
-    const val = io.parser.parseCode(
+    const val = io.parser.parse(
         \\(foo #;x #;(x y) #;x #bar [#x #"baz"] 'bat)
     );
 
@@ -289,7 +289,7 @@ test "parse3" {
 }
 
 test "parse4" {
-    const val = io.parser.parseCode("(foo . #;x bar #;y)");
+    const val = io.parser.parse("(foo . #;x bar #;y)");
 
     const s = value.sstr.unpack(value.pair.car(val));
     try std.testing.expectEqualStrings("foo", s.slice());
@@ -301,6 +301,6 @@ test "parse4" {
 test "unparse" {
     try std.testing.expectEqualStrings(
         "#foo",
-        io.unparser.unparse(io.parser.parseCode("#foo")),
+        io.unparser.unparse(io.parser.parse("#foo")),
     );
 }
diff --git a/src/libzisp/io/parser.zig b/src/libzisp/io/parser.zig
index 45a752e..1e61385 100644
--- a/src/libzisp/io/parser.zig
+++ b/src/libzisp/io/parser.zig
@@ -1,21 +1,14 @@
 //
 // === Parser for Code & Data ===
 //
-// Zisp s-expressions come in two flavors: code (sugar) and data (no sugar).
+// Zisp s-expressions are defined in terms of an extremely minimal set of data
+// types; only that which is necessary to build representations of more complex
+// expressions and types:
 //
-// However, code expressions are parsed into the same data types which the data
-// expressions can represent, so homoiconicity is preserved.
+//   type      format/examples    comment
+//   ----      ---------------    -------
 //
-// The "sugar" used in code expressions is merely shorthand for more complex
-// data expressions, which could have been written by hand.
-//
-// Data expressions have a very simple format, and are only able to express the
-// bare minimum set of data types needed to represent more complex data:
-//
-//   type      format             comment
-//   ----      ------             -------
-//
-//   string    foo , "foo bar"    symbols and strings are the same data type
+//   string    foo , "foo bar"    quoted strings are flagged as such
 //
 //   rune      #name              name is: [a-zA-Z][a-zA-Z0-9]{0,5}
 //
@@ -23,52 +16,59 @@
 //
 //   nil       ()                 we prefer the term nil over null
 //
-// The list short-hand syntax is the only "syntax sugar" supported in data:
+// The parser recognizes various "syntax sugar" and transforms uses of it into
+// uses of the above types.
+//
+// The most ubiquitous example is of course the list syntax:
 //
-//   (DATUM DATUM DATUM)  ->  (DATUM . (DATUM . (DATUM . ())))
+//   (datum1 datum2 ...)  ->  (datum1 . (datum2 . (... . ())))
 //
-// We may use terms like "code parser" and "data parser" out of convenience,
-// although there may only be a single parser that implements both formats by
-// switching between modes.
+// The following table summarizes the other supported transformations:
 //
-// When the code parser encounters syntax sugar, it always transforms it into a
-// list starting with a rune.  The list of all such transformations follows.
+//   [...]   -> (#SQUARE ...)          #datum      -> (#HASH . datum)
 //
-//   #datum  -> (#HASH . datum)        #name(...)  -> (#name ...)
+//   {...}   -> (#BRACE ...)           #rune(...)  -> (#rune ...)
 //
-//   [...]   -> (#SQUARE ...)          dat1dat2    -> (#JOIN dat1 . dat2)
+//   #<...>  -> (#ANGLE ...)           dat1dat2    -> (#JOIN dat1 . dat2)
 //
-//   {...}   -> (#BRACE ...)           dat1.dat2   -> (#DOT dat1 . dat2)
+//   'datum  -> (#QUOTE . datum)       dat1.dat2   -> (#DOT dat1 . dat2)
 //
-//   'datum  -> (#QUOTE . datum)       #n#=datum   -> (#LABEL n . datum)
+//   `datum  -> (#GRAVE . datum)       #n#=datum   -> (#LABEL n . datum)
 //
-//   `datum  -> (#GRAVE . datum)       #n#         -> (#LABEL . n)
+//   ,datum  -> (#COMMA . datum)       #n#         -> (#LABEL . n)
 //
-//   ,datum  -> (#COMMA . datum)
+// Notes:
 //
-// (The "#datum" form refers to expressions that cannot be mistaken for a rune,
-// such as for example: #(...) or #"..." etc.)
+// * The terms datum, dat1, and dat2 each refer to an arbitrary datum; ellipsis
+//   means zero or more data; n is a non-negative integer.
 //
-// The terms "datum", "dat1", and "dat2" refer to an arbitrary datum; "name" is
-// a rune name; ellipsis mean zero or more data; "n" is a non-negative integer.
+// * The #datum form applies only to expressions that cannot be mistaken for a
+//   rune, such as for example: #(...) or #"..." or #'string etc.; following a
+//   hash sign with a plain string would otherwise be parsed as a rune.
 //
-// Though not represented in the table above due to notational difficulty, the
-// format "#name(...)" doesn't require a list in the second position; any datum
-// works, so long as there's no ambiguity:
+// * Though not represented in the table due to notational difficulty, the
+//   format "#rune(...)" doesn't require a list in the second position; any
+//   datum works, so long as there's no ambiguity; for example:
 //
-//   #name1#name2  -> (#name1 . #name2)
+//     #rune1#rune2  -> (#rune1 . #rune2)
 //
-//   #name"text"   -> (#name . "text")
+//     #rune"text"   -> (#rune . "text")
 //
-// As a counter-example, following a rune immediately with a bare string is not
-// possible, since it's ambiguous:
+//     #rune'string  -> (#rune #QUOTE . string)
 //
-//   #abcdefgh  ;Could be (#abcdef . gh) or (#abcde . fgh) or ...
+//   As a counter-example, following a rune immediately with a bare string is
+//   not possible, since it's ambiguous:
 //
-// The parser will see this as an attempt to use an 8-letter rune name, and
-// raise an error, since rune names are limited to 6 characters.
+//     #abcdefgh  ;Could be (#abcdef . gh) or (#abcde . fgh) or ...
 //
-// Syntax sugar can combine arbitrarily:
+//   The parser will see this as an attempt to use an 8-letter rune name, and
+//   raise an error, since rune names are limited to 6 characters.
+//
+// * The #<...> form is a special case; the less-than and greater-than symbols
+//   are not otherwise treated as brackets; e.g., <a b c d> is actually four
+//   strings: "<a", "b", "c", "d>".
+//
+// Syntax sugar can combine arbitrarily; some examples follow:
 //
 //   #{...}            -> (#HASH #BRACE ...)
 //
@@ -80,9 +80,9 @@
 //
 //   foo.bar.baz{x y}  -> (#JOIN (#DOT (#DOT foo . bar) . baz) #BRACE x y)
 //
-// Runes are case-sensitive, and the code parser only emits runes using
-// upper-case letters, so lower-case runes are free for user extensions.
-// Exceptions are runes used directly in code, like #true and #false.
+// Runes are case-sensitive, and the parser only emits runes using upper-case
+// letters when expressing syntax sugar, so there can be no accidental clash
+// with runes that appear verbatim in code.
 //
 // Although strings and symbols aren't disjoint types in Zisp, the parser flags
 // double-quoted string literals to allow distinguishing them from bare strings.
@@ -95,7 +95,7 @@
 //
 // Note that 'foo becomes (quote foo) in Scheme, but (#QUOTE . foo) in Zisp.
 // The operand of #QUOTE is the entire cdr.  The same principle is used when
-// parsing other sugar:
+// parsing other sugar; some examples follow:
 //
 //          Incorrect                              Correct
 //
@@ -117,13 +117,11 @@
 // expect a vector literal like #(...) to work in Scheme.
 //
 // Runes may be decoded in isolation as well, rather than transforming a list
-// whose head they appear in.  This can implement #true and #false.  (These
-// would be used verbatim in code, rather than emitted by the parser.)
+// whose head they appear in.  This can implement #true and #false.
 //
 // The decoder may also perform arbitrary transforms on any type; for example,
 // it may turn bare strings (those not flagged as double-quoted) into numbers
-// when it's decoding data representing code.  This is how number literals are
-// implemented in Zisp.
+// when appropriate.  This is how number literals are implemented.
 //
 // The decoder recognizes (#QUOTE ...) to implement the traditional quoting
 // mechanism, but with a significant difference:
@@ -238,13 +236,10 @@ const value = @import("../value.zig");
 const ShortString = value.ShortString;
 const Value = value.Value;
 
-pub const Mode = enum { code, data };
-
 const TopState = struct {
     alloc: std.mem.Allocator,
     input: []const u8,
     pos: usize = 0,
-    mode: Mode = undefined,
 };
 
 const State = struct {
@@ -260,10 +255,6 @@ const State = struct {
     // To remember what kind of list we're in: () [] {}
     opening_bracket: u8 = undefined,
 
-    fn mode(s: *State) Mode {
-        return s.top.mode;
-    }
-
     fn eof(s: *State) bool {
         return s.top.pos >= s.top.input.len;
     }
@@ -380,17 +371,13 @@ const Fn = enum {
     perform_return,
 };
 
-pub fn parseCode(input: []const u8) Value {
-    return parse(input, .code);
-}
-
-pub fn parse(input: []const u8, mode: Mode) Value {
+pub fn parse(input: []const u8) Value {
     var gpa: std.heap.GeneralPurposeAllocator(.{}) = .init;
     defer if (gpa.deinit() == .leak) @panic("leak");
     const alloc = gpa.allocator();
     // var pool: std.heap.MemoryPool(State) = .init(alloc);
     // defer pool.deinit();
-    var top = TopState{ .alloc = alloc, .input = input, .mode = mode };
+    var top = TopState{ .alloc = alloc, .input = input };
     var s0 = State{ .top = &top };
     var s = &s0;
     while (true) s = switch (s.next) {
@@ -466,11 +453,6 @@ fn endDatum(s: *State, d: Value) *State {
         return s.returnDatum(d);
     }
 
-    // These are only allowed in code mode.
-    if (s.mode() == .data) {
-        return err(s, "invalid use of hash in data mode");
-    }
-
     s.context = d;
 
     if (s.peek() == '.') {
@@ -515,7 +497,7 @@ fn handleHash(s: *State) *State {
     //
     //   #|;DATUM    ;datum comment
     //
-    //   #|DATUM     ;hash-datum (code mode only)
+    //   #|DATUM     ;hash-datum
     //
 
     if (s.eof()) {
@@ -547,30 +529,12 @@ fn handleHash(s: *State) *State {
 
     // Otherwise, it must be a hash-datum.  #DATUM
 
-    // But data mode doesn't allow that.
-    if (s.mode() == .data) {
-        return err(s, "use of hash-datum sequence not allowed in data mode");
-    }
-
     return s.recurParse(.start_datum, .end_hash_datum);
 }
 
 fn handleRune(s: *State) *State {
-    const rune = readRune(s) orelse return err(s, "rune too long");
-    //
-    // Now we're at the end of the rune, but it could be a rune-datum:
-    //
-    //   #foo|(...)
-    //
-
-    if (isEndOfDatum(s)) {
-        // Nope, just a stand-alone rune.
-        return s.returnDatum(rune);
-    }
-
-    // Otherwise, it's followed by a datum, like: #foo(...)
-
-    return endDatum(s, rune);
+    const r = readRune(s) orelse return err(s, "rune too long");
+    return endDatum(s, r);
 }
 
 fn readRune(s: *State) ?Value {
@@ -719,10 +683,6 @@ fn endQuote(s: *State) *State {
 fn startList(s: *State) *State {
     const open = s.getc();
 
-    if (s.mode() == .data and open != '(') {
-        return err(s, "invalid opening bracket in data mode");
-    }
-
     s.consumeBlanks();
     if (s.eof()) {
         return err(s, "unexpected EOF while parsing list");
author	Taylan Kammer <taylan.kammer@gmail.com>	2025-02-25 23:10:35 +0100
committer	Taylan Kammer <taylan.kammer@gmail.com>	2025-02-25 23:10:35 +0100
commit	34de389fe744018e808f2c8b301648d504ab610d (patch)
tree	7806846fd4ce078f89ce173769b2d41c3d0362ed
parent	d08f735e8c8ca108a065d787a92f17b28f0409af (diff)