new parser

author: Taylan Kammer <taylan.kammer@gmail.com> 2025-03-27 21:18:09 +0100
committer: Taylan Kammer <taylan.kammer@gmail.com> 2025-03-27 21:18:09 +0100
commit: f2b18d64448ab09dd5e5e6a180d38d90d5aaf367 (patch)
tree: d83510fc892bf31feed56688af3ec350012cdeb9
parent: 91629014bfe50e1d45cffedd618ab28a063f7689 (diff)
12 files changed, 1195 insertions, 584 deletions
diff --git a/_tests/antlr.bnf b/_tests/antlr.bnf
new file mode 100644
index 0000000..7b0cf83
--- /dev/null
+++ b/_tests/antlr.bnf
@@ -0,0 +1,99 @@
+grammar ExprLexer;
+
+LF      : '\n' ;
+SP      : ' ' ;
+WS_CC   : [\t\r\f] ;
+SEMI    : ';' ;
+DOT     : '.' ;
+COLON   : ':' ;
+PIPE    : '|' ;
+TILDE   : '~' ;
+BSLASH  : '\\' ;
+DQUOTE  : '"' ;
+HASH    : '#' ;
+LPAREN  : '(' ;
+RPAREN  : ')' ;
+LSQBR   : '[' ;
+RSQBR   : ']' ;
+LCURLY  : '{' ;
+RCURLY  : '}' ;
+EQUAL   : '=' ;
+APOS    : '\'' ;
+GRAVE   : '`' ;
+COMMA   : ',' ;
+
+    : [\u0000-\u0009\u000b-\u00ff] ;
+
+BARE_CHAR     : [a-zA-Z0-9!$%&*+/<=>?@^_~-] | '\\' BARE_ESC ;
+
+BARE_ESC      : [\u0021-\u007e] ;
+
+QUOTED_CHAR   : [^"\\] | '\\' QUOTED_ESC ;
+
+QUOTED_ESC    : [\\"abefnrtv]
+              | 'x' HEX_DIGIT HEX_DIGIT
+              | 'u' '{' HEX_DIGIT+ '}'
+              ;
+
+
+HEX_DIGIT  : [0-9a-fA-F] ;
+ALPHA      : [a-zA-Z] ;
+ALNUM      : [a-zA-Z0-9] ;
+
+
+
+parse_unit    : blank* datum blank? ;
+
+
+blank         : LF | SP | WS_CC | comment ;
+
+datum         : datum_one ( join_char? datum )? ;
+
+
+comment       : ';' ( skip_datum | skip_line ) ;
+
+
+datum_one     : bare_string | fancy_datum ;
+
+join_char     : '.' | ':' | '|' ;
+
+
+skip_datum    : '~' parse_unit ;
+
+skip_line     : ANY_BUT_LF* LF? ;
+
+
+bare_string   : BARE_CHAR+ ;
+
+fancy_datum   : '\\' bare_esc_str
+              | '"' quoted_str '"'
+              | '#' hash_expr
+              | '(' list_body ')'
+              | '[' list_body ']'
+              | '{' list_body '}'
+              | quote_expr
+              ;
+
+
+bare_esc_str  : '\\' BARE_ESC BARE_CHAR* ;
+
+quoted_str    : QUOTED_CHAR* ;
+
+hash_expr     : rune fancy_datum?
+              | label ( '=' fancy_datum )?
+              | fancy_datum
+              ;
+
+list_body     : blank* ( list_head list_tail? )? ;
+
+quote_expr    : ( '\'' | '`' | ',' ) datum ;
+
+
+rune          : ALPHA ALNUM* ;
+
+label         : '%' HEX_DIGIT+ ;
+
+
+list_head     : datum blank* list_head? ;
+            
+list_tail     : blank+ '.' blank+ datum blank* ;
diff --git a/_tests/string b/_tests/string
new file mode 100644
index 0000000..c365d58
--- /dev/null
+++ b/_tests/string
@@ -0,0 +1 @@
+\0\a\b\t\n\v\f\r\e\e\r\f\v\n\t\b\a\0
+\ No newline at end of file
diff --git a/_tests/switchtable.zig b/_tests/switchtable.zig
new file mode 100644
index 0000000..722ecdd
--- /dev/null
+++ b/_tests/switchtable.zig
@@ -0,0 +1,93 @@
+const std = @import("std");
+
+const Reader = std.io.AnyReader;
+
+pub fn main() !u8 {
+    return f();
+}
+
+fn f() !u8 {
+    const file = try std.fs.cwd().openFile("string", .{});
+    defer file.close();
+
+    var br = std.io.bufferedReader(file.reader());
+    const r = br.reader().any();
+
+    var n: u8 = 0;
+    for (0..1_000_000) |i| {
+        _ = i;
+        while (r.readByte() catch null) |c| {
+            n +%= try f1(r, c);
+        }
+        br.start = 0;
+        br.end = 0;
+        try file.seekTo(0);
+    }
+    return n;
+}
+
+fn f1(r: Reader, c1: u8) !u8 {
+    if (c1 != '\\') return c1;
+    const c = try r.readByte();
+    if (c == 'u') return unknown1();
+    return switch (c) {
+        '\\', '"' => c,
+        '0' => 0,
+        'a' => 7,
+        'b' => 8,
+        't' => 9,
+        'n' => 10,
+        'v' => 11,
+        'f' => 12,
+        'r' => 13,
+        'e' => 27,
+        'x' => unknown2(),
+        else => unknown3(),
+    };
+}
+
+fn f2(r: Reader, c1: u8) !u8 {
+    if (c1 != '\\') return c1;
+    const c = try r.readByte();
+    if (c == 'u') return unknown1();
+    if (c == 'x') return unknown2();
+    if (c == '\\' or c == '"') return c;
+    const itable = .{ '0', 'a', 'b', 't', 'n', 'v', 'f', 'r', 'e' };
+    const ctable: []const u8 = &.{ 0, 7, 8, 9, 10, 11, 12, 13, 27 };
+    const i = std.mem.indexOfScalar(u8, &itable, c) orelse return unknown3();
+    return ctable[i];
+}
+
+fn f3(r: Reader, c1: u8) !u8 {
+    if (c1 != '\\') return c1;
+    const c = try r.readByte();
+    if (c == 'u') return unknown1();
+    if (c == 'x') return unknown2();
+    if (c == '\\' or c == '"') return c;
+    if (c == '0') return 0;
+    const table = comptime t: {
+        var table: [26]u8 = .{0} ** 26;
+        table['a' - 'a'] = 7;
+        table['b' - 'a'] = 8;
+        table['t' - 'a'] = 9;
+        table['n' - 'a'] = 10;
+        table['v' - 'a'] = 11;
+        table['f' - 'a'] = 12;
+        table['r' - 'a'] = 13;
+        table['e' - 'a'] = 27;
+        break :t table;
+    };
+    if (c < 'a') return unknown3();
+    const result = table[c - 'a'];
+    return if (result != 0) result else unknown3();
+}
+
+fn unknown1() u8 {
+    return 0;
+}
+fn unknown2() u8 {
+    return 0;
+}
+fn unknown3() u8 {
+    return 0;
+}
diff --git a/test.c b/_tests/test.c
index 0b0917e..0b0917e 100644
--- a/test.c
+++ b/_tests/test.c
diff --git a/test.scm b/_tests/test.scm
index d893c9f..d893c9f 100644
--- a/test.scm
+++ b/_tests/test.scm
diff --git a/_tests/test.zig b/_tests/test.zig
new file mode 100644
index 0000000..7b4a04c
--- /dev/null
+++ b/_tests/test.zig
@@ -0,0 +1,84 @@
+const std = @import("std");
+
+pub fn main() void {
+    // const y: [3]u64 = .{ 1, 2, 3 };
+    // const x: struct { u8, u64, u8 } = y;
+    // @import("std").debug.print("{}\n", .{x[0] + x[1] + x[2]});
+
+    std.debug.print("{}\n", .{@sizeOf(struct { a: u8, b: u64, c: u8, d: bool })});
+}
+
+// const x: ?u8 = 5;
+// if (x == null) {
+//     return 1;
+// } else |val| {
+//     return val;
+// }
+// var list = std.ArrayList(u8).init(std.heap.smp_allocator);
+// try parseUniHex("1f4a9", &list);
+// std.debug.print("{s}\n", .{list.items});
+
+// fn parseUniHex(
+//     str: []const u8,
+//     s: *std.ArrayList(u8),
+// ) !void {
+//     var uc: u21 = parseHexDigit1(str[0]);
+//     for (str[1..]) |c| {
+//         uc = try std.math.shlExact(u21, uc, 4);
+//         uc |= parseHexDigit1(c);
+//     }
+
+//     std.debug.print("{u}\n", .{uc});
+
+//     const n = try std.unicode.utf8CodepointSequenceLength(uc);
+//     const buf = try s.addManyAsSlice(n);
+//     _ = try std.unicode.utf8Encode(uc, buf);
+// }
+
+// fn parseHexByte1(h1: u8, h2: u8) u8 {
+//     const hi = parseHexDigit1(h1);
+//     const lo = parseHexDigit1(h2);
+//     return hi << 4 | lo;
+// }
+
+// fn parseHexDigit1(c: u8) u8 {
+//     return switch (c) {
+//         '0'...'9' => c - '0',
+//         'A'...'F' => c - 'A' + 10,
+//         'a'...'f' => c - 'a' + 10,
+//         else => @panic(""),
+//     };
+// }
+
+// fn parseHexByte2(h1: u8, h2: u8) u8 {
+//     const hi: u8 = parseHexDigit2(h1);
+//     const lo = parseHexDigit2(h2);
+//     return hi << 4 | lo;
+// }
+
+// fn parseHexDigit2(c: u8) u4 {
+//     return @intCast(switch (c) {
+//         '0'...'9' => c - '0',
+//         'A'...'F' => c - 'A' + 10,
+//         'a'...'f' => c - 'a' + 10,
+//         else => @panic(""),
+//     });
+// }
+
+// fn parseUniHex1(str: []const u8) !u21 {
+//     var uc: u21 = parseHexDigit1(str[0]);
+//     for (str[1..]) |c| {
+//         uc = try std.math.shlExact(u21, uc, 4);
+//         uc |= parseHexDigit1(c);
+//     }
+//     return uc;
+// }
+
+// fn parseUniHex2(str: []const u8) !u21 {
+//     var uc: u21 = parseHexDigit2(str[0]);
+//     for (str[1..]) |c| {
+//         uc = try std.math.shlExact(u21, uc, 4);
+//         uc |= parseHexDigit2(c);
+//     }
+//     return uc;
+// }
diff --git a/spec/parser.ebnf b/spec/parser.ebnf
new file mode 100644
index 0000000..9e02fba
--- /dev/null
+++ b/spec/parser.ebnf
@@ -0,0 +1,75 @@
+unit          : blank* ( datum blank? | EOF ) ;
+
+
+blank         : 9...13 | comment ;
+
+datum         : one_datum ( join_char? one_datum )* ;
+
+join_char     : '.' | ':' | '|' ;
+
+
+comment       : ';' ( skip_unit | skip_line ) ;
+
+skip_unit     : '~' unit ;
+
+skip_line     : ( ~LF )* LF? ;
+
+
+one_datum     : ( bare_str | clad_datum ) ;
+
+bare_str      : bare_str_elt+ ;
+
+clad_datum    : '\' bare_esc_str
+              | '"' quoted_str '"'
+              | '#' hash_expr
+              | '(' blank* list? ')'
+              | '[' blank* list? ']'
+              | '{' blank* list? '}'
+              | quote_expr
+              ;
+
+
+bare_str_elt  : bare_char | '\' bare_esc ;
+
+
+bare_esc_str  : bare_esc bare_str_elt* ;
+
+quoted_str    : ( quoted_char | '\' quoted_esc )* ;
+
+hash_expr     : rune clad_datum?
+              | '%' label ( '%' | '=' unit )
+              | clad_datum
+              ;
+
+list          : unit+ ( '.' blank+ unit )? blank* ;
+
+quote_expr    : ( "'" | "`" | "," ) datum ;
+
+
+bare_char     : letter | digit
+              | '!' | '$' | '%' | '&' | '*' | '+' | '-' | '/'
+              | '<' | '=' | '>' | '?' | '@' | '^' | '_' | '~'
+              ;
+
+bare_esc      : 33...126 ;
+
+
+quoted_char   : ~( '"' | '\' ) ;
+
+quoted_esc    : '\' | '"' | 'a' | 'b' | 'e'
+              | 'f' | 'n' | 'r' | 't' | 'v'
+              | 'x' hex_digit{2}
+              | 'u' '{' hex_digit+ '}'
+              ;
+
+
+rune          : letter ( letter | digit ){0,5} ;
+
+label         : hex_digit{1,12} ;
+
+
+letter        : 'a'...'z' | 'A'...'Z' ;
+
+digit         : '0'...'9' ;
+
+hex_digit     : '0'...'9' | 'a'...'f' | 'A'...'F' ;
diff --git a/spec/syntax.md b/spec/syntax.md
new file mode 100644
index 0000000..b85ed78
--- /dev/null
+++ b/spec/syntax.md
@@ -0,0 +1,34 @@
+# Zisp S-Expression Syntax
+
+We use a BNF notation with the following rules:
+
+* Concatenation of expressions is implicit: `foo bar` means `foo`
+  followed by `bar`.
+
+* Expressions may be followed by `?`, `*`, `+`, `{N}`, or `{N,M}`,
+  which have the meanings they have in regular expressions.
+
+* The syntax is defined in terms of bytes, not characters.  Terminals
+  `'c'` and `"c"` refer to the ASCII value of the given character `c`.
+  Numbers are in decimal and refer to a byte with the given value.
+
+* The `~` prefix means NOT.  It only applies to rules that match one
+  byte, and negates them.  For example, `~( 'a' | 'b' )` matches any
+  byte other than 97 and 98.
+
+* Ranges of terminal values are expressed as `x...y` (inclusive).
+
+* There is no ambiguity, backtracking, or look-ahead beyond the byte
+  currently being matched.  Rules match left to right, depth-first,
+  and greedy.  As soon as the input matches the first terminal of a
+  rule, it must match that rule to the end.
+
+The last rule means that the BNF is very simple to translate to code.
+
+The parser consumes one `unit` from an input stream every time it's
+called; it returns the `datum` therein, or EOF.
+
+```
+
+
+```
diff --git a/src/libzisp.zig b/src/libzisp.zig
index 90508be..e6c8ac5 100644
--- a/src/libzisp.zig
+++ b/src/libzisp.zig
@@ -13,6 +13,20 @@ pub const Hval = gc.Hval;
 pub const ShortString = value.ShortString;
 pub const Value = value.Value;
 
+fn benchmark(name: []const u8, iters: usize, func: fn () anyerror!void) !void {
+    var timer = try std.time.Timer.start();
+    for (0..iters) |i| {
+        _ = i;
+        try func();
+    }
+    const ns: f64 = @floatFromInt(timer.lap());
+    const secs = ns / 1_000_000_000;
+    std.debug.print(
+        "bench {s} x {}: {d:.3}s\n",
+        .{ name, iters, secs },
+    );
+}
+
 test "double" {
     const d1: f64 = 0.123456789;
     const d2: f64 = -0.987654321;
@@ -128,7 +142,7 @@ test "sstr" {
             .ReleaseSafe => 100_000_000,
             .ReleaseFast => 1_000_000_000,
         };
-        std.debug.print("Benchmarking with {} iters.\n", .{iters});
+        std.debug.print("Benchmarking sstr with {} iters.\n", .{iters});
         inline for (impls, 0..) |impl, i| {
             try benchmarkSstr(impl, i, iters);
         }
@@ -288,7 +302,7 @@ test "parse" {
 test "parse2" {
     const val = parseString(
         \\ ;; Testing some crazy datum comments
-        \\ #;"bar"#;([x #"y"]{##`,'z}) #"foo"
+        \\ ;~"bar"([x #"y"]{##`,'z}) #"foo"
         \\ ;; end
     );
 
@@ -302,90 +316,86 @@ test "parse2" {
     try std.testing.expectEqualStrings("foo", f.slice());
 }
 
-test "parse3" {
-    const val = parseString(
-        \\(foo #;x #;(x y) #;x #bar [#x #"baz"] 'bat)
-    );
-
-    const car = value.pair.car;
-    const cdr = value.pair.cdr;
-
-    const e1 = car(val);
-    const e2 = car(cdr(val));
-    const e3 = car(cdr(cdr(val)));
-    const e4 = car(cdr(cdr(cdr(val))));
-
-    try std.testing.expect(value.sstr.check(e1));
-    try std.testing.expect(value.rune.check(e2));
-    try std.testing.expect(value.pair.check(e3));
-    try std.testing.expect(value.pair.check(e4));
-}
-
-test "parse4" {
-    const val = parseString("(foo . #;x bar #;y)");
-
-    const s = value.sstr.unpack(value.pair.car(val));
-    try std.testing.expectEqualStrings("foo", s.slice());
-
-    const f = value.sstr.unpack(value.pair.cdr(val));
-    try std.testing.expectEqualStrings("bar", f.slice());
-}
-
-fn parseBench(path: []const u8) !void {
-    const iters = 1000;
-
-    var buf: [8196]u8 = undefined;
-    const file = try std.fs.cwd().openFile(path, .{});
-    defer file.close();
-    const count = try file.readAll(&buf);
-
-    var fbs = std.io.fixedBufferStream(buf[0..count]);
-    const reader = fbs.reader().any();
-
-    var timer = try std.time.Timer.start();
-    for (0..iters) |i| {
-        _ = i;
-        var v: Value = undefined;
-        while (true) {
-            v = io.parser.parse(reader);
-            if (value.eof.check(v)) {
-                break;
-            }
-        }
-        try fbs.seekTo(0);
-    }
-    const ns: f64 = @floatFromInt(timer.lap());
-    const secs = ns / 1_000_000_000;
-    std.debug.print(
-        "parse {s} x {}: {d:.3}s\n",
-        .{ path, iters, secs },
-    );
-}
-
-test "parse bench" {
-    try parseBench("test-data/parser-test-1.scm");
-    try parseBench("test-data/parser-test-2.scm");
-}
-
-test "unparse" {
-    var gpa: std.heap.GeneralPurposeAllocator(.{}) = .init;
-    var out: std.ArrayList(u8) = .init(gpa.allocator());
-
-    const w = out.writer();
-    const v = parseString("#foo");
-    try io.unparser.unparse(w, v);
-    try std.testing.expectEqualStrings("#foo", try out.toOwnedSlice());
-}
-
-test "unparse2" {
-    var gpa: std.heap.GeneralPurposeAllocator(.{}) = .init;
-    var out: std.ArrayList(u8) = .init(gpa.allocator());
-
-    const w = out.writer();
-    const v = parseString("#{foo bar['x]}");
-    try io.unparser.unparse(w, v);
-    try std.testing.expectEqualStrings(
-        "(#HASH #BRACE foo (#JOIN bar #SQUARE (#QUOTE . x)))",
-        try out.toOwnedSlice(),
-    );
-}
+// test "parse3" {
+//     const val = parseString(
+//         \\(foo ;~x ;~(x y) ;~x #bar [#x #"baz"] 'bat)
+//     );
+
+//     const car = value.pair.car;
+//     const cdr = value.pair.cdr;
+
+//     const e1 = car(val);
+//     const e2 = car(cdr(val));
+//     const e3 = car(cdr(cdr(val)));
+//     const e4 = car(cdr(cdr(cdr(val))));
+
+//     try std.testing.expect(value.sstr.check(e1));
+//     try std.testing.expect(value.rune.check(e2));
+//     try std.testing.expect(value.pair.check(e3));
+//     try std.testing.expect(value.pair.check(e4));
+// }
+
+// test "parse4" {
+//     const val = parseString("(foo . ;~x bar ;~y)");
+
+//     const s = value.sstr.unpack(value.pair.car(val));
+//     try std.testing.expectEqualStrings("foo", s.slice());
+
+//     const f = value.sstr.unpack(value.pair.cdr(val));
+//     try std.testing.expectEqualStrings("bar", f.slice());
+// }
+
+// fn parseBench(path: []const u8, iters: usize) !void {
+//     const file = try std.fs.cwd().openFile(path, .{});
+//     defer file.close();
+
+//     var timer = try std.time.Timer.start();
+//     for (0..iters) |i| {
+//         _ = i;
+//         var br = std.io.bufferedReader(file.reader());
+//         const reader = br.reader().any();
+//         var v: Value = undefined;
+//         while (true) {
+//             v = io.parser.parse(reader);
+//             if (value.eof.check(v)) {
+//                 break;
+//             }
+//         }
+//         try file.seekTo(0);
+//     }
+//     const ns: f64 = @floatFromInt(timer.lap());
+//     const secs = ns / 1_000_000_000;
+//     std.debug.print(
+//         "parse {s} x {}: {d:.3}s\n",
+//         .{ path, iters, secs },
+//     );
+// }
+
+// test "parse bench" {
+//     // try parseBench("test-data/parser-test-1.scm", 200);
+//     // try parseBench("test-data/parser-test-2.scm", 800);
+//     try parseBench("test-data/parser-torture.scm", 1);
+// }
+
+// test "unparse" {
+//     var gpa: std.heap.GeneralPurposeAllocator(.{}) = .init;
+//     var out: std.ArrayList(u8) = .init(gpa.allocator());
+
+//     const w = out.writer();
+//     const v = parseString("#foo");
+//     try io.unparser.unparse(w, v);
+//     try std.testing.expectEqualStrings("#foo", try out.toOwnedSlice());
+// }
+
+// test "unparse2" {
+//     var gpa: std.heap.GeneralPurposeAllocator(.{}) = .init;
+//     var out: std.ArrayList(u8) = .init(gpa.allocator());
+
+//     const w = out.writer();
+//     const v = parseString("#{foo bar['x]}");
+//     try io.unparser.unparse(w, v);
+//     try std.testing.expectEqualStrings(
+//         "(#HASH #BRACE foo (#JOIN bar #SQUARE (#QUOTE . x)))",
+//         try out.toOwnedSlice(),
+//     );
+// }
diff --git a/src/libzisp/io/parser.zig b/src/libzisp/io/parser.zig
index d02eda3..651d124 100644
--- a/src/libzisp/io/parser.zig
+++ b/src/libzisp/io/parser.zig
@@ -3,23 +3,23 @@
 //
 // Zisp s-expressions are defined in terms of an extremely minimal set of data
 // types; only that which is necessary to build representations of more complex
-// expressions and types:
+// expressions and data types:
 //
-//   type      format/examples    comment
-//   ----      ---------------    -------
+//   +---------+-------------+---------------+--------+---------+--------+
+//   | TYPE    | Bare String | Quoted String | Rune   | Pair    | Nil    |
+//   +---------+-------------+---------------+--------+---------+--------+
+//   | EXAMPLE | foobar      | "foo bar"     | #name  | (X . Y) | ()     |
+//   +---------+-------------+---------------+--------+---------+--------+
 //
-//   string    foo , "foo bar"    quoted strings are flagged as such
+// Bare strings and quoted strings are polymorphic sub-types of the generic
+// string type.  This lets the compiler distinguish between identifiers and
+// string literals.
 //
-//   rune      #name              name is: [a-zA-Z][a-zA-Z0-9]{0,5}
+// There are also non-negative integers, but they are only used for datum
+// labels; regular number literals are handled by the "decoder" (see below).
 //
-//   pair      (DATUM . DATUM)    the only composite data type supported
-//
-//   nil       ()                 we prefer the term nil over null
-//
-// The parser recognizes various "syntax sugar" and transforms uses of it into
-// uses of the above types.
-//
-// The most ubiquitous example is of course the list syntax:
+// The parser recognizes various "syntax sugar" and transforms it into uses of
+// the above data types.  The most ubiquitous example is of course the list:
 //
 //   (datum1 datum2 ...)  ->  (datum1 . (datum2 . (... . ())))
 //
@@ -35,18 +35,39 @@
 //
 //   `datum  -> (#GRAVE . datum)       dat1|dat2   -> (#PIPE dat1 . dat2)
 //
-//   ,datum  -> (#COMMA . datum)       #n#=datum   -> (#LABEL n . datum)
+//   ,datum  -> (#COMMA . datum)       #%n=datum   -> (#LABEL hex . datum)
+//
+//   #%n     -> (#LABEL . hex)
 //
-//   #n#     -> (#LABEL . n)
+// A separate process called "decoding" can transform these objects into other
+// data types.  For example, (#HASH x y z) could become a vector, so that the
+// expression #(x y z) works just like in Scheme.
 //
-// Notes:
+// Decoding also resolves datum labels, and goes over bare strings to find ones
+// that are actually a number literal.  This lets us offload the complexity of
+// number parsing elsewhere, so the parser remains extremely simple.
+//
+// Further notes about the syntax sugar table and examples above:
 //
 // * The terms datum, dat1, and dat2 each refer to an arbitrary datum; ellipsis
-//   means zero or more data; n is a non-negative integer.
+//   means zero or more data; hex is a hexadecimal number of up to 12 digits.
 //
 // * The #datum form only applies to expressions that cannot be mistaken for a
-//   rune, such as for example: #(...) or #"..." or #'string etc.; following a
-//   hash sign with a plain string will parse as a rune instead.
+//   rune, such as #(...) or #"..." or #'string etc.; a hash sign followed by a
+//   bare string will parse as a rune instead (or raise an error if it contains
+//   characters that are illegal in rune names).
+//
+// * A backslash causes the immediately following character to lose any special
+//   meaning it would normally have, and be considered as part of a bare string
+//   instead.  (This does not apply to whitespace.)  For example, the following
+//   three character sequences are each a valid bare string:
+//
+//     foo\(bar\)  \]blah  \#\'xyz
+//
+//   This can be used to follow a hash with a bare string without it being
+//   parsed as a rune:
+//
+//     #\foo  -> (#HASH . foo)
 //
 // * Though not represented in the table due to notational difficulty, the
 //   format "#rune(...)" doesn't require a list in the second position; any
@@ -66,47 +87,39 @@
 //   The parser will see this as an attempt to use an 8-letter rune name, and
 //   raise an error, since rune names are limited to 6 characters.
 //
-// Syntax sugar can combine arbitrarily; some examples follow:
-//
-//   #{...}            -> (#HASH #BRACE ...)
-//
-//   #'foo             -> (#HASH #QUOTE . foo)
+// * Syntax sugar can combine arbitrarily; some examples follow:
 //
-//   ##'[...]          -> (#HASH #HASH #QUOTE #SQUARE ...)
+//     #{...}            -> (#HASH #BRACE ...)
 //
-//   {x y}[i j]        -> (#JOIN (#BRACE x y) #SQUARE i j)
+//     #'foo             -> (#HASH #QUOTE . foo)
 //
-//   foo.bar.baz{x y}  -> (#JOIN (#DOT (#DOT foo . bar) . baz) #BRACE x y)
+//     ##'[...]          -> (#HASH #HASH #QUOTE #SQUARE ...)
 //
-// Runes are case-sensitive, and the parser only emits runes using upper-case
-// letters when expressing syntax sugar, so there can be no accidental clash
-// with runes that appear verbatim in code.
+//     {x y}[i j]        -> (#JOIN (#BRACE x y) #SQUARE i j)
 //
-// Although strings and symbols aren't disjoint types in Zisp, the parser flags
-// double-quoted string literals to allow distinguishing them from bare strings.
-// Otherwise, it would not be possible for the compiler to tell the difference
-// between an identifier and a string literal.
+//     foo.bar.baz{x y}  -> (#JOIN (#DOT (#DOT foo . bar) . baz) #BRACE x y)
 //
-// You may be wondering about numbers.  As far as the parser is concerned,
-// numbers are strings.  It's the decoder (see below) that will turn bare
-// strings into numbers where appropriate.
+// * While 'foo parses as (quote foo) in Scheme, it parses as (#QUOTE . foo) in
+//   Zisp; the operand of #QUOTE is the entire cdr.  The same principle is used
+//   when parsing other sugar; some examples follow:
 //
-// Note that 'foo becomes (quote foo) in Scheme, but (#QUOTE . foo) in Zisp.
-// The operand of #QUOTE is the entire cdr.  The same principle is used when
-// parsing other sugar; some examples follow:
+//            Incorrect                              Correct
 //
-//          Incorrect                              Correct
+//     #(x y z) -> (#HASH (x y z))            #(x y z) -> (#HASH x y z)
 //
-//   #(x y z) -> (#HASH (x y z))            #(x y z) -> (#HASH x y z)
+//     [x y z]  -> (#SQUARE (x y z))          [x y z]  -> (#SQUARE x y z)
 //
-//   [x y z]  -> (#SQUARE (x y z))          [x y z]  -> (#SQUARE x y z)
+//     #{x}     -> (#HASH (#BRACE (x)))       #{x}     -> (#HASH #BRACE x)
 //
-//   #{x}     -> (#HASH (#BRACE (x)))       #{x}     -> (#HASH #BRACE x)
+//     foo(x y) -> (#JOIN foo (x y))          foo(x y) -> (#JOIN foo x y)
 //
-//   foo(x y) -> (#JOIN foo (x y))          foo(x y) -> (#JOIN foo x y)
+// * Runes are case-sensitive, and the parser only emits runes using upper-case
+//   letters when expressing syntax sugar.  This way, there can be no accidental
+//   clash with runes that appear verbatim in code, as long as only lower-case
+//   letters are used for rune literals in code.
 //
 //
-// === Decoder ===
+// === Decoding ===
 //
 // A separate process called "decoding" can transform simple data structures,
 // consisting of only the above types, into a richer set of Zisp data types.
@@ -118,8 +131,8 @@
 // whose head they appear in.  This can implement #true and #false.
 //
 // The decoder may also perform arbitrary transforms on any type; for example,
-// it may turn bare strings (those not flagged as double-quoted) into numbers
-// when appropriate.  This can implement number literals.
+// it may turn bare strings into numbers wher appropriate.  This can implement
+// number literals.
 //
 // The decoder recognizes (#QUOTE ...) to implement the traditional quoting
 // mechanism, but with a significant difference:
@@ -162,11 +175,11 @@
 // call one of the starting functions.
 //
 // If a function wants to make the parser return, either from a recursive parse
-// or from the main loop, it sets the .retval field, and tries to pop the saved
+// or from the main loop, it sets the .result field, and tries to pop the saved
 // context.  If the context stack was empty, the main loop returns.
 //
 // While it's possible to just set .next and return, to make the main loop call
-// another function next (possibly even setting .retval to pass a value to it),
+// another function next (possibly even setting .result to pass a value to it),
 // this is completely unnecessary.  A few non-recursive calls won't blow the
 // stack.  It's only recursive parsing that we use the above strategy for.
 //
@@ -201,7 +214,7 @@
 // would still waste memory and the time it takes to allocate memory.
 //
 //
-// === Buffering strategy ===
+// === Buffering ===
 //
 // We use a small circular buffer just so we can backtrack.  We read bytes into
 // this buffer one by one, because we don't want to consume more bytes from the
@@ -235,11 +248,20 @@ const std = @import("std");
 const lib = @import("../lib.zig");
 const value = @import("../value.zig");
 
-const ShortString = value.ShortString;
+const List = std.ArrayListUnmanaged;
+
 const Value = value.Value;
 
 const cons = value.pair.cons;
-const intern = value.istr.intern;
+
+const is_test = builtin.is_test;
+const is_debug = builtin.mode == .Debug;
+
+const detailed_debug = true;
+
+// In debug, we want to see if we leak, so very small numbers.
+const init_stack_capacity = if (is_debug) 20 else 32;
+const init_chars_capacity = if (is_debug) 100 else 512;
 
 // zig fmt: off
 const DOT    = value.rune.pack("DOT");
@@ -257,166 +279,164 @@ const BRACE  = value.rune.pack("BRACE");
 
 const Context = struct {
     // What to do next.
-    next: Fn = .start_parse,
-    // For storing some context value, like accumulated list elements.
+    next: Fn = .parse_unit,
+    // For storing a context value, like accumulated list elements.
     val: Value = undefined,
-    // For storing some context char, like opening bracket.
+    // For storing a context char, like list opening bracket.
     char: u8 = undefined,
 };
 
-// Make sure these are in sync, as we will use +% to increment the position.
-// Size 16 is enough because the largest amount by which we backtrack is short
-// strings / runes, which are limited to 6 bytes.
-const BUF_SIZE = 16;
-const POS_TYPE = u4;
-
-const is_test = builtin.is_test;
-const is_debug = builtin.mode == .Debug;
+const ParseError = error{
+    InvalidCharacter,
+    UnclosedString,
+    UnexpectedEof,
+    UnicodeError,
+    RuneTooLong,
+    OutOfMemory,
+    OutOfRange,
+    ReadError,
+};
 
 const State = struct {
     input: std.io.AnyReader,
-    alloc: std.mem.Allocator,
+    stack_alloc: std.mem.Allocator,
+    chars_alloc: std.mem.Allocator,
 
-    counter: usize = 0,
+    context: Context = .{},
+    stack: List(Context) = undefined,
 
-    buf: [BUF_SIZE]u8 = undefined,
-    pos: POS_TYPE = 0,
-    write_pos: POS_TYPE = 0,
+    chars: List(u8) = undefined,
 
-    context: Context = .{},
-    stack: std.ArrayListUnmanaged(Context) = .{},
-    retval: Value = undefined,
+    // TODO: Implement bool flag for when skipping a unit
 
-    // For debugging.
-    checked_eof: bool = false,
+    result: Value = undefined,
+    unused_char: ?u8 = null,
 
-    fn init(input: std.io.AnyReader, alloc: std.mem.Allocator) State {
-        return .{ .input = input, .alloc = alloc };
+    err_code: anyerror = undefined,
+    err_msg: []const u8 = undefined,
+
+    fn init(
+        input: std.io.AnyReader,
+        stack_alloc: std.mem.Allocator,
+        chars_alloc: std.mem.Allocator,
+    ) !State {
+        var s: State = .{
+            .input = input,
+            .stack_alloc = stack_alloc,
+            .chars_alloc = chars_alloc,
+        };
+        s.stack = try .initCapacity(s.stack_alloc, init_stack_capacity);
+        s.chars = try .initCapacity(s.chars_alloc, init_chars_capacity);
+        return s;
     }
 
     fn deinit(s: *State) void {
-        s.stack.deinit(s.alloc);
+        s.stack.deinit(s.stack_alloc);
+        s.chars.deinit(s.chars_alloc);
     }
 
-    fn recurParse(s: *State, start: Fn, end: Fn) void {
-        s.stack.append(s.alloc, .{
-            .next = end,
-            .val = s.context.val,
-            .char = s.context.char,
-        }) catch @panic("OOM");
-        s.context.next = start;
+    fn err(s: *State, e: ParseError, msg: []const u8) ParseError {
+        s.err_msg = msg;
+        return e;
     }
 
-    fn returnDatum(s: *State, val: Value) void {
-        s.retval = val;
-        if (s.stack.pop()) |c| {
-            s.context = c;
-        } else {
-            s.context.next = .done;
+    fn read(s: *State) !?u8 {
+        if (is_debug and s.unused_char != null) {
+            @panic("Called read() while there was an unused character!");
         }
+        const c = s.input.readByte() catch |e| switch (e) {
+            error.EndOfStream => return null,
+            else => {
+                s.err_code = e;
+                return error.ReadError;
+            },
+        };
+        if (detailed_debug) {
+            std.debug.print("{c}", .{c});
+        }
+        return c;
     }
 
-    fn readNext(s: *State) !void {
-        if (s.pos != s.write_pos) {
-            return;
-        }
-        s.buf[s.pos] = try s.input.readByte();
-        s.write_pos +%= 1;
+    fn readNoEof(s: *State, emsg: []const u8) !u8 {
+        return if (try s.read()) |c| c else s.err(error.UnexpectedEof, emsg);
     }
 
-    fn eof(s: *State) bool {
-        if (is_debug) {
-            s.checked_eof = true;
+    fn getUnused(s: *State) ?u8 {
+        if (s.unused_char) |c| {
+            s.unused_char = null;
+            return c;
         }
-        readNext(s) catch |e| switch (e) {
-            error.EndOfStream => return true,
-            else => @panic("read error"),
-        };
-        return false;
+        return null;
     }
 
-    fn peek(s: *State) u8 {
-        if (is_debug) {
-            if (!s.checked_eof) {
-                @panic("Didn't check EOF before calling peek()!");
-            }
-        }
-        return s.buf[s.pos];
+    fn skipLine(s: *State) !void {
+        while (try s.read()) |c| if (c == '\n') break;
     }
 
-    fn skip(s: *State) void {
-        if (is_debug) {
-            s.checked_eof = false;
-        }
-        // std.debug.print("{c}\n", .{s.buf[s.pos]});
-        s.pos +%= 1;
-        s.counter += 1;
+    fn addChar(s: *State, c: u8) !void {
+        try s.chars.append(s.chars_alloc, c);
     }
 
-    fn getc(s: *State) u8 {
-        const c = s.peek();
-        s.skip();
-        return c;
+    fn getBareString(s: *State) Value {
+        defer s.chars.clearRetainingCapacity();
+        return if (s.chars.items.len <= 6)
+            value.sstr.pack(s.chars.items)
+        else
+            value.istr.intern(s.chars.items, false);
     }
 
-    // Consumes whitespace and line comments.
-    fn consumeBlanks(s: *State) void {
-        while (!s.eof()) {
-            switch (s.peek()) {
-                // Allow Form Feed (^L) commonly used by Emacs users.
-                '\t', '\n', ' ', 0x0C => s.skip(),
-                ';' => s.consumeLineComment(),
-                else => return,
-            }
-        }
+    fn getQuotedString(s: *State) Value {
+        defer s.chars.clearRetainingCapacity();
+        return if (s.chars.items.len <= 6)
+            value.sstr.packQuoted(s.chars.items)
+        else
+            value.istr.intern(s.chars.items, true);
     }
 
-    fn consumeLineComment(s: *State) void {
-        while (!s.eof()) {
-            if (s.getc() == '\n') {
-                return;
-            }
-        }
+    fn getRune(s: *State) !Value {
+        defer s.chars.clearRetainingCapacity();
+        return if (s.chars.items.len <= 6)
+            value.rune.pack(s.chars.items)
+        else
+            error.RuneTooLong;
     }
 
-    fn isWhitespace(s: *State) bool {
-        return switch (s.peek()) {
-            '\t', '\n', ' ' => true,
-            else => false,
-        };
+    fn push(s: *State, next: Fn) !void {
+        try s.stack.append(s.stack_alloc, .{ .next = next });
     }
-};
 
-const CharPred = fn (u8) bool;
-const ShortStringPack = fn ([]const u8) Value;
+    fn pushContext(s: *State, next: Fn) !void {
+        try s.stack.append(s.stack_alloc, .{
+            .next = next,
+            .val = s.context.val,
+            .char = s.context.char,
+        });
+    }
 
-// Helper function to read runes and short strings.
-fn readShortString(
-    s: *State,
-    pred: CharPred,
-    pack: ShortStringPack,
-) ?Value {
-    var str = ShortString{};
-    while (!s.eof() and pred(s.peek())) {
-        str.append(s.getc()) catch return null;
+    fn subr(s: *State, start: Fn, next: Fn) !void {
+        try s.pushContext(next);
+        s.context.next = start;
     }
-    return pack(str.constSlice());
-}
 
-// Probably best *not* to use function pointers here, but rather dispatch to
-// functions manually based on enum value.  This should help the optimizer.
+    fn jump(s: *State, next: Fn, val: ?Value) void {
+        if (val) |v| s.result = v;
+        s.context.next = next;
+    }
 
-const Fn = enum {
-    start_parse,
-    start_datum,
-    end_join_datum,
-    end_label_datum,
-    end_hash_datum,
-    end_quote_datum,
-    continue_list,
-    finish_improper_list,
-    end_improper_list,
-    done,
+    fn abort(s: *State, next: Fn, unused_c: u8) void {
+        s.result = value.undef;
+        s.unused_char = unused_c;
+        s.context.next = next;
+    }
+
+    fn retval(s: *State, val: Value) void {
+        s.result = val;
+        if (s.stack.pop()) |c| {
+            s.context = c;
+        } else {
+            s.context.next = .done;
+        }
+    }
 };
 
 pub fn parse(input: std.io.AnyReader) Value {
@@ -437,437 +457,620 @@ pub fn parse(input: std.io.AnyReader) Value {
     else
         std.heap.smp_allocator;
 
-    var sfa = std.heap.stackFallback(4096, alloc);
-
-    var s: State = .init(input, sfa.get());
+    const stack_mem = init_stack_capacity * @sizeOf(Context);
+    const chars_mem = init_chars_capacity;
+    var stack_sfa = std.heap.stackFallback(stack_mem, alloc);
+    var chars_sfa = std.heap.stackFallback(chars_mem, alloc);
+    const stack_alloc = stack_sfa.get();
+    const chars_alloc = chars_sfa.get();
+    var s = State.init(input, stack_alloc, chars_alloc) catch @panic("");
     defer s.deinit();
 
-    while (s.context.next != .done) call(&s);
-    return s.retval;
+    while (s.context.next != .done) callNext(&s) catch |e| switch (e) {
+        else => @panic(s.err_msg), // TODO
+    };
+    if (s.unused_char) |_| {
+        @panic("invalid character");
+    }
+    return s.result;
 }
 
-fn call(s: *State) void {
-    // std.debug.print("{}\n", .{s.next});
-    switch (s.context.next) {
-        .start_parse => startParse(s),
-        .start_datum => startDatum(s),
-        .end_join_datum => endJoinedDatum(s),
-        .end_label_datum => endLabelDatum(s),
+const Fn = enum {
+    parse_unit,
+    return_context,
+    end_one_datum,
+    parse_join_datum,
+    end_join_datum,
+    join_data,
+    parse_hash_datum,
+    end_hash_datum,
+    parse_rune_datum,
+    end_rune_datum,
+    end_label_datum,
+    parse_list_element,
+    continue_list,
+    parse_list_tail,
+    end_improper_list,
+    close_improper_list,
+    end_quote_expr,
+    done,
+};
+
+fn callNext(s: *State) !void {
+    if (detailed_debug) {
+        std.debug.print("\n{}:{} ctx:'{c}' unused:'{c}' \n", .{
+            s.stack.items.len,
+            s.context.next,
+            s.context.char,
+            s.unused_char orelse '_',
+        });
+    }
+    try switch (s.context.next) {
+        .parse_unit => parseUnit(s),
+        .return_context => returnContext(s),
+        .end_one_datum => endOneDatum(s),
+        .parse_join_datum => parseJoinDatum(s),
+        .end_join_datum => endJoinDatum(s),
+        .join_data => joinData(s),
+        .parse_hash_datum => parseHashDatum(s),
         .end_hash_datum => endHashDatum(s),
-        .end_quote_datum => endQuoteDatum(s),
+        .parse_rune_datum => parseRuneDatum(s),
+        .end_rune_datum => endRuneDatum(s),
+        .end_label_datum => endLabelDatum(s),
+        .parse_list_element => parseListElement(s),
         .continue_list => continueList(s),
-        .finish_improper_list => finishImproperList(s),
+        .parse_list_tail => parseListTail(s),
         .end_improper_list => endImproperList(s),
+        .close_improper_list => endImproperList(s),
+        .end_quote_expr => endQuoteExpr(s),
         .done => unreachable,
-    }
+    };
 }
 
-fn startParse(s: *State) void {
-    s.consumeBlanks();
-    if (s.eof()) {
-        return s.returnDatum(value.eof.eof);
-    }
-    switch (s.peek()) {
-        // whitespace already consumed
-        0...32, 127...255 => err(s, "invalid character"),
-        ')', ']', '}' => err(s, "unexpected closing bracket"),
-        else => startDatum(s),
+fn parseUnit(s: *State) !void {
+    var c1 = s.getUnused() orelse try s.read();
+    while (c1) |c| : (c1 = try s.read()) {
+        switch (try checkBlank(s, c)) {
+            .yes => {},
+            .skip_unit => {
+                // Simply push another parse_unit onto the stack, which will
+                // ignore the result of the current one and start anew; then
+                // keep looping to read the datum that will be ignored.
+                try s.push(.parse_unit);
+            },
+            .skip_line => try s.skipLine(),
+            .no => return parseDatum(s, c),
+        }
     }
+    return s.retval(value.eof.eof);
 }
 
-// This is called when we *immediately* expect a datum and nothing else; for
-// example, no whitespace or comments, because they've already been consumed.
-fn startDatum(s: *State) void {
-    if (s.eof()) {
-        return err(s, "expected datum, got EOF");
-    }
-    if (s.isWhitespace()) {
-        return err(s, "expected datum, got whitespace");
-    }
-    switch (s.peek()) {
-        // whitespace checked above
-        0...32, 127...255 => err(s, "invalid character"),
-
-        ')', ']', '}' => err(s, "unexpected closing bracket"),
-
-        ';' => err(s, "expected datum, got semicolon"),
-
-        '#' => handleHash(s),
-
-        '"' => startQuotedString(s),
-
-        '\'', '`', ',' => startQuote(s),
+fn checkBlank(s: *State, c: u8) !enum { yes, skip_unit, skip_line, no } {
+    return switch (c) {
+        '\t'...'\r', ' ' => .yes,
+        ';' => switch (try s.read() orelse '\n') {
+            '\n' => .yes,
+            '~' => .skip_unit,
+            else => .skip_line,
+        },
+        else => .no,
+    };
+}
 
-        '(', '[', '{' => startList(s),
+fn parseDatum(s: *State, c: u8) !void {
+    return parseOneDatum(s, c, .end_one_datum);
+}
 
-        else => startBareString(s),
+fn endOneDatum(s: *State) !void {
+    const d = s.result;
+    if (d.eq(value.undef)) {
+        return s.retval(d);
     }
+    const c1 = s.getUnused() orelse try s.read();
+    if (c1) |c| {
+        switch (try checkBlank(s, c)) {
+            .yes => {},
+            .skip_unit => return skipUnitAndReturn(s, d),
+            .skip_line => try s.skipLine(),
+            .no => return parseJoin(s, d, c),
+        }
+    }
+    return s.retval(d);
 }
 
-fn endDatum(s: *State, d: Value) void {
-    //
-    // We're at the end of a datum; check for the various ways data can be
-    // joined together, like DATUM|DATUM or DATUM|.DATUM etc.
-    //
-
-    if (isEndOfDatum(s)) {
-        // Nope, end it.
-        return s.returnDatum(d);
-    }
+fn skipUnitAndReturn(s: *State, d: Value) !void {
+    s.context.val = d;
+    return s.subr(.parse_unit, .return_context);
+}
 
-    // There's a stupid special-case we have to handle here, where a datum
-    // comment may fool us into thinking there's something to join: foo|#;bar
+fn returnContext(s: *State) !void {
+    return s.retval(s.context.val);
+}
 
-    const c = s.peek();
-    switch (c) {
-        '.', ':', '|' => s.skip(),
-        '#' => if (checkTrailingDatumComment(s)) {
-            return s.returnDatum(d);
-        },
-        else => {},
-    }
+fn parseJoin(s: *State, d: Value, c: u8) !void {
     s.context.val = d;
     s.context.char = c;
-    s.recurParse(.start_datum, .end_join_datum);
+    s.unused_char = switch (c) {
+        '.', ':', '|' => try s.readNoEof("start of joined datum"),
+        else => c,
+    };
+    return s.subr(.parse_join_datum, .join_data);
 }
 
-fn checkTrailingDatumComment(s: *State) bool {
-    const pos = s.pos;
-    s.skip();
-    if (s.eof()) {
-        // Error, but let it be handled later.
-        return false;
-    }
-    const c = s.peek();
-    s.pos = pos;
-    return c == ';';
+fn parseJoinDatum(s: *State) !void {
+    return parseOneDatum(s, s.getUnused().?, .end_join_datum);
 }
 
-fn isEndOfDatum(s: *State) bool {
-    return s.eof() or switch (s.peek()) {
-        '\t', '\n', ' ', ';', ')', ']', '}' => true,
-        else => false,
-    };
+fn endJoinDatum(s: *State) !void {
+    return s.retval(s.result);
 }
 
-fn endJoinedDatum(s: *State) void {
-    const rune = switch (s.context.char) {
+fn joinData(s: *State) !void {
+    const head = s.context.val;
+    const join = s.context.char;
+    const tail = s.result;
+    if (tail.eq(value.undef)) {
+        return s.retval(head);
+    }
+    const rune = switch (join) {
         '.' => DOT,
         ':' => COLON,
         '|' => PIPE,
         else => JOIN,
     };
-    endDatum(s, cons(rune, cons(s.context.val, s.retval)));
-}
-
-fn handleHash(s: *State) void {
-    s.skip();
-    //
-    // We just consumed a hash.  Possibilities include:
-    //
-    //   #|foo          ;rune
-    //
-    //   #|n#[=DATUM]   ;datum label, with or without datum
-    //
-    //   #|;DATUM       ;datum comment
-    //
-    //   #|DATUM        ;hash-datum
-    //
-
-    if (s.eof()) {
-        return err(s, "EOF after hash");
-    }
-    if (s.isWhitespace()) {
-        return err(s, "whitespace after hash");
-    }
-
-    switch (s.peek()) {
-        'a'...'z', 'A'...'Z' => handleRune(s),
-        '0'...'9' => handleDatumLabel(s),
-        ';' => {
-            s.skip();
-            // Don't change next in this case.  Just let the parser redo what it
-            // was doing as soon as the commented-out datum has been read.
-            s.recurParse(.start_datum, s.context.next);
-        },
-        else => s.recurParse(.start_datum, .end_hash_datum),
+    const result = cons(rune, cons(head, tail));
+    return s.jump(.end_one_datum, result);
+}
+
+fn parseOneDatum(s: *State, c: u8, next: Fn) !void {
+    if (isBareChar(c)) {
+        const d, s.unused_char = try parseBareString(s, c);
+        return s.jump(next, d);
+    }
+    return parseCladDatum(s, c, next);
+}
+
+fn parseCladDatum(s: *State, c: u8, next: Fn) !void {
+    if (c == '\\') {
+        const bs, s.unused_char = try parseBareEscString(s);
+        return s.jump(next, bs);
+    }
+    if (c == '"') {
+        const qs = try parseQuotedString(s);
+        return s.jump(next, qs);
     }
+    return switch (c) {
+        '#' => parseHashExpression(s, next),
+        '(', '[', '{' => parseList(s, c, next),
+        '\'', '`', ',' => parseQuoteExpr(s, c, next),
+        else => s.abort(next, c),
+    };
 }
 
-fn handleRune(s: *State) void {
-    const r = readRune(s) orelse return err(s, "rune too long");
-    endDatum(s, r);
+fn isBareChar(c: u8) bool {
+    return switch (c) {
+        // zig fmt: off
+        'a'...'z' , 'A'...'Z' , '0'...'9',
+        '!' , '$' , '%' , '&' , '*' , '+',
+        '-' , '/' , '<' , '=' , '>' , '?',
+        '@' , '^' , '_' , '~' => true,
+        // zig fmt: on
+        else => false,
+    };
 }
 
-fn readRune(s: *State) ?Value {
-    return readShortString(s, std.ascii.isAlphanumeric, value.rune.pack);
+fn isBareEsc(c: u8) bool {
+    return switch (c) {
+        33...126 => true,
+        else => false,
+    };
 }
 
-fn handleDatumLabel(s: *State) void {
-    const n = readDatumLabel(s) orelse return err(s, "datum label too long");
-    //
-    // We're at the end of the numeric label now; possibilities are:
-    //
-    //   #n|#
-    //
-    //   #n|#=DATUM
-    //
+fn parseBareString(s: *State, c: u8) !struct { Value, ?u8 } {
+    try s.addChar(c);
+    return parseBareStringRest(s);
+}
 
-    if (s.eof()) {
-        return err(s, "unexpected EOF while reading datum label");
-    }
-    if (s.getc() != '#') {
-        return err(s, "invalid character while reading datum label");
-    }
+fn parseBareEscString(s: *State) !struct { Value, ?u8 } {
+    try s.addChar(try parseBareEsc(s));
+    return parseBareStringRest(s);
+}
 
-    if (s.eof() or s.isWhitespace()) {
-        return endDatum(s, cons(LABEL, n));
+fn parseBareStringRest(s: *State) !struct { Value, ?u8 } {
+    while (try s.read()) |c| {
+        if (isBareChar(c)) {
+            try s.addChar(c);
+        } else if (c == '\\') {
+            try s.addChar(try parseBareEsc(s));
+        } else {
+            return .{ s.getBareString(), c };
+        }
     }
+    return .{ s.getBareString(), null };
+}
 
-    if (s.getc() != '=') {
-        return err(s, "invalid character after numeric datum label");
+fn parseBareEsc(s: *State) !u8 {
+    const c = try s.readNoEof("bare escape");
+    if (isBareEsc(c)) {
+        return c;
+    } else {
+        return s.err(error.InvalidCharacter, "bare escape");
     }
-
-    s.context.val = n;
-    s.recurParse(.start_datum, .end_label_datum);
 }
 
-fn readDatumLabel(s: *State) ?Value {
-    return readShortString(s, std.ascii.isDigit, value.sstr.pack);
+fn parseQuotedString(s: *State) !Value {
+    while (try s.read()) |c| {
+        if (c == '"') {
+            return s.getQuotedString();
+        }
+        if (c != '\\') {
+            try s.addChar(c);
+        } else {
+            try parseQuotedEsc(s);
+        }
+    }
+    return error.UnclosedString;
 }
 
-fn endLabelDatum(s: *State) void {
-    endDatum(s, cons(LABEL, cons(s.context.val, s.retval)));
+fn parseQuotedEsc(s: *State) !void {
+    const c = try s.readNoEof("quoted escape");
+    if (c == 'u') {
+        return parseUniHexHandleErrors(s);
+    }
+    try s.addChar(switch (c) {
+        '\\', '"' => c,
+        '0' => 0,
+        'a' => 7,
+        'b' => 8,
+        't' => 9,
+        'n' => 10,
+        'v' => 11,
+        'f' => 12,
+        'r' => 13,
+        'e' => 27,
+        'x' => try parseHexByte(s, "hex escape"),
+        else => return s.err(error.InvalidCharacter, "quoted escape"),
+    });
 }
 
-fn endHashDatum(s: *State) void {
-    endDatum(s, cons(HASH, s.retval));
+fn parseUniHexHandleErrors(s: *State) !void {
+    return parseUniHex(s) catch |err| switch (err) {
+        error.Utf8CannotEncodeSurrogateHalf => e: {
+            s.err_code = err;
+            s.err_msg = "unicode escape";
+            break :e error.UnicodeError;
+        },
+        else => |e| e,
+    };
 }
 
-fn startQuotedString(s: *State) void {
-    // We're at |"..." so consume the opening quote before we start reading.
-    s.skip();
+fn parseUniHex(s: *State) !void {
+    const msg = "unicode escape";
 
-    const str = readQuotedString(s) catch return err(s, "unclosed string");
-    endDatum(s, str);
+    if (try s.readNoEof(msg) != '{') {
+        return s.err(error.InvalidCharacter, msg);
+    }
+
+    const uc, const unused_c = try parseHex(s, u21, msg);
+    if (unused_c) |c| {
+        if (c != '}') {
+            return s.err(error.InvalidCharacter, msg);
+        }
+    } else {
+        return s.err(error.UnexpectedEof, msg);
+    }
+
+    const n = try std.unicode.utf8CodepointSequenceLength(uc);
+    const buf = try s.chars.addManyAsSlice(s.chars_alloc, n);
+    _ = try std.unicode.utf8Encode(uc, buf);
 }
 
-// RQS = Read Quoted String
-const RqsError = enum { Unclosed };
+fn parseHashExpression(s: *State, next: Fn) !void {
+    const c = try s.readNoEof("hash expression");
+    if (try checkBlank(s, c) != .no) {
+        return s.err(error.InvalidCharacter, "hash expression");
+    }
+    if (std.ascii.isAlphabetic(c)) {
+        const r, const unused_c = try parseRune(s, c);
+        return parseRuneEnd(s, r, unused_c, next);
+    }
+    if (c == '%') {
+        const l, const unused_c = try parseLabel(s);
+        return parseLabelEnd(s, l, unused_c, next);
+    }
+    if (isBareChar(c)) {
+        // Reserved for future extensions to syntax sugar.
+        return s.err(error.InvalidCharacter, "hash expression");
+    }
+    // fast-path to avoid subr
+    if (c == '\\') {
+        const bs, s.unused_char = try parseBareEscString(s);
+        return s.jump(next, cons(HASH, bs));
+    }
+    if (c == '"') {
+        const qs = try parseQuotedString(s);
+        return s.jump(next, cons(HASH, qs));
+    }
+    s.unused_char = c;
+    return s.subr(.parse_hash_datum, next);
+}
 
-fn readQuotedString(s: *State) !Value {
-    return try readQuotedSstr(s) orelse readQuotedLongString(s);
+fn parseHashDatum(s: *State) !void {
+    return parseCladDatum(s, s.getUnused().?, .end_hash_datum);
 }
 
-fn readQuotedSstr(s: *State) !?Value {
-    const start_pos = s.pos;
+fn endHashDatum(s: *State) !void {
+    if (s.result.eq(value.undef)) {
+        return s.err(error.InvalidCharacter, "hash datum");
+    }
+    return s.retval(cons(HASH, s.result));
+}
 
-    // TODO: Handle escapes.
-    var buf: [6]u8 = undefined;
-    var i: u8 = 0;
-    while (!s.eof()) : (i += 1) {
-        const c = s.getc();
-        if (c == '"') {
-            // ok, return what we accumulated
-            return value.sstr.packQuoted(buf[0..i]);
-        }
-        if (i == 6) {
-            // failed; reset and bail out
-            s.pos = start_pos;
-            return null;
+fn parseRune(s: *State, c1: u8) !struct { Value, ?u8 } {
+    try s.addChar(c1);
+    var len: usize = 1;
+    while (try s.read()) |c| : (len += 1) {
+        if (len == 6 or !std.ascii.isAlphanumeric(c)) {
+            return .{ try s.getRune(), c };
         }
-        // ok, save this byte and go on
-        buf[i] = c;
+        try s.addChar(c);
     }
-    return error.Unclosed;
+    return .{ try s.getRune(), null };
 }
 
-fn readQuotedLongString(s: *State) !Value {
-    var chars = std.ArrayList(u8).init(s.alloc);
-    defer chars.deinit();
-
-    // TODO: Handle escapes.
-    while (!s.eof()) {
-        const c = s.getc();
-        if (c == '"') {
-            return intern(chars.items, true);
-        }
-        chars.append(c) catch @panic("OOM");
+fn parseRuneEnd(s: *State, r: Value, c1: ?u8, next: Fn) !void {
+    const c = c1 orelse return s.jump(next, r);
+    if (c == '\\') {
+        const bs, s.unused_char = try parseBareString(s, c);
+        return s.jump(next, cons(r, bs));
+    }
+    if (c == '"') {
+        const qs = try parseQuotedString(s);
+        return s.jump(next, cons(r, qs));
+    }
+    s.unused_char = c;
+    switch (c) {
+        '#', '(', '[', '{', '\'', '`', ',' => {
+            try s.push(next);
+            return s.jump(.parse_rune_datum, r);
+        },
+        else => return s.jump(next, r),
     }
-    return error.Unclosed;
 }
 
-fn startBareString(s: *State) void {
-    // We're at |foo so start reading directly.
-    const str = readBareSstr(s) orelse readBareLongString(s);
-    endDatum(s, str);
+fn parseRuneDatum(s: *State) !void {
+    s.context.val = s.result;
+    return parseCladDatum(s, s.getUnused().?, .end_rune_datum);
 }
 
-fn readBareSstr(s: *State) ?Value {
-    const pos = s.pos;
-    if (readShortString(s, isSstrChar, value.sstr.pack)) |sstr| {
-        return sstr;
-    } else {
-        s.pos = pos;
-        return null;
+fn endRuneDatum(s: *State) !void {
+    const r = s.context.val;
+    const d = s.result;
+    if (d.eq(value.undef)) {
+        s.retval(r);
     }
+    return s.retval(cons(r, d));
 }
 
-fn isSstrChar(c: u8) bool {
-    // We will ignore illegal characters here, because they aren't consumed by
-    // this function; whatever code comes next must handle them.
-    return switch (c) {
-        '(', ')', '[', ']', '{', '}', ';', '#', '"', '\'', '`', ',' => false,
-        0...32, 127...255 => false,
-        else => true,
-    };
+fn parseLabel(s: *State) !struct { Value, ?u8 } {
+    const label, const unused_c = try parseHex(s, u48, "datum label");
+    return .{ value.fixnum.pack(label), unused_c };
 }
 
-fn readBareLongString(s: *State) Value {
-    var chars = std.ArrayList(u8).init(s.alloc);
-    defer chars.deinit();
+fn parseLabelEnd(s: *State, l: Value, c1: ?u8, next: Fn) !void {
+    const c = c1 orelse return s.err(error.UnexpectedEof, "datum label");
+    if (c == '%') {
+        return s.jump(next, cons(LABEL, l));
+    }
+    if (c == '=') {
+        try s.push(next);
+        s.context.val = l;
+        return s.subr(.parse_unit, .end_label_datum);
+    }
+    return s.err(error.InvalidCharacter, "datum label");
+}
 
-    while (!s.eof()) {
-        const c = s.peek();
-        if (isSstrChar(c)) {
-            chars.append(c) catch @panic("OOM");
-            s.skip();
-        } else {
-            break;
-        }
+fn endLabelDatum(s: *State) !void {
+    const l = s.context.val;
+    const d = s.result;
+    if (d.eq(value.undef)) {
+        return s.err(error.InvalidCharacter, "label datum");
     }
-    return intern(chars.items, false);
+    return s.retval(cons(LABEL, cons(l, d)));
 }
 
-fn startQuote(s: *State) void {
-    // We're at one of:  |'...  |`...  |,...
-    s.context.val = switch (s.getc()) {
-        '\'' => QUOTE,
-        '`' => GRAVE,
-        ',' => COMMA,
+fn parseList(s: *State, open: u8, next: Fn) !void {
+    const head = switch (open) {
+        '(' => value.nil.nil,
+        '[' => cons(SQUARE, value.nil.nil),
+        '{' => cons(BRACE, value.nil.nil),
+        else => unreachable,
+    };
+    const close: u8 = switch (open) {
+        '(' => ')',
+        '[' => ']',
+        '{' => '}',
         else => unreachable,
     };
-    s.recurParse(.start_datum, .end_quote_datum);
+    while (try s.read()) |c| {
+        if (c == close) {
+            return s.jump(next, head);
+        }
+        switch (try checkBlank(s, c)) {
+            .yes => {},
+            .skip_unit => {
+                try listParserSetup(s, head, close, next);
+                // Parse twice in a row, ignoring the first result.
+                return s.subr(.parse_unit, .parse_unit);
+            },
+            .skip_line => try s.skipLine(),
+            .no => {
+                try listParserSetup(s, head, close, next);
+                s.unused_char = c;
+                return s.jump(.parse_list_element, null);
+            },
+        }
+    }
+    return s.err(error.UnexpectedEof, "list");
 }
 
-fn endQuoteDatum(s: *State) void {
-    endDatum(s, cons(s.context.val, s.retval));
+fn listParserSetup(s: *State, head: Value, close: u8, next: Fn) !void {
+    s.context.val = head;
+    s.context.char = close;
+    try s.push(next);
+    try s.pushContext(.continue_list);
 }
 
-// List processing is, unsurprisingly, the most complicated, and it's made even
-// more complicated by the possibility of datum comments in strange places...
-
-// Make sure to use .start_parse instead of .start_datum to handle elements, so
-// that an arbitrary number of datum comments, separated by blanks (whitespace
-// and line comments) are handled automatically.
+fn parseListElement(s: *State) !void {
+    return parseDatum(s, s.getUnused().?);
+}
 
-fn startList(s: *State) void {
-    const open = s.getc();
+fn continueList(s: *State) !void {
+    const close = s.context.char;
 
-    s.consumeBlanks();
-    if (s.eof()) {
-        return err(s, "unexpected EOF while parsing list");
+    if (s.result.eq(value.undef)) {
+        const c = s.getUnused().?;
+        if (c == close) {
+            return endList(s);
+        }
+        if (c == '.') {
+            return s.jump(.parse_list_tail, null);
+        }
+        return s.err(error.InvalidCharacter, "list");
     }
 
-    s.context.val = value.nil.nil;
-    s.context.char = open;
-    if (isEndOfList(s)) {
-        endList(s);
-    } else {
-        s.recurParse(.start_parse, .continue_list);
+    s.context.val = cons(s.result, s.context.val);
+
+    var c1 = s.unused_char orelse try s.read();
+    while (c1) |c| : (c1 = try s.read()) {
+        if (c == close) {
+            return endList(s);
+        }
+        if (c == '.') {
+            return s.jump(.parse_list_tail, null);
+        }
+        switch (try checkBlank(s, c)) {
+            .yes => {},
+            .skip_unit => {
+                try s.pushContext(.continue_list);
+                return s.subr(.parse_unit, .parse_unit);
+            },
+            .skip_line => try s.skipLine(),
+            .no => {
+                s.unused_char = c;
+                return s.jump(.parse_list_element, null);
+            },
+        }
     }
+    return s.err(error.UnexpectedEof, "list");
 }
 
-fn isEndOfList(s: *State) bool {
-    return switch (s.peek()) {
-        ')', ']', '}' => true,
-        else => false,
-    };
+fn endList(s: *State) !void {
+    return s.retval(lib.list.reverse(s.context.val));
 }
 
-fn endList(s: *State) void {
-    const open = s.context.char;
-    const char = s.getc();
-
-    if (open == '(' and char == ')') {
-        return endDatum(s, s.context.val);
-    }
-    if (open == '[' and char == ']') {
-        return endDatum(s, cons(SQUARE, s.context.val));
-    }
-    if (open == '{' and char == '}') {
-        return endDatum(s, cons(BRACE, s.context.val));
+fn parseListTail(s: *State) !void {
+    const c = try s.readNoEof("list tail");
+    try s.pushContext(.end_improper_list);
+    switch (try checkBlank(s, c)) {
+        .yes => {},
+        .skip_unit => return s.subr(.parse_unit, .parse_unit),
+        .skip_line => try s.skipLine(),
+        // One blank mandatory here.
+        .no => return s.err(error.InvalidCharacter, "list tail"),
     }
-
-    err(s, "wrong closing bracket for list");
+    return s.jump(.parse_unit, null);
 }
 
-fn continueList(s: *State) void {
-    // Note that this accumulates list elements in reverse.
-    s.context.val = value.pair.cons(s.retval, s.context.val);
-
-    s.consumeBlanks();
-    if (s.eof()) {
-        return err(s, "unexpected EOF while parsing list");
+fn endImproperList(s: *State) !void {
+    const tail = s.result;
+    if (tail.eq(value.undef)) {
+        return s.err(error.InvalidCharacter, "list tail");
     }
+    s.context.val = lib.list.reverseWithTail(s.context.val, tail);
+    return closeImproperList(s);
+}
 
-    if (isEndOfList(s)) {
-        s.context.val = lib.list.reverse(s.context.val);
-        return endList(s);
+fn closeImproperList(s: *State) !void {
+    const close = s.context.char;
+    var c1 = s.getUnused() orelse try s.read();
+    while (c1) |c| : (c1 = try s.read()) {
+        switch (try checkBlank(s, c)) {
+            .yes => {},
+            .skip_unit => return s.subr(.parse_unit, .close_improper_list),
+            .skip_line => try s.skipLine(),
+            .no => {
+                if (c == close) {
+                    return s.retval(s.context.val);
+                }
+                return s.err(error.InvalidCharacter, "after list tail");
+            },
+        }
     }
+    return s.err(error.UnexpectedEof, "after list tail");
+}
 
-    // Check if there's an improper-list ending.
-    if (s.peek() == '.') {
-        const pos = s.pos;
-        s.skip();
-        if (s.eof()) {
-            return err(s, "unexpected EOF while parsing list");
-        }
-        // Scheme allows (foo .(bar)) but we don't.  Mind your spaces!
-        if (s.isWhitespace()) {
-            s.skip();
-            return s.recurParse(.start_parse, .finish_improper_list);
-        }
-        // Nope, reset.
-        s.pos = pos;
+fn parseQuoteExpr(s: *State, c1: u8, next: Fn) !void {
+    const q = switch (c1) {
+        '\'' => QUOTE,
+        '`' => GRAVE,
+        ',' => COMMA,
+        else => unreachable,
+    };
+
+    // fast-path to avoid subr
+    const c = try s.readNoEof("quote expression");
+    if (isBareChar(c) or c == '\\') {
+        const bs, s.unused_char = try parseBareString(s, c);
+        return s.jump(next, cons(q, bs));
     }
 
-    s.recurParse(.start_parse, .continue_list);
+    s.context.val = q;
+    return s.subr(.parse_unit, .end_quote_expr);
 }
 
-fn finishImproperList(s: *State) void {
-    s.context.val = lib.list.reverseWithTail(s.context.val, s.retval);
-    endImproperList(s);
+fn endQuoteExpr(s: *State) !void {
+    const q = s.context.val;
+    const d = s.result;
+    return s.retval(cons(q, d));
 }
 
-// Handling the end of an improper list is a bit awkward, because there may be
-// datum comments *after* the final cdr, where we don't actually want to parse
-// any further data.  So we keep looping here just looking for datum comments.
+// Helpers
 
-fn endImproperList(s: *State) void {
-    s.consumeBlanks();
-    if (s.eof()) {
-        return err(s, "unexpected EOF at end of improper list");
-    }
+fn parseHex(
+    s: *State,
+    u_type: type,
+    emsg: []const u8,
+) !struct { u_type, ?u8 } {
+    var uc: u_type = undefined;
 
-    if (isEndOfList(s)) {
-        return endList(s);
-    }
+    const c1 = try s.readNoEof(emsg);
+    uc = try parseHexDigit(s, c1, emsg);
 
-    if (s.getc() == '#') {
-        if (s.eof()) {
-            return err(s, "unexpected hash and EOF at end of improper list");
-        }
-        if (s.getc() == ';') {
-            return s.recurParse(.start_datum, .end_improper_list);
+    while (try s.read()) |c| {
+        if (!std.ascii.isHex(c)) {
+            return .{ uc, c };
         }
+        const shl = std.math.shlExact;
+        uc = shl(u_type, uc, 4) catch return s.err(error.OutOfRange, emsg);
+        uc |= try parseHexDigit(s, c, emsg);
     }
+    return .{ uc, null };
+}
 
-    err(s, "malformed list / extra datum at end of improper list");
+fn parseHexByte(s: *State, emsg: []const u8) !u8 {
+    const h1 = try s.readNoEof(emsg);
+    const h2 = try s.readNoEof(emsg);
+    const hi = try parseHexDigit(s, h1, emsg);
+    const lo = try parseHexDigit(s, h2, emsg);
+    return hi << 4 | lo;
 }
 
-fn err(s: *State, msg: []const u8) noreturn {
-    std.debug.print("{s}\n", .{msg});
-    std.debug.print("position: {}\n", .{s.counter});
-    @panic("parse error");
+fn parseHexDigit(s: *State, c: u8, emsg: []const u8) !u8 {
+    return switch (c) {
+        '0'...'9' => c - '0',
+        'A'...'F' => c - 'A' + 10,
+        'a'...'f' => c - 'a' + 10,
+        else => s.err(error.InvalidCharacter, emsg),
+    };
 }
diff --git a/src/libzisp/value.zig b/src/libzisp/value.zig
index 2561ca7..fbe907a 100644
--- a/src/libzisp/value.zig
+++ b/src/libzisp/value.zig
@@ -285,6 +285,10 @@ pub const Value = packed union {
         std.debug.dumpHex(std.mem.asBytes(&v));
     }
 
+    pub fn eq(v1: Value, v2: Value) bool {
+        return v1.bits == v2.bits;
+    }
+
     // The following aren't type predicates per se, but rather determine which
     // general category the value is in.  The exceptions are fixnum and double,
     // since those aren't sub-categorized into further types.
diff --git a/src/main.zig b/src/main.zig
index c9a5404..24c7959 100644
--- a/src/main.zig
+++ b/src/main.zig
@@ -1,11 +1,6 @@
-//! By convention, main.zig is where your main function lives in the case that
-//! you are building an executable. If you are making a library, the convention
-//! is to delete this file and start with root.zig instead.
-
 const std = @import("std");
 
-/// This imports the separate module containing `root.zig`. Take a look in `build.zig` for details.
-const zisp = @import("zisp_lib");
+const zisp = @import("libzisp");
 
 pub fn main() !void {
     const T = extern union {
@@ -33,6 +28,19 @@ pub fn main() !void {
     x.*.double = f1.* / f2.*;
     std.debug.print("-0/0: {x}\n", .{x.*.bits});
 
+    for (0..1000000) |i| {
+        _ = i;
+        const p = zisp.value.pair.cons(
+            zisp.value.fixnum.pack(0),
+            zisp.value.fixnum.pack(1),
+        );
+        std.mem.doNotOptimizeAway(p);
+        // std.debug.print("fx: {}\n", .{zisp.value.pair.car(p)});
+    }
+
+    const istr = zisp.value.istr.intern("foo", false);
+    std.debug.print("istr: {s}\n", .{zisp.value.istr.getHeader(istr).bytes()});
+
     // // Prints to stderr (it's a shortcut based on `std.io.getStdErr()`)
     // std.debug.print("All your {s} are belong to us.\n", .{"codebase"});
author	Taylan Kammer <taylan.kammer@gmail.com>	2025-03-27 21:18:09 +0100
committer	Taylan Kammer <taylan.kammer@gmail.com>	2025-03-27 21:18:09 +0100
commit	f2b18d64448ab09dd5e5e6a180d38d90d5aaf367 (patch)
tree	d83510fc892bf31feed56688af3ec350012cdeb9
parent	91629014bfe50e1d45cffedd618ab28a063f7689 (diff)