summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTaylan Kammer <taylan.kammer@gmail.com>2025-03-27 21:18:09 +0100
committerTaylan Kammer <taylan.kammer@gmail.com>2025-03-27 21:18:09 +0100
commitf2b18d64448ab09dd5e5e6a180d38d90d5aaf367 (patch)
treed83510fc892bf31feed56688af3ec350012cdeb9
parent91629014bfe50e1d45cffedd618ab28a063f7689 (diff)
new parser
-rw-r--r--_tests/antlr.bnf99
-rw-r--r--_tests/string1
-rw-r--r--_tests/switchtable.zig93
-rw-r--r--_tests/test.c (renamed from test.c)0
-rw-r--r--_tests/test.scm (renamed from test.scm)0
-rw-r--r--_tests/test.zig84
-rw-r--r--spec/parser.ebnf75
-rw-r--r--spec/syntax.md34
-rw-r--r--src/libzisp.zig188
-rw-r--r--src/libzisp/io/parser.zig1181
-rw-r--r--src/libzisp/value.zig4
-rw-r--r--src/main.zig20
12 files changed, 1195 insertions, 584 deletions
diff --git a/_tests/antlr.bnf b/_tests/antlr.bnf
new file mode 100644
index 0000000..7b0cf83
--- /dev/null
+++ b/_tests/antlr.bnf
@@ -0,0 +1,99 @@
+grammar ExprLexer;
+
+LF : '\n' ;
+SP : ' ' ;
+WS_CC : [\t\r\f] ;
+SEMI : ';' ;
+DOT : '.' ;
+COLON : ':' ;
+PIPE : '|' ;
+TILDE : '~' ;
+BSLASH : '\\' ;
+DQUOTE : '"' ;
+HASH : '#' ;
+LPAREN : '(' ;
+RPAREN : ')' ;
+LSQBR : '[' ;
+RSQBR : ']' ;
+LCURLY : '{' ;
+RCURLY : '}' ;
+EQUAL : '=' ;
+APOS : '\'' ;
+GRAVE : '`' ;
+COMMA : ',' ;
+
+ : [\u0000-\u0009\u000b-\u00ff] ;
+
+BARE_CHAR : [a-zA-Z0-9!$%&*+/<=>?@^_~-] | '\\' BARE_ESC ;
+
+BARE_ESC : [\u0021-\u007e] ;
+
+QUOTED_CHAR : [^"\\] | '\\' QUOTED_ESC ;
+
+QUOTED_ESC : [\\"abefnrtv]
+ | 'x' HEX_DIGIT HEX_DIGIT
+ | 'u' '{' HEX_DIGIT+ '}'
+ ;
+
+
+HEX_DIGIT : [0-9a-fA-F] ;
+ALPHA : [a-zA-Z] ;
+ALNUM : [a-zA-Z0-9] ;
+
+
+
+parse_unit : blank* datum blank? ;
+
+
+blank : LF | SP | WS_CC | comment ;
+
+datum : datum_one ( join_char? datum )? ;
+
+
+comment : ';' ( skip_datum | skip_line ) ;
+
+
+datum_one : bare_string | fancy_datum ;
+
+join_char : '.' | ':' | '|' ;
+
+
+skip_datum : '~' parse_unit ;
+
+skip_line : ANY_BUT_LF* LF? ;
+
+
+bare_string : BARE_CHAR+ ;
+
+fancy_datum : '\\' bare_esc_str
+ | '"' quoted_str '"'
+ | '#' hash_expr
+ | '(' list_body ')'
+ | '[' list_body ']'
+ | '{' list_body '}'
+ | quote_expr
+ ;
+
+
+bare_esc_str : '\\' BARE_ESC BARE_CHAR* ;
+
+quoted_str : QUOTED_CHAR* ;
+
+hash_expr : rune fancy_datum?
+ | label ( '=' fancy_datum )?
+ | fancy_datum
+ ;
+
+list_body : blank* ( list_head list_tail? )? ;
+
+quote_expr : ( '\'' | '`' | ',' ) datum ;
+
+
+rune : ALPHA ALNUM* ;
+
+label : '%' HEX_DIGIT+ ;
+
+
+list_head : datum blank* list_head? ;
+
+list_tail : blank+ '.' blank+ datum blank* ;
diff --git a/_tests/string b/_tests/string
new file mode 100644
index 0000000..c365d58
--- /dev/null
+++ b/_tests/string
@@ -0,0 +1 @@
+\0\a\b\t\n\v\f\r\e\e\r\f\v\n\t\b\a\0 \ No newline at end of file
diff --git a/_tests/switchtable.zig b/_tests/switchtable.zig
new file mode 100644
index 0000000..722ecdd
--- /dev/null
+++ b/_tests/switchtable.zig
@@ -0,0 +1,93 @@
+const std = @import("std");
+
+const Reader = std.io.AnyReader;
+
+pub fn main() !u8 {
+ return f();
+}
+
+fn f() !u8 {
+ const file = try std.fs.cwd().openFile("string", .{});
+ defer file.close();
+
+ var br = std.io.bufferedReader(file.reader());
+ const r = br.reader().any();
+
+ var n: u8 = 0;
+ for (0..1_000_000) |i| {
+ _ = i;
+ while (r.readByte() catch null) |c| {
+ n +%= try f1(r, c);
+ }
+ br.start = 0;
+ br.end = 0;
+ try file.seekTo(0);
+ }
+ return n;
+}
+
+fn f1(r: Reader, c1: u8) !u8 {
+ if (c1 != '\\') return c1;
+ const c = try r.readByte();
+ if (c == 'u') return unknown1();
+ return switch (c) {
+ '\\', '"' => c,
+ '0' => 0,
+ 'a' => 7,
+ 'b' => 8,
+ 't' => 9,
+ 'n' => 10,
+ 'v' => 11,
+ 'f' => 12,
+ 'r' => 13,
+ 'e' => 27,
+ 'x' => unknown2(),
+ else => unknown3(),
+ };
+}
+
+fn f2(r: Reader, c1: u8) !u8 {
+ if (c1 != '\\') return c1;
+ const c = try r.readByte();
+ if (c == 'u') return unknown1();
+ if (c == 'x') return unknown2();
+ if (c == '\\' or c == '"') return c;
+ const itable = .{ '0', 'a', 'b', 't', 'n', 'v', 'f', 'r', 'e' };
+ const ctable: []const u8 = &.{ 0, 7, 8, 9, 10, 11, 12, 13, 27 };
+ const i = std.mem.indexOfScalar(u8, &itable, c) orelse return unknown3();
+ return ctable[i];
+}
+
+fn f3(r: Reader, c1: u8) !u8 {
+ if (c1 != '\\') return c1;
+ const c = try r.readByte();
+ if (c == 'u') return unknown1();
+ if (c == 'x') return unknown2();
+ if (c == '\\' or c == '"') return c;
+ if (c == '0') return 0;
+ const table = comptime t: {
+ var table: [26]u8 = .{0} ** 26;
+ table['a' - 'a'] = 7;
+ table['b' - 'a'] = 8;
+ table['t' - 'a'] = 9;
+ table['n' - 'a'] = 10;
+ table['v' - 'a'] = 11;
+ table['f' - 'a'] = 12;
+ table['r' - 'a'] = 13;
+ table['e' - 'a'] = 27;
+ break :t table;
+ };
+ if (c < 'a') return unknown3();
+ const result = table[c - 'a'];
+ return if (result != 0) result else unknown3();
+}
+
+fn unknown1() u8 {
+ return 0;
+}
+fn unknown2() u8 {
+ return 0;
+}
+fn unknown3() u8 {
+ return 0;
+}
diff --git a/test.c b/_tests/test.c
index 0b0917e..0b0917e 100644
--- a/test.c
+++ b/_tests/test.c
diff --git a/test.scm b/_tests/test.scm
index d893c9f..d893c9f 100644
--- a/test.scm
+++ b/_tests/test.scm
diff --git a/_tests/test.zig b/_tests/test.zig
new file mode 100644
index 0000000..7b4a04c
--- /dev/null
+++ b/_tests/test.zig
@@ -0,0 +1,84 @@
+const std = @import("std");
+
+pub fn main() void {
+ // const y: [3]u64 = .{ 1, 2, 3 };
+ // const x: struct { u8, u64, u8 } = y;
+ // @import("std").debug.print("{}\n", .{x[0] + x[1] + x[2]});
+
+ std.debug.print("{}\n", .{@sizeOf(struct { a: u8, b: u64, c: u8, d: bool })});
+}
+
+// const x: ?u8 = 5;
+// if (x == null) {
+// return 1;
+// } else |val| {
+// return val;
+// }
+// var list = std.ArrayList(u8).init(std.heap.smp_allocator);
+// try parseUniHex("1f4a9", &list);
+// std.debug.print("{s}\n", .{list.items});
+
+// fn parseUniHex(
+// str: []const u8,
+// s: *std.ArrayList(u8),
+// ) !void {
+// var uc: u21 = parseHexDigit1(str[0]);
+// for (str[1..]) |c| {
+// uc = try std.math.shlExact(u21, uc, 4);
+// uc |= parseHexDigit1(c);
+// }
+
+// std.debug.print("{u}\n", .{uc});
+
+// const n = try std.unicode.utf8CodepointSequenceLength(uc);
+// const buf = try s.addManyAsSlice(n);
+// _ = try std.unicode.utf8Encode(uc, buf);
+// }
+
+// fn parseHexByte1(h1: u8, h2: u8) u8 {
+// const hi = parseHexDigit1(h1);
+// const lo = parseHexDigit1(h2);
+// return hi << 4 | lo;
+// }
+
+// fn parseHexDigit1(c: u8) u8 {
+// return switch (c) {
+// '0'...'9' => c - '0',
+// 'A'...'F' => c - 'A' + 10,
+// 'a'...'f' => c - 'a' + 10,
+// else => @panic(""),
+// };
+// }
+
+// fn parseHexByte2(h1: u8, h2: u8) u8 {
+// const hi: u8 = parseHexDigit2(h1);
+// const lo = parseHexDigit2(h2);
+// return hi << 4 | lo;
+// }
+
+// fn parseHexDigit2(c: u8) u4 {
+// return @intCast(switch (c) {
+// '0'...'9' => c - '0',
+// 'A'...'F' => c - 'A' + 10,
+// 'a'...'f' => c - 'a' + 10,
+// else => @panic(""),
+// });
+// }
+
+// fn parseUniHex1(str: []const u8) !u21 {
+// var uc: u21 = parseHexDigit1(str[0]);
+// for (str[1..]) |c| {
+// uc = try std.math.shlExact(u21, uc, 4);
+// uc |= parseHexDigit1(c);
+// }
+// return uc;
+// }
+
+// fn parseUniHex2(str: []const u8) !u21 {
+// var uc: u21 = parseHexDigit2(str[0]);
+// for (str[1..]) |c| {
+// uc = try std.math.shlExact(u21, uc, 4);
+// uc |= parseHexDigit2(c);
+// }
+// return uc;
+// }
diff --git a/spec/parser.ebnf b/spec/parser.ebnf
new file mode 100644
index 0000000..9e02fba
--- /dev/null
+++ b/spec/parser.ebnf
@@ -0,0 +1,75 @@
+unit : blank* ( datum blank? | EOF ) ;
+
+
+blank : 9...13 | comment ;
+
+datum : one_datum ( join_char? one_datum )* ;
+
+join_char : '.' | ':' | '|' ;
+
+
+comment : ';' ( skip_unit | skip_line ) ;
+
+skip_unit : '~' unit ;
+
+skip_line : ( ~LF )* LF? ;
+
+
+one_datum : ( bare_str | clad_datum ) ;
+
+bare_str : bare_str_elt+ ;
+
+clad_datum : '\' bare_esc_str
+ | '"' quoted_str '"'
+ | '#' hash_expr
+ | '(' blank* list? ')'
+ | '[' blank* list? ']'
+ | '{' blank* list? '}'
+ | quote_expr
+ ;
+
+
+bare_str_elt : bare_char | '\' bare_esc ;
+
+
+bare_esc_str : bare_esc bare_str_elt* ;
+
+quoted_str : ( quoted_char | '\' quoted_esc )* ;
+
+hash_expr : rune clad_datum?
+ | '%' label ( '%' | '=' unit )
+ | clad_datum
+ ;
+
+list : unit+ ( '.' blank+ unit )? blank* ;
+
+quote_expr : ( "'" | "`" | "," ) datum ;
+
+
+bare_char : letter | digit
+ | '!' | '$' | '%' | '&' | '*' | '+' | '-' | '/'
+ | '<' | '=' | '>' | '?' | '@' | '^' | '_' | '~'
+ ;
+
+bare_esc : 33...126 ;
+
+
+quoted_char : ~( '"' | '\' ) ;
+
+quoted_esc : '\' | '"' | 'a' | 'b' | 'e'
+ | 'f' | 'n' | 'r' | 't' | 'v'
+ | 'x' hex_digit{2}
+ | 'u' '{' hex_digit+ '}'
+ ;
+
+
+rune : letter ( letter | digit ){0,5} ;
+
+label : hex_digit{1,12} ;
+
+
+letter : 'a'...'z' | 'A'...'Z' ;
+
+digit : '0'...'9' ;
+
+hex_digit : '0'...'9' | 'a'...'f' | 'A'...'F' ;
diff --git a/spec/syntax.md b/spec/syntax.md
new file mode 100644
index 0000000..b85ed78
--- /dev/null
+++ b/spec/syntax.md
@@ -0,0 +1,34 @@
+# Zisp S-Expression Syntax
+
+We use a BNF notation with the following rules:
+
+* Concatenation of expressions is implicit: `foo bar` means `foo`
+ followed by `bar`.
+
+* Expressions may be followed by `?`, `*`, `+`, `{N}`, or `{N,M}`,
+ which have the meanings they have in regular expressions.
+
+* The syntax is defined in terms of bytes, not characters. Terminals
+ `'c'` and `"c"` refer to the ASCII value of the given character `c`.
+ Numbers are in decimal and refer to a byte with the given value.
+
+* The `~` prefix means NOT. It only applies to rules that match one
+ byte, and negates them. For example, `~( 'a' | 'b' )` matches any
+ byte other than 97 and 98.
+
+* Ranges of terminal values are expressed as `x...y` (inclusive).
+
+* There is no ambiguity, backtracking, or look-ahead beyond the byte
+ currently being matched. Rules match left to right, depth-first,
+ and greedy. As soon as the input matches the first terminal of a
+ rule, it must match that rule to the end.
+
+The last rule means that the BNF is very simple to translate to code.
+
+The parser consumes one `unit` from an input stream every time it's
+called; it returns the `datum` therein, or EOF.
+
+```
+
+
+```
diff --git a/src/libzisp.zig b/src/libzisp.zig
index 90508be..e6c8ac5 100644
--- a/src/libzisp.zig
+++ b/src/libzisp.zig
@@ -13,6 +13,20 @@ pub const Hval = gc.Hval;
pub const ShortString = value.ShortString;
pub const Value = value.Value;
+fn benchmark(name: []const u8, iters: usize, func: fn () anyerror!void) !void {
+ var timer = try std.time.Timer.start();
+ for (0..iters) |i| {
+ _ = i;
+ try func();
+ }
+ const ns: f64 = @floatFromInt(timer.lap());
+ const secs = ns / 1_000_000_000;
+ std.debug.print(
+ "bench {s} x {}: {d:.3}s\n",
+ .{ name, iters, secs },
+ );
+}
+
test "double" {
const d1: f64 = 0.123456789;
const d2: f64 = -0.987654321;
@@ -128,7 +142,7 @@ test "sstr" {
.ReleaseSafe => 100_000_000,
.ReleaseFast => 1_000_000_000,
};
- std.debug.print("Benchmarking with {} iters.\n", .{iters});
+ std.debug.print("Benchmarking sstr with {} iters.\n", .{iters});
inline for (impls, 0..) |impl, i| {
try benchmarkSstr(impl, i, iters);
}
@@ -288,7 +302,7 @@ test "parse" {
test "parse2" {
const val = parseString(
\\ ;; Testing some crazy datum comments
- \\ #;"bar"#;([x #"y"]{##`,'z}) #"foo"
+ \\ ;~"bar"([x #"y"]{##`,'z}) #"foo"
\\ ;; end
);
@@ -302,90 +316,86 @@ test "parse2" {
try std.testing.expectEqualStrings("foo", f.slice());
}
-test "parse3" {
- const val = parseString(
- \\(foo #;x #;(x y) #;x #bar [#x #"baz"] 'bat)
- );
-
- const car = value.pair.car;
- const cdr = value.pair.cdr;
-
- const e1 = car(val);
- const e2 = car(cdr(val));
- const e3 = car(cdr(cdr(val)));
- const e4 = car(cdr(cdr(cdr(val))));
-
- try std.testing.expect(value.sstr.check(e1));
- try std.testing.expect(value.rune.check(e2));
- try std.testing.expect(value.pair.check(e3));
- try std.testing.expect(value.pair.check(e4));
-}
-
-test "parse4" {
- const val = parseString("(foo . #;x bar #;y)");
-
- const s = value.sstr.unpack(value.pair.car(val));
- try std.testing.expectEqualStrings("foo", s.slice());
-
- const f = value.sstr.unpack(value.pair.cdr(val));
- try std.testing.expectEqualStrings("bar", f.slice());
-}
-
-fn parseBench(path: []const u8) !void {
- const iters = 1000;
-
- var buf: [8196]u8 = undefined;
- const file = try std.fs.cwd().openFile(path, .{});
- defer file.close();
- const count = try file.readAll(&buf);
-
- var fbs = std.io.fixedBufferStream(buf[0..count]);
- const reader = fbs.reader().any();
-
- var timer = try std.time.Timer.start();
- for (0..iters) |i| {
- _ = i;
- var v: Value = undefined;
- while (true) {
- v = io.parser.parse(reader);
- if (value.eof.check(v)) {
- break;
- }
- }
- try fbs.seekTo(0);
- }
- const ns: f64 = @floatFromInt(timer.lap());
- const secs = ns / 1_000_000_000;
- std.debug.print(
- "parse {s} x {}: {d:.3}s\n",
- .{ path, iters, secs },
- );
-}
-
-test "parse bench" {
- try parseBench("test-data/parser-test-1.scm");
- try parseBench("test-data/parser-test-2.scm");
-}
-
-test "unparse" {
- var gpa: std.heap.GeneralPurposeAllocator(.{}) = .init;
- var out: std.ArrayList(u8) = .init(gpa.allocator());
-
- const w = out.writer();
- const v = parseString("#foo");
- try io.unparser.unparse(w, v);
- try std.testing.expectEqualStrings("#foo", try out.toOwnedSlice());
-}
-
-test "unparse2" {
- var gpa: std.heap.GeneralPurposeAllocator(.{}) = .init;
- var out: std.ArrayList(u8) = .init(gpa.allocator());
-
- const w = out.writer();
- const v = parseString("#{foo bar['x]}");
- try io.unparser.unparse(w, v);
- try std.testing.expectEqualStrings(
- "(#HASH #BRACE foo (#JOIN bar #SQUARE (#QUOTE . x)))",
- try out.toOwnedSlice(),
- );
-}
+// test "parse3" {
+// const val = parseString(
+// \\(foo ;~x ;~(x y) ;~x #bar [#x #"baz"] 'bat)
+// );
+
+// const car = value.pair.car;
+// const cdr = value.pair.cdr;
+
+// const e1 = car(val);
+// const e2 = car(cdr(val));
+// const e3 = car(cdr(cdr(val)));
+// const e4 = car(cdr(cdr(cdr(val))));
+
+// try std.testing.expect(value.sstr.check(e1));
+// try std.testing.expect(value.rune.check(e2));
+// try std.testing.expect(value.pair.check(e3));
+// try std.testing.expect(value.pair.check(e4));
+// }
+
+// test "parse4" {
+// const val = parseString("(foo . ;~x bar ;~y)");
+
+// const s = value.sstr.unpack(value.pair.car(val));
+// try std.testing.expectEqualStrings("foo", s.slice());
+
+// const f = value.sstr.unpack(value.pair.cdr(val));
+// try std.testing.expectEqualStrings("bar", f.slice());
+// }
+
+// fn parseBench(path: []const u8, iters: usize) !void {
+// const file = try std.fs.cwd().openFile(path, .{});
+// defer file.close();
+
+// var timer = try std.time.Timer.start();
+// for (0..iters) |i| {
+// _ = i;
+// var br = std.io.bufferedReader(file.reader());
+// const reader = br.reader().any();
+// var v: Value = undefined;
+// while (true) {
+// v = io.parser.parse(reader);
+// if (value.eof.check(v)) {
+// break;
+// }
+// }
+// try file.seekTo(0);
+// }
+// const ns: f64 = @floatFromInt(timer.lap());
+// const secs = ns / 1_000_000_000;
+// std.debug.print(
+// "parse {s} x {}: {d:.3}s\n",
+// .{ path, iters, secs },
+// );
+// }
+
+// test "parse bench" {
+// // try parseBench("test-data/parser-test-1.scm", 200);
+// // try parseBench("test-data/parser-test-2.scm", 800);
+// try parseBench("test-data/parser-torture.scm", 1);
+// }
+
+// test "unparse" {
+// var gpa: std.heap.GeneralPurposeAllocator(.{}) = .init;
+// var out: std.ArrayList(u8) = .init(gpa.allocator());
+
+// const w = out.writer();
+// const v = parseString("#foo");
+// try io.unparser.unparse(w, v);
+// try std.testing.expectEqualStrings("#foo", try out.toOwnedSlice());
+// }
+
+// test "unparse2" {
+// var gpa: std.heap.GeneralPurposeAllocator(.{}) = .init;
+// var out: std.ArrayList(u8) = .init(gpa.allocator());
+
+// const w = out.writer();
+// const v = parseString("#{foo bar['x]}");
+// try io.unparser.unparse(w, v);
+// try std.testing.expectEqualStrings(
+// "(#HASH #BRACE foo (#JOIN bar #SQUARE (#QUOTE . x)))",
+// try out.toOwnedSlice(),
+// );
+// }
diff --git a/src/libzisp/io/parser.zig b/src/libzisp/io/parser.zig
index d02eda3..651d124 100644
--- a/src/libzisp/io/parser.zig
+++ b/src/libzisp/io/parser.zig
@@ -3,23 +3,23 @@
//
// Zisp s-expressions are defined in terms of an extremely minimal set of data
// types; only that which is necessary to build representations of more complex
-// expressions and types:
+// expressions and data types:
//
-// type format/examples comment
-// ---- --------------- -------
+// +---------+-------------+---------------+--------+---------+--------+
+// | TYPE | Bare String | Quoted String | Rune | Pair | Nil |
+// +---------+-------------+---------------+--------+---------+--------+
+// | EXAMPLE | foobar | "foo bar" | #name | (X . Y) | () |
+// +---------+-------------+---------------+--------+---------+--------+
//
-// string foo , "foo bar" quoted strings are flagged as such
+// Bare strings and quoted strings are polymorphic sub-types of the generic
+// string type. This lets the compiler distinguish between identifiers and
+// string literals.
//
-// rune #name name is: [a-zA-Z][a-zA-Z0-9]{0,5}
+// There are also non-negative integers, but they are only used for datum
+// labels; regular number literals are handled by the "decoder" (see below).
//
-// pair (DATUM . DATUM) the only composite data type supported
-//
-// nil () we prefer the term nil over null
-//
-// The parser recognizes various "syntax sugar" and transforms uses of it into
-// uses of the above types.
-//
-// The most ubiquitous example is of course the list syntax:
+// The parser recognizes various "syntax sugar" and transforms it into uses of
+// the above data types. The most ubiquitous example is of course the list:
//
// (datum1 datum2 ...) -> (datum1 . (datum2 . (... . ())))
//
@@ -35,18 +35,39 @@
//
// `datum -> (#GRAVE . datum) dat1|dat2 -> (#PIPE dat1 . dat2)
//
-// ,datum -> (#COMMA . datum) #n#=datum -> (#LABEL n . datum)
+// ,datum -> (#COMMA . datum) #%n=datum -> (#LABEL hex . datum)
+//
+// #%n -> (#LABEL . hex)
//
-// #n# -> (#LABEL . n)
+// A separate process called "decoding" can transform these objects into other
+// data types. For example, (#HASH x y z) could become a vector, so that the
+// expression #(x y z) works just like in Scheme.
//
-// Notes:
+// Decoding also resolves datum labels, and goes over bare strings to find ones
+// that are actually a number literal. This lets us offload the complexity of
+// number parsing elsewhere, so the parser remains extremely simple.
+//
+// Further notes about the syntax sugar table and examples above:
//
// * The terms datum, dat1, and dat2 each refer to an arbitrary datum; ellipsis
-// means zero or more data; n is a non-negative integer.
+// means zero or more data; hex is a hexadecimal number of up to 12 digits.
//
// * The #datum form only applies to expressions that cannot be mistaken for a
-// rune, such as for example: #(...) or #"..." or #'string etc.; following a
-// hash sign with a plain string will parse as a rune instead.
+// rune, such as #(...) or #"..." or #'string etc.; a hash sign followed by a
+// bare string will parse as a rune instead (or raise an error if it contains
+// characters that are illegal in rune names).
+//
+// * A backslash causes the immediately following character to lose any special
+// meaning it would normally have, and be considered as part of a bare string
+// instead. (This does not apply to whitespace.) For example, the following
+// three character sequences are each a valid bare string:
+//
+// foo\(bar\) \]blah \#\'xyz
+//
+// This can be used to follow a hash with a bare string without it being
+// parsed as a rune:
+//
+// #\foo -> (#HASH . foo)
//
// * Though not represented in the table due to notational difficulty, the
// format "#rune(...)" doesn't require a list in the second position; any
@@ -66,47 +87,39 @@
// The parser will see this as an attempt to use an 8-letter rune name, and
// raise an error, since rune names are limited to 6 characters.
//
-// Syntax sugar can combine arbitrarily; some examples follow:
-//
-// #{...} -> (#HASH #BRACE ...)
-//
-// #'foo -> (#HASH #QUOTE . foo)
+// * Syntax sugar can combine arbitrarily; some examples follow:
//
-// ##'[...] -> (#HASH #HASH #QUOTE #SQUARE ...)
+// #{...} -> (#HASH #BRACE ...)
//
-// {x y}[i j] -> (#JOIN (#BRACE x y) #SQUARE i j)
+// #'foo -> (#HASH #QUOTE . foo)
//
-// foo.bar.baz{x y} -> (#JOIN (#DOT (#DOT foo . bar) . baz) #BRACE x y)
+// ##'[...] -> (#HASH #HASH #QUOTE #SQUARE ...)
//
-// Runes are case-sensitive, and the parser only emits runes using upper-case
-// letters when expressing syntax sugar, so there can be no accidental clash
-// with runes that appear verbatim in code.
+// {x y}[i j] -> (#JOIN (#BRACE x y) #SQUARE i j)
//
-// Although strings and symbols aren't disjoint types in Zisp, the parser flags
-// double-quoted string literals to allow distinguishing them from bare strings.
-// Otherwise, it would not be possible for the compiler to tell the difference
-// between an identifier and a string literal.
+// foo.bar.baz{x y} -> (#JOIN (#DOT (#DOT foo . bar) . baz) #BRACE x y)
//
-// You may be wondering about numbers. As far as the parser is concerned,
-// numbers are strings. It's the decoder (see below) that will turn bare
-// strings into numbers where appropriate.
+// * While 'foo parses as (quote foo) in Scheme, it parses as (#QUOTE . foo) in
+// Zisp; the operand of #QUOTE is the entire cdr. The same principle is used
+// when parsing other sugar; some examples follow:
//
-// Note that 'foo becomes (quote foo) in Scheme, but (#QUOTE . foo) in Zisp.
-// The operand of #QUOTE is the entire cdr. The same principle is used when
-// parsing other sugar; some examples follow:
+// Incorrect Correct
//
-// Incorrect Correct
+// #(x y z) -> (#HASH (x y z)) #(x y z) -> (#HASH x y z)
//
-// #(x y z) -> (#HASH (x y z)) #(x y z) -> (#HASH x y z)
+// [x y z] -> (#SQUARE (x y z)) [x y z] -> (#SQUARE x y z)
//
-// [x y z] -> (#SQUARE (x y z)) [x y z] -> (#SQUARE x y z)
+// #{x} -> (#HASH (#BRACE (x))) #{x} -> (#HASH #BRACE x)
//
-// #{x} -> (#HASH (#BRACE (x))) #{x} -> (#HASH #BRACE x)
+// foo(x y) -> (#JOIN foo (x y)) foo(x y) -> (#JOIN foo x y)
//
-// foo(x y) -> (#JOIN foo (x y)) foo(x y) -> (#JOIN foo x y)
+// * Runes are case-sensitive, and the parser only emits runes using upper-case
+// letters when expressing syntax sugar. This way, there can be no accidental
+// clash with runes that appear verbatim in code, as long as only lower-case
+// letters are used for rune literals in code.
//
//
-// === Decoder ===
+// === Decoding ===
//
// A separate process called "decoding" can transform simple data structures,
// consisting of only the above types, into a richer set of Zisp data types.
@@ -118,8 +131,8 @@
// whose head they appear in. This can implement #true and #false.
//
// The decoder may also perform arbitrary transforms on any type; for example,
-// it may turn bare strings (those not flagged as double-quoted) into numbers
-// when appropriate. This can implement number literals.
+// it may turn bare strings into numbers wher appropriate. This can implement
+// number literals.
//
// The decoder recognizes (#QUOTE ...) to implement the traditional quoting
// mechanism, but with a significant difference:
@@ -162,11 +175,11 @@
// call one of the starting functions.
//
// If a function wants to make the parser return, either from a recursive parse
-// or from the main loop, it sets the .retval field, and tries to pop the saved
+// or from the main loop, it sets the .result field, and tries to pop the saved
// context. If the context stack was empty, the main loop returns.
//
// While it's possible to just set .next and return, to make the main loop call
-// another function next (possibly even setting .retval to pass a value to it),
+// another function next (possibly even setting .result to pass a value to it),
// this is completely unnecessary. A few non-recursive calls won't blow the
// stack. It's only recursive parsing that we use the above strategy for.
//
@@ -201,7 +214,7 @@
// would still waste memory and the time it takes to allocate memory.
//
//
-// === Buffering strategy ===
+// === Buffering ===
//
// We use a small circular buffer just so we can backtrack. We read bytes into
// this buffer one by one, because we don't want to consume more bytes from the
@@ -235,11 +248,20 @@ const std = @import("std");
const lib = @import("../lib.zig");
const value = @import("../value.zig");
-const ShortString = value.ShortString;
+const List = std.ArrayListUnmanaged;
+
const Value = value.Value;
const cons = value.pair.cons;
-const intern = value.istr.intern;
+
+const is_test = builtin.is_test;
+const is_debug = builtin.mode == .Debug;
+
+const detailed_debug = true;
+
+// In debug, we want to see if we leak, so very small numbers.
+const init_stack_capacity = if (is_debug) 20 else 32;
+const init_chars_capacity = if (is_debug) 100 else 512;
// zig fmt: off
const DOT = value.rune.pack("DOT");
@@ -257,166 +279,164 @@ const BRACE = value.rune.pack("BRACE");
const Context = struct {
// What to do next.
- next: Fn = .start_parse,
- // For storing some context value, like accumulated list elements.
+ next: Fn = .parse_unit,
+ // For storing a context value, like accumulated list elements.
val: Value = undefined,
- // For storing some context char, like opening bracket.
+ // For storing a context char, like list opening bracket.
char: u8 = undefined,
};
-// Make sure these are in sync, as we will use +% to increment the position.
-// Size 16 is enough because the largest amount by which we backtrack is short
-// strings / runes, which are limited to 6 bytes.
-const BUF_SIZE = 16;
-const POS_TYPE = u4;
-
-const is_test = builtin.is_test;
-const is_debug = builtin.mode == .Debug;
+const ParseError = error{
+ InvalidCharacter,
+ UnclosedString,
+ UnexpectedEof,
+ UnicodeError,
+ RuneTooLong,
+ OutOfMemory,
+ OutOfRange,
+ ReadError,
+};
const State = struct {
input: std.io.AnyReader,
- alloc: std.mem.Allocator,
+ stack_alloc: std.mem.Allocator,
+ chars_alloc: std.mem.Allocator,
- counter: usize = 0,
+ context: Context = .{},
+ stack: List(Context) = undefined,
- buf: [BUF_SIZE]u8 = undefined,
- pos: POS_TYPE = 0,
- write_pos: POS_TYPE = 0,
+ chars: List(u8) = undefined,
- context: Context = .{},
- stack: std.ArrayListUnmanaged(Context) = .{},
- retval: Value = undefined,
+ // TODO: Implement bool flag for when skipping a unit
- // For debugging.
- checked_eof: bool = false,
+ result: Value = undefined,
+ unused_char: ?u8 = null,
- fn init(input: std.io.AnyReader, alloc: std.mem.Allocator) State {
- return .{ .input = input, .alloc = alloc };
+ err_code: anyerror = undefined,
+ err_msg: []const u8 = undefined,
+
+ fn init(
+ input: std.io.AnyReader,
+ stack_alloc: std.mem.Allocator,
+ chars_alloc: std.mem.Allocator,
+ ) !State {
+ var s: State = .{
+ .input = input,
+ .stack_alloc = stack_alloc,
+ .chars_alloc = chars_alloc,
+ };
+ s.stack = try .initCapacity(s.stack_alloc, init_stack_capacity);
+ s.chars = try .initCapacity(s.chars_alloc, init_chars_capacity);
+ return s;
}
fn deinit(s: *State) void {
- s.stack.deinit(s.alloc);
+ s.stack.deinit(s.stack_alloc);
+ s.chars.deinit(s.chars_alloc);
}
- fn recurParse(s: *State, start: Fn, end: Fn) void {
- s.stack.append(s.alloc, .{
- .next = end,
- .val = s.context.val,
- .char = s.context.char,
- }) catch @panic("OOM");
- s.context.next = start;
+ fn err(s: *State, e: ParseError, msg: []const u8) ParseError {
+ s.err_msg = msg;
+ return e;
}
- fn returnDatum(s: *State, val: Value) void {
- s.retval = val;
- if (s.stack.pop()) |c| {
- s.context = c;
- } else {
- s.context.next = .done;
+ fn read(s: *State) !?u8 {
+ if (is_debug and s.unused_char != null) {
+ @panic("Called read() while there was an unused character!");
}
+ const c = s.input.readByte() catch |e| switch (e) {
+ error.EndOfStream => return null,
+ else => {
+ s.err_code = e;
+ return error.ReadError;
+ },
+ };
+ if (detailed_debug) {
+ std.debug.print("{c}", .{c});
+ }
+ return c;
}
- fn readNext(s: *State) !void {
- if (s.pos != s.write_pos) {
- return;
- }
- s.buf[s.pos] = try s.input.readByte();
- s.write_pos +%= 1;
+ fn readNoEof(s: *State, emsg: []const u8) !u8 {
+ return if (try s.read()) |c| c else s.err(error.UnexpectedEof, emsg);
}
- fn eof(s: *State) bool {
- if (is_debug) {
- s.checked_eof = true;
+ fn getUnused(s: *State) ?u8 {
+ if (s.unused_char) |c| {
+ s.unused_char = null;
+ return c;
}
- readNext(s) catch |e| switch (e) {
- error.EndOfStream => return true,
- else => @panic("read error"),
- };
- return false;
+ return null;
}
- fn peek(s: *State) u8 {
- if (is_debug) {
- if (!s.checked_eof) {
- @panic("Didn't check EOF before calling peek()!");
- }
- }
- return s.buf[s.pos];
+ fn skipLine(s: *State) !void {
+ while (try s.read()) |c| if (c == '\n') break;
}
- fn skip(s: *State) void {
- if (is_debug) {
- s.checked_eof = false;
- }
- // std.debug.print("{c}\n", .{s.buf[s.pos]});
- s.pos +%= 1;
- s.counter += 1;
+ fn addChar(s: *State, c: u8) !void {
+ try s.chars.append(s.chars_alloc, c);
}
- fn getc(s: *State) u8 {
- const c = s.peek();
- s.skip();
- return c;
+ fn getBareString(s: *State) Value {
+ defer s.chars.clearRetainingCapacity();
+ return if (s.chars.items.len <= 6)
+ value.sstr.pack(s.chars.items)
+ else
+ value.istr.intern(s.chars.items, false);
}
- // Consumes whitespace and line comments.
- fn consumeBlanks(s: *State) void {
- while (!s.eof()) {
- switch (s.peek()) {
- // Allow Form Feed (^L) commonly used by Emacs users.
- '\t', '\n', ' ', 0x0C => s.skip(),
- ';' => s.consumeLineComment(),
- else => return,
- }
- }
+ fn getQuotedString(s: *State) Value {
+ defer s.chars.clearRetainingCapacity();
+ return if (s.chars.items.len <= 6)
+ value.sstr.packQuoted(s.chars.items)
+ else
+ value.istr.intern(s.chars.items, true);
}
- fn consumeLineComment(s: *State) void {
- while (!s.eof()) {
- if (s.getc() == '\n') {
- return;
- }
- }
+ fn getRune(s: *State) !Value {
+ defer s.chars.clearRetainingCapacity();
+ return if (s.chars.items.len <= 6)
+ value.rune.pack(s.chars.items)
+ else
+ error.RuneTooLong;
}
- fn isWhitespace(s: *State) bool {
- return switch (s.peek()) {
- '\t', '\n', ' ' => true,
- else => false,
- };
+ fn push(s: *State, next: Fn) !void {
+ try s.stack.append(s.stack_alloc, .{ .next = next });
}
-};
-const CharPred = fn (u8) bool;
-const ShortStringPack = fn ([]const u8) Value;
+ fn pushContext(s: *State, next: Fn) !void {
+ try s.stack.append(s.stack_alloc, .{
+ .next = next,
+ .val = s.context.val,
+ .char = s.context.char,
+ });
+ }
-// Helper function to read runes and short strings.
-fn readShortString(
- s: *State,
- pred: CharPred,
- pack: ShortStringPack,
-) ?Value {
- var str = ShortString{};
- while (!s.eof() and pred(s.peek())) {
- str.append(s.getc()) catch return null;
+ fn subr(s: *State, start: Fn, next: Fn) !void {
+ try s.pushContext(next);
+ s.context.next = start;
}
- return pack(str.constSlice());
-}
-// Probably best *not* to use function pointers here, but rather dispatch to
-// functions manually based on enum value. This should help the optimizer.
+ fn jump(s: *State, next: Fn, val: ?Value) void {
+ if (val) |v| s.result = v;
+ s.context.next = next;
+ }
-const Fn = enum {
- start_parse,
- start_datum,
- end_join_datum,
- end_label_datum,
- end_hash_datum,
- end_quote_datum,
- continue_list,
- finish_improper_list,
- end_improper_list,
- done,
+ fn abort(s: *State, next: Fn, unused_c: u8) void {
+ s.result = value.undef;
+ s.unused_char = unused_c;
+ s.context.next = next;
+ }
+
+ fn retval(s: *State, val: Value) void {
+ s.result = val;
+ if (s.stack.pop()) |c| {
+ s.context = c;
+ } else {
+ s.context.next = .done;
+ }
+ }
};
pub fn parse(input: std.io.AnyReader) Value {
@@ -437,437 +457,620 @@ pub fn parse(input: std.io.AnyReader) Value {
else
std.heap.smp_allocator;
- var sfa = std.heap.stackFallback(4096, alloc);
-
- var s: State = .init(input, sfa.get());
+ const stack_mem = init_stack_capacity * @sizeOf(Context);
+ const chars_mem = init_chars_capacity;
+ var stack_sfa = std.heap.stackFallback(stack_mem, alloc);
+ var chars_sfa = std.heap.stackFallback(chars_mem, alloc);
+ const stack_alloc = stack_sfa.get();
+ const chars_alloc = chars_sfa.get();
+ var s = State.init(input, stack_alloc, chars_alloc) catch @panic("");
defer s.deinit();
- while (s.context.next != .done) call(&s);
- return s.retval;
+ while (s.context.next != .done) callNext(&s) catch |e| switch (e) {
+ else => @panic(s.err_msg), // TODO
+ };
+ if (s.unused_char) |_| {
+ @panic("invalid character");
+ }
+ return s.result;
}
-fn call(s: *State) void {
- // std.debug.print("{}\n", .{s.next});
- switch (s.context.next) {
- .start_parse => startParse(s),
- .start_datum => startDatum(s),
- .end_join_datum => endJoinedDatum(s),
- .end_label_datum => endLabelDatum(s),
+const Fn = enum {
+ parse_unit,
+ return_context,
+ end_one_datum,
+ parse_join_datum,
+ end_join_datum,
+ join_data,
+ parse_hash_datum,
+ end_hash_datum,
+ parse_rune_datum,
+ end_rune_datum,
+ end_label_datum,
+ parse_list_element,
+ continue_list,
+ parse_list_tail,
+ end_improper_list,
+ close_improper_list,
+ end_quote_expr,
+ done,
+};
+
+fn callNext(s: *State) !void {
+ if (detailed_debug) {
+ std.debug.print("\n{}:{} ctx:'{c}' unused:'{c}' \n", .{
+ s.stack.items.len,
+ s.context.next,
+ s.context.char,
+ s.unused_char orelse '_',
+ });
+ }
+ try switch (s.context.next) {
+ .parse_unit => parseUnit(s),
+ .return_context => returnContext(s),
+ .end_one_datum => endOneDatum(s),
+ .parse_join_datum => parseJoinDatum(s),
+ .end_join_datum => endJoinDatum(s),
+ .join_data => joinData(s),
+ .parse_hash_datum => parseHashDatum(s),
.end_hash_datum => endHashDatum(s),
- .end_quote_datum => endQuoteDatum(s),
+ .parse_rune_datum => parseRuneDatum(s),
+ .end_rune_datum => endRuneDatum(s),
+ .end_label_datum => endLabelDatum(s),
+ .parse_list_element => parseListElement(s),
.continue_list => continueList(s),
- .finish_improper_list => finishImproperList(s),
+ .parse_list_tail => parseListTail(s),
.end_improper_list => endImproperList(s),
+ .close_improper_list => endImproperList(s),
+ .end_quote_expr => endQuoteExpr(s),
.done => unreachable,
- }
+ };
}
-fn startParse(s: *State) void {
- s.consumeBlanks();
- if (s.eof()) {
- return s.returnDatum(value.eof.eof);
- }
- switch (s.peek()) {
- // whitespace already consumed
- 0...32, 127...255 => err(s, "invalid character"),
- ')', ']', '}' => err(s, "unexpected closing bracket"),
- else => startDatum(s),
+fn parseUnit(s: *State) !void {
+ var c1 = s.getUnused() orelse try s.read();
+ while (c1) |c| : (c1 = try s.read()) {
+ switch (try checkBlank(s, c)) {
+ .yes => {},
+ .skip_unit => {
+ // Simply push another parse_unit onto the stack, which will
+ // ignore the result of the current one and start anew; then
+ // keep looping to read the datum that will be ignored.
+ try s.push(.parse_unit);
+ },
+ .skip_line => try s.skipLine(),
+ .no => return parseDatum(s, c),
+ }
}
+ return s.retval(value.eof.eof);
}
-// This is called when we *immediately* expect a datum and nothing else; for
-// example, no whitespace or comments, because they've already been consumed.
-fn startDatum(s: *State) void {
- if (s.eof()) {
- return err(s, "expected datum, got EOF");
- }
- if (s.isWhitespace()) {
- return err(s, "expected datum, got whitespace");
- }
- switch (s.peek()) {
- // whitespace checked above
- 0...32, 127...255 => err(s, "invalid character"),
-
- ')', ']', '}' => err(s, "unexpected closing bracket"),
-
- ';' => err(s, "expected datum, got semicolon"),
-
- '#' => handleHash(s),
-
- '"' => startQuotedString(s),
-
- '\'', '`', ',' => startQuote(s),
+fn checkBlank(s: *State, c: u8) !enum { yes, skip_unit, skip_line, no } {
+ return switch (c) {
+ '\t'...'\r', ' ' => .yes,
+ ';' => switch (try s.read() orelse '\n') {
+ '\n' => .yes,
+ '~' => .skip_unit,
+ else => .skip_line,
+ },
+ else => .no,
+ };
+}
- '(', '[', '{' => startList(s),
+fn parseDatum(s: *State, c: u8) !void {
+ return parseOneDatum(s, c, .end_one_datum);
+}
- else => startBareString(s),
+fn endOneDatum(s: *State) !void {
+ const d = s.result;
+ if (d.eq(value.undef)) {
+ return s.retval(d);
}
+ const c1 = s.getUnused() orelse try s.read();
+ if (c1) |c| {
+ switch (try checkBlank(s, c)) {
+ .yes => {},
+ .skip_unit => return skipUnitAndReturn(s, d),
+ .skip_line => try s.skipLine(),
+ .no => return parseJoin(s, d, c),
+ }
+ }
+ return s.retval(d);
}
-fn endDatum(s: *State, d: Value) void {
- //
- // We're at the end of a datum; check for the various ways data can be
- // joined together, like DATUM|DATUM or DATUM|.DATUM etc.
- //
-
- if (isEndOfDatum(s)) {
- // Nope, end it.
- return s.returnDatum(d);
- }
+fn skipUnitAndReturn(s: *State, d: Value) !void {
+ s.context.val = d;
+ return s.subr(.parse_unit, .return_context);
+}
- // There's a stupid special-case we have to handle here, where a datum
- // comment may fool us into thinking there's something to join: foo|#;bar
+fn returnContext(s: *State) !void {
+ return s.retval(s.context.val);
+}
- const c = s.peek();
- switch (c) {
- '.', ':', '|' => s.skip(),
- '#' => if (checkTrailingDatumComment(s)) {
- return s.returnDatum(d);
- },
- else => {},
- }
+fn parseJoin(s: *State, d: Value, c: u8) !void {
s.context.val = d;
s.context.char = c;
- s.recurParse(.start_datum, .end_join_datum);
+ s.unused_char = switch (c) {
+ '.', ':', '|' => try s.readNoEof("start of joined datum"),
+ else => c,
+ };
+ return s.subr(.parse_join_datum, .join_data);
}
-fn checkTrailingDatumComment(s: *State) bool {
- const pos = s.pos;
- s.skip();
- if (s.eof()) {
- // Error, but let it be handled later.
- return false;
- }
- const c = s.peek();
- s.pos = pos;
- return c == ';';
+fn parseJoinDatum(s: *State) !void {
+ return parseOneDatum(s, s.getUnused().?, .end_join_datum);
}
-fn isEndOfDatum(s: *State) bool {
- return s.eof() or switch (s.peek()) {
- '\t', '\n', ' ', ';', ')', ']', '}' => true,
- else => false,
- };
+fn endJoinDatum(s: *State) !void {
+ return s.retval(s.result);
}
-fn endJoinedDatum(s: *State) void {
- const rune = switch (s.context.char) {
+fn joinData(s: *State) !void {
+ const head = s.context.val;
+ const join = s.context.char;
+ const tail = s.result;
+ if (tail.eq(value.undef)) {
+ return s.retval(head);
+ }
+ const rune = switch (join) {
'.' => DOT,
':' => COLON,
'|' => PIPE,
else => JOIN,
};
- endDatum(s, cons(rune, cons(s.context.val, s.retval)));
-}
-
-fn handleHash(s: *State) void {
- s.skip();
- //
- // We just consumed a hash. Possibilities include:
- //
- // #|foo ;rune
- //
- // #|n#[=DATUM] ;datum label, with or without datum
- //
- // #|;DATUM ;datum comment
- //
- // #|DATUM ;hash-datum
- //
-
- if (s.eof()) {
- return err(s, "EOF after hash");
- }
- if (s.isWhitespace()) {
- return err(s, "whitespace after hash");
- }
-
- switch (s.peek()) {
- 'a'...'z', 'A'...'Z' => handleRune(s),
- '0'...'9' => handleDatumLabel(s),
- ';' => {
- s.skip();
- // Don't change next in this case. Just let the parser redo what it
- // was doing as soon as the commented-out datum has been read.
- s.recurParse(.start_datum, s.context.next);
- },
- else => s.recurParse(.start_datum, .end_hash_datum),
+ const result = cons(rune, cons(head, tail));
+ return s.jump(.end_one_datum, result);
+}
+
+fn parseOneDatum(s: *State, c: u8, next: Fn) !void {
+ if (isBareChar(c)) {
+ const d, s.unused_char = try parseBareString(s, c);
+ return s.jump(next, d);
+ }
+ return parseCladDatum(s, c, next);
+}
+
+fn parseCladDatum(s: *State, c: u8, next: Fn) !void {
+ if (c == '\\') {
+ const bs, s.unused_char = try parseBareEscString(s);
+ return s.jump(next, bs);
+ }
+ if (c == '"') {
+ const qs = try parseQuotedString(s);
+ return s.jump(next, qs);
}
+ return switch (c) {
+ '#' => parseHashExpression(s, next),
+ '(', '[', '{' => parseList(s, c, next),
+ '\'', '`', ',' => parseQuoteExpr(s, c, next),
+ else => s.abort(next, c),
+ };
}
-fn handleRune(s: *State) void {
- const r = readRune(s) orelse return err(s, "rune too long");
- endDatum(s, r);
+fn isBareChar(c: u8) bool {
+ return switch (c) {
+ // zig fmt: off
+ 'a'...'z' , 'A'...'Z' , '0'...'9',
+ '!' , '$' , '%' , '&' , '*' , '+',
+ '-' , '/' , '<' , '=' , '>' , '?',
+ '@' , '^' , '_' , '~' => true,
+ // zig fmt: on
+ else => false,
+ };
}
-fn readRune(s: *State) ?Value {
- return readShortString(s, std.ascii.isAlphanumeric, value.rune.pack);
+fn isBareEsc(c: u8) bool {
+ return switch (c) {
+ 33...126 => true,
+ else => false,
+ };
}
-fn handleDatumLabel(s: *State) void {
- const n = readDatumLabel(s) orelse return err(s, "datum label too long");
- //
- // We're at the end of the numeric label now; possibilities are:
- //
- // #n|#
- //
- // #n|#=DATUM
- //
+fn parseBareString(s: *State, c: u8) !struct { Value, ?u8 } {
+ try s.addChar(c);
+ return parseBareStringRest(s);
+}
- if (s.eof()) {
- return err(s, "unexpected EOF while reading datum label");
- }
- if (s.getc() != '#') {
- return err(s, "invalid character while reading datum label");
- }
+fn parseBareEscString(s: *State) !struct { Value, ?u8 } {
+ try s.addChar(try parseBareEsc(s));
+ return parseBareStringRest(s);
+}
- if (s.eof() or s.isWhitespace()) {
- return endDatum(s, cons(LABEL, n));
+fn parseBareStringRest(s: *State) !struct { Value, ?u8 } {
+ while (try s.read()) |c| {
+ if (isBareChar(c)) {
+ try s.addChar(c);
+ } else if (c == '\\') {
+ try s.addChar(try parseBareEsc(s));
+ } else {
+ return .{ s.getBareString(), c };
+ }
}
+ return .{ s.getBareString(), null };
+}
- if (s.getc() != '=') {
- return err(s, "invalid character after numeric datum label");
+fn parseBareEsc(s: *State) !u8 {
+ const c = try s.readNoEof("bare escape");
+ if (isBareEsc(c)) {
+ return c;
+ } else {
+ return s.err(error.InvalidCharacter, "bare escape");
}
-
- s.context.val = n;
- s.recurParse(.start_datum, .end_label_datum);
}
-fn readDatumLabel(s: *State) ?Value {
- return readShortString(s, std.ascii.isDigit, value.sstr.pack);
+fn parseQuotedString(s: *State) !Value {
+ while (try s.read()) |c| {
+ if (c == '"') {
+ return s.getQuotedString();
+ }
+ if (c != '\\') {
+ try s.addChar(c);
+ } else {
+ try parseQuotedEsc(s);
+ }
+ }
+ return error.UnclosedString;
}
-fn endLabelDatum(s: *State) void {
- endDatum(s, cons(LABEL, cons(s.context.val, s.retval)));
+fn parseQuotedEsc(s: *State) !void {
+ const c = try s.readNoEof("quoted escape");
+ if (c == 'u') {
+ return parseUniHexHandleErrors(s);
+ }
+ try s.addChar(switch (c) {
+ '\\', '"' => c,
+ '0' => 0,
+ 'a' => 7,
+ 'b' => 8,
+ 't' => 9,
+ 'n' => 10,
+ 'v' => 11,
+ 'f' => 12,
+ 'r' => 13,
+ 'e' => 27,
+ 'x' => try parseHexByte(s, "hex escape"),
+ else => return s.err(error.InvalidCharacter, "quoted escape"),
+ });
}
-fn endHashDatum(s: *State) void {
- endDatum(s, cons(HASH, s.retval));
+fn parseUniHexHandleErrors(s: *State) !void {
+ return parseUniHex(s) catch |err| switch (err) {
+ error.Utf8CannotEncodeSurrogateHalf => e: {
+ s.err_code = err;
+ s.err_msg = "unicode escape";
+ break :e error.UnicodeError;
+ },
+ else => |e| e,
+ };
}
-fn startQuotedString(s: *State) void {
- // We're at |"..." so consume the opening quote before we start reading.
- s.skip();
+fn parseUniHex(s: *State) !void {
+ const msg = "unicode escape";
- const str = readQuotedString(s) catch return err(s, "unclosed string");
- endDatum(s, str);
+ if (try s.readNoEof(msg) != '{') {
+ return s.err(error.InvalidCharacter, msg);
+ }
+
+ const uc, const unused_c = try parseHex(s, u21, msg);
+ if (unused_c) |c| {
+ if (c != '}') {
+ return s.err(error.InvalidCharacter, msg);
+ }
+ } else {
+ return s.err(error.UnexpectedEof, msg);
+ }
+
+ const n = try std.unicode.utf8CodepointSequenceLength(uc);
+ const buf = try s.chars.addManyAsSlice(s.chars_alloc, n);
+ _ = try std.unicode.utf8Encode(uc, buf);
}
-// RQS = Read Quoted String
-const RqsError = enum { Unclosed };
+fn parseHashExpression(s: *State, next: Fn) !void {
+ const c = try s.readNoEof("hash expression");
+ if (try checkBlank(s, c) != .no) {
+ return s.err(error.InvalidCharacter, "hash expression");
+ }
+ if (std.ascii.isAlphabetic(c)) {
+ const r, const unused_c = try parseRune(s, c);
+ return parseRuneEnd(s, r, unused_c, next);
+ }
+ if (c == '%') {
+ const l, const unused_c = try parseLabel(s);
+ return parseLabelEnd(s, l, unused_c, next);
+ }
+ if (isBareChar(c)) {
+ // Reserved for future extensions to syntax sugar.
+ return s.err(error.InvalidCharacter, "hash expression");
+ }
+ // fast-path to avoid subr
+ if (c == '\\') {
+ const bs, s.unused_char = try parseBareEscString(s);
+ return s.jump(next, cons(HASH, bs));
+ }
+ if (c == '"') {
+ const qs = try parseQuotedString(s);
+ return s.jump(next, cons(HASH, qs));
+ }
+ s.unused_char = c;
+ return s.subr(.parse_hash_datum, next);
+}
-fn readQuotedString(s: *State) !Value {
- return try readQuotedSstr(s) orelse readQuotedLongString(s);
+fn parseHashDatum(s: *State) !void {
+ return parseCladDatum(s, s.getUnused().?, .end_hash_datum);
}
-fn readQuotedSstr(s: *State) !?Value {
- const start_pos = s.pos;
+fn endHashDatum(s: *State) !void {
+ if (s.result.eq(value.undef)) {
+ return s.err(error.InvalidCharacter, "hash datum");
+ }
+ return s.retval(cons(HASH, s.result));
+}
- // TODO: Handle escapes.
- var buf: [6]u8 = undefined;
- var i: u8 = 0;
- while (!s.eof()) : (i += 1) {
- const c = s.getc();
- if (c == '"') {
- // ok, return what we accumulated
- return value.sstr.packQuoted(buf[0..i]);
- }
- if (i == 6) {
- // failed; reset and bail out
- s.pos = start_pos;
- return null;
+fn parseRune(s: *State, c1: u8) !struct { Value, ?u8 } {
+ try s.addChar(c1);
+ var len: usize = 1;
+ while (try s.read()) |c| : (len += 1) {
+ if (len == 6 or !std.ascii.isAlphanumeric(c)) {
+ return .{ try s.getRune(), c };
}
- // ok, save this byte and go on
- buf[i] = c;
+ try s.addChar(c);
}
- return error.Unclosed;
+ return .{ try s.getRune(), null };
}
-fn readQuotedLongString(s: *State) !Value {
- var chars = std.ArrayList(u8).init(s.alloc);
- defer chars.deinit();
-
- // TODO: Handle escapes.
- while (!s.eof()) {
- const c = s.getc();
- if (c == '"') {
- return intern(chars.items, true);
- }
- chars.append(c) catch @panic("OOM");
+fn parseRuneEnd(s: *State, r: Value, c1: ?u8, next: Fn) !void {
+ const c = c1 orelse return s.jump(next, r);
+ if (c == '\\') {
+ const bs, s.unused_char = try parseBareString(s, c);
+ return s.jump(next, cons(r, bs));
+ }
+ if (c == '"') {
+ const qs = try parseQuotedString(s);
+ return s.jump(next, cons(r, qs));
+ }
+ s.unused_char = c;
+ switch (c) {
+ '#', '(', '[', '{', '\'', '`', ',' => {
+ try s.push(next);
+ return s.jump(.parse_rune_datum, r);
+ },
+ else => return s.jump(next, r),
}
- return error.Unclosed;
}
-fn startBareString(s: *State) void {
- // We're at |foo so start reading directly.
- const str = readBareSstr(s) orelse readBareLongString(s);
- endDatum(s, str);
+fn parseRuneDatum(s: *State) !void {
+ s.context.val = s.result;
+ return parseCladDatum(s, s.getUnused().?, .end_rune_datum);
}
-fn readBareSstr(s: *State) ?Value {
- const pos = s.pos;
- if (readShortString(s, isSstrChar, value.sstr.pack)) |sstr| {
- return sstr;
- } else {
- s.pos = pos;
- return null;
+fn endRuneDatum(s: *State) !void {
+ const r = s.context.val;
+ const d = s.result;
+ if (d.eq(value.undef)) {
+ s.retval(r);
}
+ return s.retval(cons(r, d));
}
-fn isSstrChar(c: u8) bool {
- // We will ignore illegal characters here, because they aren't consumed by
- // this function; whatever code comes next must handle them.
- return switch (c) {
- '(', ')', '[', ']', '{', '}', ';', '#', '"', '\'', '`', ',' => false,
- 0...32, 127...255 => false,
- else => true,
- };
+fn parseLabel(s: *State) !struct { Value, ?u8 } {
+ const label, const unused_c = try parseHex(s, u48, "datum label");
+ return .{ value.fixnum.pack(label), unused_c };
}
-fn readBareLongString(s: *State) Value {
- var chars = std.ArrayList(u8).init(s.alloc);
- defer chars.deinit();
+fn parseLabelEnd(s: *State, l: Value, c1: ?u8, next: Fn) !void {
+ const c = c1 orelse return s.err(error.UnexpectedEof, "datum label");
+ if (c == '%') {
+ return s.jump(next, cons(LABEL, l));
+ }
+ if (c == '=') {
+ try s.push(next);
+ s.context.val = l;
+ return s.subr(.parse_unit, .end_label_datum);
+ }
+ return s.err(error.InvalidCharacter, "datum label");
+}
- while (!s.eof()) {
- const c = s.peek();
- if (isSstrChar(c)) {
- chars.append(c) catch @panic("OOM");
- s.skip();
- } else {
- break;
- }
+fn endLabelDatum(s: *State) !void {
+ const l = s.context.val;
+ const d = s.result;
+ if (d.eq(value.undef)) {
+ return s.err(error.InvalidCharacter, "label datum");
}
- return intern(chars.items, false);
+ return s.retval(cons(LABEL, cons(l, d)));
}
-fn startQuote(s: *State) void {
- // We're at one of: |'... |`... |,...
- s.context.val = switch (s.getc()) {
- '\'' => QUOTE,
- '`' => GRAVE,
- ',' => COMMA,
+fn parseList(s: *State, open: u8, next: Fn) !void {
+ const head = switch (open) {
+ '(' => value.nil.nil,
+ '[' => cons(SQUARE, value.nil.nil),
+ '{' => cons(BRACE, value.nil.nil),
+ else => unreachable,
+ };
+ const close: u8 = switch (open) {
+ '(' => ')',
+ '[' => ']',
+ '{' => '}',
else => unreachable,
};
- s.recurParse(.start_datum, .end_quote_datum);
+ while (try s.read()) |c| {
+ if (c == close) {
+ return s.jump(next, head);
+ }
+ switch (try checkBlank(s, c)) {
+ .yes => {},
+ .skip_unit => {
+ try listParserSetup(s, head, close, next);
+ // Parse twice in a row, ignoring the first result.
+ return s.subr(.parse_unit, .parse_unit);
+ },
+ .skip_line => try s.skipLine(),
+ .no => {
+ try listParserSetup(s, head, close, next);
+ s.unused_char = c;
+ return s.jump(.parse_list_element, null);
+ },
+ }
+ }
+ return s.err(error.UnexpectedEof, "list");
}
-fn endQuoteDatum(s: *State) void {
- endDatum(s, cons(s.context.val, s.retval));
+fn listParserSetup(s: *State, head: Value, close: u8, next: Fn) !void {
+ s.context.val = head;
+ s.context.char = close;
+ try s.push(next);
+ try s.pushContext(.continue_list);
}
-// List processing is, unsurprisingly, the most complicated, and it's made even
-// more complicated by the possibility of datum comments in strange places...
-
-// Make sure to use .start_parse instead of .start_datum to handle elements, so
-// that an arbitrary number of datum comments, separated by blanks (whitespace
-// and line comments) are handled automatically.
+fn parseListElement(s: *State) !void {
+ return parseDatum(s, s.getUnused().?);
+}
-fn startList(s: *State) void {
- const open = s.getc();
+fn continueList(s: *State) !void {
+ const close = s.context.char;
- s.consumeBlanks();
- if (s.eof()) {
- return err(s, "unexpected EOF while parsing list");
+ if (s.result.eq(value.undef)) {
+ const c = s.getUnused().?;
+ if (c == close) {
+ return endList(s);
+ }
+ if (c == '.') {
+ return s.jump(.parse_list_tail, null);
+ }
+ return s.err(error.InvalidCharacter, "list");
}
- s.context.val = value.nil.nil;
- s.context.char = open;
- if (isEndOfList(s)) {
- endList(s);
- } else {
- s.recurParse(.start_parse, .continue_list);
+ s.context.val = cons(s.result, s.context.val);
+
+ var c1 = s.unused_char orelse try s.read();
+ while (c1) |c| : (c1 = try s.read()) {
+ if (c == close) {
+ return endList(s);
+ }
+ if (c == '.') {
+ return s.jump(.parse_list_tail, null);
+ }
+ switch (try checkBlank(s, c)) {
+ .yes => {},
+ .skip_unit => {
+ try s.pushContext(.continue_list);
+ return s.subr(.parse_unit, .parse_unit);
+ },
+ .skip_line => try s.skipLine(),
+ .no => {
+ s.unused_char = c;
+ return s.jump(.parse_list_element, null);
+ },
+ }
}
+ return s.err(error.UnexpectedEof, "list");
}
-fn isEndOfList(s: *State) bool {
- return switch (s.peek()) {
- ')', ']', '}' => true,
- else => false,
- };
+fn endList(s: *State) !void {
+ return s.retval(lib.list.reverse(s.context.val));
}
-fn endList(s: *State) void {
- const open = s.context.char;
- const char = s.getc();
-
- if (open == '(' and char == ')') {
- return endDatum(s, s.context.val);
- }
- if (open == '[' and char == ']') {
- return endDatum(s, cons(SQUARE, s.context.val));
- }
- if (open == '{' and char == '}') {
- return endDatum(s, cons(BRACE, s.context.val));
+fn parseListTail(s: *State) !void {
+ const c = try s.readNoEof("list tail");
+ try s.pushContext(.end_improper_list);
+ switch (try checkBlank(s, c)) {
+ .yes => {},
+ .skip_unit => return s.subr(.parse_unit, .parse_unit),
+ .skip_line => try s.skipLine(),
+ // One blank mandatory here.
+ .no => return s.err(error.InvalidCharacter, "list tail"),
}
-
- err(s, "wrong closing bracket for list");
+ return s.jump(.parse_unit, null);
}
-fn continueList(s: *State) void {
- // Note that this accumulates list elements in reverse.
- s.context.val = value.pair.cons(s.retval, s.context.val);
-
- s.consumeBlanks();
- if (s.eof()) {
- return err(s, "unexpected EOF while parsing list");
+fn endImproperList(s: *State) !void {
+ const tail = s.result;
+ if (tail.eq(value.undef)) {
+ return s.err(error.InvalidCharacter, "list tail");
}
+ s.context.val = lib.list.reverseWithTail(s.context.val, tail);
+ return closeImproperList(s);
+}
- if (isEndOfList(s)) {
- s.context.val = lib.list.reverse(s.context.val);
- return endList(s);
+fn closeImproperList(s: *State) !void {
+ const close = s.context.char;
+ var c1 = s.getUnused() orelse try s.read();
+ while (c1) |c| : (c1 = try s.read()) {
+ switch (try checkBlank(s, c)) {
+ .yes => {},
+ .skip_unit => return s.subr(.parse_unit, .close_improper_list),
+ .skip_line => try s.skipLine(),
+ .no => {
+ if (c == close) {
+ return s.retval(s.context.val);
+ }
+ return s.err(error.InvalidCharacter, "after list tail");
+ },
+ }
}
+ return s.err(error.UnexpectedEof, "after list tail");
+}
- // Check if there's an improper-list ending.
- if (s.peek() == '.') {
- const pos = s.pos;
- s.skip();
- if (s.eof()) {
- return err(s, "unexpected EOF while parsing list");
- }
- // Scheme allows (foo .(bar)) but we don't. Mind your spaces!
- if (s.isWhitespace()) {
- s.skip();
- return s.recurParse(.start_parse, .finish_improper_list);
- }
- // Nope, reset.
- s.pos = pos;
+fn parseQuoteExpr(s: *State, c1: u8, next: Fn) !void {
+ const q = switch (c1) {
+ '\'' => QUOTE,
+ '`' => GRAVE,
+ ',' => COMMA,
+ else => unreachable,
+ };
+
+ // fast-path to avoid subr
+ const c = try s.readNoEof("quote expression");
+ if (isBareChar(c) or c == '\\') {
+ const bs, s.unused_char = try parseBareString(s, c);
+ return s.jump(next, cons(q, bs));
}
- s.recurParse(.start_parse, .continue_list);
+ s.context.val = q;
+ return s.subr(.parse_unit, .end_quote_expr);
}
-fn finishImproperList(s: *State) void {
- s.context.val = lib.list.reverseWithTail(s.context.val, s.retval);
- endImproperList(s);
+fn endQuoteExpr(s: *State) !void {
+ const q = s.context.val;
+ const d = s.result;
+ return s.retval(cons(q, d));
}
-// Handling the end of an improper list is a bit awkward, because there may be
-// datum comments *after* the final cdr, where we don't actually want to parse
-// any further data. So we keep looping here just looking for datum comments.
+// Helpers
-fn endImproperList(s: *State) void {
- s.consumeBlanks();
- if (s.eof()) {
- return err(s, "unexpected EOF at end of improper list");
- }
+fn parseHex(
+ s: *State,
+ u_type: type,
+ emsg: []const u8,
+) !struct { u_type, ?u8 } {
+ var uc: u_type = undefined;
- if (isEndOfList(s)) {
- return endList(s);
- }
+ const c1 = try s.readNoEof(emsg);
+ uc = try parseHexDigit(s, c1, emsg);
- if (s.getc() == '#') {
- if (s.eof()) {
- return err(s, "unexpected hash and EOF at end of improper list");
- }
- if (s.getc() == ';') {
- return s.recurParse(.start_datum, .end_improper_list);
+ while (try s.read()) |c| {
+ if (!std.ascii.isHex(c)) {
+ return .{ uc, c };
}
+ const shl = std.math.shlExact;
+ uc = shl(u_type, uc, 4) catch return s.err(error.OutOfRange, emsg);
+ uc |= try parseHexDigit(s, c, emsg);
}
+ return .{ uc, null };
+}
- err(s, "malformed list / extra datum at end of improper list");
+fn parseHexByte(s: *State, emsg: []const u8) !u8 {
+ const h1 = try s.readNoEof(emsg);
+ const h2 = try s.readNoEof(emsg);
+ const hi = try parseHexDigit(s, h1, emsg);
+ const lo = try parseHexDigit(s, h2, emsg);
+ return hi << 4 | lo;
}
-fn err(s: *State, msg: []const u8) noreturn {
- std.debug.print("{s}\n", .{msg});
- std.debug.print("position: {}\n", .{s.counter});
- @panic("parse error");
+fn parseHexDigit(s: *State, c: u8, emsg: []const u8) !u8 {
+ return switch (c) {
+ '0'...'9' => c - '0',
+ 'A'...'F' => c - 'A' + 10,
+ 'a'...'f' => c - 'a' + 10,
+ else => s.err(error.InvalidCharacter, emsg),
+ };
}
diff --git a/src/libzisp/value.zig b/src/libzisp/value.zig
index 2561ca7..fbe907a 100644
--- a/src/libzisp/value.zig
+++ b/src/libzisp/value.zig
@@ -285,6 +285,10 @@ pub const Value = packed union {
std.debug.dumpHex(std.mem.asBytes(&v));
}
+ pub fn eq(v1: Value, v2: Value) bool {
+ return v1.bits == v2.bits;
+ }
+
// The following aren't type predicates per se, but rather determine which
// general category the value is in. The exceptions are fixnum and double,
// since those aren't sub-categorized into further types.
diff --git a/src/main.zig b/src/main.zig
index c9a5404..24c7959 100644
--- a/src/main.zig
+++ b/src/main.zig
@@ -1,11 +1,6 @@
-//! By convention, main.zig is where your main function lives in the case that
-//! you are building an executable. If you are making a library, the convention
-//! is to delete this file and start with root.zig instead.
-
const std = @import("std");
-/// This imports the separate module containing `root.zig`. Take a look in `build.zig` for details.
-const zisp = @import("zisp_lib");
+const zisp = @import("libzisp");
pub fn main() !void {
const T = extern union {
@@ -33,6 +28,19 @@ pub fn main() !void {
x.*.double = f1.* / f2.*;
std.debug.print("-0/0: {x}\n", .{x.*.bits});
+ for (0..1000000) |i| {
+ _ = i;
+ const p = zisp.value.pair.cons(
+ zisp.value.fixnum.pack(0),
+ zisp.value.fixnum.pack(1),
+ );
+ std.mem.doNotOptimizeAway(p);
+ // std.debug.print("fx: {}\n", .{zisp.value.pair.car(p)});
+ }
+
+ const istr = zisp.value.istr.intern("foo", false);
+ std.debug.print("istr: {s}\n", .{zisp.value.istr.getHeader(istr).bytes()});
+
// // Prints to stderr (it's a shortcut based on `std.io.getStdErr()`)
// std.debug.print("All your {s} are belong to us.\n", .{"codebase"});