diff options
| author | Taylan Kammer <taylan.kammer@gmail.com> | 2025-03-27 21:18:09 +0100 |
|---|---|---|
| committer | Taylan Kammer <taylan.kammer@gmail.com> | 2025-03-27 21:18:09 +0100 |
| commit | f2b18d64448ab09dd5e5e6a180d38d90d5aaf367 (patch) | |
| tree | d83510fc892bf31feed56688af3ec350012cdeb9 | |
| parent | 91629014bfe50e1d45cffedd618ab28a063f7689 (diff) | |
new parser
| -rw-r--r-- | _tests/antlr.bnf | 99 | ||||
| -rw-r--r-- | _tests/string | 1 | ||||
| -rw-r--r-- | _tests/switchtable.zig | 93 | ||||
| -rw-r--r-- | _tests/test.c (renamed from test.c) | 0 | ||||
| -rw-r--r-- | _tests/test.scm (renamed from test.scm) | 0 | ||||
| -rw-r--r-- | _tests/test.zig | 84 | ||||
| -rw-r--r-- | spec/parser.ebnf | 75 | ||||
| -rw-r--r-- | spec/syntax.md | 34 | ||||
| -rw-r--r-- | src/libzisp.zig | 188 | ||||
| -rw-r--r-- | src/libzisp/io/parser.zig | 1181 | ||||
| -rw-r--r-- | src/libzisp/value.zig | 4 | ||||
| -rw-r--r-- | src/main.zig | 20 |
12 files changed, 1195 insertions, 584 deletions
diff --git a/_tests/antlr.bnf b/_tests/antlr.bnf new file mode 100644 index 0000000..7b0cf83 --- /dev/null +++ b/_tests/antlr.bnf @@ -0,0 +1,99 @@ +grammar ExprLexer; + +LF : '\n' ; +SP : ' ' ; +WS_CC : [\t\r\f] ; +SEMI : ';' ; +DOT : '.' ; +COLON : ':' ; +PIPE : '|' ; +TILDE : '~' ; +BSLASH : '\\' ; +DQUOTE : '"' ; +HASH : '#' ; +LPAREN : '(' ; +RPAREN : ')' ; +LSQBR : '[' ; +RSQBR : ']' ; +LCURLY : '{' ; +RCURLY : '}' ; +EQUAL : '=' ; +APOS : '\'' ; +GRAVE : '`' ; +COMMA : ',' ; + + : [\u0000-\u0009\u000b-\u00ff] ; + +BARE_CHAR : [a-zA-Z0-9!$%&*+/<=>?@^_~-] | '\\' BARE_ESC ; + +BARE_ESC : [\u0021-\u007e] ; + +QUOTED_CHAR : [^"\\] | '\\' QUOTED_ESC ; + +QUOTED_ESC : [\\"abefnrtv] + | 'x' HEX_DIGIT HEX_DIGIT + | 'u' '{' HEX_DIGIT+ '}' + ; + + +HEX_DIGIT : [0-9a-fA-F] ; +ALPHA : [a-zA-Z] ; +ALNUM : [a-zA-Z0-9] ; + + + +parse_unit : blank* datum blank? ; + + +blank : LF | SP | WS_CC | comment ; + +datum : datum_one ( join_char? datum )? ; + + +comment : ';' ( skip_datum | skip_line ) ; + + +datum_one : bare_string | fancy_datum ; + +join_char : '.' | ':' | '|' ; + + +skip_datum : '~' parse_unit ; + +skip_line : ANY_BUT_LF* LF? ; + + +bare_string : BARE_CHAR+ ; + +fancy_datum : '\\' bare_esc_str + | '"' quoted_str '"' + | '#' hash_expr + | '(' list_body ')' + | '[' list_body ']' + | '{' list_body '}' + | quote_expr + ; + + +bare_esc_str : '\\' BARE_ESC BARE_CHAR* ; + +quoted_str : QUOTED_CHAR* ; + +hash_expr : rune fancy_datum? + | label ( '=' fancy_datum )? + | fancy_datum + ; + +list_body : blank* ( list_head list_tail? )? ; + +quote_expr : ( '\'' | '`' | ',' ) datum ; + + +rune : ALPHA ALNUM* ; + +label : '%' HEX_DIGIT+ ; + + +list_head : datum blank* list_head? ; + +list_tail : blank+ '.' blank+ datum blank* ; diff --git a/_tests/string b/_tests/string new file mode 100644 index 0000000..c365d58 --- /dev/null +++ b/_tests/string @@ -0,0 +1 @@ +\0\a\b\t\n\v\f\r\e\e\r\f\v\n\t\b\a\0
\ No newline at end of file diff --git a/_tests/switchtable.zig b/_tests/switchtable.zig new file mode 100644 index 0000000..722ecdd --- /dev/null +++ b/_tests/switchtable.zig @@ -0,0 +1,93 @@ +const std = @import("std"); + +const Reader = std.io.AnyReader; + +pub fn main() !u8 { + return f(); +} + +fn f() !u8 { + const file = try std.fs.cwd().openFile("string", .{}); + defer file.close(); + + var br = std.io.bufferedReader(file.reader()); + const r = br.reader().any(); + + var n: u8 = 0; + for (0..1_000_000) |i| { + _ = i; + while (r.readByte() catch null) |c| { + n +%= try f1(r, c); + } + br.start = 0; + br.end = 0; + try file.seekTo(0); + } + return n; +} + +fn f1(r: Reader, c1: u8) !u8 { + if (c1 != '\\') return c1; + const c = try r.readByte(); + if (c == 'u') return unknown1(); + return switch (c) { + '\\', '"' => c, + '0' => 0, + 'a' => 7, + 'b' => 8, + 't' => 9, + 'n' => 10, + 'v' => 11, + 'f' => 12, + 'r' => 13, + 'e' => 27, + 'x' => unknown2(), + else => unknown3(), + }; +} + +fn f2(r: Reader, c1: u8) !u8 { + if (c1 != '\\') return c1; + const c = try r.readByte(); + if (c == 'u') return unknown1(); + if (c == 'x') return unknown2(); + if (c == '\\' or c == '"') return c; + const itable = .{ '0', 'a', 'b', 't', 'n', 'v', 'f', 'r', 'e' }; + const ctable: []const u8 = &.{ 0, 7, 8, 9, 10, 11, 12, 13, 27 }; + const i = std.mem.indexOfScalar(u8, &itable, c) orelse return unknown3(); + return ctable[i]; +} + +fn f3(r: Reader, c1: u8) !u8 { + if (c1 != '\\') return c1; + const c = try r.readByte(); + if (c == 'u') return unknown1(); + if (c == 'x') return unknown2(); + if (c == '\\' or c == '"') return c; + if (c == '0') return 0; + const table = comptime t: { + var table: [26]u8 = .{0} ** 26; + table['a' - 'a'] = 7; + table['b' - 'a'] = 8; + table['t' - 'a'] = 9; + table['n' - 'a'] = 10; + table['v' - 'a'] = 11; + table['f' - 'a'] = 12; + table['r' - 'a'] = 13; + table['e' - 'a'] = 27; + break :t table; + }; + if (c < 'a') return unknown3(); + const result = table[c - 'a']; + return if (result != 0) result else unknown3(); +} + +fn unknown1() u8 { + return 0; +} +fn unknown2() u8 { + return 0; +} +fn unknown3() u8 { + return 0; +} diff --git a/test.scm b/_tests/test.scm index d893c9f..d893c9f 100644 --- a/test.scm +++ b/_tests/test.scm diff --git a/_tests/test.zig b/_tests/test.zig new file mode 100644 index 0000000..7b4a04c --- /dev/null +++ b/_tests/test.zig @@ -0,0 +1,84 @@ +const std = @import("std"); + +pub fn main() void { + // const y: [3]u64 = .{ 1, 2, 3 }; + // const x: struct { u8, u64, u8 } = y; + // @import("std").debug.print("{}\n", .{x[0] + x[1] + x[2]}); + + std.debug.print("{}\n", .{@sizeOf(struct { a: u8, b: u64, c: u8, d: bool })}); +} + +// const x: ?u8 = 5; +// if (x == null) { +// return 1; +// } else |val| { +// return val; +// } +// var list = std.ArrayList(u8).init(std.heap.smp_allocator); +// try parseUniHex("1f4a9", &list); +// std.debug.print("{s}\n", .{list.items}); + +// fn parseUniHex( +// str: []const u8, +// s: *std.ArrayList(u8), +// ) !void { +// var uc: u21 = parseHexDigit1(str[0]); +// for (str[1..]) |c| { +// uc = try std.math.shlExact(u21, uc, 4); +// uc |= parseHexDigit1(c); +// } + +// std.debug.print("{u}\n", .{uc}); + +// const n = try std.unicode.utf8CodepointSequenceLength(uc); +// const buf = try s.addManyAsSlice(n); +// _ = try std.unicode.utf8Encode(uc, buf); +// } + +// fn parseHexByte1(h1: u8, h2: u8) u8 { +// const hi = parseHexDigit1(h1); +// const lo = parseHexDigit1(h2); +// return hi << 4 | lo; +// } + +// fn parseHexDigit1(c: u8) u8 { +// return switch (c) { +// '0'...'9' => c - '0', +// 'A'...'F' => c - 'A' + 10, +// 'a'...'f' => c - 'a' + 10, +// else => @panic(""), +// }; +// } + +// fn parseHexByte2(h1: u8, h2: u8) u8 { +// const hi: u8 = parseHexDigit2(h1); +// const lo = parseHexDigit2(h2); +// return hi << 4 | lo; +// } + +// fn parseHexDigit2(c: u8) u4 { +// return @intCast(switch (c) { +// '0'...'9' => c - '0', +// 'A'...'F' => c - 'A' + 10, +// 'a'...'f' => c - 'a' + 10, +// else => @panic(""), +// }); +// } + +// fn parseUniHex1(str: []const u8) !u21 { +// var uc: u21 = parseHexDigit1(str[0]); +// for (str[1..]) |c| { +// uc = try std.math.shlExact(u21, uc, 4); +// uc |= parseHexDigit1(c); +// } +// return uc; +// } + +// fn parseUniHex2(str: []const u8) !u21 { +// var uc: u21 = parseHexDigit2(str[0]); +// for (str[1..]) |c| { +// uc = try std.math.shlExact(u21, uc, 4); +// uc |= parseHexDigit2(c); +// } +// return uc; +// } diff --git a/spec/parser.ebnf b/spec/parser.ebnf new file mode 100644 index 0000000..9e02fba --- /dev/null +++ b/spec/parser.ebnf @@ -0,0 +1,75 @@ +unit : blank* ( datum blank? | EOF ) ; + + +blank : 9...13 | comment ; + +datum : one_datum ( join_char? one_datum )* ; + +join_char : '.' | ':' | '|' ; + + +comment : ';' ( skip_unit | skip_line ) ; + +skip_unit : '~' unit ; + +skip_line : ( ~LF )* LF? ; + + +one_datum : ( bare_str | clad_datum ) ; + +bare_str : bare_str_elt+ ; + +clad_datum : '\' bare_esc_str + | '"' quoted_str '"' + | '#' hash_expr + | '(' blank* list? ')' + | '[' blank* list? ']' + | '{' blank* list? '}' + | quote_expr + ; + + +bare_str_elt : bare_char | '\' bare_esc ; + + +bare_esc_str : bare_esc bare_str_elt* ; + +quoted_str : ( quoted_char | '\' quoted_esc )* ; + +hash_expr : rune clad_datum? + | '%' label ( '%' | '=' unit ) + | clad_datum + ; + +list : unit+ ( '.' blank+ unit )? blank* ; + +quote_expr : ( "'" | "`" | "," ) datum ; + + +bare_char : letter | digit + | '!' | '$' | '%' | '&' | '*' | '+' | '-' | '/' + | '<' | '=' | '>' | '?' | '@' | '^' | '_' | '~' + ; + +bare_esc : 33...126 ; + + +quoted_char : ~( '"' | '\' ) ; + +quoted_esc : '\' | '"' | 'a' | 'b' | 'e' + | 'f' | 'n' | 'r' | 't' | 'v' + | 'x' hex_digit{2} + | 'u' '{' hex_digit+ '}' + ; + + +rune : letter ( letter | digit ){0,5} ; + +label : hex_digit{1,12} ; + + +letter : 'a'...'z' | 'A'...'Z' ; + +digit : '0'...'9' ; + +hex_digit : '0'...'9' | 'a'...'f' | 'A'...'F' ; diff --git a/spec/syntax.md b/spec/syntax.md new file mode 100644 index 0000000..b85ed78 --- /dev/null +++ b/spec/syntax.md @@ -0,0 +1,34 @@ +# Zisp S-Expression Syntax + +We use a BNF notation with the following rules: + +* Concatenation of expressions is implicit: `foo bar` means `foo` + followed by `bar`. + +* Expressions may be followed by `?`, `*`, `+`, `{N}`, or `{N,M}`, + which have the meanings they have in regular expressions. + +* The syntax is defined in terms of bytes, not characters. Terminals + `'c'` and `"c"` refer to the ASCII value of the given character `c`. + Numbers are in decimal and refer to a byte with the given value. + +* The `~` prefix means NOT. It only applies to rules that match one + byte, and negates them. For example, `~( 'a' | 'b' )` matches any + byte other than 97 and 98. + +* Ranges of terminal values are expressed as `x...y` (inclusive). + +* There is no ambiguity, backtracking, or look-ahead beyond the byte + currently being matched. Rules match left to right, depth-first, + and greedy. As soon as the input matches the first terminal of a + rule, it must match that rule to the end. + +The last rule means that the BNF is very simple to translate to code. + +The parser consumes one `unit` from an input stream every time it's +called; it returns the `datum` therein, or EOF. + +``` + + +``` diff --git a/src/libzisp.zig b/src/libzisp.zig index 90508be..e6c8ac5 100644 --- a/src/libzisp.zig +++ b/src/libzisp.zig @@ -13,6 +13,20 @@ pub const Hval = gc.Hval; pub const ShortString = value.ShortString; pub const Value = value.Value; +fn benchmark(name: []const u8, iters: usize, func: fn () anyerror!void) !void { + var timer = try std.time.Timer.start(); + for (0..iters) |i| { + _ = i; + try func(); + } + const ns: f64 = @floatFromInt(timer.lap()); + const secs = ns / 1_000_000_000; + std.debug.print( + "bench {s} x {}: {d:.3}s\n", + .{ name, iters, secs }, + ); +} + test "double" { const d1: f64 = 0.123456789; const d2: f64 = -0.987654321; @@ -128,7 +142,7 @@ test "sstr" { .ReleaseSafe => 100_000_000, .ReleaseFast => 1_000_000_000, }; - std.debug.print("Benchmarking with {} iters.\n", .{iters}); + std.debug.print("Benchmarking sstr with {} iters.\n", .{iters}); inline for (impls, 0..) |impl, i| { try benchmarkSstr(impl, i, iters); } @@ -288,7 +302,7 @@ test "parse" { test "parse2" { const val = parseString( \\ ;; Testing some crazy datum comments - \\ #;"bar"#;([x #"y"]{##`,'z}) #"foo" + \\ ;~"bar"([x #"y"]{##`,'z}) #"foo" \\ ;; end ); @@ -302,90 +316,86 @@ test "parse2" { try std.testing.expectEqualStrings("foo", f.slice()); } -test "parse3" { - const val = parseString( - \\(foo #;x #;(x y) #;x #bar [#x #"baz"] 'bat) - ); - - const car = value.pair.car; - const cdr = value.pair.cdr; - - const e1 = car(val); - const e2 = car(cdr(val)); - const e3 = car(cdr(cdr(val))); - const e4 = car(cdr(cdr(cdr(val)))); - - try std.testing.expect(value.sstr.check(e1)); - try std.testing.expect(value.rune.check(e2)); - try std.testing.expect(value.pair.check(e3)); - try std.testing.expect(value.pair.check(e4)); -} - -test "parse4" { - const val = parseString("(foo . #;x bar #;y)"); - - const s = value.sstr.unpack(value.pair.car(val)); - try std.testing.expectEqualStrings("foo", s.slice()); - - const f = value.sstr.unpack(value.pair.cdr(val)); - try std.testing.expectEqualStrings("bar", f.slice()); -} - -fn parseBench(path: []const u8) !void { - const iters = 1000; - - var buf: [8196]u8 = undefined; - const file = try std.fs.cwd().openFile(path, .{}); - defer file.close(); - const count = try file.readAll(&buf); - - var fbs = std.io.fixedBufferStream(buf[0..count]); - const reader = fbs.reader().any(); - - var timer = try std.time.Timer.start(); - for (0..iters) |i| { - _ = i; - var v: Value = undefined; - while (true) { - v = io.parser.parse(reader); - if (value.eof.check(v)) { - break; - } - } - try fbs.seekTo(0); - } - const ns: f64 = @floatFromInt(timer.lap()); - const secs = ns / 1_000_000_000; - std.debug.print( - "parse {s} x {}: {d:.3}s\n", - .{ path, iters, secs }, - ); -} - -test "parse bench" { - try parseBench("test-data/parser-test-1.scm"); - try parseBench("test-data/parser-test-2.scm"); -} - -test "unparse" { - var gpa: std.heap.GeneralPurposeAllocator(.{}) = .init; - var out: std.ArrayList(u8) = .init(gpa.allocator()); - - const w = out.writer(); - const v = parseString("#foo"); - try io.unparser.unparse(w, v); - try std.testing.expectEqualStrings("#foo", try out.toOwnedSlice()); -} - -test "unparse2" { - var gpa: std.heap.GeneralPurposeAllocator(.{}) = .init; - var out: std.ArrayList(u8) = .init(gpa.allocator()); - - const w = out.writer(); - const v = parseString("#{foo bar['x]}"); - try io.unparser.unparse(w, v); - try std.testing.expectEqualStrings( - "(#HASH #BRACE foo (#JOIN bar #SQUARE (#QUOTE . x)))", - try out.toOwnedSlice(), - ); -} +// test "parse3" { +// const val = parseString( +// \\(foo ;~x ;~(x y) ;~x #bar [#x #"baz"] 'bat) +// ); + +// const car = value.pair.car; +// const cdr = value.pair.cdr; + +// const e1 = car(val); +// const e2 = car(cdr(val)); +// const e3 = car(cdr(cdr(val))); +// const e4 = car(cdr(cdr(cdr(val)))); + +// try std.testing.expect(value.sstr.check(e1)); +// try std.testing.expect(value.rune.check(e2)); +// try std.testing.expect(value.pair.check(e3)); +// try std.testing.expect(value.pair.check(e4)); +// } + +// test "parse4" { +// const val = parseString("(foo . ;~x bar ;~y)"); + +// const s = value.sstr.unpack(value.pair.car(val)); +// try std.testing.expectEqualStrings("foo", s.slice()); + +// const f = value.sstr.unpack(value.pair.cdr(val)); +// try std.testing.expectEqualStrings("bar", f.slice()); +// } + +// fn parseBench(path: []const u8, iters: usize) !void { +// const file = try std.fs.cwd().openFile(path, .{}); +// defer file.close(); + +// var timer = try std.time.Timer.start(); +// for (0..iters) |i| { +// _ = i; +// var br = std.io.bufferedReader(file.reader()); +// const reader = br.reader().any(); +// var v: Value = undefined; +// while (true) { +// v = io.parser.parse(reader); +// if (value.eof.check(v)) { +// break; +// } +// } +// try file.seekTo(0); +// } +// const ns: f64 = @floatFromInt(timer.lap()); +// const secs = ns / 1_000_000_000; +// std.debug.print( +// "parse {s} x {}: {d:.3}s\n", +// .{ path, iters, secs }, +// ); +// } + +// test "parse bench" { +// // try parseBench("test-data/parser-test-1.scm", 200); +// // try parseBench("test-data/parser-test-2.scm", 800); +// try parseBench("test-data/parser-torture.scm", 1); +// } + +// test "unparse" { +// var gpa: std.heap.GeneralPurposeAllocator(.{}) = .init; +// var out: std.ArrayList(u8) = .init(gpa.allocator()); + +// const w = out.writer(); +// const v = parseString("#foo"); +// try io.unparser.unparse(w, v); +// try std.testing.expectEqualStrings("#foo", try out.toOwnedSlice()); +// } + +// test "unparse2" { +// var gpa: std.heap.GeneralPurposeAllocator(.{}) = .init; +// var out: std.ArrayList(u8) = .init(gpa.allocator()); + +// const w = out.writer(); +// const v = parseString("#{foo bar['x]}"); +// try io.unparser.unparse(w, v); +// try std.testing.expectEqualStrings( +// "(#HASH #BRACE foo (#JOIN bar #SQUARE (#QUOTE . x)))", +// try out.toOwnedSlice(), +// ); +// } diff --git a/src/libzisp/io/parser.zig b/src/libzisp/io/parser.zig index d02eda3..651d124 100644 --- a/src/libzisp/io/parser.zig +++ b/src/libzisp/io/parser.zig @@ -3,23 +3,23 @@ // // Zisp s-expressions are defined in terms of an extremely minimal set of data // types; only that which is necessary to build representations of more complex -// expressions and types: +// expressions and data types: // -// type format/examples comment -// ---- --------------- ------- +// +---------+-------------+---------------+--------+---------+--------+ +// | TYPE | Bare String | Quoted String | Rune | Pair | Nil | +// +---------+-------------+---------------+--------+---------+--------+ +// | EXAMPLE | foobar | "foo bar" | #name | (X . Y) | () | +// +---------+-------------+---------------+--------+---------+--------+ // -// string foo , "foo bar" quoted strings are flagged as such +// Bare strings and quoted strings are polymorphic sub-types of the generic +// string type. This lets the compiler distinguish between identifiers and +// string literals. // -// rune #name name is: [a-zA-Z][a-zA-Z0-9]{0,5} +// There are also non-negative integers, but they are only used for datum +// labels; regular number literals are handled by the "decoder" (see below). // -// pair (DATUM . DATUM) the only composite data type supported -// -// nil () we prefer the term nil over null -// -// The parser recognizes various "syntax sugar" and transforms uses of it into -// uses of the above types. -// -// The most ubiquitous example is of course the list syntax: +// The parser recognizes various "syntax sugar" and transforms it into uses of +// the above data types. The most ubiquitous example is of course the list: // // (datum1 datum2 ...) -> (datum1 . (datum2 . (... . ()))) // @@ -35,18 +35,39 @@ // // `datum -> (#GRAVE . datum) dat1|dat2 -> (#PIPE dat1 . dat2) // -// ,datum -> (#COMMA . datum) #n#=datum -> (#LABEL n . datum) +// ,datum -> (#COMMA . datum) #%n=datum -> (#LABEL hex . datum) +// +// #%n -> (#LABEL . hex) // -// #n# -> (#LABEL . n) +// A separate process called "decoding" can transform these objects into other +// data types. For example, (#HASH x y z) could become a vector, so that the +// expression #(x y z) works just like in Scheme. // -// Notes: +// Decoding also resolves datum labels, and goes over bare strings to find ones +// that are actually a number literal. This lets us offload the complexity of +// number parsing elsewhere, so the parser remains extremely simple. +// +// Further notes about the syntax sugar table and examples above: // // * The terms datum, dat1, and dat2 each refer to an arbitrary datum; ellipsis -// means zero or more data; n is a non-negative integer. +// means zero or more data; hex is a hexadecimal number of up to 12 digits. // // * The #datum form only applies to expressions that cannot be mistaken for a -// rune, such as for example: #(...) or #"..." or #'string etc.; following a -// hash sign with a plain string will parse as a rune instead. +// rune, such as #(...) or #"..." or #'string etc.; a hash sign followed by a +// bare string will parse as a rune instead (or raise an error if it contains +// characters that are illegal in rune names). +// +// * A backslash causes the immediately following character to lose any special +// meaning it would normally have, and be considered as part of a bare string +// instead. (This does not apply to whitespace.) For example, the following +// three character sequences are each a valid bare string: +// +// foo\(bar\) \]blah \#\'xyz +// +// This can be used to follow a hash with a bare string without it being +// parsed as a rune: +// +// #\foo -> (#HASH . foo) // // * Though not represented in the table due to notational difficulty, the // format "#rune(...)" doesn't require a list in the second position; any @@ -66,47 +87,39 @@ // The parser will see this as an attempt to use an 8-letter rune name, and // raise an error, since rune names are limited to 6 characters. // -// Syntax sugar can combine arbitrarily; some examples follow: -// -// #{...} -> (#HASH #BRACE ...) -// -// #'foo -> (#HASH #QUOTE . foo) +// * Syntax sugar can combine arbitrarily; some examples follow: // -// ##'[...] -> (#HASH #HASH #QUOTE #SQUARE ...) +// #{...} -> (#HASH #BRACE ...) // -// {x y}[i j] -> (#JOIN (#BRACE x y) #SQUARE i j) +// #'foo -> (#HASH #QUOTE . foo) // -// foo.bar.baz{x y} -> (#JOIN (#DOT (#DOT foo . bar) . baz) #BRACE x y) +// ##'[...] -> (#HASH #HASH #QUOTE #SQUARE ...) // -// Runes are case-sensitive, and the parser only emits runes using upper-case -// letters when expressing syntax sugar, so there can be no accidental clash -// with runes that appear verbatim in code. +// {x y}[i j] -> (#JOIN (#BRACE x y) #SQUARE i j) // -// Although strings and symbols aren't disjoint types in Zisp, the parser flags -// double-quoted string literals to allow distinguishing them from bare strings. -// Otherwise, it would not be possible for the compiler to tell the difference -// between an identifier and a string literal. +// foo.bar.baz{x y} -> (#JOIN (#DOT (#DOT foo . bar) . baz) #BRACE x y) // -// You may be wondering about numbers. As far as the parser is concerned, -// numbers are strings. It's the decoder (see below) that will turn bare -// strings into numbers where appropriate. +// * While 'foo parses as (quote foo) in Scheme, it parses as (#QUOTE . foo) in +// Zisp; the operand of #QUOTE is the entire cdr. The same principle is used +// when parsing other sugar; some examples follow: // -// Note that 'foo becomes (quote foo) in Scheme, but (#QUOTE . foo) in Zisp. -// The operand of #QUOTE is the entire cdr. The same principle is used when -// parsing other sugar; some examples follow: +// Incorrect Correct // -// Incorrect Correct +// #(x y z) -> (#HASH (x y z)) #(x y z) -> (#HASH x y z) // -// #(x y z) -> (#HASH (x y z)) #(x y z) -> (#HASH x y z) +// [x y z] -> (#SQUARE (x y z)) [x y z] -> (#SQUARE x y z) // -// [x y z] -> (#SQUARE (x y z)) [x y z] -> (#SQUARE x y z) +// #{x} -> (#HASH (#BRACE (x))) #{x} -> (#HASH #BRACE x) // -// #{x} -> (#HASH (#BRACE (x))) #{x} -> (#HASH #BRACE x) +// foo(x y) -> (#JOIN foo (x y)) foo(x y) -> (#JOIN foo x y) // -// foo(x y) -> (#JOIN foo (x y)) foo(x y) -> (#JOIN foo x y) +// * Runes are case-sensitive, and the parser only emits runes using upper-case +// letters when expressing syntax sugar. This way, there can be no accidental +// clash with runes that appear verbatim in code, as long as only lower-case +// letters are used for rune literals in code. // // -// === Decoder === +// === Decoding === // // A separate process called "decoding" can transform simple data structures, // consisting of only the above types, into a richer set of Zisp data types. @@ -118,8 +131,8 @@ // whose head they appear in. This can implement #true and #false. // // The decoder may also perform arbitrary transforms on any type; for example, -// it may turn bare strings (those not flagged as double-quoted) into numbers -// when appropriate. This can implement number literals. +// it may turn bare strings into numbers wher appropriate. This can implement +// number literals. // // The decoder recognizes (#QUOTE ...) to implement the traditional quoting // mechanism, but with a significant difference: @@ -162,11 +175,11 @@ // call one of the starting functions. // // If a function wants to make the parser return, either from a recursive parse -// or from the main loop, it sets the .retval field, and tries to pop the saved +// or from the main loop, it sets the .result field, and tries to pop the saved // context. If the context stack was empty, the main loop returns. // // While it's possible to just set .next and return, to make the main loop call -// another function next (possibly even setting .retval to pass a value to it), +// another function next (possibly even setting .result to pass a value to it), // this is completely unnecessary. A few non-recursive calls won't blow the // stack. It's only recursive parsing that we use the above strategy for. // @@ -201,7 +214,7 @@ // would still waste memory and the time it takes to allocate memory. // // -// === Buffering strategy === +// === Buffering === // // We use a small circular buffer just so we can backtrack. We read bytes into // this buffer one by one, because we don't want to consume more bytes from the @@ -235,11 +248,20 @@ const std = @import("std"); const lib = @import("../lib.zig"); const value = @import("../value.zig"); -const ShortString = value.ShortString; +const List = std.ArrayListUnmanaged; + const Value = value.Value; const cons = value.pair.cons; -const intern = value.istr.intern; + +const is_test = builtin.is_test; +const is_debug = builtin.mode == .Debug; + +const detailed_debug = true; + +// In debug, we want to see if we leak, so very small numbers. +const init_stack_capacity = if (is_debug) 20 else 32; +const init_chars_capacity = if (is_debug) 100 else 512; // zig fmt: off const DOT = value.rune.pack("DOT"); @@ -257,166 +279,164 @@ const BRACE = value.rune.pack("BRACE"); const Context = struct { // What to do next. - next: Fn = .start_parse, - // For storing some context value, like accumulated list elements. + next: Fn = .parse_unit, + // For storing a context value, like accumulated list elements. val: Value = undefined, - // For storing some context char, like opening bracket. + // For storing a context char, like list opening bracket. char: u8 = undefined, }; -// Make sure these are in sync, as we will use +% to increment the position. -// Size 16 is enough because the largest amount by which we backtrack is short -// strings / runes, which are limited to 6 bytes. -const BUF_SIZE = 16; -const POS_TYPE = u4; - -const is_test = builtin.is_test; -const is_debug = builtin.mode == .Debug; +const ParseError = error{ + InvalidCharacter, + UnclosedString, + UnexpectedEof, + UnicodeError, + RuneTooLong, + OutOfMemory, + OutOfRange, + ReadError, +}; const State = struct { input: std.io.AnyReader, - alloc: std.mem.Allocator, + stack_alloc: std.mem.Allocator, + chars_alloc: std.mem.Allocator, - counter: usize = 0, + context: Context = .{}, + stack: List(Context) = undefined, - buf: [BUF_SIZE]u8 = undefined, - pos: POS_TYPE = 0, - write_pos: POS_TYPE = 0, + chars: List(u8) = undefined, - context: Context = .{}, - stack: std.ArrayListUnmanaged(Context) = .{}, - retval: Value = undefined, + // TODO: Implement bool flag for when skipping a unit - // For debugging. - checked_eof: bool = false, + result: Value = undefined, + unused_char: ?u8 = null, - fn init(input: std.io.AnyReader, alloc: std.mem.Allocator) State { - return .{ .input = input, .alloc = alloc }; + err_code: anyerror = undefined, + err_msg: []const u8 = undefined, + + fn init( + input: std.io.AnyReader, + stack_alloc: std.mem.Allocator, + chars_alloc: std.mem.Allocator, + ) !State { + var s: State = .{ + .input = input, + .stack_alloc = stack_alloc, + .chars_alloc = chars_alloc, + }; + s.stack = try .initCapacity(s.stack_alloc, init_stack_capacity); + s.chars = try .initCapacity(s.chars_alloc, init_chars_capacity); + return s; } fn deinit(s: *State) void { - s.stack.deinit(s.alloc); + s.stack.deinit(s.stack_alloc); + s.chars.deinit(s.chars_alloc); } - fn recurParse(s: *State, start: Fn, end: Fn) void { - s.stack.append(s.alloc, .{ - .next = end, - .val = s.context.val, - .char = s.context.char, - }) catch @panic("OOM"); - s.context.next = start; + fn err(s: *State, e: ParseError, msg: []const u8) ParseError { + s.err_msg = msg; + return e; } - fn returnDatum(s: *State, val: Value) void { - s.retval = val; - if (s.stack.pop()) |c| { - s.context = c; - } else { - s.context.next = .done; + fn read(s: *State) !?u8 { + if (is_debug and s.unused_char != null) { + @panic("Called read() while there was an unused character!"); } + const c = s.input.readByte() catch |e| switch (e) { + error.EndOfStream => return null, + else => { + s.err_code = e; + return error.ReadError; + }, + }; + if (detailed_debug) { + std.debug.print("{c}", .{c}); + } + return c; } - fn readNext(s: *State) !void { - if (s.pos != s.write_pos) { - return; - } - s.buf[s.pos] = try s.input.readByte(); - s.write_pos +%= 1; + fn readNoEof(s: *State, emsg: []const u8) !u8 { + return if (try s.read()) |c| c else s.err(error.UnexpectedEof, emsg); } - fn eof(s: *State) bool { - if (is_debug) { - s.checked_eof = true; + fn getUnused(s: *State) ?u8 { + if (s.unused_char) |c| { + s.unused_char = null; + return c; } - readNext(s) catch |e| switch (e) { - error.EndOfStream => return true, - else => @panic("read error"), - }; - return false; + return null; } - fn peek(s: *State) u8 { - if (is_debug) { - if (!s.checked_eof) { - @panic("Didn't check EOF before calling peek()!"); - } - } - return s.buf[s.pos]; + fn skipLine(s: *State) !void { + while (try s.read()) |c| if (c == '\n') break; } - fn skip(s: *State) void { - if (is_debug) { - s.checked_eof = false; - } - // std.debug.print("{c}\n", .{s.buf[s.pos]}); - s.pos +%= 1; - s.counter += 1; + fn addChar(s: *State, c: u8) !void { + try s.chars.append(s.chars_alloc, c); } - fn getc(s: *State) u8 { - const c = s.peek(); - s.skip(); - return c; + fn getBareString(s: *State) Value { + defer s.chars.clearRetainingCapacity(); + return if (s.chars.items.len <= 6) + value.sstr.pack(s.chars.items) + else + value.istr.intern(s.chars.items, false); } - // Consumes whitespace and line comments. - fn consumeBlanks(s: *State) void { - while (!s.eof()) { - switch (s.peek()) { - // Allow Form Feed (^L) commonly used by Emacs users. - '\t', '\n', ' ', 0x0C => s.skip(), - ';' => s.consumeLineComment(), - else => return, - } - } + fn getQuotedString(s: *State) Value { + defer s.chars.clearRetainingCapacity(); + return if (s.chars.items.len <= 6) + value.sstr.packQuoted(s.chars.items) + else + value.istr.intern(s.chars.items, true); } - fn consumeLineComment(s: *State) void { - while (!s.eof()) { - if (s.getc() == '\n') { - return; - } - } + fn getRune(s: *State) !Value { + defer s.chars.clearRetainingCapacity(); + return if (s.chars.items.len <= 6) + value.rune.pack(s.chars.items) + else + error.RuneTooLong; } - fn isWhitespace(s: *State) bool { - return switch (s.peek()) { - '\t', '\n', ' ' => true, - else => false, - }; + fn push(s: *State, next: Fn) !void { + try s.stack.append(s.stack_alloc, .{ .next = next }); } -}; -const CharPred = fn (u8) bool; -const ShortStringPack = fn ([]const u8) Value; + fn pushContext(s: *State, next: Fn) !void { + try s.stack.append(s.stack_alloc, .{ + .next = next, + .val = s.context.val, + .char = s.context.char, + }); + } -// Helper function to read runes and short strings. -fn readShortString( - s: *State, - pred: CharPred, - pack: ShortStringPack, -) ?Value { - var str = ShortString{}; - while (!s.eof() and pred(s.peek())) { - str.append(s.getc()) catch return null; + fn subr(s: *State, start: Fn, next: Fn) !void { + try s.pushContext(next); + s.context.next = start; } - return pack(str.constSlice()); -} -// Probably best *not* to use function pointers here, but rather dispatch to -// functions manually based on enum value. This should help the optimizer. + fn jump(s: *State, next: Fn, val: ?Value) void { + if (val) |v| s.result = v; + s.context.next = next; + } -const Fn = enum { - start_parse, - start_datum, - end_join_datum, - end_label_datum, - end_hash_datum, - end_quote_datum, - continue_list, - finish_improper_list, - end_improper_list, - done, + fn abort(s: *State, next: Fn, unused_c: u8) void { + s.result = value.undef; + s.unused_char = unused_c; + s.context.next = next; + } + + fn retval(s: *State, val: Value) void { + s.result = val; + if (s.stack.pop()) |c| { + s.context = c; + } else { + s.context.next = .done; + } + } }; pub fn parse(input: std.io.AnyReader) Value { @@ -437,437 +457,620 @@ pub fn parse(input: std.io.AnyReader) Value { else std.heap.smp_allocator; - var sfa = std.heap.stackFallback(4096, alloc); - - var s: State = .init(input, sfa.get()); + const stack_mem = init_stack_capacity * @sizeOf(Context); + const chars_mem = init_chars_capacity; + var stack_sfa = std.heap.stackFallback(stack_mem, alloc); + var chars_sfa = std.heap.stackFallback(chars_mem, alloc); + const stack_alloc = stack_sfa.get(); + const chars_alloc = chars_sfa.get(); + var s = State.init(input, stack_alloc, chars_alloc) catch @panic(""); defer s.deinit(); - while (s.context.next != .done) call(&s); - return s.retval; + while (s.context.next != .done) callNext(&s) catch |e| switch (e) { + else => @panic(s.err_msg), // TODO + }; + if (s.unused_char) |_| { + @panic("invalid character"); + } + return s.result; } -fn call(s: *State) void { - // std.debug.print("{}\n", .{s.next}); - switch (s.context.next) { - .start_parse => startParse(s), - .start_datum => startDatum(s), - .end_join_datum => endJoinedDatum(s), - .end_label_datum => endLabelDatum(s), +const Fn = enum { + parse_unit, + return_context, + end_one_datum, + parse_join_datum, + end_join_datum, + join_data, + parse_hash_datum, + end_hash_datum, + parse_rune_datum, + end_rune_datum, + end_label_datum, + parse_list_element, + continue_list, + parse_list_tail, + end_improper_list, + close_improper_list, + end_quote_expr, + done, +}; + +fn callNext(s: *State) !void { + if (detailed_debug) { + std.debug.print("\n{}:{} ctx:'{c}' unused:'{c}' \n", .{ + s.stack.items.len, + s.context.next, + s.context.char, + s.unused_char orelse '_', + }); + } + try switch (s.context.next) { + .parse_unit => parseUnit(s), + .return_context => returnContext(s), + .end_one_datum => endOneDatum(s), + .parse_join_datum => parseJoinDatum(s), + .end_join_datum => endJoinDatum(s), + .join_data => joinData(s), + .parse_hash_datum => parseHashDatum(s), .end_hash_datum => endHashDatum(s), - .end_quote_datum => endQuoteDatum(s), + .parse_rune_datum => parseRuneDatum(s), + .end_rune_datum => endRuneDatum(s), + .end_label_datum => endLabelDatum(s), + .parse_list_element => parseListElement(s), .continue_list => continueList(s), - .finish_improper_list => finishImproperList(s), + .parse_list_tail => parseListTail(s), .end_improper_list => endImproperList(s), + .close_improper_list => endImproperList(s), + .end_quote_expr => endQuoteExpr(s), .done => unreachable, - } + }; } -fn startParse(s: *State) void { - s.consumeBlanks(); - if (s.eof()) { - return s.returnDatum(value.eof.eof); - } - switch (s.peek()) { - // whitespace already consumed - 0...32, 127...255 => err(s, "invalid character"), - ')', ']', '}' => err(s, "unexpected closing bracket"), - else => startDatum(s), +fn parseUnit(s: *State) !void { + var c1 = s.getUnused() orelse try s.read(); + while (c1) |c| : (c1 = try s.read()) { + switch (try checkBlank(s, c)) { + .yes => {}, + .skip_unit => { + // Simply push another parse_unit onto the stack, which will + // ignore the result of the current one and start anew; then + // keep looping to read the datum that will be ignored. + try s.push(.parse_unit); + }, + .skip_line => try s.skipLine(), + .no => return parseDatum(s, c), + } } + return s.retval(value.eof.eof); } -// This is called when we *immediately* expect a datum and nothing else; for -// example, no whitespace or comments, because they've already been consumed. -fn startDatum(s: *State) void { - if (s.eof()) { - return err(s, "expected datum, got EOF"); - } - if (s.isWhitespace()) { - return err(s, "expected datum, got whitespace"); - } - switch (s.peek()) { - // whitespace checked above - 0...32, 127...255 => err(s, "invalid character"), - - ')', ']', '}' => err(s, "unexpected closing bracket"), - - ';' => err(s, "expected datum, got semicolon"), - - '#' => handleHash(s), - - '"' => startQuotedString(s), - - '\'', '`', ',' => startQuote(s), +fn checkBlank(s: *State, c: u8) !enum { yes, skip_unit, skip_line, no } { + return switch (c) { + '\t'...'\r', ' ' => .yes, + ';' => switch (try s.read() orelse '\n') { + '\n' => .yes, + '~' => .skip_unit, + else => .skip_line, + }, + else => .no, + }; +} - '(', '[', '{' => startList(s), +fn parseDatum(s: *State, c: u8) !void { + return parseOneDatum(s, c, .end_one_datum); +} - else => startBareString(s), +fn endOneDatum(s: *State) !void { + const d = s.result; + if (d.eq(value.undef)) { + return s.retval(d); } + const c1 = s.getUnused() orelse try s.read(); + if (c1) |c| { + switch (try checkBlank(s, c)) { + .yes => {}, + .skip_unit => return skipUnitAndReturn(s, d), + .skip_line => try s.skipLine(), + .no => return parseJoin(s, d, c), + } + } + return s.retval(d); } -fn endDatum(s: *State, d: Value) void { - // - // We're at the end of a datum; check for the various ways data can be - // joined together, like DATUM|DATUM or DATUM|.DATUM etc. - // - - if (isEndOfDatum(s)) { - // Nope, end it. - return s.returnDatum(d); - } +fn skipUnitAndReturn(s: *State, d: Value) !void { + s.context.val = d; + return s.subr(.parse_unit, .return_context); +} - // There's a stupid special-case we have to handle here, where a datum - // comment may fool us into thinking there's something to join: foo|#;bar +fn returnContext(s: *State) !void { + return s.retval(s.context.val); +} - const c = s.peek(); - switch (c) { - '.', ':', '|' => s.skip(), - '#' => if (checkTrailingDatumComment(s)) { - return s.returnDatum(d); - }, - else => {}, - } +fn parseJoin(s: *State, d: Value, c: u8) !void { s.context.val = d; s.context.char = c; - s.recurParse(.start_datum, .end_join_datum); + s.unused_char = switch (c) { + '.', ':', '|' => try s.readNoEof("start of joined datum"), + else => c, + }; + return s.subr(.parse_join_datum, .join_data); } -fn checkTrailingDatumComment(s: *State) bool { - const pos = s.pos; - s.skip(); - if (s.eof()) { - // Error, but let it be handled later. - return false; - } - const c = s.peek(); - s.pos = pos; - return c == ';'; +fn parseJoinDatum(s: *State) !void { + return parseOneDatum(s, s.getUnused().?, .end_join_datum); } -fn isEndOfDatum(s: *State) bool { - return s.eof() or switch (s.peek()) { - '\t', '\n', ' ', ';', ')', ']', '}' => true, - else => false, - }; +fn endJoinDatum(s: *State) !void { + return s.retval(s.result); } -fn endJoinedDatum(s: *State) void { - const rune = switch (s.context.char) { +fn joinData(s: *State) !void { + const head = s.context.val; + const join = s.context.char; + const tail = s.result; + if (tail.eq(value.undef)) { + return s.retval(head); + } + const rune = switch (join) { '.' => DOT, ':' => COLON, '|' => PIPE, else => JOIN, }; - endDatum(s, cons(rune, cons(s.context.val, s.retval))); -} - -fn handleHash(s: *State) void { - s.skip(); - // - // We just consumed a hash. Possibilities include: - // - // #|foo ;rune - // - // #|n#[=DATUM] ;datum label, with or without datum - // - // #|;DATUM ;datum comment - // - // #|DATUM ;hash-datum - // - - if (s.eof()) { - return err(s, "EOF after hash"); - } - if (s.isWhitespace()) { - return err(s, "whitespace after hash"); - } - - switch (s.peek()) { - 'a'...'z', 'A'...'Z' => handleRune(s), - '0'...'9' => handleDatumLabel(s), - ';' => { - s.skip(); - // Don't change next in this case. Just let the parser redo what it - // was doing as soon as the commented-out datum has been read. - s.recurParse(.start_datum, s.context.next); - }, - else => s.recurParse(.start_datum, .end_hash_datum), + const result = cons(rune, cons(head, tail)); + return s.jump(.end_one_datum, result); +} + +fn parseOneDatum(s: *State, c: u8, next: Fn) !void { + if (isBareChar(c)) { + const d, s.unused_char = try parseBareString(s, c); + return s.jump(next, d); + } + return parseCladDatum(s, c, next); +} + +fn parseCladDatum(s: *State, c: u8, next: Fn) !void { + if (c == '\\') { + const bs, s.unused_char = try parseBareEscString(s); + return s.jump(next, bs); + } + if (c == '"') { + const qs = try parseQuotedString(s); + return s.jump(next, qs); } + return switch (c) { + '#' => parseHashExpression(s, next), + '(', '[', '{' => parseList(s, c, next), + '\'', '`', ',' => parseQuoteExpr(s, c, next), + else => s.abort(next, c), + }; } -fn handleRune(s: *State) void { - const r = readRune(s) orelse return err(s, "rune too long"); - endDatum(s, r); +fn isBareChar(c: u8) bool { + return switch (c) { + // zig fmt: off + 'a'...'z' , 'A'...'Z' , '0'...'9', + '!' , '$' , '%' , '&' , '*' , '+', + '-' , '/' , '<' , '=' , '>' , '?', + '@' , '^' , '_' , '~' => true, + // zig fmt: on + else => false, + }; } -fn readRune(s: *State) ?Value { - return readShortString(s, std.ascii.isAlphanumeric, value.rune.pack); +fn isBareEsc(c: u8) bool { + return switch (c) { + 33...126 => true, + else => false, + }; } -fn handleDatumLabel(s: *State) void { - const n = readDatumLabel(s) orelse return err(s, "datum label too long"); - // - // We're at the end of the numeric label now; possibilities are: - // - // #n|# - // - // #n|#=DATUM - // +fn parseBareString(s: *State, c: u8) !struct { Value, ?u8 } { + try s.addChar(c); + return parseBareStringRest(s); +} - if (s.eof()) { - return err(s, "unexpected EOF while reading datum label"); - } - if (s.getc() != '#') { - return err(s, "invalid character while reading datum label"); - } +fn parseBareEscString(s: *State) !struct { Value, ?u8 } { + try s.addChar(try parseBareEsc(s)); + return parseBareStringRest(s); +} - if (s.eof() or s.isWhitespace()) { - return endDatum(s, cons(LABEL, n)); +fn parseBareStringRest(s: *State) !struct { Value, ?u8 } { + while (try s.read()) |c| { + if (isBareChar(c)) { + try s.addChar(c); + } else if (c == '\\') { + try s.addChar(try parseBareEsc(s)); + } else { + return .{ s.getBareString(), c }; + } } + return .{ s.getBareString(), null }; +} - if (s.getc() != '=') { - return err(s, "invalid character after numeric datum label"); +fn parseBareEsc(s: *State) !u8 { + const c = try s.readNoEof("bare escape"); + if (isBareEsc(c)) { + return c; + } else { + return s.err(error.InvalidCharacter, "bare escape"); } - - s.context.val = n; - s.recurParse(.start_datum, .end_label_datum); } -fn readDatumLabel(s: *State) ?Value { - return readShortString(s, std.ascii.isDigit, value.sstr.pack); +fn parseQuotedString(s: *State) !Value { + while (try s.read()) |c| { + if (c == '"') { + return s.getQuotedString(); + } + if (c != '\\') { + try s.addChar(c); + } else { + try parseQuotedEsc(s); + } + } + return error.UnclosedString; } -fn endLabelDatum(s: *State) void { - endDatum(s, cons(LABEL, cons(s.context.val, s.retval))); +fn parseQuotedEsc(s: *State) !void { + const c = try s.readNoEof("quoted escape"); + if (c == 'u') { + return parseUniHexHandleErrors(s); + } + try s.addChar(switch (c) { + '\\', '"' => c, + '0' => 0, + 'a' => 7, + 'b' => 8, + 't' => 9, + 'n' => 10, + 'v' => 11, + 'f' => 12, + 'r' => 13, + 'e' => 27, + 'x' => try parseHexByte(s, "hex escape"), + else => return s.err(error.InvalidCharacter, "quoted escape"), + }); } -fn endHashDatum(s: *State) void { - endDatum(s, cons(HASH, s.retval)); +fn parseUniHexHandleErrors(s: *State) !void { + return parseUniHex(s) catch |err| switch (err) { + error.Utf8CannotEncodeSurrogateHalf => e: { + s.err_code = err; + s.err_msg = "unicode escape"; + break :e error.UnicodeError; + }, + else => |e| e, + }; } -fn startQuotedString(s: *State) void { - // We're at |"..." so consume the opening quote before we start reading. - s.skip(); +fn parseUniHex(s: *State) !void { + const msg = "unicode escape"; - const str = readQuotedString(s) catch return err(s, "unclosed string"); - endDatum(s, str); + if (try s.readNoEof(msg) != '{') { + return s.err(error.InvalidCharacter, msg); + } + + const uc, const unused_c = try parseHex(s, u21, msg); + if (unused_c) |c| { + if (c != '}') { + return s.err(error.InvalidCharacter, msg); + } + } else { + return s.err(error.UnexpectedEof, msg); + } + + const n = try std.unicode.utf8CodepointSequenceLength(uc); + const buf = try s.chars.addManyAsSlice(s.chars_alloc, n); + _ = try std.unicode.utf8Encode(uc, buf); } -// RQS = Read Quoted String -const RqsError = enum { Unclosed }; +fn parseHashExpression(s: *State, next: Fn) !void { + const c = try s.readNoEof("hash expression"); + if (try checkBlank(s, c) != .no) { + return s.err(error.InvalidCharacter, "hash expression"); + } + if (std.ascii.isAlphabetic(c)) { + const r, const unused_c = try parseRune(s, c); + return parseRuneEnd(s, r, unused_c, next); + } + if (c == '%') { + const l, const unused_c = try parseLabel(s); + return parseLabelEnd(s, l, unused_c, next); + } + if (isBareChar(c)) { + // Reserved for future extensions to syntax sugar. + return s.err(error.InvalidCharacter, "hash expression"); + } + // fast-path to avoid subr + if (c == '\\') { + const bs, s.unused_char = try parseBareEscString(s); + return s.jump(next, cons(HASH, bs)); + } + if (c == '"') { + const qs = try parseQuotedString(s); + return s.jump(next, cons(HASH, qs)); + } + s.unused_char = c; + return s.subr(.parse_hash_datum, next); +} -fn readQuotedString(s: *State) !Value { - return try readQuotedSstr(s) orelse readQuotedLongString(s); +fn parseHashDatum(s: *State) !void { + return parseCladDatum(s, s.getUnused().?, .end_hash_datum); } -fn readQuotedSstr(s: *State) !?Value { - const start_pos = s.pos; +fn endHashDatum(s: *State) !void { + if (s.result.eq(value.undef)) { + return s.err(error.InvalidCharacter, "hash datum"); + } + return s.retval(cons(HASH, s.result)); +} - // TODO: Handle escapes. - var buf: [6]u8 = undefined; - var i: u8 = 0; - while (!s.eof()) : (i += 1) { - const c = s.getc(); - if (c == '"') { - // ok, return what we accumulated - return value.sstr.packQuoted(buf[0..i]); - } - if (i == 6) { - // failed; reset and bail out - s.pos = start_pos; - return null; +fn parseRune(s: *State, c1: u8) !struct { Value, ?u8 } { + try s.addChar(c1); + var len: usize = 1; + while (try s.read()) |c| : (len += 1) { + if (len == 6 or !std.ascii.isAlphanumeric(c)) { + return .{ try s.getRune(), c }; } - // ok, save this byte and go on - buf[i] = c; + try s.addChar(c); } - return error.Unclosed; + return .{ try s.getRune(), null }; } -fn readQuotedLongString(s: *State) !Value { - var chars = std.ArrayList(u8).init(s.alloc); - defer chars.deinit(); - - // TODO: Handle escapes. - while (!s.eof()) { - const c = s.getc(); - if (c == '"') { - return intern(chars.items, true); - } - chars.append(c) catch @panic("OOM"); +fn parseRuneEnd(s: *State, r: Value, c1: ?u8, next: Fn) !void { + const c = c1 orelse return s.jump(next, r); + if (c == '\\') { + const bs, s.unused_char = try parseBareString(s, c); + return s.jump(next, cons(r, bs)); + } + if (c == '"') { + const qs = try parseQuotedString(s); + return s.jump(next, cons(r, qs)); + } + s.unused_char = c; + switch (c) { + '#', '(', '[', '{', '\'', '`', ',' => { + try s.push(next); + return s.jump(.parse_rune_datum, r); + }, + else => return s.jump(next, r), } - return error.Unclosed; } -fn startBareString(s: *State) void { - // We're at |foo so start reading directly. - const str = readBareSstr(s) orelse readBareLongString(s); - endDatum(s, str); +fn parseRuneDatum(s: *State) !void { + s.context.val = s.result; + return parseCladDatum(s, s.getUnused().?, .end_rune_datum); } -fn readBareSstr(s: *State) ?Value { - const pos = s.pos; - if (readShortString(s, isSstrChar, value.sstr.pack)) |sstr| { - return sstr; - } else { - s.pos = pos; - return null; +fn endRuneDatum(s: *State) !void { + const r = s.context.val; + const d = s.result; + if (d.eq(value.undef)) { + s.retval(r); } + return s.retval(cons(r, d)); } -fn isSstrChar(c: u8) bool { - // We will ignore illegal characters here, because they aren't consumed by - // this function; whatever code comes next must handle them. - return switch (c) { - '(', ')', '[', ']', '{', '}', ';', '#', '"', '\'', '`', ',' => false, - 0...32, 127...255 => false, - else => true, - }; +fn parseLabel(s: *State) !struct { Value, ?u8 } { + const label, const unused_c = try parseHex(s, u48, "datum label"); + return .{ value.fixnum.pack(label), unused_c }; } -fn readBareLongString(s: *State) Value { - var chars = std.ArrayList(u8).init(s.alloc); - defer chars.deinit(); +fn parseLabelEnd(s: *State, l: Value, c1: ?u8, next: Fn) !void { + const c = c1 orelse return s.err(error.UnexpectedEof, "datum label"); + if (c == '%') { + return s.jump(next, cons(LABEL, l)); + } + if (c == '=') { + try s.push(next); + s.context.val = l; + return s.subr(.parse_unit, .end_label_datum); + } + return s.err(error.InvalidCharacter, "datum label"); +} - while (!s.eof()) { - const c = s.peek(); - if (isSstrChar(c)) { - chars.append(c) catch @panic("OOM"); - s.skip(); - } else { - break; - } +fn endLabelDatum(s: *State) !void { + const l = s.context.val; + const d = s.result; + if (d.eq(value.undef)) { + return s.err(error.InvalidCharacter, "label datum"); } - return intern(chars.items, false); + return s.retval(cons(LABEL, cons(l, d))); } -fn startQuote(s: *State) void { - // We're at one of: |'... |`... |,... - s.context.val = switch (s.getc()) { - '\'' => QUOTE, - '`' => GRAVE, - ',' => COMMA, +fn parseList(s: *State, open: u8, next: Fn) !void { + const head = switch (open) { + '(' => value.nil.nil, + '[' => cons(SQUARE, value.nil.nil), + '{' => cons(BRACE, value.nil.nil), + else => unreachable, + }; + const close: u8 = switch (open) { + '(' => ')', + '[' => ']', + '{' => '}', else => unreachable, }; - s.recurParse(.start_datum, .end_quote_datum); + while (try s.read()) |c| { + if (c == close) { + return s.jump(next, head); + } + switch (try checkBlank(s, c)) { + .yes => {}, + .skip_unit => { + try listParserSetup(s, head, close, next); + // Parse twice in a row, ignoring the first result. + return s.subr(.parse_unit, .parse_unit); + }, + .skip_line => try s.skipLine(), + .no => { + try listParserSetup(s, head, close, next); + s.unused_char = c; + return s.jump(.parse_list_element, null); + }, + } + } + return s.err(error.UnexpectedEof, "list"); } -fn endQuoteDatum(s: *State) void { - endDatum(s, cons(s.context.val, s.retval)); +fn listParserSetup(s: *State, head: Value, close: u8, next: Fn) !void { + s.context.val = head; + s.context.char = close; + try s.push(next); + try s.pushContext(.continue_list); } -// List processing is, unsurprisingly, the most complicated, and it's made even -// more complicated by the possibility of datum comments in strange places... - -// Make sure to use .start_parse instead of .start_datum to handle elements, so -// that an arbitrary number of datum comments, separated by blanks (whitespace -// and line comments) are handled automatically. +fn parseListElement(s: *State) !void { + return parseDatum(s, s.getUnused().?); +} -fn startList(s: *State) void { - const open = s.getc(); +fn continueList(s: *State) !void { + const close = s.context.char; - s.consumeBlanks(); - if (s.eof()) { - return err(s, "unexpected EOF while parsing list"); + if (s.result.eq(value.undef)) { + const c = s.getUnused().?; + if (c == close) { + return endList(s); + } + if (c == '.') { + return s.jump(.parse_list_tail, null); + } + return s.err(error.InvalidCharacter, "list"); } - s.context.val = value.nil.nil; - s.context.char = open; - if (isEndOfList(s)) { - endList(s); - } else { - s.recurParse(.start_parse, .continue_list); + s.context.val = cons(s.result, s.context.val); + + var c1 = s.unused_char orelse try s.read(); + while (c1) |c| : (c1 = try s.read()) { + if (c == close) { + return endList(s); + } + if (c == '.') { + return s.jump(.parse_list_tail, null); + } + switch (try checkBlank(s, c)) { + .yes => {}, + .skip_unit => { + try s.pushContext(.continue_list); + return s.subr(.parse_unit, .parse_unit); + }, + .skip_line => try s.skipLine(), + .no => { + s.unused_char = c; + return s.jump(.parse_list_element, null); + }, + } } + return s.err(error.UnexpectedEof, "list"); } -fn isEndOfList(s: *State) bool { - return switch (s.peek()) { - ')', ']', '}' => true, - else => false, - }; +fn endList(s: *State) !void { + return s.retval(lib.list.reverse(s.context.val)); } -fn endList(s: *State) void { - const open = s.context.char; - const char = s.getc(); - - if (open == '(' and char == ')') { - return endDatum(s, s.context.val); - } - if (open == '[' and char == ']') { - return endDatum(s, cons(SQUARE, s.context.val)); - } - if (open == '{' and char == '}') { - return endDatum(s, cons(BRACE, s.context.val)); +fn parseListTail(s: *State) !void { + const c = try s.readNoEof("list tail"); + try s.pushContext(.end_improper_list); + switch (try checkBlank(s, c)) { + .yes => {}, + .skip_unit => return s.subr(.parse_unit, .parse_unit), + .skip_line => try s.skipLine(), + // One blank mandatory here. + .no => return s.err(error.InvalidCharacter, "list tail"), } - - err(s, "wrong closing bracket for list"); + return s.jump(.parse_unit, null); } -fn continueList(s: *State) void { - // Note that this accumulates list elements in reverse. - s.context.val = value.pair.cons(s.retval, s.context.val); - - s.consumeBlanks(); - if (s.eof()) { - return err(s, "unexpected EOF while parsing list"); +fn endImproperList(s: *State) !void { + const tail = s.result; + if (tail.eq(value.undef)) { + return s.err(error.InvalidCharacter, "list tail"); } + s.context.val = lib.list.reverseWithTail(s.context.val, tail); + return closeImproperList(s); +} - if (isEndOfList(s)) { - s.context.val = lib.list.reverse(s.context.val); - return endList(s); +fn closeImproperList(s: *State) !void { + const close = s.context.char; + var c1 = s.getUnused() orelse try s.read(); + while (c1) |c| : (c1 = try s.read()) { + switch (try checkBlank(s, c)) { + .yes => {}, + .skip_unit => return s.subr(.parse_unit, .close_improper_list), + .skip_line => try s.skipLine(), + .no => { + if (c == close) { + return s.retval(s.context.val); + } + return s.err(error.InvalidCharacter, "after list tail"); + }, + } } + return s.err(error.UnexpectedEof, "after list tail"); +} - // Check if there's an improper-list ending. - if (s.peek() == '.') { - const pos = s.pos; - s.skip(); - if (s.eof()) { - return err(s, "unexpected EOF while parsing list"); - } - // Scheme allows (foo .(bar)) but we don't. Mind your spaces! - if (s.isWhitespace()) { - s.skip(); - return s.recurParse(.start_parse, .finish_improper_list); - } - // Nope, reset. - s.pos = pos; +fn parseQuoteExpr(s: *State, c1: u8, next: Fn) !void { + const q = switch (c1) { + '\'' => QUOTE, + '`' => GRAVE, + ',' => COMMA, + else => unreachable, + }; + + // fast-path to avoid subr + const c = try s.readNoEof("quote expression"); + if (isBareChar(c) or c == '\\') { + const bs, s.unused_char = try parseBareString(s, c); + return s.jump(next, cons(q, bs)); } - s.recurParse(.start_parse, .continue_list); + s.context.val = q; + return s.subr(.parse_unit, .end_quote_expr); } -fn finishImproperList(s: *State) void { - s.context.val = lib.list.reverseWithTail(s.context.val, s.retval); - endImproperList(s); +fn endQuoteExpr(s: *State) !void { + const q = s.context.val; + const d = s.result; + return s.retval(cons(q, d)); } -// Handling the end of an improper list is a bit awkward, because there may be -// datum comments *after* the final cdr, where we don't actually want to parse -// any further data. So we keep looping here just looking for datum comments. +// Helpers -fn endImproperList(s: *State) void { - s.consumeBlanks(); - if (s.eof()) { - return err(s, "unexpected EOF at end of improper list"); - } +fn parseHex( + s: *State, + u_type: type, + emsg: []const u8, +) !struct { u_type, ?u8 } { + var uc: u_type = undefined; - if (isEndOfList(s)) { - return endList(s); - } + const c1 = try s.readNoEof(emsg); + uc = try parseHexDigit(s, c1, emsg); - if (s.getc() == '#') { - if (s.eof()) { - return err(s, "unexpected hash and EOF at end of improper list"); - } - if (s.getc() == ';') { - return s.recurParse(.start_datum, .end_improper_list); + while (try s.read()) |c| { + if (!std.ascii.isHex(c)) { + return .{ uc, c }; } + const shl = std.math.shlExact; + uc = shl(u_type, uc, 4) catch return s.err(error.OutOfRange, emsg); + uc |= try parseHexDigit(s, c, emsg); } + return .{ uc, null }; +} - err(s, "malformed list / extra datum at end of improper list"); +fn parseHexByte(s: *State, emsg: []const u8) !u8 { + const h1 = try s.readNoEof(emsg); + const h2 = try s.readNoEof(emsg); + const hi = try parseHexDigit(s, h1, emsg); + const lo = try parseHexDigit(s, h2, emsg); + return hi << 4 | lo; } -fn err(s: *State, msg: []const u8) noreturn { - std.debug.print("{s}\n", .{msg}); - std.debug.print("position: {}\n", .{s.counter}); - @panic("parse error"); +fn parseHexDigit(s: *State, c: u8, emsg: []const u8) !u8 { + return switch (c) { + '0'...'9' => c - '0', + 'A'...'F' => c - 'A' + 10, + 'a'...'f' => c - 'a' + 10, + else => s.err(error.InvalidCharacter, emsg), + }; } diff --git a/src/libzisp/value.zig b/src/libzisp/value.zig index 2561ca7..fbe907a 100644 --- a/src/libzisp/value.zig +++ b/src/libzisp/value.zig @@ -285,6 +285,10 @@ pub const Value = packed union { std.debug.dumpHex(std.mem.asBytes(&v)); } + pub fn eq(v1: Value, v2: Value) bool { + return v1.bits == v2.bits; + } + // The following aren't type predicates per se, but rather determine which // general category the value is in. The exceptions are fixnum and double, // since those aren't sub-categorized into further types. diff --git a/src/main.zig b/src/main.zig index c9a5404..24c7959 100644 --- a/src/main.zig +++ b/src/main.zig @@ -1,11 +1,6 @@ -//! By convention, main.zig is where your main function lives in the case that -//! you are building an executable. If you are making a library, the convention -//! is to delete this file and start with root.zig instead. - const std = @import("std"); -/// This imports the separate module containing `root.zig`. Take a look in `build.zig` for details. -const zisp = @import("zisp_lib"); +const zisp = @import("libzisp"); pub fn main() !void { const T = extern union { @@ -33,6 +28,19 @@ pub fn main() !void { x.*.double = f1.* / f2.*; std.debug.print("-0/0: {x}\n", .{x.*.bits}); + for (0..1000000) |i| { + _ = i; + const p = zisp.value.pair.cons( + zisp.value.fixnum.pack(0), + zisp.value.fixnum.pack(1), + ); + std.mem.doNotOptimizeAway(p); + // std.debug.print("fx: {}\n", .{zisp.value.pair.car(p)}); + } + + const istr = zisp.value.istr.intern("foo", false); + std.debug.print("istr: {s}\n", .{zisp.value.istr.getHeader(istr).bytes()}); + // // Prints to stderr (it's a shortcut based on `std.io.getStdErr()`) // std.debug.print("All your {s} are belong to us.\n", .{"codebase"}); |
