From f2b18d64448ab09dd5e5e6a180d38d90d5aaf367 Mon Sep 17 00:00:00 2001 From: Taylan Kammer Date: Thu, 27 Mar 2025 21:18:09 +0100 Subject: new parser --- spec/parser.ebnf | 75 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ spec/syntax.md | 34 +++++++++++++++++++++++++ 2 files changed, 109 insertions(+) create mode 100644 spec/parser.ebnf create mode 100644 spec/syntax.md (limited to 'spec') diff --git a/spec/parser.ebnf b/spec/parser.ebnf new file mode 100644 index 0000000..9e02fba --- /dev/null +++ b/spec/parser.ebnf @@ -0,0 +1,75 @@ +unit : blank* ( datum blank? | EOF ) ; + + +blank : 9...13 | comment ; + +datum : one_datum ( join_char? one_datum )* ; + +join_char : '.' | ':' | '|' ; + + +comment : ';' ( skip_unit | skip_line ) ; + +skip_unit : '~' unit ; + +skip_line : ( ~LF )* LF? ; + + +one_datum : ( bare_str | clad_datum ) ; + +bare_str : bare_str_elt+ ; + +clad_datum : '\' bare_esc_str + | '"' quoted_str '"' + | '#' hash_expr + | '(' blank* list? ')' + | '[' blank* list? ']' + | '{' blank* list? '}' + | quote_expr + ; + + +bare_str_elt : bare_char | '\' bare_esc ; + + +bare_esc_str : bare_esc bare_str_elt* ; + +quoted_str : ( quoted_char | '\' quoted_esc )* ; + +hash_expr : rune clad_datum? + | '%' label ( '%' | '=' unit ) + | clad_datum + ; + +list : unit+ ( '.' blank+ unit )? blank* ; + +quote_expr : ( "'" | "`" | "," ) datum ; + + +bare_char : letter | digit + | '!' | '$' | '%' | '&' | '*' | '+' | '-' | '/' + | '<' | '=' | '>' | '?' | '@' | '^' | '_' | '~' + ; + +bare_esc : 33...126 ; + + +quoted_char : ~( '"' | '\' ) ; + +quoted_esc : '\' | '"' | 'a' | 'b' | 'e' + | 'f' | 'n' | 'r' | 't' | 'v' + | 'x' hex_digit{2} + | 'u' '{' hex_digit+ '}' + ; + + +rune : letter ( letter | digit ){0,5} ; + +label : hex_digit{1,12} ; + + +letter : 'a'...'z' | 'A'...'Z' ; + +digit : '0'...'9' ; + +hex_digit : '0'...'9' | 'a'...'f' | 'A'...'F' ; diff --git a/spec/syntax.md b/spec/syntax.md new file mode 100644 index 0000000..b85ed78 --- /dev/null +++ b/spec/syntax.md @@ -0,0 +1,34 @@ +# Zisp S-Expression Syntax + +We use a BNF notation with the following rules: + +* Concatenation of expressions is implicit: `foo bar` means `foo` + followed by `bar`. + +* Expressions may be followed by `?`, `*`, `+`, `{N}`, or `{N,M}`, + which have the meanings they have in regular expressions. + +* The syntax is defined in terms of bytes, not characters. Terminals + `'c'` and `"c"` refer to the ASCII value of the given character `c`. + Numbers are in decimal and refer to a byte with the given value. + +* The `~` prefix means NOT. It only applies to rules that match one + byte, and negates them. For example, `~( 'a' | 'b' )` matches any + byte other than 97 and 98. + +* Ranges of terminal values are expressed as `x...y` (inclusive). + +* There is no ambiguity, backtracking, or look-ahead beyond the byte + currently being matched. Rules match left to right, depth-first, + and greedy. As soon as the input matches the first terminal of a + rule, it must match that rule to the end. + +The last rule means that the BNF is very simple to translate to code. + +The parser consumes one `unit` from an input stream every time it's +called; it returns the `datum` therein, or EOF. + +``` + + +``` -- cgit v1.2.3 From 00fd32b6c0d35140bdc160aa759bbac52242d7d0 Mon Sep 17 00:00:00 2001 From: Taylan Kammer Date: Fri, 28 Mar 2025 12:19:54 +0100 Subject: blah --- _tests/test.zig | 2 +- spec/parser.ebnf | 2 +- src/libzisp.zig | 194 ++++++++++++++++++++++++++-------------------- src/libzisp/io/parser.zig | 147 +++++++++++++++++++---------------- 4 files changed, 193 insertions(+), 152 deletions(-) (limited to 'spec') diff --git a/_tests/test.zig b/_tests/test.zig index 7b4a04c..5acb628 100644 --- a/_tests/test.zig +++ b/_tests/test.zig @@ -5,7 +5,7 @@ pub fn main() void { // const x: struct { u8, u64, u8 } = y; // @import("std").debug.print("{}\n", .{x[0] + x[1] + x[2]}); - std.debug.print("{}\n", .{@sizeOf(struct { a: u8, b: u64, c: u8, d: bool })}); + std.debug.print("{}\n", .{@sizeOf(struct { u64, ?u8 })}); } // const x: ?u8 = 5; diff --git a/spec/parser.ebnf b/spec/parser.ebnf index 9e02fba..44b1967 100644 --- a/spec/parser.ebnf +++ b/spec/parser.ebnf @@ -12,7 +12,7 @@ comment : ';' ( skip_unit | skip_line ) ; skip_unit : '~' unit ; -skip_line : ( ~LF )* LF? ; +skip_line : ( ~10 )* 10? ; one_datum : ( bare_str | clad_datum ) ; diff --git a/src/libzisp.zig b/src/libzisp.zig index e6c8ac5..df8422b 100644 --- a/src/libzisp.zig +++ b/src/libzisp.zig @@ -316,86 +316,114 @@ test "parse2" { try std.testing.expectEqualStrings("foo", f.slice()); } -// test "parse3" { -// const val = parseString( -// \\(foo ;~x ;~(x y) ;~x #bar [#x #"baz"] 'bat) -// ); - -// const car = value.pair.car; -// const cdr = value.pair.cdr; - -// const e1 = car(val); -// const e2 = car(cdr(val)); -// const e3 = car(cdr(cdr(val))); -// const e4 = car(cdr(cdr(cdr(val)))); - -// try std.testing.expect(value.sstr.check(e1)); -// try std.testing.expect(value.rune.check(e2)); -// try std.testing.expect(value.pair.check(e3)); -// try std.testing.expect(value.pair.check(e4)); -// } - -// test "parse4" { -// const val = parseString("(foo . ;~x bar ;~y)"); - -// const s = value.sstr.unpack(value.pair.car(val)); -// try std.testing.expectEqualStrings("foo", s.slice()); - -// const f = value.sstr.unpack(value.pair.cdr(val)); -// try std.testing.expectEqualStrings("bar", f.slice()); -// } - -// fn parseBench(path: []const u8, iters: usize) !void { -// const file = try std.fs.cwd().openFile(path, .{}); -// defer file.close(); - -// var timer = try std.time.Timer.start(); -// for (0..iters) |i| { -// _ = i; -// var br = std.io.bufferedReader(file.reader()); -// const reader = br.reader().any(); -// var v: Value = undefined; -// while (true) { -// v = io.parser.parse(reader); -// if (value.eof.check(v)) { -// break; -// } -// } -// try file.seekTo(0); -// } -// const ns: f64 = @floatFromInt(timer.lap()); -// const secs = ns / 1_000_000_000; -// std.debug.print( -// "parse {s} x {}: {d:.3}s\n", -// .{ path, iters, secs }, -// ); -// } - -// test "parse bench" { -// // try parseBench("test-data/parser-test-1.scm", 200); -// // try parseBench("test-data/parser-test-2.scm", 800); -// try parseBench("test-data/parser-torture.scm", 1); -// } - -// test "unparse" { -// var gpa: std.heap.GeneralPurposeAllocator(.{}) = .init; -// var out: std.ArrayList(u8) = .init(gpa.allocator()); - -// const w = out.writer(); -// const v = parseString("#foo"); -// try io.unparser.unparse(w, v); -// try std.testing.expectEqualStrings("#foo", try out.toOwnedSlice()); -// } - -// test "unparse2" { -// var gpa: std.heap.GeneralPurposeAllocator(.{}) = .init; -// var out: std.ArrayList(u8) = .init(gpa.allocator()); - -// const w = out.writer(); -// const v = parseString("#{foo bar['x]}"); -// try io.unparser.unparse(w, v); -// try std.testing.expectEqualStrings( -// "(#HASH #BRACE foo (#JOIN bar #SQUARE (#QUOTE . x)))", -// try out.toOwnedSlice(), -// ); -// } +test "parse3" { + const val = parseString( + \\(foo ;~x ;~(x y) ;~x #bar [#x #"baz"] 'bat) + ); + + const car = value.pair.car; + const cdr = value.pair.cdr; + + const e1 = car(val); + const e2 = car(cdr(val)); + const e3 = car(cdr(cdr(val))); + const e4 = car(cdr(cdr(cdr(val)))); + + try std.testing.expect(value.sstr.check(e1)); + try std.testing.expect(value.rune.check(e2)); + try std.testing.expect(value.pair.check(e3)); + try std.testing.expect(value.pair.check(e4)); +} + +test "parse4" { + const val = parseString("(foo . ;~x bar ;~y)"); + + const s = value.sstr.unpack(value.pair.car(val)); + try std.testing.expectEqualStrings("foo", s.slice()); + + const f = value.sstr.unpack(value.pair.cdr(val)); + try std.testing.expectEqualStrings("bar", f.slice()); +} + +fn parseBench(path: []const u8, iters: usize) !void { + const file = try std.fs.cwd().openFile(path, .{}); + defer file.close(); + + var timer = try std.time.Timer.start(); + for (0..iters) |i| { + _ = i; + var br = std.io.bufferedReader(file.reader()); + const reader = br.reader().any(); + var v: Value = undefined; + while (true) { + v = io.parser.parse(reader); + if (value.eof.check(v)) { + break; + } + } + try file.seekTo(0); + } + const ns: f64 = @floatFromInt(timer.lap()); + const secs = ns / 1_000_000_000; + std.debug.print( + "parse {s} x {}: {d:.3}s\n", + .{ path, iters, secs }, + ); +} + +test "parse bench" { + try parseBench("test-data/parser-test-1.scm", 1000); + try parseBench("test-data/parser-test-2.scm", 1000); + // try parseBench("test-data/parser-torture.scm", 1); +} + +test "unparse" { + var gpa: std.heap.GeneralPurposeAllocator(.{}) = .init; + var out: std.ArrayList(u8) = .init(gpa.allocator()); + + const w = out.writer(); + const v = parseString("#foo"); + try io.unparser.unparse(w, v); + try std.testing.expectEqualStrings("#foo", try out.toOwnedSlice()); +} + +test "unparse2" { + var gpa: std.heap.GeneralPurposeAllocator(.{}) = .init; + var out: std.ArrayList(u8) = .init(gpa.allocator()); + + const w = out.writer(); + const v = parseString("#{foo bar['x]}"); + try io.unparser.unparse(w, v); + try std.testing.expectEqualStrings( + "(#HASH #BRACE foo (#JOIN bar #SQUARE (#QUOTE . x)))", + try out.toOwnedSlice(), + ); +} + +test "unparse3" { + const w = std.io.getStdErr().writer(); + const v = parseString("#{foo bar['x](y)(z)}"); + try io.unparser.unparse(w, v); + try w.writeByte('\n'); +} + +test "unparse4" { + const w = std.io.getStdErr().writer(); + const v = parseString("(foo ;~bar)"); + try io.unparser.unparse(w, v); + try w.writeByte('\n'); +} + +test "unparse5" { + const w = std.io.getStdErr().writer(); + const v = parseString("(;~foo foo ;~bar . ;~bar bar ;~bar)"); + try io.unparser.unparse(w, v); + try w.writeByte('\n'); +} + +test "unparse6" { + const w = std.io.getStdErr().writer(); + const v = parseString("(foo .bar ... baz. bat.(qux))"); + try io.unparser.unparse(w, v); + try w.writeByte('\n'); +} diff --git a/src/libzisp/io/parser.zig b/src/libzisp/io/parser.zig index 651d124..643f7e8 100644 --- a/src/libzisp/io/parser.zig +++ b/src/libzisp/io/parser.zig @@ -257,11 +257,11 @@ const cons = value.pair.cons; const is_test = builtin.is_test; const is_debug = builtin.mode == .Debug; -const detailed_debug = true; +const detailed_debug = false; // In debug, we want to see if we leak, so very small numbers. -const init_stack_capacity = if (is_debug) 20 else 32; -const init_chars_capacity = if (is_debug) 100 else 512; +const init_stack_capacity = if (is_debug) 32 else 32; +const init_chars_capacity = if (is_debug) 512 else 512; // zig fmt: off const DOT = value.rune.pack("DOT"); @@ -277,6 +277,8 @@ const SQUARE = value.rune.pack("SQUARE"); const BRACE = value.rune.pack("BRACE"); // zig fmt: on +const S_DOT = value.sstr.pack("."); + const Context = struct { // What to do next. next: Fn = .parse_unit, @@ -393,12 +395,9 @@ const State = struct { value.istr.intern(s.chars.items, true); } - fn getRune(s: *State) !Value { + fn getRune(s: *State) Value { defer s.chars.clearRetainingCapacity(); - return if (s.chars.items.len <= 6) - value.rune.pack(s.chars.items) - else - error.RuneTooLong; + return value.rune.pack(s.chars.items); } fn push(s: *State, next: Fn) !void { @@ -466,11 +465,21 @@ pub fn parse(input: std.io.AnyReader) Value { var s = State.init(input, stack_alloc, chars_alloc) catch @panic(""); defer s.deinit(); - while (s.context.next != .done) callNext(&s) catch |e| switch (e) { - else => @panic(s.err_msg), // TODO + while (s.context.next != .done) callNext(&s) catch { + if (s.unused_char) |c| { + std.debug.panic( + "Parse error: {} at: {s}, char: {c}\n", + .{ s.err_code, s.err_msg, c }, + ); + } else { + std.debug.panic( + "Parse error: {} at: {s}\n", + .{ s.err_code, s.err_msg }, + ); + } }; - if (s.unused_char) |_| { - @panic("invalid character"); + if (s.unused_char) |c| { + std.debug.panic("Invalid character: {c}\n", .{c}); } return s.result; } @@ -489,7 +498,6 @@ const Fn = enum { end_label_datum, parse_list_element, continue_list, - parse_list_tail, end_improper_list, close_improper_list, end_quote_expr, @@ -498,12 +506,24 @@ const Fn = enum { fn callNext(s: *State) !void { if (detailed_debug) { - std.debug.print("\n{}:{} ctx:'{c}' unused:'{c}' \n", .{ - s.stack.items.len, + const stack = s.stack.items; + std.debug.print("\n\n{}:{} ctx:'{c}' unused:'{c}' \n", .{ + stack.len, s.context.next, s.context.char, s.unused_char orelse '_', }); + if (stack.len > 0) { + var i = stack.len; + while (i > 0) : (i -= 1) { + const prev = stack[i - 1]; + std.debug.print("{}:{} ctx:'{c}'\n", .{ + i - 1, + prev.next, + prev.char, + }); + } + } } try switch (s.context.next) { .parse_unit => parseUnit(s), @@ -519,9 +539,8 @@ fn callNext(s: *State) !void { .end_label_datum => endLabelDatum(s), .parse_list_element => parseListElement(s), .continue_list => continueList(s), - .parse_list_tail => parseListTail(s), .end_improper_list => endImproperList(s), - .close_improper_list => endImproperList(s), + .close_improper_list => closeImproperList(s), .end_quote_expr => endQuoteExpr(s), .done => unreachable, }; @@ -532,12 +551,7 @@ fn parseUnit(s: *State) !void { while (c1) |c| : (c1 = try s.read()) { switch (try checkBlank(s, c)) { .yes => {}, - .skip_unit => { - // Simply push another parse_unit onto the stack, which will - // ignore the result of the current one and start anew; then - // keep looping to read the datum that will be ignored. - try s.push(.parse_unit); - }, + .skip_unit => try s.push(.parse_unit), .skip_line => try s.skipLine(), .no => return parseDatum(s, c), } @@ -562,10 +576,10 @@ fn parseDatum(s: *State, c: u8) !void { } fn endOneDatum(s: *State) !void { - const d = s.result; - if (d.eq(value.undef)) { - return s.retval(d); + if (s.result.eq(value.undef)) { + return s.retval(value.undef); } + const d = s.result; const c1 = s.getUnused() orelse try s.read(); if (c1) |c| { switch (try checkBlank(s, c)) { @@ -590,10 +604,16 @@ fn returnContext(s: *State) !void { fn parseJoin(s: *State, d: Value, c: u8) !void { s.context.val = d; s.context.char = c; - s.unused_char = switch (c) { - '.', ':', '|' => try s.readNoEof("start of joined datum"), - else => c, - }; + switch (c) { + '.', ':', '|' => { + s.context.char = c; + s.unused_char = try s.readNoEof("join datum"); + }, + else => { + s.context.char = 0; + s.unused_char = c; + }, + } return s.subr(.parse_join_datum, .join_data); } @@ -610,16 +630,21 @@ fn joinData(s: *State) !void { const join = s.context.char; const tail = s.result; if (tail.eq(value.undef)) { - return s.retval(head); + if (join == 0) { + return s.retval(head); + } else { + return s.err(error.InvalidCharacter, "join datum"); + } } const rune = switch (join) { + 0 => JOIN, '.' => DOT, ':' => COLON, '|' => PIPE, - else => JOIN, + else => unreachable, }; - const result = cons(rune, cons(head, tail)); - return s.jump(.end_one_datum, result); + const data = cons(head, tail); + return s.jump(.end_one_datum, cons(rune, data)); } fn parseOneDatum(s: *State, c: u8, next: Fn) !void { @@ -653,7 +678,7 @@ fn isBareChar(c: u8) bool { 'a'...'z' , 'A'...'Z' , '0'...'9', '!' , '$' , '%' , '&' , '*' , '+', '-' , '/' , '<' , '=' , '>' , '?', - '@' , '^' , '_' , '~' => true, + '@' , '^' , '_' , '~' , '.' => true, // zig fmt: on else => false, }; @@ -811,11 +836,11 @@ fn parseRune(s: *State, c1: u8) !struct { Value, ?u8 } { var len: usize = 1; while (try s.read()) |c| : (len += 1) { if (len == 6 or !std.ascii.isAlphanumeric(c)) { - return .{ try s.getRune(), c }; + return .{ s.getRune(), c }; } try s.addChar(c); } - return .{ try s.getRune(), null }; + return .{ s.getRune(), null }; } fn parseRuneEnd(s: *State, r: Value, c1: ?u8, next: Fn) !void { @@ -933,22 +958,20 @@ fn continueList(s: *State) !void { if (c == close) { return endList(s); } - if (c == '.') { - return s.jump(.parse_list_tail, null); - } return s.err(error.InvalidCharacter, "list"); } + if (s.result.eq(S_DOT)) { + return s.subr(.parse_unit, .end_improper_list); + } + s.context.val = cons(s.result, s.context.val); - var c1 = s.unused_char orelse try s.read(); + var c1 = s.getUnused() orelse try s.read(); while (c1) |c| : (c1 = try s.read()) { if (c == close) { return endList(s); } - if (c == '.') { - return s.jump(.parse_list_tail, null); - } switch (try checkBlank(s, c)) { .yes => {}, .skip_unit => { @@ -958,7 +981,7 @@ fn continueList(s: *State) !void { .skip_line => try s.skipLine(), .no => { s.unused_char = c; - return s.jump(.parse_list_element, null); + return s.subr(.parse_list_element, .continue_list); }, } } @@ -969,19 +992,6 @@ fn endList(s: *State) !void { return s.retval(lib.list.reverse(s.context.val)); } -fn parseListTail(s: *State) !void { - const c = try s.readNoEof("list tail"); - try s.pushContext(.end_improper_list); - switch (try checkBlank(s, c)) { - .yes => {}, - .skip_unit => return s.subr(.parse_unit, .parse_unit), - .skip_line => try s.skipLine(), - // One blank mandatory here. - .no => return s.err(error.InvalidCharacter, "list tail"), - } - return s.jump(.parse_unit, null); -} - fn endImproperList(s: *State) !void { const tail = s.result; if (tail.eq(value.undef)) { @@ -992,22 +1002,21 @@ fn endImproperList(s: *State) !void { } fn closeImproperList(s: *State) !void { + const result = s.context.val; const close = s.context.char; var c1 = s.getUnused() orelse try s.read(); - while (c1) |c| : (c1 = try s.read()) { + while (c1) |c| : (c1 = try s.readNoEof("after list tail")) { + if (c == close) { + return s.retval(result); + } switch (try checkBlank(s, c)) { .yes => {}, .skip_unit => return s.subr(.parse_unit, .close_improper_list), .skip_line => try s.skipLine(), - .no => { - if (c == close) { - return s.retval(s.context.val); - } - return s.err(error.InvalidCharacter, "after list tail"); - }, + .no => return s.err(error.InvalidCharacter, "after list tail"), } } - return s.err(error.UnexpectedEof, "after list tail"); + unreachable; } fn parseQuoteExpr(s: *State, c1: u8, next: Fn) !void { @@ -1026,10 +1035,14 @@ fn parseQuoteExpr(s: *State, c1: u8, next: Fn) !void { } s.context.val = q; - return s.subr(.parse_unit, .end_quote_expr); + s.unused_char = c; + return s.subr(.parse_list_element, .end_quote_expr); } fn endQuoteExpr(s: *State) !void { + if (s.result.eq(value.undef)) { + return s.err(error.InvalidCharacter, "quote expression datum"); + } const q = s.context.val; const d = s.result; return s.retval(cons(q, d)); -- cgit v1.2.3 From 6eedf5394997b91467a392732cdb7fbb80a790b8 Mon Sep 17 00:00:00 2001 From: Taylan Kammer Date: Fri, 28 Mar 2025 18:02:38 +0100 Subject: blub --- _tests/test.zig | 4 +- spec/parser.ebnf | 30 ++++-- src/libzisp.zig | 16 ++- src/libzisp/io/parser.zig | 250 +++++++++++++++++++++++---------------------- src/libzisp/value/rune.zig | 4 + 5 files changed, 170 insertions(+), 134 deletions(-) (limited to 'spec') diff --git a/_tests/test.zig b/_tests/test.zig index 5acb628..e746851 100644 --- a/_tests/test.zig +++ b/_tests/test.zig @@ -1,11 +1,13 @@ const std = @import("std"); -pub fn main() void { +pub fn main() u8 { // const y: [3]u64 = .{ 1, 2, 3 }; // const x: struct { u8, u64, u8 } = y; // @import("std").debug.print("{}\n", .{x[0] + x[1] + x[2]}); std.debug.print("{}\n", .{@sizeOf(struct { u64, ?u8 })}); + + return while (true) if (true) break 1; } // const x: ?u8 = 5; diff --git a/spec/parser.ebnf b/spec/parser.ebnf index 44b1967..60f7890 100644 --- a/spec/parser.ebnf +++ b/spec/parser.ebnf @@ -1,11 +1,14 @@ -unit : blank* ( datum blank? | EOF ) ; +unit : empty_unit | datum_unit ; -blank : 9...13 | comment ; +empty_unit : blank* EOF ; -datum : one_datum ( join_char? one_datum )* ; +datum_unit : blank* datum blank? ; -join_char : '.' | ':' | '|' ; + +blank : 9...13 | comment ; + +datum : join_data | dot_string ; comment : ';' ( skip_unit | skip_line ) ; @@ -15,9 +18,18 @@ skip_unit : '~' unit ; skip_line : ( ~10 )* 10? ; -one_datum : ( bare_str | clad_datum ) ; +join_data : one_datum ( join_char? one_datum )* + +join_char : '.' | ':' | '|' ; + +dot_string : '.'{2,} + -bare_str : bare_str_elt+ ; +one_datum : ( num_string | bare_string | clad_datum ) ; + +num_string : ( '+' | '-' )? digit ( bare_str_elt | '.' )* ; + +bare_string : bare_str_elt+ ; clad_datum : '\' bare_esc_str | '"' quoted_str '"' @@ -37,11 +49,13 @@ bare_esc_str : bare_esc bare_str_elt* ; quoted_str : ( quoted_char | '\' quoted_esc )* ; hash_expr : rune clad_datum? - | '%' label ( '%' | '=' unit ) + | '%' label ( '%' | '=' datum_unit ) | clad_datum ; -list : unit+ ( '.' blank+ unit )? blank* ; +list : datum_unit+ list_tail? blank* ; + +list_tail : '.' blank+ datum_unit quote_expr : ( "'" | "`" | "," ) datum ; diff --git a/src/libzisp.zig b/src/libzisp.zig index df8422b..ceee3f6 100644 --- a/src/libzisp.zig +++ b/src/libzisp.zig @@ -352,11 +352,17 @@ fn parseBench(path: []const u8, iters: usize) !void { var timer = try std.time.Timer.start(); for (0..iters) |i| { _ = i; - var br = std.io.bufferedReader(file.reader()); - const reader = br.reader().any(); + // var br = std.io.bufferedReader(file.reader()); + // const reader = br.reader().any(); + const reader = file.reader().any(); var v: Value = undefined; while (true) { - v = io.parser.parse(reader); + v = io.parser._parse(reader) catch |e| { + std.debug.print("\nfile pos: {}\n", .{ + try file.getPos(), + }); + return e; + }; if (value.eof.check(v)) { break; } @@ -374,7 +380,7 @@ fn parseBench(path: []const u8, iters: usize) !void { test "parse bench" { try parseBench("test-data/parser-test-1.scm", 1000); try parseBench("test-data/parser-test-2.scm", 1000); - // try parseBench("test-data/parser-torture.scm", 1); + try parseBench("test-data/parser-torture.scm", 1); } test "unparse" { @@ -423,7 +429,7 @@ test "unparse5" { test "unparse6" { const w = std.io.getStdErr().writer(); - const v = parseString("(foo .bar ... baz. bat.(qux))"); + const v = parseString("(foo bar ... baz bat.(qux))"); try io.unparser.unparse(w, v); try w.writeByte('\n'); } diff --git a/src/libzisp/io/parser.zig b/src/libzisp/io/parser.zig index 643f7e8..8093ffe 100644 --- a/src/libzisp/io/parser.zig +++ b/src/libzisp/io/parser.zig @@ -275,10 +275,10 @@ const GRAVE = value.rune.pack("GRAVE"); const COMMA = value.rune.pack("COMMA"); const SQUARE = value.rune.pack("SQUARE"); const BRACE = value.rune.pack("BRACE"); +const VOID = value.rune.packForced(""); +const LSTAIL = value.rune.packForced("."); // zig fmt: on -const S_DOT = value.sstr.pack("."); - const Context = struct { // What to do next. next: Fn = .parse_unit, @@ -288,7 +288,7 @@ const Context = struct { char: u8 = undefined, }; -const ParseError = error{ +const ParseError = enum { InvalidCharacter, UnclosedString, UnexpectedEof, @@ -314,7 +314,6 @@ const State = struct { result: Value = undefined, unused_char: ?u8 = null, - err_code: anyerror = undefined, err_msg: []const u8 = undefined, fn init( @@ -337,9 +336,13 @@ const State = struct { s.chars.deinit(s.chars_alloc); } - fn err(s: *State, e: ParseError, msg: []const u8) ParseError { - s.err_msg = msg; - return e; + fn err( + s: *State, + comptime e: ParseError, + comptime msg: []const u8, + ) error{ParseError} { + s.err_msg = @tagName(e) ++ " at: " ++ msg; + return error.ParseError; } fn read(s: *State) !?u8 { @@ -348,10 +351,7 @@ const State = struct { } const c = s.input.readByte() catch |e| switch (e) { error.EndOfStream => return null, - else => { - s.err_code = e; - return error.ReadError; - }, + else => return s.err(.ReadError, "???"), }; if (detailed_debug) { std.debug.print("{c}", .{c}); @@ -359,8 +359,8 @@ const State = struct { return c; } - fn readNoEof(s: *State, emsg: []const u8) !u8 { - return if (try s.read()) |c| c else s.err(error.UnexpectedEof, emsg); + fn readNoEof(s: *State, comptime emsg: []const u8) !u8 { + return if (try s.read()) |c| c else s.err(.UnexpectedEof, emsg); } fn getUnused(s: *State) ?u8 { @@ -371,10 +371,6 @@ const State = struct { return null; } - fn skipLine(s: *State) !void { - while (try s.read()) |c| if (c == '\n') break; - } - fn addChar(s: *State, c: u8) !void { try s.chars.append(s.chars_alloc, c); } @@ -423,7 +419,7 @@ const State = struct { } fn abort(s: *State, next: Fn, unused_c: u8) void { - s.result = value.undef; + s.result = VOID; s.unused_char = unused_c; s.context.next = next; } @@ -438,7 +434,7 @@ const State = struct { } }; -pub fn parse(input: std.io.AnyReader) Value { +pub fn _parse(input: std.io.AnyReader) !Value { var debug_alloc: std.heap.DebugAllocator(.{}) = undefined; if (!is_test and is_debug) { debug_alloc = .init; @@ -465,25 +461,28 @@ pub fn parse(input: std.io.AnyReader) Value { var s = State.init(input, stack_alloc, chars_alloc) catch @panic(""); defer s.deinit(); - while (s.context.next != .done) callNext(&s) catch { - if (s.unused_char) |c| { - std.debug.panic( - "Parse error: {} at: {s}, char: {c}\n", - .{ s.err_code, s.err_msg, c }, - ); - } else { - std.debug.panic( - "Parse error: {} at: {s}\n", - .{ s.err_code, s.err_msg }, - ); - } + while (s.context.next != .done) callNext(&s) catch |e| { + // _ = e; + // if (s.unused_char) |c| { + // std.debug.panic( + // "Parse error: {s}, unused_char: 0x{x}\n", + // .{ s.err_msg, c }, + // ); + // } else { + // std.debug.panic("Parse error: {s}\n", .{s.err_msg}); + // } + return e; }; if (s.unused_char) |c| { - std.debug.panic("Invalid character: {c}\n", .{c}); + std.debug.panic("Invalid trailing character: {c}\n", .{c}); } return s.result; } +pub fn parse(input: std.io.AnyReader) Value { + return _parse(input) catch @panic(""); +} + const Fn = enum { parse_unit, return_context, @@ -549,43 +548,74 @@ fn callNext(s: *State) !void { fn parseUnit(s: *State) !void { var c1 = s.getUnused() orelse try s.read(); while (c1) |c| : (c1 = try s.read()) { - switch (try checkBlank(s, c)) { + switch (try checkBlanks(s, c)) { .yes => {}, .skip_unit => try s.push(.parse_unit), - .skip_line => try s.skipLine(), .no => return parseDatum(s, c), } } return s.retval(value.eof.eof); } -fn checkBlank(s: *State, c: u8) !enum { yes, skip_unit, skip_line, no } { +fn checkBlanks(s: *State, c: u8) !enum { yes, skip_unit, no } { return switch (c) { '\t'...'\r', ' ' => .yes, ';' => switch (try s.read() orelse '\n') { '\n' => .yes, '~' => .skip_unit, - else => .skip_line, + else => while (try s.read() != '\n') {} else .yes, }, else => .no, }; } fn parseDatum(s: *State, c: u8) !void { + if (c == '.') { + return parseDotString(s); + } return parseOneDatum(s, c, .end_one_datum); } +fn parseDotString(s: *State) !void { + try s.addChar('.'); + while (try s.read()) |c| { + switch (try checkBlanks(s, c)) { + .yes => return dotString(s, false), + .skip_unit => return dotString(s, true), + .no => switch (c) { + '.' => try s.addChar('.'), + ')', ']', '}' => { + s.unused_char = c; + return dotString(s, false); + }, + else => return s.err(.InvalidCharacter, "dot string"), + }, + } + } + unreachable; +} + +fn dotString(s: *State, skip_unit: bool) !void { + const lstail = s.chars.items.len == 1; + const result = if (lstail) LSTAIL else s.getBareString(); + if (skip_unit) { + s.context.val = result; + return s.subr(.parse_unit, .return_context); + } else { + return s.retval(result); + } +} + fn endOneDatum(s: *State) !void { - if (s.result.eq(value.undef)) { - return s.retval(value.undef); + if (s.result.eq(VOID)) { + return s.retval(VOID); } const d = s.result; const c1 = s.getUnused() orelse try s.read(); if (c1) |c| { - switch (try checkBlank(s, c)) { + switch (try checkBlanks(s, c)) { .yes => {}, .skip_unit => return skipUnitAndReturn(s, d), - .skip_line => try s.skipLine(), .no => return parseJoin(s, d, c), } } @@ -629,11 +659,11 @@ fn joinData(s: *State) !void { const head = s.context.val; const join = s.context.char; const tail = s.result; - if (tail.eq(value.undef)) { + if (tail.eq(VOID)) { if (join == 0) { return s.retval(head); } else { - return s.err(error.InvalidCharacter, "join datum"); + return s.err(.InvalidCharacter, "join datum"); } } const rune = switch (join) { @@ -649,20 +679,17 @@ fn joinData(s: *State) !void { fn parseOneDatum(s: *State, c: u8, next: Fn) !void { if (isBareChar(c)) { - const d, s.unused_char = try parseBareString(s, c); - return s.jump(next, d); + return s.jump(next, try parseBareString(s, c)); } return parseCladDatum(s, c, next); } fn parseCladDatum(s: *State, c: u8, next: Fn) !void { if (c == '\\') { - const bs, s.unused_char = try parseBareEscString(s); - return s.jump(next, bs); + return s.jump(next, try parseBareEscString(s)); } if (c == '"') { - const qs = try parseQuotedString(s); - return s.jump(next, qs); + return s.jump(next, try parseQuotedString(s)); } return switch (c) { '#' => parseHashExpression(s, next), @@ -675,10 +702,8 @@ fn parseCladDatum(s: *State, c: u8, next: Fn) !void { fn isBareChar(c: u8) bool { return switch (c) { // zig fmt: off - 'a'...'z' , 'A'...'Z' , '0'...'9', - '!' , '$' , '%' , '&' , '*' , '+', - '-' , '/' , '<' , '=' , '>' , '?', - '@' , '^' , '_' , '~' , '.' => true, + 'a'...'z' , 'A'...'Z' , '0'...'9' , '!' , '$' , '%' , '&' , '*' , + '+' , '-' , '/' , '<' , '=' , '>' , '?' , '@' , '^' , '_' , '~' => true, // zig fmt: on else => false, }; @@ -691,27 +716,28 @@ fn isBareEsc(c: u8) bool { }; } -fn parseBareString(s: *State, c: u8) !struct { Value, ?u8 } { +fn parseBareString(s: *State, c: u8) !Value { try s.addChar(c); return parseBareStringRest(s); } -fn parseBareEscString(s: *State) !struct { Value, ?u8 } { +fn parseBareEscString(s: *State) !Value { try s.addChar(try parseBareEsc(s)); return parseBareStringRest(s); } -fn parseBareStringRest(s: *State) !struct { Value, ?u8 } { +fn parseBareStringRest(s: *State) !Value { while (try s.read()) |c| { if (isBareChar(c)) { try s.addChar(c); } else if (c == '\\') { try s.addChar(try parseBareEsc(s)); } else { - return .{ s.getBareString(), c }; + s.unused_char = c; + break; } } - return .{ s.getBareString(), null }; + return s.getBareString(); } fn parseBareEsc(s: *State) !u8 { @@ -719,7 +745,7 @@ fn parseBareEsc(s: *State) !u8 { if (isBareEsc(c)) { return c; } else { - return s.err(error.InvalidCharacter, "bare escape"); + return s.err(.InvalidCharacter, "bare escape"); } } @@ -754,17 +780,16 @@ fn parseQuotedEsc(s: *State) !void { 'r' => 13, 'e' => 27, 'x' => try parseHexByte(s, "hex escape"), - else => return s.err(error.InvalidCharacter, "quoted escape"), + else => return s.err(.InvalidCharacter, "quoted escape"), }); } fn parseUniHexHandleErrors(s: *State) !void { return parseUniHex(s) catch |err| switch (err) { - error.Utf8CannotEncodeSurrogateHalf => e: { - s.err_code = err; - s.err_msg = "unicode escape"; - break :e error.UnicodeError; - }, + error.Utf8CannotEncodeSurrogateHalf => s.err( + .UnicodeError, + "unicode escape", + ), else => |e| e, }; } @@ -773,16 +798,16 @@ fn parseUniHex(s: *State) !void { const msg = "unicode escape"; if (try s.readNoEof(msg) != '{') { - return s.err(error.InvalidCharacter, msg); + return s.err(.InvalidCharacter, msg); } const uc, const unused_c = try parseHex(s, u21, msg); if (unused_c) |c| { if (c != '}') { - return s.err(error.InvalidCharacter, msg); + return s.err(.InvalidCharacter, msg); } } else { - return s.err(error.UnexpectedEof, msg); + return s.err(.UnexpectedEof, msg); } const n = try std.unicode.utf8CodepointSequenceLength(uc); @@ -792,8 +817,8 @@ fn parseUniHex(s: *State) !void { fn parseHashExpression(s: *State, next: Fn) !void { const c = try s.readNoEof("hash expression"); - if (try checkBlank(s, c) != .no) { - return s.err(error.InvalidCharacter, "hash expression"); + if (try checkBlanks(s, c) != .no) { + return s.err(.InvalidCharacter, "hash expression"); } if (std.ascii.isAlphabetic(c)) { const r, const unused_c = try parseRune(s, c); @@ -805,16 +830,14 @@ fn parseHashExpression(s: *State, next: Fn) !void { } if (isBareChar(c)) { // Reserved for future extensions to syntax sugar. - return s.err(error.InvalidCharacter, "hash expression"); + return s.err(.InvalidCharacter, "hash expression"); } // fast-path to avoid subr if (c == '\\') { - const bs, s.unused_char = try parseBareEscString(s); - return s.jump(next, cons(HASH, bs)); + return s.jump(next, cons(HASH, try parseBareEscString(s))); } if (c == '"') { - const qs = try parseQuotedString(s); - return s.jump(next, cons(HASH, qs)); + return s.jump(next, cons(HASH, try parseQuotedString(s))); } s.unused_char = c; return s.subr(.parse_hash_datum, next); @@ -825,8 +848,8 @@ fn parseHashDatum(s: *State) !void { } fn endHashDatum(s: *State) !void { - if (s.result.eq(value.undef)) { - return s.err(error.InvalidCharacter, "hash datum"); + if (s.result.eq(VOID)) { + return s.err(.InvalidCharacter, "hash datum"); } return s.retval(cons(HASH, s.result)); } @@ -846,12 +869,10 @@ fn parseRune(s: *State, c1: u8) !struct { Value, ?u8 } { fn parseRuneEnd(s: *State, r: Value, c1: ?u8, next: Fn) !void { const c = c1 orelse return s.jump(next, r); if (c == '\\') { - const bs, s.unused_char = try parseBareString(s, c); - return s.jump(next, cons(r, bs)); + return s.jump(next, cons(r, try parseBareString(s, c))); } if (c == '"') { - const qs = try parseQuotedString(s); - return s.jump(next, cons(r, qs)); + return s.jump(next, cons(r, try parseQuotedString(s))); } s.unused_char = c; switch (c) { @@ -869,12 +890,10 @@ fn parseRuneDatum(s: *State) !void { } fn endRuneDatum(s: *State) !void { - const r = s.context.val; - const d = s.result; - if (d.eq(value.undef)) { - s.retval(r); + if (s.result.eq(VOID)) { + s.retval(s.context.val); } - return s.retval(cons(r, d)); + return s.retval(cons(s.context.val, s.result)); } fn parseLabel(s: *State) !struct { Value, ?u8 } { @@ -883,7 +902,7 @@ fn parseLabel(s: *State) !struct { Value, ?u8 } { } fn parseLabelEnd(s: *State, l: Value, c1: ?u8, next: Fn) !void { - const c = c1 orelse return s.err(error.UnexpectedEof, "datum label"); + const c = c1 orelse return s.err(.UnexpectedEof, "datum label"); if (c == '%') { return s.jump(next, cons(LABEL, l)); } @@ -892,16 +911,14 @@ fn parseLabelEnd(s: *State, l: Value, c1: ?u8, next: Fn) !void { s.context.val = l; return s.subr(.parse_unit, .end_label_datum); } - return s.err(error.InvalidCharacter, "datum label"); + return s.err(.InvalidCharacter, "datum label"); } fn endLabelDatum(s: *State) !void { - const l = s.context.val; - const d = s.result; - if (d.eq(value.undef)) { - return s.err(error.InvalidCharacter, "label datum"); + if (s.result.eq(VOID)) { + return s.err(.InvalidCharacter, "label datum"); } - return s.retval(cons(LABEL, cons(l, d))); + return s.retval(cons(LABEL, cons(s.context.val, s.result))); } fn parseList(s: *State, open: u8, next: Fn) !void { @@ -921,14 +938,13 @@ fn parseList(s: *State, open: u8, next: Fn) !void { if (c == close) { return s.jump(next, head); } - switch (try checkBlank(s, c)) { + switch (try checkBlanks(s, c)) { .yes => {}, .skip_unit => { try listParserSetup(s, head, close, next); // Parse twice in a row, ignoring the first result. return s.subr(.parse_unit, .parse_unit); }, - .skip_line => try s.skipLine(), .no => { try listParserSetup(s, head, close, next); s.unused_char = c; @@ -936,7 +952,7 @@ fn parseList(s: *State, open: u8, next: Fn) !void { }, } } - return s.err(error.UnexpectedEof, "list"); + return s.err(.UnexpectedEof, "list"); } fn listParserSetup(s: *State, head: Value, close: u8, next: Fn) !void { @@ -953,15 +969,15 @@ fn parseListElement(s: *State) !void { fn continueList(s: *State) !void { const close = s.context.char; - if (s.result.eq(value.undef)) { + if (s.result.eq(VOID)) { const c = s.getUnused().?; if (c == close) { return endList(s); } - return s.err(error.InvalidCharacter, "list"); + return s.err(.InvalidCharacter, "list"); } - if (s.result.eq(S_DOT)) { + if (s.result.eq(LSTAIL)) { return s.subr(.parse_unit, .end_improper_list); } @@ -972,20 +988,19 @@ fn continueList(s: *State) !void { if (c == close) { return endList(s); } - switch (try checkBlank(s, c)) { + switch (try checkBlanks(s, c)) { .yes => {}, .skip_unit => { try s.pushContext(.continue_list); return s.subr(.parse_unit, .parse_unit); }, - .skip_line => try s.skipLine(), .no => { s.unused_char = c; return s.subr(.parse_list_element, .continue_list); }, } } - return s.err(error.UnexpectedEof, "list"); + return s.err(.UnexpectedEof, "list"); } fn endList(s: *State) !void { @@ -993,11 +1008,10 @@ fn endList(s: *State) !void { } fn endImproperList(s: *State) !void { - const tail = s.result; - if (tail.eq(value.undef)) { - return s.err(error.InvalidCharacter, "list tail"); + if (s.result.eq(VOID)) { + return s.err(.InvalidCharacter, "list tail"); } - s.context.val = lib.list.reverseWithTail(s.context.val, tail); + s.context.val = lib.list.reverseWithTail(s.context.val, s.result); return closeImproperList(s); } @@ -1009,11 +1023,10 @@ fn closeImproperList(s: *State) !void { if (c == close) { return s.retval(result); } - switch (try checkBlank(s, c)) { + switch (try checkBlanks(s, c)) { .yes => {}, .skip_unit => return s.subr(.parse_unit, .close_improper_list), - .skip_line => try s.skipLine(), - .no => return s.err(error.InvalidCharacter, "after list tail"), + .no => return s.err(.InvalidCharacter, "after list tail"), } } unreachable; @@ -1030,8 +1043,7 @@ fn parseQuoteExpr(s: *State, c1: u8, next: Fn) !void { // fast-path to avoid subr const c = try s.readNoEof("quote expression"); if (isBareChar(c) or c == '\\') { - const bs, s.unused_char = try parseBareString(s, c); - return s.jump(next, cons(q, bs)); + return s.jump(next, cons(q, try parseBareString(s, c))); } s.context.val = q; @@ -1040,12 +1052,10 @@ fn parseQuoteExpr(s: *State, c1: u8, next: Fn) !void { } fn endQuoteExpr(s: *State) !void { - if (s.result.eq(value.undef)) { - return s.err(error.InvalidCharacter, "quote expression datum"); + if (s.result.eq(VOID)) { + return s.err(.InvalidCharacter, "quote expression datum"); } - const q = s.context.val; - const d = s.result; - return s.retval(cons(q, d)); + return s.retval(cons(s.context.val, s.result)); } // Helpers @@ -1053,7 +1063,7 @@ fn endQuoteExpr(s: *State) !void { fn parseHex( s: *State, u_type: type, - emsg: []const u8, + comptime emsg: []const u8, ) !struct { u_type, ?u8 } { var uc: u_type = undefined; @@ -1065,13 +1075,13 @@ fn parseHex( return .{ uc, c }; } const shl = std.math.shlExact; - uc = shl(u_type, uc, 4) catch return s.err(error.OutOfRange, emsg); + uc = shl(u_type, uc, 4) catch return s.err(.OutOfRange, emsg); uc |= try parseHexDigit(s, c, emsg); } return .{ uc, null }; } -fn parseHexByte(s: *State, emsg: []const u8) !u8 { +fn parseHexByte(s: *State, comptime emsg: []const u8) !u8 { const h1 = try s.readNoEof(emsg); const h2 = try s.readNoEof(emsg); const hi = try parseHexDigit(s, h1, emsg); @@ -1079,11 +1089,11 @@ fn parseHexByte(s: *State, emsg: []const u8) !u8 { return hi << 4 | lo; } -fn parseHexDigit(s: *State, c: u8, emsg: []const u8) !u8 { +fn parseHexDigit(s: *State, c: u8, comptime emsg: []const u8) !u8 { return switch (c) { '0'...'9' => c - '0', 'A'...'F' => c - 'A' + 10, 'a'...'f' => c - 'a' + 10, - else => s.err(error.InvalidCharacter, emsg), + else => s.err(.InvalidCharacter, emsg), }; } diff --git a/src/libzisp/value/rune.zig b/src/libzisp/value/rune.zig index 154ec13..195210e 100644 --- a/src/libzisp/value/rune.zig +++ b/src/libzisp/value/rune.zig @@ -44,6 +44,10 @@ fn assertValidRune(s: []const u8) void { pub fn pack(s: []const u8) Value { assertValidRune(s); + return packForced(s); +} + +pub fn packForced(s: []const u8) Value { var v = Value{ .rune = .{ .name = 0 } }; const dest: [*]u8 = @ptrCast(&v.rune.name); @memcpy(dest, s); -- cgit v1.2.3 From d714cf3b57e39979b208369f9369b526409172b3 Mon Sep 17 00:00:00 2001 From: Taylan Kammer Date: Fri, 28 Mar 2025 20:44:01 +0100 Subject: blip --- spec/parser.ebnf | 4 +- src/libzisp.zig | 101 ++++++++++++++++++++++---------------------- src/libzisp/io/parser.zig | 65 ++++++++++++++++++++-------- src/libzisp/io/unparser.zig | 23 ++++++++++ src/libzisp/value/istr.zig | 4 ++ 5 files changed, 126 insertions(+), 71 deletions(-) (limited to 'spec') diff --git a/spec/parser.ebnf b/spec/parser.ebnf index 60f7890..ce7fa83 100644 --- a/spec/parser.ebnf +++ b/spec/parser.ebnf @@ -49,7 +49,7 @@ bare_esc_str : bare_esc bare_str_elt* ; quoted_str : ( quoted_char | '\' quoted_esc )* ; hash_expr : rune clad_datum? - | '%' label ( '%' | '=' datum_unit ) + | '%' label ( '%' | '=' blank* datum ) | clad_datum ; @@ -57,7 +57,7 @@ list : datum_unit+ list_tail? blank* ; list_tail : '.' blank+ datum_unit -quote_expr : ( "'" | "`" | "," ) datum ; +quote_expr : ( "'" | "`" | "," ) blank* datum ; bare_char : letter | digit diff --git a/src/libzisp.zig b/src/libzisp.zig index ceee3f6..de3f2e6 100644 --- a/src/libzisp.zig +++ b/src/libzisp.zig @@ -345,6 +345,56 @@ test "parse4" { try std.testing.expectEqualStrings("bar", f.slice()); } +test "unparse" { + var gpa: std.heap.GeneralPurposeAllocator(.{}) = .init; + var out: std.ArrayList(u8) = .init(gpa.allocator()); + + const w = out.writer(); + const v = parseString("#foo"); + try io.unparser.unparse(w, v); + try std.testing.expectEqualStrings("#foo", try out.toOwnedSlice()); +} + +test "unparse2" { + var gpa: std.heap.GeneralPurposeAllocator(.{}) = .init; + var out: std.ArrayList(u8) = .init(gpa.allocator()); + + const w = out.writer(); + const v = parseString("#{foo bar['x]}"); + try io.unparser.unparse(w, v); + try std.testing.expectEqualStrings( + "(#HASH #BRACE foo (#JOIN bar #SQUARE (#QUOTE . x)))", + try out.toOwnedSlice(), + ); +} + +fn writeParseResult(str: []const u8) !void { + const w = std.io.getStdErr().writer(); + const v = parseString(str); + try io.unparser.unparse(w, v); + try w.writeByte('\n'); +} + +test "unparse3" { + try writeParseResult("#{foo bar['x](y)(z)}"); +} + +test "unparse4" { + try writeParseResult("(foo ;~bar)"); +} + +test "unparse5" { + try writeParseResult("(;~foo foo ;~bar . ;~bar bar ;~bar)"); +} + +test "unparse6" { + try writeParseResult("(foo bar ... baz bat.(qux))"); +} + +test "unparse7" { + try writeParseResult("#`(#,(->keyword (syntax->datum #'sym)) . in)"); +} + fn parseBench(path: []const u8, iters: usize) !void { const file = try std.fs.cwd().openFile(path, .{}); defer file.close(); @@ -382,54 +432,3 @@ test "parse bench" { try parseBench("test-data/parser-test-2.scm", 1000); try parseBench("test-data/parser-torture.scm", 1); } - -test "unparse" { - var gpa: std.heap.GeneralPurposeAllocator(.{}) = .init; - var out: std.ArrayList(u8) = .init(gpa.allocator()); - - const w = out.writer(); - const v = parseString("#foo"); - try io.unparser.unparse(w, v); - try std.testing.expectEqualStrings("#foo", try out.toOwnedSlice()); -} - -test "unparse2" { - var gpa: std.heap.GeneralPurposeAllocator(.{}) = .init; - var out: std.ArrayList(u8) = .init(gpa.allocator()); - - const w = out.writer(); - const v = parseString("#{foo bar['x]}"); - try io.unparser.unparse(w, v); - try std.testing.expectEqualStrings( - "(#HASH #BRACE foo (#JOIN bar #SQUARE (#QUOTE . x)))", - try out.toOwnedSlice(), - ); -} - -test "unparse3" { - const w = std.io.getStdErr().writer(); - const v = parseString("#{foo bar['x](y)(z)}"); - try io.unparser.unparse(w, v); - try w.writeByte('\n'); -} - -test "unparse4" { - const w = std.io.getStdErr().writer(); - const v = parseString("(foo ;~bar)"); - try io.unparser.unparse(w, v); - try w.writeByte('\n'); -} - -test "unparse5" { - const w = std.io.getStdErr().writer(); - const v = parseString("(;~foo foo ;~bar . ;~bar bar ;~bar)"); - try io.unparser.unparse(w, v); - try w.writeByte('\n'); -} - -test "unparse6" { - const w = std.io.getStdErr().writer(); - const v = parseString("(foo bar ... baz bat.(qux))"); - try io.unparser.unparse(w, v); - try w.writeByte('\n'); -} diff --git a/src/libzisp/io/parser.zig b/src/libzisp/io/parser.zig index 8093ffe..209c548 100644 --- a/src/libzisp/io/parser.zig +++ b/src/libzisp/io/parser.zig @@ -257,7 +257,7 @@ const cons = value.pair.cons; const is_test = builtin.is_test; const is_debug = builtin.mode == .Debug; -const detailed_debug = false; +pub var detailed_debug = false; // In debug, we want to see if we leak, so very small numbers. const init_stack_capacity = if (is_debug) 32 else 32; @@ -474,7 +474,9 @@ pub fn _parse(input: std.io.AnyReader) !Value { return e; }; if (s.unused_char) |c| { - std.debug.panic("Invalid trailing character: {c}\n", .{c}); + if (c != ' ') { + std.debug.panic("Invalid trailing character: {c}\n", .{c}); + } } return s.result; } @@ -577,16 +579,16 @@ fn parseDatum(s: *State, c: u8) !void { } fn parseDotString(s: *State) !void { - try s.addChar('.'); - while (try s.read()) |c| { + var n: u48 = 1; + while (try s.read()) |c| : (n += 1) { switch (try checkBlanks(s, c)) { - .yes => return dotString(s, false), - .skip_unit => return dotString(s, true), + .yes => return dotString(s, n, false), + .skip_unit => return dotString(s, n, true), .no => switch (c) { - '.' => try s.addChar('.'), + '.' => {}, ')', ']', '}' => { s.unused_char = c; - return dotString(s, false); + return dotString(s, n, false); }, else => return s.err(.InvalidCharacter, "dot string"), }, @@ -595,9 +597,12 @@ fn parseDotString(s: *State) !void { unreachable; } -fn dotString(s: *State, skip_unit: bool) !void { - const lstail = s.chars.items.len == 1; - const result = if (lstail) LSTAIL else s.getBareString(); +fn dotString(s: *State, n: u48, skip_unit: bool) !void { + const result = if (n == 1) LSTAIL else r: { + const buf = try s.chars.addManyAsSlice(s.chars_alloc, n); + @memset(buf, '.'); + break :r s.getBareString(); + }; if (skip_unit) { s.context.val = result; return s.subr(.parse_unit, .return_context); @@ -619,6 +624,7 @@ fn endOneDatum(s: *State) !void { .no => return parseJoin(s, d, c), } } + s.unused_char = ' '; return s.retval(d); } @@ -628,13 +634,17 @@ fn skipUnitAndReturn(s: *State, d: Value) !void { } fn returnContext(s: *State) !void { + s.unused_char = ' '; return s.retval(s.context.val); } fn parseJoin(s: *State, d: Value, c: u8) !void { - s.context.val = d; - s.context.char = c; switch (c) { + ')', ']', '}' => { + // shortcut + s.unused_char = c; + return s.retval(d); + }, '.', ':', '|' => { s.context.char = c; s.unused_char = try s.readNoEof("join datum"); @@ -644,6 +654,7 @@ fn parseJoin(s: *State, d: Value, c: u8) !void { s.unused_char = c; }, } + s.context.val = d; return s.subr(.parse_join_datum, .join_data); } @@ -718,17 +729,34 @@ fn isBareEsc(c: u8) bool { fn parseBareString(s: *State, c: u8) !Value { try s.addChar(c); - return parseBareStringRest(s); + var is_num = false; + if (std.ascii.isDigit(c)) { + is_num = true; + } else if (c == '+' or c == '-') { + const c2 = try s.read() orelse return s.getBareString(); + if (std.ascii.isDigit(c2)) { + try s.addChar(c2); + is_num = true; + } else if (isBareChar(c2)) { + try s.addChar(c2); + } else if (c2 == '\\') { + try s.addChar(try parseBareEsc(s)); + } else { + s.unused_char = c2; + return s.getBareString(); + } + } + return parseBareStringRest(s, is_num); } fn parseBareEscString(s: *State) !Value { try s.addChar(try parseBareEsc(s)); - return parseBareStringRest(s); + return parseBareStringRest(s, false); } -fn parseBareStringRest(s: *State) !Value { +fn parseBareStringRest(s: *State, is_num: bool) !Value { while (try s.read()) |c| { - if (isBareChar(c)) { + if (isBareChar(c) or (is_num and c == '.')) { try s.addChar(c); } else if (c == '\\') { try s.addChar(try parseBareEsc(s)); @@ -1046,9 +1074,10 @@ fn parseQuoteExpr(s: *State, c1: u8, next: Fn) !void { return s.jump(next, cons(q, try parseBareString(s, c))); } + try s.push(next); s.context.val = q; s.unused_char = c; - return s.subr(.parse_list_element, .end_quote_expr); + return s.subr(.parse_unit, .end_quote_expr); } fn endQuoteExpr(s: *State) !void { diff --git a/src/libzisp/io/unparser.zig b/src/libzisp/io/unparser.zig index d65ffb0..d703182 100644 --- a/src/libzisp/io/unparser.zig +++ b/src/libzisp/io/unparser.zig @@ -2,6 +2,9 @@ const std = @import("std"); const value = @import("../value.zig"); +const istr = value.istr; +const seq = value.seq; + const ShortString = value.ShortString; const OtherTag = value.OtherTag; const Value = value.Value; @@ -33,6 +36,7 @@ fn unparseHeap(w: anytype, v: Value) !void { const p, const t = value.ptr.unpack(v); try switch (t) { .pair => unparsePair(w, @ptrCast(p)), + .seq => unparseSeq(w, @ptrCast(p)), else => @panic("not implemented"), }; } @@ -97,3 +101,22 @@ fn unparsePair(w: anytype, p: *[2]Value) !void { } try w.writeByte(')'); } + +fn unparseSeq(w: anytype, p: *seq.Header) !void { + const h = istr.getHeaderFromPtr(@ptrCast(p)); + switch (h.type) { + .string => try unparseString(w, h), + else => @panic("not implemented"), + } +} + +fn unparseString(w: anytype, h: *seq.Header) !void { + const info = h.info.string; + if (info.quoted) { + try w.writeByte('"'); + } + try w.writeAll(h.bytes()); + if (info.quoted) { + try w.writeByte('"'); + } +} diff --git a/src/libzisp/value/istr.zig b/src/libzisp/value/istr.zig index 9834716..abd0447 100644 --- a/src/libzisp/value/istr.zig +++ b/src/libzisp/value/istr.zig @@ -46,6 +46,10 @@ pub fn getHeader(v: Value) *seq.Header { return gc.istrHeader(header_ptr); } +pub fn getHeaderFromPtr(p: *Hval) *seq.Header { + return gc.istrHeader(p); +} + // Zisp API pub fn pred(v: Value) Value { -- cgit v1.2.3