diff options
| author | Taylan Kammer <taylan.kammer@gmail.com> | 2025-02-25 23:10:35 +0100 |
|---|---|---|
| committer | Taylan Kammer <taylan.kammer@gmail.com> | 2025-02-25 23:10:35 +0100 |
| commit | 34de389fe744018e808f2c8b301648d504ab610d (patch) | |
| tree | 7806846fd4ce078f89ce173769b2d41c3d0362ed | |
| parent | d08f735e8c8ca108a065d787a92f17b28f0409af (diff) | |
update
| -rw-r--r-- | src/libzisp.zig | 10 | ||||
| -rw-r--r-- | src/libzisp/io/parser.zig | 144 |
2 files changed, 57 insertions, 97 deletions
diff --git a/src/libzisp.zig b/src/libzisp.zig index be3f683..79a54b4 100644 --- a/src/libzisp.zig +++ b/src/libzisp.zig @@ -247,7 +247,7 @@ test "pair" { } test "parse" { - const val = io.parser.parseCode("\"foo\""); + const val = io.parser.parse("\"foo\""); try std.testing.expect(value.sstr.check(val)); @@ -256,7 +256,7 @@ test "parse" { } test "parse2" { - const val = io.parser.parseCode( + const val = io.parser.parse( \\ ;; Testing some crazy datum comments \\ ##;"bar"#;([x #"y"]{##`,'z})"foo" \\ ;; end @@ -273,7 +273,7 @@ test "parse2" { } test "parse3" { - const val = io.parser.parseCode( + const val = io.parser.parse( \\(foo #;x #;(x y) #;x #bar [#x #"baz"] 'bat) ); @@ -289,7 +289,7 @@ test "parse3" { } test "parse4" { - const val = io.parser.parseCode("(foo . #;x bar #;y)"); + const val = io.parser.parse("(foo . #;x bar #;y)"); const s = value.sstr.unpack(value.pair.car(val)); try std.testing.expectEqualStrings("foo", s.slice()); @@ -301,6 +301,6 @@ test "parse4" { test "unparse" { try std.testing.expectEqualStrings( "#foo", - io.unparser.unparse(io.parser.parseCode("#foo")), + io.unparser.unparse(io.parser.parse("#foo")), ); } diff --git a/src/libzisp/io/parser.zig b/src/libzisp/io/parser.zig index 45a752e..1e61385 100644 --- a/src/libzisp/io/parser.zig +++ b/src/libzisp/io/parser.zig @@ -1,21 +1,14 @@ // // === Parser for Code & Data === // -// Zisp s-expressions come in two flavors: code (sugar) and data (no sugar). +// Zisp s-expressions are defined in terms of an extremely minimal set of data +// types; only that which is necessary to build representations of more complex +// expressions and types: // -// However, code expressions are parsed into the same data types which the data -// expressions can represent, so homoiconicity is preserved. +// type format/examples comment +// ---- --------------- ------- // -// The "sugar" used in code expressions is merely shorthand for more complex -// data expressions, which could have been written by hand. -// -// Data expressions have a very simple format, and are only able to express the -// bare minimum set of data types needed to represent more complex data: -// -// type format comment -// ---- ------ ------- -// -// string foo , "foo bar" symbols and strings are the same data type +// string foo , "foo bar" quoted strings are flagged as such // // rune #name name is: [a-zA-Z][a-zA-Z0-9]{0,5} // @@ -23,52 +16,59 @@ // // nil () we prefer the term nil over null // -// The list short-hand syntax is the only "syntax sugar" supported in data: +// The parser recognizes various "syntax sugar" and transforms uses of it into +// uses of the above types. +// +// The most ubiquitous example is of course the list syntax: // -// (DATUM DATUM DATUM) -> (DATUM . (DATUM . (DATUM . ()))) +// (datum1 datum2 ...) -> (datum1 . (datum2 . (... . ()))) // -// We may use terms like "code parser" and "data parser" out of convenience, -// although there may only be a single parser that implements both formats by -// switching between modes. +// The following table summarizes the other supported transformations: // -// When the code parser encounters syntax sugar, it always transforms it into a -// list starting with a rune. The list of all such transformations follows. +// [...] -> (#SQUARE ...) #datum -> (#HASH . datum) // -// #datum -> (#HASH . datum) #name(...) -> (#name ...) +// {...} -> (#BRACE ...) #rune(...) -> (#rune ...) // -// [...] -> (#SQUARE ...) dat1dat2 -> (#JOIN dat1 . dat2) +// #<...> -> (#ANGLE ...) dat1dat2 -> (#JOIN dat1 . dat2) // -// {...} -> (#BRACE ...) dat1.dat2 -> (#DOT dat1 . dat2) +// 'datum -> (#QUOTE . datum) dat1.dat2 -> (#DOT dat1 . dat2) // -// 'datum -> (#QUOTE . datum) #n#=datum -> (#LABEL n . datum) +// `datum -> (#GRAVE . datum) #n#=datum -> (#LABEL n . datum) // -// `datum -> (#GRAVE . datum) #n# -> (#LABEL . n) +// ,datum -> (#COMMA . datum) #n# -> (#LABEL . n) // -// ,datum -> (#COMMA . datum) +// Notes: // -// (The "#datum" form refers to expressions that cannot be mistaken for a rune, -// such as for example: #(...) or #"..." etc.) +// * The terms datum, dat1, and dat2 each refer to an arbitrary datum; ellipsis +// means zero or more data; n is a non-negative integer. // -// The terms "datum", "dat1", and "dat2" refer to an arbitrary datum; "name" is -// a rune name; ellipsis mean zero or more data; "n" is a non-negative integer. +// * The #datum form applies only to expressions that cannot be mistaken for a +// rune, such as for example: #(...) or #"..." or #'string etc.; following a +// hash sign with a plain string would otherwise be parsed as a rune. // -// Though not represented in the table above due to notational difficulty, the -// format "#name(...)" doesn't require a list in the second position; any datum -// works, so long as there's no ambiguity: +// * Though not represented in the table due to notational difficulty, the +// format "#rune(...)" doesn't require a list in the second position; any +// datum works, so long as there's no ambiguity; for example: // -// #name1#name2 -> (#name1 . #name2) +// #rune1#rune2 -> (#rune1 . #rune2) // -// #name"text" -> (#name . "text") +// #rune"text" -> (#rune . "text") // -// As a counter-example, following a rune immediately with a bare string is not -// possible, since it's ambiguous: +// #rune'string -> (#rune #QUOTE . string) // -// #abcdefgh ;Could be (#abcdef . gh) or (#abcde . fgh) or ... +// As a counter-example, following a rune immediately with a bare string is +// not possible, since it's ambiguous: // -// The parser will see this as an attempt to use an 8-letter rune name, and -// raise an error, since rune names are limited to 6 characters. +// #abcdefgh ;Could be (#abcdef . gh) or (#abcde . fgh) or ... // -// Syntax sugar can combine arbitrarily: +// The parser will see this as an attempt to use an 8-letter rune name, and +// raise an error, since rune names are limited to 6 characters. +// +// * The #<...> form is a special case; the less-than and greater-than symbols +// are not otherwise treated as brackets; e.g., <a b c d> is actually four +// strings: "<a", "b", "c", "d>". +// +// Syntax sugar can combine arbitrarily; some examples follow: // // #{...} -> (#HASH #BRACE ...) // @@ -80,9 +80,9 @@ // // foo.bar.baz{x y} -> (#JOIN (#DOT (#DOT foo . bar) . baz) #BRACE x y) // -// Runes are case-sensitive, and the code parser only emits runes using -// upper-case letters, so lower-case runes are free for user extensions. -// Exceptions are runes used directly in code, like #true and #false. +// Runes are case-sensitive, and the parser only emits runes using upper-case +// letters when expressing syntax sugar, so there can be no accidental clash +// with runes that appear verbatim in code. // // Although strings and symbols aren't disjoint types in Zisp, the parser flags // double-quoted string literals to allow distinguishing them from bare strings. @@ -95,7 +95,7 @@ // // Note that 'foo becomes (quote foo) in Scheme, but (#QUOTE . foo) in Zisp. // The operand of #QUOTE is the entire cdr. The same principle is used when -// parsing other sugar: +// parsing other sugar; some examples follow: // // Incorrect Correct // @@ -117,13 +117,11 @@ // expect a vector literal like #(...) to work in Scheme. // // Runes may be decoded in isolation as well, rather than transforming a list -// whose head they appear in. This can implement #true and #false. (These -// would be used verbatim in code, rather than emitted by the parser.) +// whose head they appear in. This can implement #true and #false. // // The decoder may also perform arbitrary transforms on any type; for example, // it may turn bare strings (those not flagged as double-quoted) into numbers -// when it's decoding data representing code. This is how number literals are -// implemented in Zisp. +// when appropriate. This is how number literals are implemented. // // The decoder recognizes (#QUOTE ...) to implement the traditional quoting // mechanism, but with a significant difference: @@ -238,13 +236,10 @@ const value = @import("../value.zig"); const ShortString = value.ShortString; const Value = value.Value; -pub const Mode = enum { code, data }; - const TopState = struct { alloc: std.mem.Allocator, input: []const u8, pos: usize = 0, - mode: Mode = undefined, }; const State = struct { @@ -260,10 +255,6 @@ const State = struct { // To remember what kind of list we're in: () [] {} opening_bracket: u8 = undefined, - fn mode(s: *State) Mode { - return s.top.mode; - } - fn eof(s: *State) bool { return s.top.pos >= s.top.input.len; } @@ -380,17 +371,13 @@ const Fn = enum { perform_return, }; -pub fn parseCode(input: []const u8) Value { - return parse(input, .code); -} - -pub fn parse(input: []const u8, mode: Mode) Value { +pub fn parse(input: []const u8) Value { var gpa: std.heap.GeneralPurposeAllocator(.{}) = .init; defer if (gpa.deinit() == .leak) @panic("leak"); const alloc = gpa.allocator(); // var pool: std.heap.MemoryPool(State) = .init(alloc); // defer pool.deinit(); - var top = TopState{ .alloc = alloc, .input = input, .mode = mode }; + var top = TopState{ .alloc = alloc, .input = input }; var s0 = State{ .top = &top }; var s = &s0; while (true) s = switch (s.next) { @@ -466,11 +453,6 @@ fn endDatum(s: *State, d: Value) *State { return s.returnDatum(d); } - // These are only allowed in code mode. - if (s.mode() == .data) { - return err(s, "invalid use of hash in data mode"); - } - s.context = d; if (s.peek() == '.') { @@ -515,7 +497,7 @@ fn handleHash(s: *State) *State { // // #|;DATUM ;datum comment // - // #|DATUM ;hash-datum (code mode only) + // #|DATUM ;hash-datum // if (s.eof()) { @@ -547,30 +529,12 @@ fn handleHash(s: *State) *State { // Otherwise, it must be a hash-datum. #DATUM - // But data mode doesn't allow that. - if (s.mode() == .data) { - return err(s, "use of hash-datum sequence not allowed in data mode"); - } - return s.recurParse(.start_datum, .end_hash_datum); } fn handleRune(s: *State) *State { - const rune = readRune(s) orelse return err(s, "rune too long"); - // - // Now we're at the end of the rune, but it could be a rune-datum: - // - // #foo|(...) - // - - if (isEndOfDatum(s)) { - // Nope, just a stand-alone rune. - return s.returnDatum(rune); - } - - // Otherwise, it's followed by a datum, like: #foo(...) - - return endDatum(s, rune); + const r = readRune(s) orelse return err(s, "rune too long"); + return endDatum(s, r); } fn readRune(s: *State) ?Value { @@ -719,10 +683,6 @@ fn endQuote(s: *State) *State { fn startList(s: *State) *State { const open = s.getc(); - if (s.mode() == .data and open != '(') { - return err(s, "invalid opening bracket in data mode"); - } - s.consumeBlanks(); if (s.eof()) { return err(s, "unexpected EOF while parsing list"); |
