diff options
| author | Taylan Kammer <taylan.kammer@gmail.com> | 2025-02-22 15:04:05 +0100 |
|---|---|---|
| committer | Taylan Kammer <taylan.kammer@gmail.com> | 2025-02-22 15:04:05 +0100 |
| commit | c922361115c8ee398ec4e26bb0af8cca4dcb9667 (patch) | |
| tree | 6b093f2ad960bf073d295458faf6707e5123c1e4 /src | |
| parent | 2d52864265e06a4d863dba84e1d20c8d52d8c5dd (diff) | |
update
Diffstat (limited to 'src')
| -rw-r--r-- | src/libzisp.zig | 2 | ||||
| -rw-r--r-- | src/libzisp/io/decoder.zig | 1 | ||||
| -rw-r--r-- | src/libzisp/io/encoder.zig | 1 | ||||
| -rw-r--r-- | src/libzisp/io/parser.zig (renamed from src/libzisp/parser.zig) | 292 | ||||
| -rw-r--r-- | src/libzisp/io/reader.zig | 10 | ||||
| -rw-r--r-- | src/libzisp/io/writer.zig | 1 |
6 files changed, 115 insertions, 192 deletions
diff --git a/src/libzisp.zig b/src/libzisp.zig index be8b8b7..8141994 100644 --- a/src/libzisp.zig +++ b/src/libzisp.zig @@ -6,8 +6,8 @@ const builtin = @import("builtin"); const testing = std.testing; pub const gc = @import("libzisp/gc.zig"); -pub const parser = @import("libzisp/parser.zig"); pub const value = @import("libzisp/value.zig"); +pub const parser = @import("libzisp/io/parser.zig"); pub const Value = value.Value; pub const Bucket = gc.Bucket; diff --git a/src/libzisp/io/decoder.zig b/src/libzisp/io/decoder.zig new file mode 100644 index 0000000..eb27e20 --- /dev/null +++ b/src/libzisp/io/decoder.zig @@ -0,0 +1 @@ +// wip diff --git a/src/libzisp/io/encoder.zig b/src/libzisp/io/encoder.zig new file mode 100644 index 0000000..eb27e20 --- /dev/null +++ b/src/libzisp/io/encoder.zig @@ -0,0 +1 @@ +// wip diff --git a/src/libzisp/parser.zig b/src/libzisp/io/parser.zig index 1c58687..71c6946 100644 --- a/src/libzisp/parser.zig +++ b/src/libzisp/io/parser.zig @@ -12,15 +12,13 @@ // Data expressions have a very simple format, and are only able to express a // minimal set of data types: // -// string -> foo , "foo bar" ;symbols and strings are the same data type +// string -> foo , "foo bar" ;symbols and strings are the same data type // -// number -> -1.2 , +nan.0 , ... ;this is the most complex type to represent +// rune -> #foo ;limited to 6 ASCII letters (a - z, A - Z) // -// rune -> #foo ;limited to 6 ASCII letters (a - z, A - Z) +// pair -> (DATUM . DATUM) ;the only composite data type supported // -// pair -> (DATUM . DATUM) ;the only composite data type supported -// -// nil -> () ;we prefer the term nil over null +// nil -> () ;we prefer the term nil over null // // The list short-hand syntax may be considered the only "syntax sugar" that is // supported by the data parser: @@ -57,12 +55,15 @@ // would be string literals, because Zisp doesn't differentiate strings and // symbols like traditional lisps. Also, note that although we could reuse // #QUOTE here, instead of using #STRING, this would make it impossible to -// differentiate between the expressions #'foo and #"foo" once parsing is -// finished, even though we may want to give them different meanings.) +// differentiate between the code expressions #'foo and #"foo".) // // Runes are case-sensitive, and the code parser only emits runes using // upper-case letters, so lower-case runes are free for user extensions. // +// You may be wondering about numbers. As far as the parser is concerned, +// numbers are strings. It's the decoder (see below) that will turn bare +// strings (those not marked with #STRING) into numbers. +// // Note that 'foo becomes (quote foo) in Scheme, but (#QUOTE . foo) in Zisp. // The operand of #QUOTE is the entire cdr. The same principle is used when // parsing other sugar: @@ -87,22 +88,36 @@ // Runes may be decoded in isolation as well, rather than transforming a list // whose head they appear in. This is how #true and #false are implemented. // +// The decoder may also perform arbitrary transforms on any type; for example, +// it may turn bare strings (those not marked with #STRING) into numbers when +// it's decoding data representing code. This is how number literals are +// implemented in Zisp. +// // The decoder recognizes (#QUOTE ...) to implement the traditional quoting // mechanism, but in a better way: // // Traditional quote is "unhygienic" in Scheme terms. An expression such as // '(foo bar) will always be read as (quote (foo bar)) regardless of what sort // of lexical context it appears in, so the semantics will depend on whatever -// the identifier "quote" is bound to in that lexical context. +// the identifier "quote" is bound to in that lexical context, meaning that the +// expression may end up evaluating to something other than the list (foo bar). // // The Zisp decoder, which transforms not text to text, but objects to objects, // can turn (#QUOTE ...) into an abstract object which encapsulates the notion -// of quoting, which the Zisp evaluator can recognize and act upon. +// of quoting, which the Zisp evaluator can recognize and act upon, ensuring +// that an expression like '(foo bar) always turns into the list (foo bar). // // One way to think about this, in Scheme (R6RS / syntax-case) terms, is that // expressions like '(foo bar) turn directly into a *syntax object* when read, // rather than a regular list object. // +// The decoder is, of course, configurable and extensible. The transformations +// mentioned above would be performed only when it's told to decode data which +// represents Zisp code. The decoder may be given a different configuration, +// telling it to decode, for example, data which represents a different kind of +// domain-specific data, such as application settings, build system commands, +// complex data records with non-standard data types, and so on. +// // // === Implementation details === // @@ -143,6 +158,14 @@ // .start_datum, this means no whitespace or comments would be tolerated after // the datum comment. // +// Note 3: When it comes to pairs/lists, we mainly try to parse them as lists, +// and a pair becomes a special-case of an improper list (two elements). This +// has the advantage of saving memory: If we implemented list parsing as pair +// parsing, we would be calling the parser recursively, deeper and deeper, for +// every pair that the list is made up of. Although we're not limited by stack +// space, thanks to the strategy described above, this would still waste memory +// while parsing. +// // // === Notation used in comments === // @@ -157,9 +180,9 @@ const std = @import("std"); -const gc = @import("gc.zig"); -const list = @import("list.zig"); -const value = @import("value.zig"); +const gc = @import("../gc.zig"); +const list = @import("../list.zig"); +const value = @import("../value.zig"); const Value = value.Value; @@ -175,10 +198,11 @@ const State = struct { parent: ?*State = null, - datum_rune: Value = value.boole.f, - list_stack: Value = value.nil.nil, + // Used to store various context, but most notably the stack of list + // elements parsed so far, so just initialize it to nil. + context: Value = value.nil.nil, + opening_bracket: u8 = 0, - opening_quote: enum { quote, grave, comma } = .quote, retval: Value = value.eof.eof, @@ -272,8 +296,8 @@ const Fn = enum { end_rune_datum, end_quote, continue_list, + finalize_improper_list, end_improper_list, - handle_trailing_datum_comments, perform_return, }; @@ -288,8 +312,8 @@ pub fn parse(input: []const u8) Value { .end_rune_datum => endRuneDatum(s), .end_quote => endQuote(s), .continue_list => continueList(s), + .finalize_improper_list => finalizeImproperList(s), .end_improper_list => endImproperList(s), - .handle_trailing_datum_comments => handleTrailingDatumComments(s), .perform_return => s.performReturn() orelse return s.retval, }; } @@ -301,7 +325,7 @@ fn startParse(s: *State) *State { } return switch (s.peek()) { // whitespace already consumed - 0...31, 127...255 => err(s, "invalid character"), + 0...32, 127...255 => err(s, "invalid character"), ')', ']', '}' => err(s, "unexpected closing bracket"), else => startDatum(s), }; @@ -318,7 +342,7 @@ fn startDatum(s: *State) *State { } return switch (s.peek()) { // whitespace checked above - 0...31, 127...255 => err(s, "invalid character"), + 0...32, 127...255 => err(s, "invalid character"), ')', ']', '}' => err(s, "unexpected closing bracket"), @@ -332,14 +356,10 @@ fn startDatum(s: *State) *State { '(', '[', '{' => startList(s), - '+', '-' => handlePlusMinus(s), - - '0'...'9' => handleDigit(s), - - // Periods only allowed between digits, and to express improper lists. - // Things like the following look too much like it could be a typo: + // Periods are only allowed in the middle of a string, or to express + // improper lists, because the following look too much like typos: // - // (foo .5) (foo .bar) + // (foo. bar) (foo .bar) (123. 456) (123 .456) // '.' => err(s, "misplaced period"), @@ -368,7 +388,7 @@ fn handleHash(s: *State) *State { // Is it a rune? #foo switch (s.peek()) { - 'A'...'Z', 'a'...'z' => return handleRune(s), + 'a'...'z', 'A'...'Z' => return handleRune(s), else => {}, } @@ -392,7 +412,6 @@ fn handleHash(s: *State) *State { fn handleRune(s: *State) *State { const rune = readRune(s) orelse return err(s, "rune too long"); - // // Now we're at the end of the rune, but it could be a rune-datum: // @@ -411,7 +430,7 @@ fn handleRune(s: *State) *State { return err(s, "invalid use of hash in data mode"); } - s.datum_rune = rune; + s.context = rune; return s.recurParse(.start_datum, .end_rune_datum); } @@ -443,21 +462,16 @@ fn isEndOfRune(s: *State) bool { } fn endRuneDatum(s: *State) *State { - return s.returnDatum(value.pair.cons( - s.datum_rune, - s.retval, - )); + return s.returnDatum(value.pair.cons(s.context, s.retval)); } fn endHashDatum(s: *State) *State { - return s.returnDatum(value.pair.cons( - value.rune.pack("HASH"), - s.retval, - )); + const rune = value.rune.pack("HASH"); + return s.returnDatum(value.pair.cons(rune, s.retval)); } fn startQuotedString(s: *State) *State { - // Consume opening quote. + // We're at |"..." so consume the opening quote before we start reading. s.skip(); const str = readQuotedString(s) catch return err(s, "unclosed string"); @@ -507,6 +521,7 @@ fn readQuotedLongString(s: *State) Value { } fn startBareString(s: *State) *State { + // We're at |foo so start reading directly. return readBareSstr(s) orelse readBareLongString(s); } @@ -546,27 +561,23 @@ fn readBareLongString(s: *State) *State { } fn startQuote(s: *State) *State { - s.opening_quote = switch (s.getc()) { - '\'' => .quote, - '`' => .grave, - ',' => .comma, + // We're at one of: |'... |`... |,... + s.context = value.rune.pack(switch (s.getc()) { + '\'' => "QUOTE", + '`' => "GRAVE", + ',' => "COMMA", else => unreachable, - }; + }); return s.recurParse(.start_datum, .end_quote); } fn endQuote(s: *State) *State { - const name = switch (s.opening_quote) { - .quote => "QUOTE", - .grave => "GRAVE", - .comma => "COMMA", - }; - return s.returnDatum(value.pair.cons( - value.rune.pack(name), - s.retval, - )); + return s.returnDatum(value.pair.cons(s.context, s.retval)); } +// List processing is, unsurprisingly, the most complicated, and it's made even +// more complicated by the possibility of datum comments in strange places... + fn startList(s: *State) *State { const open = s.getc(); @@ -575,195 +586,94 @@ fn startList(s: *State) *State { } s.consumeBlanks(); - if (s.eof()) { return err(s, "unexpected EOF while parsing list"); } - const char = s.peek(); - - // Check for empty lists: (), [], {} - if (open == '(' and char == ')') { - s.skip(); - return s.returnDatum(value.nil.nil); - } - if (open == '[' and char == ']') { - s.skip(); - return s.returnDatum(value.pair.cons( - value.rune.pack("SQUARE"), - value.nil.nil, - )); - } - if (open == '{' and char == '}') { - s.skip(); - return s.returnDatum(value.pair.cons( - value.rune.pack("BRACE"), - value.nil.nil, - )); - } - s.opening_bracket = open; + return if (isEndOfList(s)) + endList(s) + else + s.recurParse(.start_parse, .continue_list); +} - // Now we're facing the first element of the list. - return s.recurParse(.start_parse, .continue_list); +fn isEndOfList(s: *State) bool { + return switch (s.peek()) { + ')', ']', '}' => true, + else => false, + }; } fn continueList(s: *State) *State { - // - // We now consumed a list element and are at its end. Possibilities: - // - // (... foo| ) - // - // (... foo| . bar) - // - // (... foo| bar baz ...) - // - // (This may be the first element, or any other; doesn't matter.) - // + s.context = value.pair.cons(s.retval, s.context); s.consumeBlanks(); - if (s.eof()) { return err(s, "unexpected EOF while parsing list"); } - const stack = value.pair.cons(s.retval, s.list_stack); - - const open = s.opening_bracket; - const char = s.peek(); - - // Check for proper ending: (foo bar baz) - if (open == '(' and char == ')') { - s.skip(); - return s.returnDatum(list.reverse(stack)); - } - if (open == '[' and char == ']') { - s.skip(); - return s.returnDatum(value.pair.cons( - value.rune.pack("SQUARE"), - list.reverse(stack), - )); - } - if (open == '{' and char == '}') { - s.skip(); - return s.returnDatum(value.pair.cons( - value.rune.pack("BRACE"), - list.reverse(stack), - )); + if (isEndOfList(s)) { + s.context = list.reverse(s.context); + return endList(s); } - s.list_stack = stack; - - // Check for improper ending: (foo bar . baz) - if (char == '.') { + if (s.peek() == '.') { s.skip(); - - // We should now be at (... foo .| bar) and whitespace must follow. // Scheme allows (foo .(bar)) but we don't. Mind your spaces! if (!s.isWhitespace()) { return err(s, "misplaced period"); } - - return s.recurParse(.start_parse, .end_improper_list); + return s.recurParse(.start_parse, .finalize_improper_list); } - // OK, we're at the next element now and the list is going on. return s.recurParse(.start_parse, .continue_list); } -fn endImproperList(s: *State) *State { - // - // We're at the very end of an improper list now: - // - // (... foo . bar| ) - // - // There's another sneaky possibility though: trailing datum comments! - // - // (... foo . bar #;DATUM ... ) - // - // These were being handled "automatically" while parsing regular list - // elements, but here we have to special-handle them; see below. - // +fn finalizeImproperList(s: *State) *State { + s.context = list.reverseWithTail(s.context, s.retval); + return endImproperList(s); +} +fn endImproperList(s: *State) *State { s.consumeBlanks(); - if (s.eof()) { return err(s, "unexpected EOF while parsing list"); } - const result = list.reverseWithTail(s.list_stack, s.retval); - - const char = s.getc(); - const open = s.opening_bracket; - - if (open == '(' and char == ')') { - return s.returnDatum(result); - } - if (open == '[' and char == ']') { - const rune = value.rune.pack("SQUARE"); - return s.returnDatum(value.pair.cons(rune, result)); - } - if (open == '{' and char == '}') { - const rune = value.rune.pack("BRACE"); - return s.returnDatum(value.pair.cons(rune, result)); + if (isEndOfList(s)) { + return endList(s); } - // Handle trailing datum comments, but don't allow anything else! - - if (char == '#' and s.peek() == ';') { - s.skip(); - - // We will "abuse" the list_stack field to save the result. - s.list_stack = result; - return s.recurParse(.start_datum, .handle_trailing_datum_comments); + if (s.getc() == '#') { + if (s.eof()) { + return err(s, "unexpected EOF after hash while parsing list"); + } + if (s.getc() == ';') { + return s.recurParse(.start_datum, .end_improper_list); + } } return err(s, "malformed list / extra datum at end of improper list"); } -fn handleTrailingDatumComments(s: *State) *State { - s.consumeBlanks(); - - if (s.eof()) { - return err(s, "unexpected EOF while parsing list"); - } - - const result = s.list_stack; - - const char = s.getc(); +fn endList(s: *State) *State { const open = s.opening_bracket; + const char = s.getc(); + // Check for proper ending: (foo bar baz) if (open == '(' and char == ')') { - return s.returnDatum(result); + return s.returnDatum(s.context); } if (open == '[' and char == ']') { const rune = value.rune.pack("SQUARE"); - return s.returnDatum(value.pair.cons(rune, result)); + return s.returnDatum(value.pair.cons(rune, s.context)); } if (open == '{' and char == '}') { const rune = value.rune.pack("BRACE"); - return s.returnDatum(value.pair.cons(rune, result)); - } - - // Handle trailing datum comments, but don't allow anything else! - - if (char == '#' and s.peek() == ';') { - s.skip(); - - // We will "abuse" the list_stack field to save the result. - s.list_stack = result; - return s.recurParse(.start_datum, .handle_trailing_datum_comments); + return s.returnDatum(value.pair.cons(rune, s.context)); } - return err(s, "malformed list / extra datum at end of improper list"); -} - -fn handlePlusMinus(s: *State) *State { - return s; -} - -fn handleDigit(s: *State) *State { - return s; + return err(s, "wrong closing bracket for list"); } fn err(s: *State, msg: []const u8) noreturn { diff --git a/src/libzisp/io/reader.zig b/src/libzisp/io/reader.zig new file mode 100644 index 0000000..d6de79d --- /dev/null +++ b/src/libzisp/io/reader.zig @@ -0,0 +1,10 @@ +// See parse.zig for details. + +const parser = @import("parser.zig"); +const decoder = @import("decoder.zig"); + +const Value = @import("../value.zig").Value; + +pub fn readCode(input: []const u8) Value { + return decoder.decode(parser.parse(input)); +} diff --git a/src/libzisp/io/writer.zig b/src/libzisp/io/writer.zig new file mode 100644 index 0000000..eb27e20 --- /dev/null +++ b/src/libzisp/io/writer.zig @@ -0,0 +1 @@ +// wip |
