From b84ed4f563b3536365f7d3cc4d068407e98685b3 Mon Sep 17 00:00:00 2001 From: Taylan Kammer Date: Sat, 20 Jun 2026 22:53:50 +0200 Subject: It's a revolution baby. --- doc/0/0-value.md | 199 +++++++++++++++ doc/0/1-parse.md | 595 +++++++++++++++++++++++++++++++++++++++++++++ doc/0/2-decode.md | 44 ++++ doc/0/grammar/abnf.txt | 139 +++++++++++ doc/0/grammar/index.md | 115 +++++++++ doc/0/grammar/peg.txt | 91 +++++++ doc/0/grammar/zbnf.txt | 75 ++++++ doc/0/index.md | 34 +++ doc/c1/1-parse.md | 608 ---------------------------------------------- doc/c1/2-decode.md | 44 ---- doc/c1/grammar/abnf.txt | 141 ----------- doc/c1/grammar/index.md | 115 --------- doc/c1/grammar/peg.txt | 93 ------- doc/c1/grammar/zbnf.txt | 77 ------ doc/c1/index.md | 30 --- doc/index.md | 42 ++-- src/zisp/gc.zig | 14 +- src/zisp/gc/IstrSet.zig | 8 +- src/zisp/gc/ListPool.zig | 69 ++++++ src/zisp/gc/PairPool.zig | 34 --- src/zisp/io/Decoder.zig | 1 + src/zisp/io/Encoder.zig | 1 + src/zisp/io/Parser.zig | 195 ++++++++------- src/zisp/io/Printer.zig | 255 +++++++++++++------ src/zisp/value.zig | 521 +++++++++++---------------------------- src/zisp/value/array.zig | 8 +- src/zisp/value/istr.zig | 34 ++- src/zisp/value/list.zig | 64 +++++ src/zisp/value/ptr.zig | 68 +----- src/zisp/value/srat.zig | 1 + src/zisp/value/string.zig | 13 +- update-html.sh | 10 +- 32 files changed, 1943 insertions(+), 1795 deletions(-) create mode 100644 doc/0/0-value.md create mode 100644 doc/0/1-parse.md create mode 100644 doc/0/2-decode.md create mode 100644 doc/0/grammar/abnf.txt create mode 100644 doc/0/grammar/index.md create mode 100644 doc/0/grammar/peg.txt create mode 100644 doc/0/grammar/zbnf.txt create mode 100644 doc/0/index.md delete mode 100644 doc/c1/1-parse.md delete mode 100644 doc/c1/2-decode.md delete mode 100644 doc/c1/grammar/abnf.txt delete mode 100644 doc/c1/grammar/index.md delete mode 100644 doc/c1/grammar/peg.txt delete mode 100644 doc/c1/grammar/zbnf.txt delete mode 100644 doc/c1/index.md create mode 100644 src/zisp/gc/ListPool.zig delete mode 100644 src/zisp/gc/PairPool.zig create mode 100644 src/zisp/io/Decoder.zig create mode 100644 src/zisp/io/Encoder.zig create mode 100644 src/zisp/value/list.zig create mode 100644 src/zisp/value/srat.zig diff --git a/doc/0/0-value.md b/doc/0/0-value.md new file mode 100644 index 0000000..4bb7e0c --- /dev/null +++ b/doc/0/0-value.md @@ -0,0 +1,199 @@ +# NaN-packed Value representation + +The format of a binary64 floating-point number, in big-endian notation: + + { sign: 1 bit, exponent: 11 bits, fraction: 52 bits } + +When the 11 exponent bits are all set, it's either a NaN or an Infinity. + +For value packing, the remaining 53 bits are available, giving us `2^53` values +minus the following four bit patterns: + + *** FORBIDDEN BIT-PATTERNS *** + + 1. Negative cqNaN :: { sign = 1, exponent = MAX, fraction = 10000... } + + 2. Negative Infinity :: { sign = 1, exponent = MAX, fraction = 00000... } + + 3. Positive cqNaN :: { sign = 0, exponent = MAX, fraction = 10000... } + + 4. Positive Infinity :: { sign = 0, exponent = MAX, fraction = 00000... } + +The abbreviation "cqNaN" stands for canonical quiet NaN. + +The MSb of the fraction is also called the `is_quiet` flag, because it marks a +NaN as being "quiet" rather than signaling. The rest of the fraction being all +zero makes it the *canonical* quiet NaN for the given sign value. + +The positive and negative cqNaN are the *only* NaN values that can actually be +returned by FP operations. This is convenient, because it means we can simply +use them to represent themselves in Zisp. + +Infinity values may also be returned by FP operations, and we want them to also +exit in Zisp, so they also represent themselves. + +Beyond those four bit patterns, all values with a maximum exponent (all bits +set) are fair game for representing other values, so `2^53 - 4` possibilities. + +We split those `2^53 - 4` available values into four groups, each allowing for +`2^51 - 1` different values to be encoded. (51-bit values excluding zero.) + + sign = 1, quiet = 1 :: Negative Fixnum from -1 to -2^51+1 + + sign = 1, quiet = 0 :: Positive Fixnum from 0 to 2^51-2 + + sign = 0, quiet = 1 :: Pointers and various immediates + + sign = 0, quiet = 0 :: Internal use by interpreter + + +## Fixnums + +Negative fixnums actually represent themselves, without needing to go through +any transformation. Only the smallest 52-bit signed negative, `-2^51`, cannot +be represented, as it would step on Forbidden Value #1, Negative cqNaN. + +Positive fixnums go through a bitsiwe NOT (which can be implemented as an XOR +mask combining it with removal of NaN-related high bits) to avoid the all-zero +payload value, which would step on Forbidden Value #2, Negative Infinity. + + +## Pointers and immediates + +This region of 51-bit values is divided as follows, based on the three highest +bits, providing a payload value of 48 bits for each. + + 000 :: Pointer to heap object (type-tagged) + + 001 :: Pointer to list values (length-tagged) + + 010 :: Pointer to istr object + + 011 :: Immediate short string + + 100 :: Immediate small rational (sign bit 0) + + 101 :: Immediate small rational (sign bit 1) + + 110 :: Undefined + + 111 :: Immediate types further subdivided as follows: + + 0....... 0....... 0....... (etc.) :: Rune + + 1....... :: 128 40-bit types + + 0....... 1....... :: 16384 32-bit types + + 0....... 0....... 1....... :: 2097152 24-bit types + + (etc.) + +Forbidden Value #3, Positive cqNaN, is avoided by not using 0 as a valid type +tag value for heap pointers. + +### Type-tagged heap pointers + +Regular heap objects are allocated with 16-byte alignment, meaning the lowest +four bits are naturally zero. We exploit this by shifting down the address by +four bits, making room for more tag bits immediately following the 16 high bits +that mark the value as a heap pointer. + +Thus, a value can be checked against a specific heap object type by comparing +the 20 high bits to a combined constant value: the 16 highest bits indicating +that it's a heap pointer, and the 4 bits after that denoting the heap type. + +### Length-tagged list pointers + +Lists are arrays of Value objects, allocated without padding (8-byte alignment) +for efficient source code representation and traversal. Since the lowest three +bits are naturally zero, we use them as a 3-bit length information tag. + +A length tag value of 1 to 7 means there are exactly that many Value objects +starting at the address, while 0 means there is an array of at least eight, +terminated with a special 64-bit sentinel bit-pattern that is not otherwise +valid as a Zisp value. + +### Interned string pointers + +Interned string (istr) objects may be unaligned, so the low bits of the pointer +are not used for any special purpose. Having a separate category for this type +of pointer also streamlines the interpreter implementation + +### Short strings + +This 48-bit range is used for strings of zero to six bytes. These are NUL +terminated unless exactly six bytes, meaning that a literal NUL byte cannot +appear in them, but otherwise they allow arbitrary byte values. + +### Small rationals + +We use a 49-bit space for small rational numbers, with a signed 25-bit two's +complement integer numerator, and 24-bit unsigned integer denominator. + +### Runes and other small values + +Runes are symbols of up to 6 ASCII characters in length, used to implement +extensible reader syntax. (See Zisp decoder.) They cannot contain the NUL +byte, as they are NUL-terminated unless exactly six ASCII bytes in length. + +NOTE: The order in which the characters of the rune are encoded depends on +endianness. On little-endian systems (i.e. most modern architectures) the +characters will be in "reverse" order, with the first character in lowest +position, so the terminating NUL has to be searched from low to high. + +The fact that runes are limited to ASCII bytes, whose MSb is unset, opens up +some space for other small values to co-inhabit the same 48-bit value range. + +We divide this space into increasingly many potential types, with smaller and +smaller payloads, where the highest byte with a non-zero MSb determines which +size category we're in: If the highest byte has its MSb set, then the other +seven bits are a type tag, and each type has a 40-bit payload; if the second +highest byte has its MSb set, then the 14 non-MSb bits of the two high bytes +define the type, and each has a 32-bit payload; and so on. + +Unicode code points need 21 bits, so we use a 24-bit type for the Character +type. Miscellaneous values like True, False, EOF, etc. are placed in an 8-bit +type, since there will never be that many of them. + +A virtually unlimited number of user-defined enum types can fit into the types +with small payload values here: There is room for over 268 Million 16-bit types +(28-bit type tag) and over 34 Billion 8-bit types (35-bit type tag). + + +## Internal use values + +The final 51-bit range is used for various internal purposes by the Zisp +interpreter, mostly related to transparent code optimization. + + 000 :: Pointer to heap object as constant + + 001 :: Pointer to list values as constant + + 010 :: Pointer to istr object as constant + + 011 :: Immediate short string as constant + + 100 :: Local variable reference by index + + 101 :: Pointer to constant function-call expression + + 110 :: Pointer to variable function-call expression + + 111 :: Pointer to special-form or macro-call expression + +Forbidden Value #4, Positive Infinity, is avoided thanks to the fact that heap +pointers always have a non-zero heap-type tag. (See further above.) + +The first four categories simply mirror those of the previous 51-bit range, but +mark the values as being constants rather than code to evaluate. + +The remaining four categories are somewhat similar to VM instructions. + + + + diff --git a/doc/0/1-parse.md b/doc/0/1-parse.md new file mode 100644 index 0000000..101a3b6 --- /dev/null +++ b/doc/0/1-parse.md @@ -0,0 +1,595 @@ +# Parser for Code & Data + + + +Zisp s-expressions represent an extremely minimal set of data types; only that +which is necessary to strategically construct more complex values: + + +---------+--------+----------+------+ + | String | Rune | List | Nil | + +---------+--------+----------+------+ + | foobar | #name | (X ...) | () | + +---------+--------+----------+------+ + +The parser recognizes various *syntax sugar* which abbreviates verbose syntax, +and may result in special data structures (typically, a list with a rune in its +first position) which another Zisp component called the *decoder* can transform +into a rich set of value types. + +More details about syntax sugar, and the decoder, are explained later. + + +## Character Encoding + +The parser does not consume Unicode characters; it consumes bytes. Grammar is +generally constructed by bytes corresponding to ASCII characters. + +Some elements of the grammar, such as comments and quoted strings, may contain +arbitrary byte sequences, until terminated. These sequences may happen to be +valid UTF-8 text. This way, quoted strings and comments may contain Unicode +text encoded in UTF-8, but the parser does not check these for validity. + +Since comments and quoted strings may contain arbitrary byte sequences, a text +editor or other program displaying Zisp s-expressions may need to use a special +visual representation for bytes that don't represent valid text. + +The parser working on bytes rather than Unicode characters is not a limitation, +but rather a feature: It allows Zisp s-expressions to be used as a structured +data exchange format, which may contain binary data elements, without the need +to encode these in Base64 or other such text representations of binary data. +Consider the example: + + ((image.webp "") + (video.webm "")) + +All that needs to be done for this to work, is that any incidental occurrences +of the double-quote sign, and the backslash sign, are escaped with a backslash +within the `` data; all other bytes can appear verbatim in the strings. + + +## Stream Parsing + +The parser can be repeatedly invoked on a byte stream to consume the next datum +within. This does not require "unreading" or back-seeking within the stream; +the parser always reads a full datum, and stops after some byte which cleanly +terminates the currently parsed datum. + +This means Zisp s-expressions can be safely intermixed with other data within +the same byte stream. So long as the other data is consumed by some parser +which similarly stops reading at a clear boundary, the Zisp parser can then +continue operating on the same stream. Consider the example: + + ("image.webp" 8273) + + << 8273 bytes >> + + ("video.webm" 736) + + << 736 bytes >> + +The "header" for each file in this stream is a Zisp s-expression containing +information about how many bytes should be read after the header, before the +next file header appears. (The header data need to be terminated with a blank +ASCII character such as a newline; the closing parenthesis does not act as a +terminator unto itself due to the "join" syntax sugar.) + +To enable this stream parsing strategy, the parser does not use any automatic +buffering. If it did, it might inadvertently consume some bytes beyond the +currently parsed datum, leaving the stream inconsistent. + +If the parser is meant to be used on an input stream associated with expensive +system calls, such as a file handle or network socket, it's best to wrap that +stream in some intermediate object which asks the system for large chunks of +data at once, and stores the data in a buffer. + + +## Comments + +Two types of comment are supported: datum comments and line comments. + +* A semicolon followed by a tilde instructs the parser to consume one datum and + discard it. Whitespace may appear between the tilde and the datum to discard. + +* A semicolon, followed by a non-tilde byte, instructs the parser to consume and + discard bytes until a newline (ASCII Line Feed) is encountered. + + +## Value vs. Datum + +A Zisp *value* that has an *external representation* in the form of a sequence +of bytes is called a *datum*. Every datum is a value, but not every value is a +datum. In other words, a datum is a value that can be printed out as a byte +sequence which the parser can turn back into an equivalent datum. + +A value that is not a datum may nevertheless be *encoded* into one, allowing it +to have an external representation. After parsing, it needs to be *decoded* to +actually become the expected value. + +One may speak of an *external representation of a value* where the value is not +itself a datum, but has an encoding as one. The more strictly correct term for +this is: "The external representation of the datum encoding the value." + +### Syntax sugar + +The parser recognizes various *syntax sugar* to abbreviate an equivalent datum +construction, or express a datum that encodes a more complex value. + +As an example, the expression `#(x y z)` is an abbreviation for the equivalent +`(#HASH x y z)`. These are two external representations for the same datum; +after parsing, both will yield values that are indistinguishable in all but +their memory address. + +An example of syntax sugar that is not a mere abbreviation is a quoted string +which contains bytes that could not appear in a *bare* string: + + "foo bar" -> (#DQUOTE ) + +In this example, the visual token `` represents the actual string value +in program memory, which has no direct external representation in bytes because +it contains a space character. + +Those familiar with Lisp and Scheme may expect bare strings to be parsed into a +separate type called *symbol* while quoted strings are parsed directly into a +string type, but this is not the case in Zisp. + +### Decoder + +The *decoder* transforms Zisp data into values of more complex types, including +values that are not of a datum type. + +Combined with syntax sugar, this allows Zisp to offer familiar syntax elements. +For example, the expression `#(x y z)` which parses into `(#HASH x y z)` can be +decoded into an array, so the result is similar to the vector syntax of Scheme. + +Decoding also resolves datum labels, goes over bare strings to find ones that +represent a number literal, and takes care of a number of other transforms. +This offloads complexity, allowing the parser to remain extremely simple. + +See the dedicated documentation of the [decoder](2-decode.html) for more. + + +## Data types + +Following is a more in-depth explanation of each data type constructed by the +Zisp s-expression parser. + +These are in fact value types, though the term "data type" is often used due to +familiarity. A Zisp value that is a member of one of the following value types +is only a *datum* if it adheres to additional constraints as explained below. + +### String + +Strings can appear *bare* or be quoted in various ways. A quoted string is in +fact parsed into a list value with a rune in the first position to identify the +quotation variant that was parsed, and the string value in the second position; +or, in case of at-quoted strings, a special construct we will look at later. + + +-----------+-------------------------------+ + | Syntax | Parse output | + +-----------+-------------------------------+ + | |bytes| | (#PQSTR ) | + +-----------+-------------------------------+ + | "bytes" | (#DQSTR ) | + +-----------+-------------------------------+ + | @_bytes_ | (#ATSTR ) | + +-----------+-------------------------------+ + +The visual token `` denotes the actual string, as a Zisp value, in the +second position of the list. The visual token `` stands for a Zisp +integer value between 0 and 254. + +These external representations of strings will be explained in more detail +further below, including backslash escape sequences allowed within, and how +exactly at-quoted strings work. + +Strings have a fixed length, counted in bytes. Each byte can have any value, +including zero (ASCII NUL). The parser reads bytes, not Unicode characters; a +string may contain UTF-8 byte sequences, but these are not tested for validity. + +A string that is up to 255 bytes long is automatically *interned*, meaning any +occurrence of the same string -- equal in length and containing the same byte +values -- ends up being represented by the same bit-pattern; either a memory +address, or an immediate representation within a CPU word for short strings. +The quotation method is inconsequential to this process; for example, while +`|foo bar|` and `"foo bar"` will parse into different list values, the actual +string they hold a reference to will be the same one in program memory. This +behavior is however configurable and can be disabled entirely for cases where +large numbers of arbitrary binary strings are being parsed. + +Strings of length greater than 255 bytes are stored separately in memory, even +if they are equal in length and content. + +### Rune + +A rune is represented by an ASCII character sequence of 1 to 6 bytes, that must +begin with a letter, and may only contain letters and digits. This character +sequence of letters and digits is called the *name* of the rune. A rune that +follows this constraint is valid as a datum. + +Zisp code may explicitly construct values of the rune type that violate the +above constraints. Such runes are not valid data and cannot be printed or +parsed. + +Runes are case-sensitive, and the parser always emits runes using upper-case +letters when expressing syntax sugar. Uppercase rune names are reserved for +Zisp's internal use and standard library; users can use lowercase runes with +custom meaning without worrying about clashes, with the exception of a small +number of lowercase runes such as `#true` and `#false` that are part of the +default decoder settings and documented explicitly as such. + +Runes are always stored directly in a CPU word; never by memory address. + +### List + +A list is a contiguous array of one or more values in memory, whose length may +be encoded directly within the pointer to the head of the array, or else the +array is terminated with a special sentinel bit-pattern that is not otherwise +valid as a Zisp value. + +The parser allocates a unique array in program memory for every list, and the +list as a value is then represented by the memory address of that array, with +either an exact length tag or a tag indicating that it's sentinel-terminated. + +Lists are valid data if one of the following holds true: + +* The list encodes a quoted string, datum label, or shebang line. + +* All values in the list are a valid datum. + +Further, a structure of nested list values may not contain cyclic references +back up in the structure (which would make the above definition diverge into +infinity). Such cycles must be broken up with datum labels, or else the list +cannot be considered a datum, since it cannot be printed or parsed. + +### Nil + +The Zisp nil value is a singleton and a datum. There is exactly one nil value, +used in lieu of a list of zero length; it has the external representation `()`. + + +## Quoted strings + +Three quoted string types exist: Pipe-quoted, double-quoted, and at-quoted. +This section goes into the details of each variant. + +### Pipe-quoted + +Strings can be quoted with pipes, like symbols in R7RS Scheme, which triggers +the parser to generate a list with the structure: + + (#PQSTR ) ;; is visual aid, not syntax + +The decoder, using default settings, would emit this string verbatim as a value. +Then, during code evaluation, this would be seen as an identifier. In this way, +pipe-quoted strings are equivalent to bare strings in functionality. + +It is important to understand that the decoder sits between the parser and the +[evaluator](3-eval.html), and in opposition to Lisp and Scheme tradition, it is +common for the evaluator to receive values that are not valid as a datum; here, +a string unto itself that may not be a valid datum. Yet, it is valid as an +identifier for the purposes of the evaluator. + +### Double-quoted + +Strings wrapped in the double-quote symbol parse into: + + (#DQSTR ) ;; is visual aid, not syntax + +Under default settings, the decoder would transform this into a value which, +when evaluated as code, simply yields the contained string as a value. + +### At-quoted + +This is a special type of syntax for "raw" strings, meaning that no backslash +escapes nor any other kind of escape sequence are recognized within them. + +The syntax begins with an at sign, followed by any byte. That byte becomes a +termination marker, and the string cannot contain an occurrence of it, since +there are no escape sequences. The byte value 255 has a special meaning; see +further below. + + @"foo \ bar" -> (#ATSTR ) + +The visual tokens `` and `` represent an integer and string +value, respectively. Here, the integer would be 34, which is the ASCII value +for a double-quote sign. The string contains a literal backslash, since there +is no backslash escape parsing. + +This style of quoting can be useful, for instance, when representing regular +expressions as strings in code: + + ;; Matches e.g. foo\bar.["blah"] + + @/^foo\\(bar|baz)\.\[".*"\]$/ + +Were it not for this syntax, this regular expression would only be possible to +represent through a quoted string such as the following: + + ;; Same as above, but so many backslashes + + "^foo\\\\(bar|baz)\\t\\[\".*\"\\]$" + +The byte that follows the at sign need not be a printable character or even a +valid ASCII byte; it can be absolutely any byte value, even NUL. This can be +useful to easily encode binary data which is known to not contain a specific +byte; an example would be C strings which cannot contain NUL. + +If however the byte value is 255, then it does not stand for a sentinel, but +rather indicates that 6 more bytes follow, interpreted as a big-endian 48-bit +integer, which is the count of bytes making up the contents of the string. + +Example sequence of bytes, represented as a mixture of ASCII and raw integers: + + '@' 255 0 0 0 0 2 100 <612 bytes> -> (#ATSTR ) + +One may ask why the length is not included in the list. This is unnecessary, +since strings in Zisp already carry length information in their own metadata +structure. + + +### Backslash escapes + +In pipe-quoted and double-quoted strings, the following ASCII characters may +follow a backslash to insert a certain character. + + +-------+----------------------------+ + | Char | Meaning | + +-------+----------------------------+ + | \ | Literal backslash | + +-------+----------------------------+ + | | | Literal pipe symbol | + +-------+----------------------------+ + | " | Literal double-quote | + +-------+----------------------------+ + | 0 | ASCII NUL | + +-------+----------------------------+ + | a | ASCII Alert | + +-------+----------------------------+ + | b | ASCII Backspace | + +-------+----------------------------+ + | t | ASCII Tab (Horizontal) | + +-------+----------------------------+ + | n | ASCII Newline (Line Feed) | + +-------+----------------------------+ + | v | ASCII Vertical Tab | + +-------+----------------------------+ + | f | ASCII Form Feed | + +-------+----------------------------+ + | r | ASCII Carriage Return | + +-------+----------------------------+ + | e | ASCII Escape | + +-------+----------------------------+ + +In words: + +* A backslash, followed by a backslash, pipe, or double-quote character, is + substituted with a literal occurrence of that character. + +* The characters 0, a, b, t, n, v, f, r, and e have the same meanings as in the + C programming language, representing common ASCII control characters. + +Further, the following Regular Expression patterns following a backslash have +special meaning. + + +---------------------+-----------------------+ + | Regular Expression | Meaning | + +---------------------+-----------------------+ + | [\t ]*\n[\t ]* | Discarded | + +---------------------+-----------------------+ + | x([0-9a-fA-F]{2})*; | Arbitrary bytes | + +---------------------+-----------------------+ + | u[0-9a-fA-F]+; | Unicode Scalar Value | + +---------------------+-----------------------+ + +Explanations: + +* A backslash followed by any number of blanks (space or tab), a newline, and + again any number of blanks, is substituted with nothing. This is to allow + splitting a string into multiple lines for human readability. + + (define p "This paragraph has been visually split into multiple \ + lines, but the newline is escaped, so it's one line.") + +* An x, followed by pairs of hexadecimal digits (case insensitive), terminated + by a semicolon, is substituted with the sequence of bytes represented by the + corresponding pairs of hexadecimal digits. E.g.: `"foo\xDEADBEEF;bar"` + +* A u, followed by a hexadecimal digit sequence (case insensitive), terminated + by a semicolon, is substituted with the canonical UTF-8 byte sequence for the + Unicode Scalar Value represented by that hexadecimal number. The number must + be in the range `0` to `10FFFF`. E.g.: `"foo\u00A0;bar"` + +### Newlines in strings + +Normally, a newline in a string has no special meaning and simply becomes part +of the string. However, newlines can be backslash-escaped, which simple erases +them; the escaped newline can also be preceded or followed by any number of tab +and space characters, which are all stripped as well. (Note: It's not blanks +preceding the backslash that are stripped, but blanks following the backslash +and preceding the newline; i.e., blanks at the end of the line.) + +Following are some examples of how multi-line strings can appear in source code +with different intentions and meanings: + + (define paragraph "This paragraph has been visually split into multiple \ + lines, but the newlines are escaped, so it's one line.") + + (define json-object '| ;; use '|| so double-quotes need no escaping + { + "key": "value" + } + |) + +The second example is actually slightly problematic. It begins with a newline, +which may be undesirable, but escaping that newline would cause the first line +to have no indentation, thus the opening `{` would not line up with the closing +`}` when this string is printed out. Further, if the entire block of code is +indented, then the string contents may be more indented than intended. (No pun +or rhyme intended.) Consider: + + (let ((foo one)) + (let ((bar two)) + (let ((json-object '| + { + "key": "value" + } + |)) + (do-whatever)))) + +The string bound to `json-object` has redundant indentation. Should the parser +attempt to solve this issue? + +Thankfully, we have the decoder to handle such complexities. Under the default +settings, the rune `#HASH` is bound to a decoder rule which detects a payload +value that is a string literal, and implements the same algorithm as seen in +Java 15 Text Blocks: [JEP 378: Text Blocks](https://openjdk.org/jeps/378) + +Thus, we can do the following: + + (let ((foo one)) + (let ((bar two)) + (let ((json-object #| + ........... { + ........... "key": "value" + ........... } + ...........|)) + (do-whatever)))) + +(Dots represent whitespace that is deleted. The initial newline is, as well.) + +The only feature Zisp does not offer is a way to fence off multi-line strings +with a longer token such as `"""` as seen in Python and Java, or an arbitrary +word as seen in Bourne shell and PHP "here doc" syntax. + +However, if a programmer truly wanted to have arbitrary text blocks in code, +without needing to escape anything in them, it's possible to abuse at-quoted +string syntax, using it with an ASCII control character which is displayed +visibly by a text editor. In the following, the characters `^\` are meant to +represent a literal ASCII File Separator character in the source code: + + (define json-object #@^\ + { + "key": "value" + } + ^\) + +It works fine in Emacs, so why not? Use `C-q C-\` to insert the `^\`. + +This is indeed quite an eldritch syntax, but hopefully most programs would not +need to use it. + + +## Other syntax + +The following table summarizes commonly useful syntax abbreviations: + + [...] -> (#SQUARE ...) #datum -> (#HASH datum) + + {...} -> (#BRACE ...) #rune(...) -> (#rune ...) + + 'datum -> (#QUOTE datum) dat1dat2 -> (#JOIN dat1 dat2) + + `datum -> (#GRAVE datum) dat1.dat2 -> (#DOT dat1 dat2) + + ,datum -> (#COMMA datum) dat1:dat2 -> (#COLON dat1 dat2) + +Notes: + +* The terms datum, dat1, and dat2 each refer to an arbitrary datum; ellipsis + means zero or more data. + +* The `#datum` form only applies when the datum following the hash sign is + anything other than a bare string, since otherwise this would be ambiguous + with a rune literal. A bare string can nevertheless follow the hash sign by + separating the two with a backslash: + + #\string -> (#HASH string) + +* Though not represented in the table due to notational difficulty, the form + `#rune(...)` doesn't require a list in the second position; any datum that + works with the `#datum` syntax also works with `#rune`. + + #rune1#rune2 -> (#rune1 #rune2) + + #rune\string -> (#rune string) + + #rune'string -> (#rune (#QUOTE string)) + + #rune"string" -> (#rune (#DQSTR |string|)) + + As a counter-example, following a rune immediately with a bare string isn't + possible without the delimiting backslash, since that would be ambiguous: + + #abcdefgh ;Could be (#abcdef gh) or (#abcde fgh) or ... + +* Syntax sugar can combine arbitrarily. Some examples follow. Any of these may + or may not actually have a meaning in code; some might simply end up producing + an error during decoding, or later evaluation of code. + + #{...} -> (#HASH (#BRACE ...)) + + #'foo -> (#HASH (#QUOTE foo)) + + ##'[...] -> (#HASH (#HASH (#QUOTE (#SQUARE ...)))) + + {x y}[i j] -> (#JOIN (#BRACE x y) (#SQUARE i j)) + + foo.bar.baz{x y} -> (#JOIN (#DOT (#DOT foo bar) baz) (#BRACE x y)) + +* Those used to thinking in Lisp and Scheme may think that `(#QUOTE ...)` halts + further decoding of enclosed data. This is not so, since quoting is related + to code evaluation, not decoding. + +### Datum labels + +Valid data cannot be cyclic, since that would mean it has infinite length in +bytes. To externally represent a value with cyclic structure, one uses datum +labels in the data encoding of the value. + +A datum label either wraps another datum to assign a number to it, or contains +just a reference to a previous assignment. + + +------------------+----------------------------+ + | Syntax | Internal datum structure | + +------------------+----------------------------+ + | #%= | (#LABEL ) | + +------------------+----------------------------+ + | #%% | (#LABEL ) | + +------------------+----------------------------+ + +In this visual, the token `` stands for a hexadecimal digit sequence, the +token `` stands for any other datum, and `` is a stand-in for a +number value; that which is represented by ``. + +For clarity, concrete examples follow: + + +-------------------+------------------------------+ + | Byte sequence | Parse result | + +-------------------+------------------------------+ + | #%1234abcd=(foo) | (#LABEL <0x1234abcd> (foo)) | + +-------------------+------------------------------+ + | #%1234abcd% | (#LABEL <0x1234abcd>) | + +-------------------+------------------------------+ + +Here, the visual token `<0x1234abcd>` stands for a Zisp value of a numeric type +with an integer value. Note that the decoder may not accept a bare string here, +meaning this syntax sugar is not merely an abbreviation. + +### Shebang + +Finally, the parser recognizes the Unix *shebang* syntax and outputs a datum to +hold the string values found within: + + #!interpreter -> (#SHBANG interpreter) + + #!interpreter argline -> (#SHBANG interpreter argline) + +When executing a script file, Zisp simply stores this into a global value that +may be inspected if desired. + + + diff --git a/doc/0/2-decode.md b/doc/0/2-decode.md new file mode 100644 index 0000000..1a45824 --- /dev/null +++ b/doc/0/2-decode.md @@ -0,0 +1,44 @@ +# Decoding + +A separate process called "decoding" can transform simple data structures, +consisting of only the base datum types, into a richer set of Zisp types. + +For example, the decoder may turn `(#HASH ...)` into a vector, as one would +expect a vector literal like `#(...)` to work in Scheme. Bytevector syntax +could use a custom rune as a list prefix, like: `#u8(...)` + +Runes may be decoded in isolation as well, rather than transforming a list +whose head they appear in. This can implement Boolean constants as `#true` +and `#false` or `#t` and `#f`. + +The decoder recognizes `(#QUOTE ...)` to aid in implementing the traditional +quoting mechanism of Lisp/Scheme, but with a significant difference: + +Traditional quote is "unhygienic" in Scheme terms. An expression such as +`'(foo bar)` will always be read as `(quote (foo bar))` regardless of what +lexical context it appears in, so the semantics will depend on whatever the +identifier `quote` is bound to, meaning that the expression may end up +evaluating to something other than the list `(foo bar)`. + +The Zisp decoder, which transforms not datum to datum, but object to object, +can turn `#QUOTE` into an object which encapsulates the notion of quoting, +which the Zisp evaluator can recognize and act upon, ensuring that an +expression like `'(foo bar)` always turns into the list `(foo bar)`. + +One way to think about this, in Scheme (R6RS / syntax-case) terms, is that +expressions like `'(foo bar)` turn directly into a syntax object when read, +and the created syntax object begins with an identifier bound to `quote` in +the standard library. + +The decoder is, of course, configurable and extensible. The transformations +mentioned above would be performed only when it's told to decode data which +represents Zisp code. The decoder may be given a different configuration, +telling it to decode, for example, data which represents a different kind of +domain-specific data, such as application settings, build system commands, +complex data records with non-standard data types, and so on. + + diff --git a/doc/0/grammar/abnf.txt b/doc/0/grammar/abnf.txt new file mode 100644 index 0000000..5ab3c89 --- /dev/null +++ b/doc/0/grammar/abnf.txt @@ -0,0 +1,139 @@ +; Standards-compliant ABNF (RFC 5234, RFC 7405) + +; Compatible with: https://www.quut.com/abnfgen/ + +; Unlike PEG, grammar rules in BNF are non-deterministic, which makes +; it much more challenging to express our naive parse logic. Whether +; this ABNF file is truly accurate is difficult to assess. + +; The abnfgen(1) tool linked above can be used to generate arbitrary +; strings matching the grammar in this file. These can be fed into +; the Zisp parser to reveal some potential bugs; either in the parser +; itself, or this ABNF grammar. + +; Note that the tool may generate Zisp string literals with Unicode +; escape sequences corresponding to surrogate code points; the parser +; may reject these. This is expected; it's difficult to rewrite this +; ABNF grammar to exclude those Unicode values. + +; Other minor inaccuracies that aren't important include: This ABNF +; forces line comments to be terminated with an LF character, when in +; fact the end-of-file may also terminate them; the same applies to +; hash-bang parsing which doesn't actually have to end in LF. These +; discrepancies won't make abnfgen(1) generate invalid strings; they +; only make this ABNF more strict than the Zisp parser, so it won't +; generate some strings that the parser would actually accept. + + +Stream = [ Unit *( Blank Unit ) ] *Blank [Trail] + + +Unit = *Blank Datum + +Blank = HTAB / LF / %x0b / %x0c / CR / SP / Comment + +Trail = SkipLine / SkipUnit / ";" "~" *Blank + + +Datum = BareString / SpecialStr / CladDatum / Rune / RuneStr + / RuneDotStr / RuneClad / LabelRef / LabelDef / HashStr + / HashDotStr / HashClad / QuoteExpr / JoinExpr + +Comment = SkipLine LF / SkipUnit Blank + +SkipLine = ";" [ SkipLStart *AnyButLF ] + +SkipUnit = ";" "~" Unit + +SkipLStart = %x00-09 / %x0b-7d / %x7f-ff ; any but LF or "~" + +AnyButLF = %x00-09 / %x0b-ff + + +BareString = BareChar *( BareChar / Numeric ) + +SpecialStr = SpecStrChar *( SpecStrChar / BareChar ) + +CladDatum = "|" *( PipeStrChar / "\" StringEsc ) "|" + / DQUOTE *( QuotStrChar / "\" StringEsc ) DQUOTE + / "(" List ")" + / "[" List "]" + / "{" List "}" + +Rune = "#" RuneName + +RuneStr = "#" RuneName "\" BareString + +RuneDotStr = "#" RuneName "\" SpecialStr + +RuneClad = "#" RuneName CladDatum + +HashBang = "#" "!" *( SP / HTAB ) HBLine LF + +LabelRef = "#" "%" Label "%" + +LabelDef = "#" "%" Label "=" Datum + +HashStr = "#" "\" BareString + +HashDotStr = "#" "\" SpecialStr + +HashClad = "#" CladDatum + +QuoteExpr = "'" Datum + / "`" Datum + / "," Datum + +JoinExpr = Datum RJoinDatum + / LJoinDatum NoStartDot + / Datum ":" Datum + / NoEndDot "." Datum + + +BareChar = "!" / "$" / "%" / "&" / "*" / "/" / "<" / "=" / ">" + / "?" / "^" / "_" / "~" / ALPHA + +Numeric = "+" / "-" / DIGIT + +SpecStrChar = "." / ":" / Numeric + +PipeStrChar = %x00-5b / %x5d-7b / %x7d-ff ; any but "|" or "\" + +QuotStrChar = %x00-21 / %x23-5b / %x5d-ff ; any but DQUOTE or "\" + +StringEsc = "\" / "|" / DQUOTE / *( HTAB / SP ) LF *( HTAB / SP ) + / %s"a" / %s"b" / %s"t" / %s"n" + / %s"v" / %s"f" / %s"r" / %s"e" + / %s"x" *( 2HEXDIG ) ";" + / %s"u" ["0"] 1*5HEXDIG ";" + / %s"u" "1" "0" 4HEXDIG ";" + +List = [ Unit *( Blank Unit ) ] *Blank [SkipUnit] + + +RuneName = ALPHA *5( ALPHA / DIGIT ) + +Label = 1*12( HEXDIG ) + +HBLine = 1*HBChar [ 1*( SP / HTAB ) *HBChar ] + +HBChar = %x00-08 / %x0b-1f / %x21-ff ; any but HT, LF, SP + + +RJoinDatum = CladDatum / Rune / RuneStr / RuneDotStr / RuneClad + / LabelRef / LabelDef / HashStr / HashDotStr / HashClad + / QuoteExpr + +LJoinDatum = CladDatum / RuneClad / LabelRef / HashClad + +NoStartDot = BareString / CladDatum / Rune / RuneStr / RuneDotStr + / RuneClad / LabelRef / LabelDef / HashStr / HashDotStr + / HashClad / QuoteExpr + +NoEndDot = BareString / Rune / RuneStr / RuneClad / LabelRef + / HashStr / HashClad + + +;; Local Variables: +;; eval: (flyspell-mode -1) +;; End: diff --git a/doc/0/grammar/index.md b/doc/0/grammar/index.md new file mode 100644 index 0000000..e3716ea --- /dev/null +++ b/doc/0/grammar/index.md @@ -0,0 +1,115 @@ +# Zisp S-Expression Grammar + +The grammar is available in several different formats: + +* [ZBNF](zbnf.txt): See below for the rules of this notation +* [ABNF](abnf.txt): Compatible with the `abnfgen` tool +* [PEG](peg.txt): Compatible with `peg/leg` tool + + +## ZBNF notation + +The ZBNF grammar specification uses a BNF-like notation with PEG-like +semantics: + +* Concatenation of expressions is implicit: `foo bar` means `foo` + followed by `bar`. + +* Parentheses are used for grouping, and the pipe symbol `|` is used + for alternatives. + +* The suffixes `?`, `*`, and `+` have the same meaning as in regular + expressions, although `[foo]` is used in place of `(foo)?`. + +* The syntax is defined in terms of bytes, not characters. Terminals + `'c'` and `"c"` refer to the ASCII value of the given character `c`. + Standard C escape sequences are supported. + +* The prefix `~` means NOT. It only applies to rules that match one + byte, and negates them. For example, `~( 'a' | 'b' )` matches any + byte other than 'a' and 'b'. + +* Ranges of terminal values are expressed as `x...y` (inclusive). + +* ABNF "core rules" like `ALPHA` and `HEXDIG` are supported. + +* There is no ambiguity, or look-ahead / backtracking beyond one byte. + Rules match left to right, depth-first, and greedy. As soon as the + input matches the first terminal of a rule --explicit or implied by + recursively descending into the first non-terminal-- it must match + that rule to the end or a syntax error is reported. + +The last point makes the notation simple to translate to code. + + +## Limitations outside the grammar + +The following limits are not represented in the grammar: + +* A `UnicodeSV` is the hexadecimal representation of a Unicode scalar + value; it must represent a value in the range 0 to D7FF, or E000 to + 10FFFF, inclusive. Any other value signals an error. Valid values + are converted into a UTF-8 byte sequence encoding the value. + +* A `Rune` longer than 6 bytes is grammatical, but signals an error. + This is important because runes are not self-terminating; defining + their grammar as ending after a maximum of 6 bytes would allow + another datum beginning with an alphabetic character to follow a + rune immediately without any visual delineation, which would be + terribly confusing for a human reader. Consider: `#foobarbaz`. + This would parse as a `Datum` joining `#foobar` and `baz`. + + (The ABNF does not suffer from this issue, since it explicitly + enumerates the join possibilities anyway.) + +* A `Label` is the hexadecimal representation of a 48-bit integer, + meaning it allows for a maximum of 12 hexadecimal digits. Longer + values are grammatical, but signal an out-of-range error, so as to + avoid signaling a confusing "invalid character" error on input that + appears grammatical. Consider: `#%123456789abcd=foo`. This would + signal an invalid character error at the letter `d` if the grammar + limited a `Label` to 12 hexadecimal digits. + + (As above, the ABNF doesn't care about this. You probably don't + want to use the ABNF to generate a parser anyway.) + + +## At-quoted strings + +The mechanism of at-quoted strings is not represented in any of the +grammars, since it essentially has 256 variants. Representing it +sanely in a grammar requires the ability to save and reference +variables. + + +## Stream-parsing strategy + +The parser consumes one `Unit` from the input stream every time it's +called; it returns the `Datum` therein if found, or else it returns +the Zisp EOF token. + +Since a `Datum` is not self-terminating, the parser must read beyond +it to realize that it has ended (if not followed by the EOF). Thus, +it will consume one more `Blank` following the `Unit` that it parsed. +If this `Blank` is a comment, it will be consumed entirely, ensuring +that parsing resumes properly on a subsequent parser call on the same +input stream, without needing to store any state in between. + +Since comments of type `SkipUnit` are likewise not self-terminating, +an arbitrary number of chained `SkipUnit` comments may need to be +consumed before the parser is finally allowed to return. + +The following illustration shows the positions at which the parser +will stop consuming input when called repeatedly on the same input +stream. The dots represent the extent of each `Unit` being parsed, +while the caret points at the last byte the parser will consume in +that parse cycle. + +``` +foo (bar)[baz] foo;~bar foo;~bar;~baz;~bat foobar +...^..........^... ^... ^......^ +``` + +Notice how, in the fourth cycle, the parser is forced to consume all +commented-out units before it can return, since it would otherwise +leave the stream in an inappropriate state. diff --git a/doc/0/grammar/peg.txt b/doc/0/grammar/peg.txt new file mode 100644 index 0000000..1541da6 --- /dev/null +++ b/doc/0/grammar/peg.txt @@ -0,0 +1,91 @@ +# Standard PEG notation + +Stream <- Unit ( Blank Unit )* !. + + +Unit <- Blank* Datum + +Blank <- [\t-\r ] / Comment + + +Datum <- OneDatum ( JoinChar? OneDatum )* + +JoinChar <- '.' / ':' + + +Comment <- ';' ( SkipUnit / SkipLine ) + +SkipUnit <- '~' Unit + +SkipLine <- (!'\n' .)* '\n'? + + +OneDatum <- BareString / CladDatum + + +BareString <- SpecBareChar ( BareChar / JoinChar )* + / BareChar+ + +SpecBareChar <- '+' / '-' / JoinChar / DIGIT + +BareChar <- ALPHA / DIGIT + / '!' / '$' / '%' / '&' / '*' / '+' / '-' / '/' + / '<' / '=' / '>' / '?' / '^' / '_' / '~' + + +CladDatum <- PipeStr / QuoteStr / HashExpr / QuoteExpr / List + +PipeStr <- '|' ( PipeStrChar / '\' StringEsc )* '|' +QuoteStr <- '"' ( QuotStrChar / '\' StringEsc )* '"' +HashExpr <- '#' HashExprs +QuoteExpr <- "'" Datum / '`' Datum / ',' Datum +List <- ParenList / SquareList / BraceList + + +PipeStrChar <- (![|\\] .) +QuotStrChar <- (!["\\] .) + +StringEsc <- '\' / '|' / '"' / ( HTAB / SP )* LF ( HTAB / SP )* + / '0' / 'a' / 'b' / 't' / 'n' / 'v' / 'f' / 'r' / 'e' + / 'x' HexByte* ';' + / 'u' UnicodeSV ';' + +HexByte <- HEXDIG HEXDIG +UnicodeSV <- HEXDIG+ + + +HashExprs <- '!' [\t ]* HBangLine '\n'? + / '%' Label ( '%' / '=' Datum ) + / '\' BareString / CladDatum + / Rune ( '\' BareString / CladDatum )? + +HBangLine <- HBChars+ [\t ]* ( HBChars+ )? +HBChars <- (![\t\n ] .) +Label <- HEXDIG+ +Rune <- ALPHA ( ALPHA / DIGIT )* + + +ParenList <- '(' Unit* ')' +SquareList <- '[' Unit* ']' +BraceList <- '{' Unit* '}' + + +DIGIT <- [0-9] +ALPHA <- [a-zA-Z] +HEXDIG <- [0-9a-fA-F] + + +# Keep this in sync line-for-line with the ZBNF grammar for easy +# comparison between the two. + +# This file is meant to be compatible with: +# https://piumarta.com/software/peg + +# Due to a quirk in the peg tool this file is used with, the grammar +# must not allow an empty stream. Therefore, the Unit rule has its +# Datum declared as mandatory rather than optional. + + +# Local Variables: +# eval: (flyspell-mode -1) +# End: diff --git a/doc/0/grammar/zbnf.txt b/doc/0/grammar/zbnf.txt new file mode 100644 index 0000000..c04b813 --- /dev/null +++ b/doc/0/grammar/zbnf.txt @@ -0,0 +1,75 @@ +; Custom notation with PEG semantics + +Stream : Unit ( Blank Unit )* + + +Unit : Blank* [Datum] + +Blank : '\t'...'\r' | SP | Comment + + +Datum : OneDatum ( [JoinChar] OneDatum )* + +JoinChar : '.' | ':' + + +Comment : ';' ( SkipUnit | SkipLine ) + +SkipUnit : '~' Unit + +SkipLine : ( ~LF )* [LF] + + +OneDatum : BareString | CladDatum + + +BareString : SpecBareChar ( BareChar | JoinChar )* + | BareChar+ + +SpecBareChar : '+' | '-' | JoinChar | DIGIT + +BareChar : ALPHA | DIGIT + | '!' | '$' | '%' | '&' | '*' | '+' | '-' | '/' + | '<' | '=' | '>' | '?' | '^' | '_' | '~' + + +CladDatum : PipeStr | QuoteStr | HashExpr | QuoteExpr | List + +PipeStr : '|' ( PipeStrChar | '\' StringEsc )* '|' +QuoteStr : '"' ( QuotStrChar | '\' StringEsc )* '"' +HashExpr : '#' HashExprs +QuoteExpr : "'" Datum | '`' Datum | ',' Datum +List : ParenList | SquareList | BraceList + + +PipeStrChar : ~( '|' | '\' ) +QuotStrChar : ~( '"' | '\' ) + +StringEsc : '\' | '|' | '"' | ( HTAB | SP )* LF ( HTAB | SP )* + | '0' | 'a' | 'b' | 't' | 'n' | 'v' | 'f' | 'r' | 'e' + | 'x' HexByte* ';' + | 'u' UnicodeSV ';' + +HexByte : HEXDIG HEXDIG +UnicodeSV : HEXDIG+ + + +HashExprs : '!' ( SP | HTAB )* HBangLine [ LF ] + | '%' Label ( '%' | '=' Datum ) + | '\' BareString | CladDatum + | Rune [ '\' BareString | CladDatum ] + +HBangLine : HBChars+ ( SP | HTAB )* [ HBChars+ ] +HBChars : ~( SP | HTAB | LF ) +Label : HEXDIG+ +Rune : ALPHA ( ALPHA | DIGIT )* + + +ParenList : '(' Unit* ')' +SquareList : '[' Unit* ']' +BraceList : '{' Unit* '}' + + +;; Local Variables: +;; eval: (flyspell-mode -1) +;; End: diff --git a/doc/0/index.md b/doc/0/index.md new file mode 100644 index 0000000..f0da216 --- /dev/null +++ b/doc/0/index.md @@ -0,0 +1,34 @@ +# Chapter 0: Genesis + +This chapter explains the core value representation of Zisp, and goes +through the processes of parsing, decoding, running, and optionally +compiling code. + +0. [Value](0-value.html) + + Zisp uses a uniform 64-bit representation for all values, densely + packed into signaling or non-canonical NaN values. + +1. [Parse](1-parse.html) (see also [grammar](grammar/)) + + The parser receives a stream of bytes and transforms them into a + minimal set of data types with very little processing. + +2. [Decode](2-decode.html) + + The decoder runs configurable and extensible pre-processing steps + over data received from the parser, enriching it with more complex + value types, and handling primitive source code transforms. + +3. [Eval](3-eval.html) + + Code is evaluated within a mutable module context, on which it can + have side effects such as creating new definitions or establishing + links to other modules. + +4. [Compile](4-compile.html) + + Procedures from within the compiler module can be used to demand + the compilation of other modules, with various options, yielding + static or dynamic object files. These may be loaded immediately, + replacing the previously uncompiled module code in memory. diff --git a/doc/c1/1-parse.md b/doc/c1/1-parse.md deleted file mode 100644 index d4c4c2e..0000000 --- a/doc/c1/1-parse.md +++ /dev/null @@ -1,608 +0,0 @@ -# Parser for Code & Data - - - -Zisp s-expressions represent an extremely minimal set of data types; only that -which is necessary to strategically construct more complex values: - - +-------+---------+--------+----------+------+ - | TYPE | String | Rune | Pair | Nil | - +-------+---------+--------+----------+------+ - | E.G. | foobar | #name | (X & Y) | () | - +-------+---------+--------+----------+------+ - -The parser recognizes various *syntax sugar* which abbreviates verbose syntax, -and may result in special data structures (typically, a pair with a rune in its -first, and payload in its second position) which another Zisp component called -the *decoder* can transform into a rich set of value types. - -The most ubiquitous syntax sugar is the list, which abbreviates a sequence of -tail-linked pairs, terminated with a special nil value represented as `()`: - - (x) -> (x & ()) - - (x y) -> (x & (y & ())) - - (x y z) -> (x & (y & (z & ()))) - -The following are so-called *improper lists*, ending in a non-nil value: - - (x y & z) -> (x & (y & z)) - - (x y z & t) -> (x & (y & (z & t))) - -More details about syntax sugar, and the decoder, are explained later. - - -## Character Encoding - -The parser does not consume Unicode characters; it consumes bytes. Grammar is -generally constructed by bytes corresponding to ASCII characters. - -Some elements of the grammar, such as comments and quoted strings, may contain -arbitrary byte sequences, until terminated. These sequences may happen to be -valid UTF-8 text. This way, quoted strings and comments may contain Unicode -text encoded in UTF-8, but the parser does not check these for validity. - -Since comments and quoted strings may contain arbitrary byte sequences, a text -editor or other program displaying Zisp s-expressions may need to use a special -visual representation for bytes that don't represent valid text. - -The parser working on bytes rather than Unicode characters is not a limitation, -but rather a feature: It allows Zisp s-expressions to be used as a structured -data exchange format, which may contain binary data elements, without the need -to encode these in Base64 or other such text representations of binary data. -Consider the example: - - ((image.webp "") - (video.webm "")) - -All that needs to be done for this to work, is that any incidental occurrences -of the double-quote sign, and the backslash sign, are escaped with a backslash -within the `` data; all other bytes can appear verbatim in the strings. - - -## Stream Parsing - -The parser can be repeatedly invoked on a byte stream to consume the next datum -within. This does not require "unreading" or back-seeking within the stream; -the parser always reads a full datum, and stops after some byte which cleanly -terminates the currently parsed datum. - -This means Zisp s-expressions can be safely intermixed with other data within -the same byte stream. So long as the other data is consumed by some parser -which similarly stops reading at a clear boundary, the Zisp parser can then -continue operating on the same stream. Consider the example: - - ("image.webp" 8273) - - << 8273 bytes >> - - ("video.webm" 736) - - << 736 bytes >> - -The "header" for each file in this stream is a Zisp s-expression containing -information about how many bytes should be read after the header, before the -next file header appears. (The header data need to be terminated with a blank -ASCII character such as a newline; the closing parenthesis does not act as a -terminator unto itself due to the "join" syntax sugar.) - -To enable this stream parsing strategy, the parser does not use any automatic -buffering. If it did, it might inadvertently consume some bytes beyond the -currently parsed datum, leaving the stream inconsistent. - -If the parser is meant to be used on an input stream associated with expensive -system calls, such as a file handle or network socket, it's best to wrap that -stream in some intermediate object which asks the system for large chunks of -data at once, and stores the data in a buffer. - - -## Comments - -Two types of comment are supported: datum comments and line comments. - -* A semicolon followed by a tilde instructs the parser to consume one datum and - discard it. Whitespace may appear between the tilde and the datum to discard. - -* A semicolon, followed by a non-tilde byte, instructs the parser to consume and - discard bytes until a newline (ASCII Line Feed) is encountered. - - -## Value vs. Datum - -A Zisp *value* that has an *external representation* in the form of a sequence -of bytes is called a *datum*. Every datum is a value, but not every value is a -datum. In other words, a datum is a value that can be printed out as a byte -sequence which the parser can turn back into an equivalent datum. - -A value that is not a datum may nevertheless be *encoded* into one, allowing it -to have an external representation. After parsing, it needs to be *decoded* to -actually become the expected value. - -One may speak of an *external representation of a value* where the value is not -itself a datum, but can be encoded as one. The more strictly correct term for -this is: "The external representation of a datum that encodes the value." - -### Syntax sugar - -The parser recognizes various *syntax sugar* to abbreviate an equivalent datum -construction, or express a datum that encodes a more complex value. - -As an example, the expression `#(x y z)` is an abbreviation for the equivalent -`(#HASH x y z)`. These are two external representations for the same datum; -after parsing, both will yield values that are indistinguishable in all but -their memory address. - -An example of syntax sugar that is not a mere abbreviation is a quoted string -which contains bytes that could not appear in a *bare* string: - - "foo bar" -> (#DQUOTE & ) - -In this example, the visual token `` represents the actual string value -in program memory, which has no direct external representation in bytes because -it contains a space character. - -Those familiar with Lisp and Scheme may expect bare strings to be parsed into a -separate type called *symbol* while quoted strings are parsed directly into a -string type, but this is not the case in Zisp. - -### Decoder - -The *decoder* transforms Zisp data into values of more complex types, including -values that are not of a datum type. - -Combined with syntax sugar, this allows Zisp to offer familiar syntax elements. -For example, the expression `#(x y z)` which parses into `(#HASH x y z)` can be -decoded into an array, so the result is similar to the vector syntax of Scheme. - -Decoding also resolves datum labels, goes over bare strings to find ones that -represent a number literal, and takes care of a number of other transforms. -This offloads complexity, allowing the parser to remain extremely simple. - -See the dedicated documentation of the [decoder](2-decode.html) for more. - - -## Data types - -Following is a more in-depth explanation of each data type constructed by the -Zisp s-expression parser. - -These are in fact value types, though the term "data type" is often used due to -familiarity. A Zisp value that is a member of one of the following value types -is only a *datum* if it adheres to additional constraints as explained below. - -### String - -Strings can appear *bare* or be quoted in various ways. A quoted string is in -fact parsed into a pair value with a rune in the first position to identify the -quotation variant that was parsed, and the string value in the second position; -or, in case of at-quoted strings, a special construct we will look at later. - - +-----------+-----------------------------+ - | Syntax | Parse output | - +-----------+-----------------------------+ - | |bytes| | (#PQSTR & ) | - +-----------+-----------------------------+ - | "bytes" | (#DQSTR & ) | - +-----------+-----------------------------+ - | @_bytes_ | (#ATSTR & ) | - +-----------+-----------------------------+ - -The visual token `` denotes the actual string, as a Zisp value, in the -second position of the pair. The visual token `` stands for an integer -Zisp value between 0 and 255. - -These external representations of strings will be explained in more detail -further below, including backslash escape sequences allowed within, and how -exactly at-quoted strings work. - -Strings have a fixed length, counted in bytes. Each byte can have any value, -including zero (ASCII NUL). The parser reads bytes, not Unicode characters; a -string may contain UTF-8 byte sequences, but these are not tested for validity. - -A string that is up to 255 bytes long is automatically *interned*, meaning any -occurrence of the same string -- equal in length and containing the same byte -values -- ends up being represented by the same bit-pattern; either a memory -address, or an immediate representation within a CPU word for short strings. -The quotation method is inconsequential to this process; for example, while -`|foobar|` and `"foobar"` will parse into different pair values, the actual -string they hold will be the same one in program memory. - -Strings of length greater than 255 bytes are stored separately in memory, even -if they are equal in length and content. - -### Rune - -A rune is represented by an ASCII character sequence of 1 to 6 bytes, that must -begin with a letter, and may only contain letters and digits. This character -sequence of letters and digits is called the *name* of the rune. A rune that -follows this constraint is valid as a datum. - -Zisp code may explicitly construct values of the rune type that violate the -above constraints. Such runes are not valid data and cannot be printed or -parsed. - -Runes are case-sensitive, and the parser always emits runes using upper-case -letters when expressing syntax sugar. Uppercase rune names are reserved for -Zisp's internal use and standard library; users can use lowercase runes with -custom meaning without worrying about clashes, with the exception of a small -number of lowercase runes such as `#true` and `#false` that are part of the -default decoder settings and documented explicitly as such. - -Runes are always stored directly in a CPU word; never by memory address. - -### Pair - -A pair is a tuple of two values: the first value and the second value. In Lisp -tradition, these are also called the `car` and `cdr` of the pair, respectively. - -The parser allocates a unique two-word cell in program memory for every pair, -and represents that pair through the memory address of the cell. - -Pairs are valid data if one of the following holds true: - -* The pair encodes a quoted string, datum label, or shebang line. - -* Both the first and second value in the pair is a valid datum. - -Further, a structure of nested pair values may not contain cyclic references -back up in the structure (which would make the above definition diverge into -infinity). Such cycles must be broken up with datum labels, or else the pair -cannot be considered a datum, since it cannot be printed or parsed. - -### Nil - -The Zisp nil value is a singleton and a datum. There is exactly one nil value -and it is used to terminate a chain of pairs representing a list of values; it -has the external representation `()`. - - -## Quoted strings - -Three quoted string types exist: Pipe-quoted, double-quoted, and at-quoted. -This section goes into the details of each variant. - -### Pipe-quoted - -Strings can be quoted with pipes, like symbols in R7RS Scheme, which triggers -the parser to generate a pair with the structure: - - (#PQSTR & ) ;; is visual aid, not syntax - -The decoder, using default settings, would emit this string verbatim as a value. -Then, during code evaluation, this would be seen as an identifier. In this way, -pipe-quoted strings are equivalent to bare strings in functionality. - -It is important to understand that the decoder sits between the parser and the -[evaluator](3-execute.html), and in opposition to Lisp and Scheme tradition, it -is common for the evaluator to receive values that are not valid as a datum; in -this case, a string unto itself that may not be a valid datum, due to not being -possible to be represented as a bare string. Yet, it is valid as an identifier -for the purposes of the evaluator, since it is a string *value* like any other. - -### Double-quoted - -Strings wrapped in the double-quote symbol parse into: - - (#DQSTR & ) ;; is visual aid, not syntax - -Under default settings, the decoder would transform this into a value which, -when evaluated as code, simply yields the contained string as a value. - -### At-quoted - -This is a special type of syntax for "raw" strings, meaning that no backslash -escapes nor any other kind of escape sequence are recognized within them. - -The syntax begins with an at sign, followed by any byte. That byte becomes a -termination marker, and the string cannot contain an occurrence of it, since -there are no escape sequences. - - @"foo \ bar" -> (#ATSTR & ) - -In the above, the visual tokens `` and `` represent an integer -value and a string value, respectively. In this example, the integer value -would be 34; the ASCII value for the double-quote sign. The string value -contains a literal backslash, since there is no backslash escape parsing. - -This style of quoting can be useful, for instance, when representing regular -expressions as strings in code: - - ;; Matches e.g. foo\bar.["blah"] - - @/^foo\\(bar|baz)\.\[".*"\]$/ - -Were it not for this syntax, this regular expression would only be possible to -represent through a quoted string such as the following: - - ;; Same as above, but so many backslashes - - "^foo\\\\(bar|baz)\\t\\[\".*\"\\]$" - -The byte that follows the at sign need not be a printable character or even a -valid ASCII byte; it can be absolutely any byte value, even NUL. This can be -useful to easily encode binary data which is known to not contain a specific -byte; an example would be C strings which cannot contain NUL. - -### Backslash escapes - -In pipe-quoted and double-quoted strings, the following ASCII characters may -follow a backslash to insert a certain character. - - +-------+----------------------------+ - | Char | Meaning | - +-------+----------------------------+ - | \ | Literal backslash | - +-------+----------------------------+ - | | | Literal pipe symbol | - +-------+----------------------------+ - | " | Literal double-quote | - +-------+----------------------------+ - | 0 | ASCII NUL | - +-------+----------------------------+ - | a | ASCII Alert | - +-------+----------------------------+ - | b | ASCII Backspace | - +-------+----------------------------+ - | t | ASCII Tab (Horizontal) | - +-------+----------------------------+ - | n | ASCII Newline (Line Feed) | - +-------+----------------------------+ - | v | ASCII Vertical Tab | - +-------+----------------------------+ - | f | ASCII Form Feed | - +-------+----------------------------+ - | r | ASCII Carriage Return | - +-------+----------------------------+ - | e | ASCII Escape | - +-------+----------------------------+ - -In words: - -* A backslash followed by a backslash, pipe, or double-quote character is - substituted with a literal occurrence of that character. - -* The characters 0, a, b, t, n, v, f, r, and e have the same meanings as in the - C programming language, representing common ASCII control characters. - -Further, the following Regular Expression patterns following a backslash have -special meaning. - - +---------------------+-----------------------+ - | Regular Expression | Meaning | - +---------------------+-----------------------+ - | [\t ]*\n[\t ]* | Discarded | - +---------------------+-----------------------+ - | x([0-9a-fA-F]{2})*; | Arbitrary bytes | - +---------------------+-----------------------+ - | u[0-9a-fA-F]+; | Unicode Scalar Value | - +---------------------+-----------------------+ - -Explanations: - -* A backslash followed by any number of blanks (space or tab), a newline, and - again any number of blanks, is substituted with nothing. This is to allow - splitting a string into multiple lines for human readability. - - (define p "This paragraph has been visually split into multiple \ - lines, but the newline is escaped, so it's one line.") - -* An x, followed by pairs of hexadecimal digits (case insensitive), terminated - by a semicolon, is substituted with the sequence of bytes represented by the - corresponding pairs of hexadecimal digits. E.g.: `"foo\xDEADBEEF;bar"` - -* A u, followed by a hexadecimal digit sequence (case insensitive), terminated - by a semicolon, is substituted with the canonical UTF-8 byte sequence for the - Unicode Scalar Value represented by that hexadecimal number. The number must - be in the range `0` to `10FFFF`. E.g.: `"foo\u00A0;bar"` - -### Newlines in strings - -Normally, a newline in a string has no special meaning and simply becomes part -of the string. However, newlines can be backslash-escaped, which simple erases -them; the escaped newline can also be preceded or followed by any number of tab -and space characters, which are all stripped as well. (Note: It's not blanks -preceding the backslash that are stripped, but blanks following the backslash -and preceding the newline; i.e., blanks at the end of the line.) - -Following are some examples of how multi-line strings can appear in source code -with different intentions and meanings: - - (define paragraph "This paragraph has been visually split into multiple \ - lines, but the newlines are escaped, so it's one line.") - - (define json-object '| ;; use '|| so double-quotes need no escaping - { - "key": "value" - } - |) - -The second example is actually slightly problematic. It begins with a newline, -which may be undesirable, but escaping that newline would cause the first line -to have no indentation, thus the opening `{` would not line up with the closing -`}` when this string is printed out. Further, if the entire block of code is -indented, then the string contents may be more indented than intended. (No pun -or rhyme intended.) Consider: - - (let ((foo one)) - (let ((bar two)) - (let ((json-object '| - { - "key": "value" - } - |)) - (do-whatever)))) - -The string bound to `json-object` has redundant indentation. Should the parser -attempt to solve this issue? - -Thankfully, we have the decoder to handle such complexities. Under the default -settings, the rune `#HASH` is bound to a decoder rule which detects a payload -value that is a string literal, and implements the same algorithm as seen in -Java 15 Text Blocks: [JEP 378: Text Blocks](https://openjdk.org/jeps/378) - -Thus, we can do the following: - - (let ((foo one)) - (let ((bar two)) - (let ((json-object #| - ........... { - ........... "key": "value" - ........... } - ...........|)) - (do-whatever)))) - -(Dots represent whitespace that is deleted. The initial newline is, as well.) - -The only feature Zisp does not offer is a way to fence off multi-line strings -with a longer token such as `"""` as seen in Python and Java, or an arbitrary -word as seen in Bourne shell and PHP "here doc" syntax. - -However, if a programmer truly wanted to have arbitrary text blocks in code, -without needing to escape anything in them, it's possible to abuse at-quoted -string syntax, using it with an ASCII control character which is displayed -visibly by a text editor. In the following, the characters `^\` are meant to -represent a literal ASCII File Separator character in the source code: - - (define json-object #@^\ - { - "key": "value" - } - ^\) - -It works fine in Emacs, so why not? Use `C-q C-\` to insert the `^\`. - -This is indeed quite an eldritch syntax, but hopefully most programs would not -need to use it. - - -## Other syntax - -The following table summarizes commonly useful syntax abbreviations: - - [...] -> (#SQUARE ...) #datum -> (#HASH & datum) - - {...} -> (#BRACE ...) #rune(...) -> (#rune ...) - - 'datum -> (#QUOTE & datum) dat1dat2 -> (#JOIN dat1 & dat2) - - `datum -> (#GRAVE & datum) dat1.dat2 -> (#DOT dat1 & dat2) - - ,datum -> (#COMMA & datum) dat1:dat2 -> (#COLON dat1 & dat2) - -Notes: - -* The terms datum, dat1, and dat2 each refer to an arbitrary datum; ellipsis - means zero or more data. - -* The `#datum` form only applies when the datum following the hash sign is - anything other than a bare string, since otherwise this would be ambiguous - with a rune literal. A bare string can nevertheless follow the hash sign by - separating the two with a backslash: - - #\string -> (#HASH & string) - -* Though not represented in the table due to notational difficulty, the form - `#rune(...)` doesn't require a list in the second position; any datum that - works with the `#datum` syntax also works with `#rune`. - - #rune1#rune2 -> (#rune1 & #rune2) - - #rune\string -> (rune & string) - - #rune'string -> (#rune #QUOTE & string) - - #rune"string" -> (#rune #DQSTR & |string|) - - As a counter-example, following a rune immediately with a bare string isn't - possible without the delimiting backslash, since that would be ambiguous: - - #abcdefgh ;Could be (#abcdef & gh) or (#abcde & fgh) or ... - -* Syntax sugar can combine arbitrarily. Some examples follow. Any of these may - or may not actually have a meaning in code; many could simply end up producing - an error during decoding, or later evaluation of code. - - #{...} -> (#HASH #BRACE ...) - - #'foo -> (#HASH #QUOTE & foo) - - ##'[...] -> (#HASH #HASH #QUOTE #SQUARE ...) - - {x y}[i j] -> (#JOIN (#BRACE x y) #SQUARE i j) - - foo.bar.baz{x y} -> (#JOIN (#DOT (#DOT foo & bar) & baz) #BRACE x y) - -* While in Lisp and Scheme `'foo` parses as `(quote foo)`, in Zisp it parses as - `(#QUOTE & foo)`; a single pair with the quoted datum in the second position. - - The same principle is used when parsing other sugar; some examples follow: - - Incorrect Correct - - #(x y z) -> (#HASH (x y z)) #(x y z) -> (#HASH x y z) - - [x y z] -> (#SQUARE (x y z)) [x y z] -> (#SQUARE x y z) - - #{x} -> (#HASH (#BRACE (x))) #{x} -> (#HASH #BRACE x) - - foo(x y) -> (#JOIN foo (x y)) foo(x y) -> (#JOIN foo x y) - -* Those used to thinking in Lisp and Scheme may think that `(#QUOTE ...)` halts - further decoding of enclosed data. This is not so, since quoting is related - to code evaluation, not decoding. - -### Datum labels - -Valid data cannot be cyclic, since that would mean it has infinite length in -bytes. To externally represent a value with cyclic structure, one uses datum -labels in the data encoding of the value. - -A datum label either wraps another datum to assign a number to it, or contains -just a reference to a previous assignment. - - +------------------+------------------------------+ - | Syntax | Internal datum structure | - +------------------+------------------------------+ - | #%= | (#LABEL & ) | - +------------------+------------------------------+ - | #%% | (#LABEL & ) | - +------------------+------------------------------+ - -In this visual, the token `` stands for a hexadecimal digit sequence, the -token `` stands for any other datum, and `` is a stand-in for a -number value; that which is represented by ``. - -For clarity, concrete examples follow: - - +-------------------+-------------------------------+ - | Byte sequence | Parse result | - +-------------------+-------------------------------+ - | #%1234abcd=(foo) | (#LABEL <0x1234abcd> & (foo)) | - +-------------------+-------------------------------+ - | #%1234abcd% | (#LABEL & <0x1234abcd>) | - +-------------------+-------------------------------+ - -Here, the visual token `<0x1234abcd>` stands for a Zisp value of a numeric type -with an integer value. Note that the decoder may not accept a bare string here, -meaning this syntax sugar is not merely an abbreviation. - -### Shebang - -Finally, the parser recognizes the Unix *shebang* syntax and outputs a datum to -hold the string values found within: - - #!interpreter -> (#SHBANG & interpreter) - - #!interpreter argline -> (#SHBANG interpreter & argline) - -When executing a script file, Zisp simply stores this into a global value that -may be inspected if desired. - - - diff --git a/doc/c1/2-decode.md b/doc/c1/2-decode.md deleted file mode 100644 index 379c74b..0000000 --- a/doc/c1/2-decode.md +++ /dev/null @@ -1,44 +0,0 @@ -# Decoding - -A separate process called "decoding" can transform simple data structures, -consisting of only the base datum types, into a richer set of Zisp types. - -For example, the decoder may turn `(#HASH ...)` into a vector, as one would -expect a vector literal like `#(...)` to work in Scheme. Bytevector syntax -could use a custom rune as a list prefix, like: `#u8(...)` - -Runes may be decoded in isolation as well, rather than transforming a list -whose head they appear in. This can implement Boolean constants as `#true` -and `#false` or `#t` and `#f`. - -The decoder recognizes `(#QUOTE ...)` to aid in implementing the traditional -quoting mechanism of Lisp/Scheme, but with a significant difference: - -Traditional quote is "unhygienic" in Scheme terms. An expression such as -`'(foo bar)` will always be read as `(quote (foo bar))` regardless of what -lexical context it appears in, so the semantics will depend on whatever the -identifier `quote` is bound to, meaning that the expression may end up -evaluating to something other than the list `(foo bar)`. - -The Zisp decoder, which transforms not datum to datum, but object to object, -can turn `#QUOTE` into an object which encapsulates the notion of quoting, -which the Zisp evaluator can recognize and act upon, ensuring that an -expression like `'(foo bar)` always turns into the list `(foo bar)`. - -One way to think about this, in Scheme (R6RS / syntax-case) terms, is that -expressions like `'(foo bar)` turn directly into a syntax object when read, -and the created syntax object begins with an identifier bound to `quote` in -the standard library. - -The decoder is, of course, configurable and extensible. The transformations -mentioned above would be performed only when it's told to decode data which -represents Zisp code. The decoder may be given a different configuration, -telling it to decode, for example, data which represents a different kind of -domain-specific data, such as application settings, build system commands, -complex data records with non-standard data types, and so on. - - diff --git a/doc/c1/grammar/abnf.txt b/doc/c1/grammar/abnf.txt deleted file mode 100644 index aa67646..0000000 --- a/doc/c1/grammar/abnf.txt +++ /dev/null @@ -1,141 +0,0 @@ -; Standards-compliant ABNF (RFC 5234, RFC 7405) - -; Compatible with: https://www.quut.com/abnfgen/ - -; Unlike PEG, grammar rules in BNF are non-deterministic, which makes -; it much more challenging to express our naive parse logic. Whether -; this ABNF file is truly accurate is difficult to assess. - -; The abnfgen(1) tool linked above can be used to generate arbitrary -; strings matching the grammar in this file. These can be fed into -; the Zisp parser to reveal some potential bugs; either in the parser -; itself, or this ABNF grammar. - -; Note that the tool may generate Zisp string literals with Unicode -; escape sequences corresponding to surrogate code points; the parser -; may reject these. This is expected; it's difficult to rewrite this -; ABNF grammar to exclude those Unicode values. - -; Other minor inaccuracies that aren't important include: This ABNF -; forces line comments to be terminated with an LF character, when in -; fact the end-of-file may also terminate them; the same applies to -; hash-bang parsing which doesn't actually have to end in LF. These -; discrepancies won't make abnfgen(1) generate invalid strings; they -; only make this ABNF more strict than the Zisp parser, so it won't -; generate some strings that the parser would actually accept. - - -Stream = [ Unit *( Blank Unit ) ] *Blank [Trail] - - -Unit = *Blank Datum - -Blank = HTAB / LF / %x0b / %x0c / CR / SP / Comment - -Trail = SkipLine / SkipUnit / ";" "~" *Blank - - -Datum = BareString / SpecialStr / CladDatum / Rune / RuneStr - / RuneDotStr / RuneClad / LabelRef / LabelDef / HashStr - / HashDotStr / HashClad / QuoteExpr / JoinExpr - -Comment = SkipLine LF / SkipUnit Blank - -SkipLine = ";" [ SkipLStart *AnyButLF ] - -SkipUnit = ";" "~" Unit - -SkipLStart = %x00-09 / %x0b-7d / %x7f-ff ; any but LF or "~" - -AnyButLF = %x00-09 / %x0b-ff - - -BareString = BareChar *( BareChar / Numeric ) - -SpecialStr = SpecStrChar *( SpecStrChar / BareChar ) - -CladDatum = "|" *( PipeStrChar / "\" StringEsc ) "|" - / DQUOTE *( QuotStrChar / "\" StringEsc ) DQUOTE - / "(" List ")" - / "[" List "]" - / "{" List "}" - -Rune = "#" RuneName - -RuneStr = "#" RuneName "\" BareString - -RuneDotStr = "#" RuneName "\" SpecialStr - -RuneClad = "#" RuneName CladDatum - -HashBang = "#" "!" *( SP / HTAB ) HBLine LF - -LabelRef = "#" "%" Label "%" - -LabelDef = "#" "%" Label "=" Datum - -HashStr = "#" "\" BareString - -HashDotStr = "#" "\" SpecialStr - -HashClad = "#" CladDatum - -QuoteExpr = "'" Datum - / "`" Datum - / "," Datum - -JoinExpr = Datum RJoinDatum - / LJoinDatum NoStartDot - / Datum ":" Datum - / NoEndDot "." Datum - - -BareChar = "!" / "$" / "%" / "*" / "/" / "<" / "=" / ">" - / "?" / "^" / "_" / "~" / ALPHA - -Numeric = "+" / "-" / DIGIT - -SpecStrChar = "." / ":" / Numeric - -PipeStrChar = %x00-5b / %x5d-7b / %x7d-ff ; any but "|" or "\" - -QuotStrChar = %x00-21 / %x23-5b / %x5d-ff ; any but DQUOTE or "\" - -StringEsc = "\" / "|" / DQUOTE / *( HTAB / SP ) LF *( HTAB / SP ) - / %s"a" / %s"b" / %s"t" / %s"n" - / %s"v" / %s"f" / %s"r" / %s"e" - / %s"x" *( 2HEXDIG ) ";" - / %s"u" ["0"] 1*5HEXDIG ";" - / %s"u" "1" "0" 4HEXDIG ";" - -List = [ Unit *( Blank Unit ) ] *Blank [Tail] [SkipUnit] - -Tail = "&" Unit *Blank - - -RuneName = ALPHA *5( ALPHA / DIGIT ) - -Label = 1*12( HEXDIG ) - -HBLine = 1*HBChar [ 1*( SP / HTAB ) *HBChar ] - -HBChar = %x00-08 / %x0b-1f / %x21-ff ; any but HT, LF, SP - - -RJoinDatum = CladDatum / Rune / RuneStr / RuneDotStr / RuneClad - / LabelRef / LabelDef / HashStr / HashDotStr / HashClad - / QuoteExpr - -LJoinDatum = CladDatum / RuneClad / LabelRef / HashClad - -NoStartDot = BareString / CladDatum / Rune / RuneStr / RuneDotStr - / RuneClad / LabelRef / LabelDef / HashStr / HashDotStr - / HashClad / QuoteExpr - -NoEndDot = BareString / Rune / RuneStr / RuneClad / LabelRef - / HashStr / HashClad - - -;; Local Variables: -;; eval: (flyspell-mode -1) -;; End: diff --git a/doc/c1/grammar/index.md b/doc/c1/grammar/index.md deleted file mode 100644 index e3716ea..0000000 --- a/doc/c1/grammar/index.md +++ /dev/null @@ -1,115 +0,0 @@ -# Zisp S-Expression Grammar - -The grammar is available in several different formats: - -* [ZBNF](zbnf.txt): See below for the rules of this notation -* [ABNF](abnf.txt): Compatible with the `abnfgen` tool -* [PEG](peg.txt): Compatible with `peg/leg` tool - - -## ZBNF notation - -The ZBNF grammar specification uses a BNF-like notation with PEG-like -semantics: - -* Concatenation of expressions is implicit: `foo bar` means `foo` - followed by `bar`. - -* Parentheses are used for grouping, and the pipe symbol `|` is used - for alternatives. - -* The suffixes `?`, `*`, and `+` have the same meaning as in regular - expressions, although `[foo]` is used in place of `(foo)?`. - -* The syntax is defined in terms of bytes, not characters. Terminals - `'c'` and `"c"` refer to the ASCII value of the given character `c`. - Standard C escape sequences are supported. - -* The prefix `~` means NOT. It only applies to rules that match one - byte, and negates them. For example, `~( 'a' | 'b' )` matches any - byte other than 'a' and 'b'. - -* Ranges of terminal values are expressed as `x...y` (inclusive). - -* ABNF "core rules" like `ALPHA` and `HEXDIG` are supported. - -* There is no ambiguity, or look-ahead / backtracking beyond one byte. - Rules match left to right, depth-first, and greedy. As soon as the - input matches the first terminal of a rule --explicit or implied by - recursively descending into the first non-terminal-- it must match - that rule to the end or a syntax error is reported. - -The last point makes the notation simple to translate to code. - - -## Limitations outside the grammar - -The following limits are not represented in the grammar: - -* A `UnicodeSV` is the hexadecimal representation of a Unicode scalar - value; it must represent a value in the range 0 to D7FF, or E000 to - 10FFFF, inclusive. Any other value signals an error. Valid values - are converted into a UTF-8 byte sequence encoding the value. - -* A `Rune` longer than 6 bytes is grammatical, but signals an error. - This is important because runes are not self-terminating; defining - their grammar as ending after a maximum of 6 bytes would allow - another datum beginning with an alphabetic character to follow a - rune immediately without any visual delineation, which would be - terribly confusing for a human reader. Consider: `#foobarbaz`. - This would parse as a `Datum` joining `#foobar` and `baz`. - - (The ABNF does not suffer from this issue, since it explicitly - enumerates the join possibilities anyway.) - -* A `Label` is the hexadecimal representation of a 48-bit integer, - meaning it allows for a maximum of 12 hexadecimal digits. Longer - values are grammatical, but signal an out-of-range error, so as to - avoid signaling a confusing "invalid character" error on input that - appears grammatical. Consider: `#%123456789abcd=foo`. This would - signal an invalid character error at the letter `d` if the grammar - limited a `Label` to 12 hexadecimal digits. - - (As above, the ABNF doesn't care about this. You probably don't - want to use the ABNF to generate a parser anyway.) - - -## At-quoted strings - -The mechanism of at-quoted strings is not represented in any of the -grammars, since it essentially has 256 variants. Representing it -sanely in a grammar requires the ability to save and reference -variables. - - -## Stream-parsing strategy - -The parser consumes one `Unit` from the input stream every time it's -called; it returns the `Datum` therein if found, or else it returns -the Zisp EOF token. - -Since a `Datum` is not self-terminating, the parser must read beyond -it to realize that it has ended (if not followed by the EOF). Thus, -it will consume one more `Blank` following the `Unit` that it parsed. -If this `Blank` is a comment, it will be consumed entirely, ensuring -that parsing resumes properly on a subsequent parser call on the same -input stream, without needing to store any state in between. - -Since comments of type `SkipUnit` are likewise not self-terminating, -an arbitrary number of chained `SkipUnit` comments may need to be -consumed before the parser is finally allowed to return. - -The following illustration shows the positions at which the parser -will stop consuming input when called repeatedly on the same input -stream. The dots represent the extent of each `Unit` being parsed, -while the caret points at the last byte the parser will consume in -that parse cycle. - -``` -foo (bar)[baz] foo;~bar foo;~bar;~baz;~bat foobar -...^..........^... ^... ^......^ -``` - -Notice how, in the fourth cycle, the parser is forced to consume all -commented-out units before it can return, since it would otherwise -leave the stream in an inappropriate state. diff --git a/doc/c1/grammar/peg.txt b/doc/c1/grammar/peg.txt deleted file mode 100644 index 7b28a99..0000000 --- a/doc/c1/grammar/peg.txt +++ /dev/null @@ -1,93 +0,0 @@ -# Standard PEG notation - -Stream <- Unit ( Blank Unit )* !. - - -Unit <- Blank* Datum - -Blank <- [\t-\r ] / Comment - - -Datum <- OneDatum ( JoinChar? OneDatum )* - -JoinChar <- '.' / ':' - - -Comment <- ';' ( SkipUnit / SkipLine ) - -SkipUnit <- '~' Unit - -SkipLine <- (!'\n' .)* '\n'? - - -OneDatum <- BareString / CladDatum - - -BareString <- SpecBareChar ( BareChar / JoinChar )* - / BareChar+ - -SpecBareChar <- '+' / '-' / JoinChar / DIGIT - -BareChar <- ALPHA / DIGIT - / '!' / '$' / '%' / '*' / '+' / '-' / '/' - / '<' / '=' / '>' / '?' / '^' / '_' / '~' - - -CladDatum <- PipeStr / QuoteStr / HashExpr / QuoteExpr / List - -PipeStr <- '|' ( PipeStrChar / '\' StringEsc )* '|' -QuoteStr <- '"' ( QuotStrChar / '\' StringEsc )* '"' -HashExpr <- '#' HashExprs -QuoteExpr <- "'" Datum / '`' Datum / ',' Datum -List <- ParenList / SquareList / BraceList - - -PipeStrChar <- (![|\\] .) -QuotStrChar <- (!["\\] .) - -StringEsc <- '\' / '|' / '"' / ( HTAB / SP )* LF ( HTAB / SP )* - / '0' / 'a' / 'b' / 't' / 'n' / 'v' / 'f' / 'r' / 'e' - / 'x' HexByte* ';' - / 'u' UnicodeSV ';' - -HexByte <- HEXDIG HEXDIG -UnicodeSV <- HEXDIG+ - - -HashExprs <- '!' [\t ]* HBangLine '\n'? - / '%' Label ( '%' / '=' Datum ) - / '\' BareString / CladDatum - / Rune ( '\' BareString / CladDatum )? - -HBangLine <- HBChars+ [\t ]* ( HBChars+ )? -HBChars <- (![\t\n ] .) -Label <- HEXDIG+ -Rune <- ALPHA ( ALPHA / DIGIT )* - - -ParenList <- '(' ListBody ')' -SquareList <- '[' ListBody ']' -BraceList <- '{' ListBody '}' - -ListBody <- Unit* ( Blank* '&' Unit )? Blank* - - -DIGIT <- [0-9] -ALPHA <- [a-zA-Z] -HEXDIG <- [0-9a-fA-F] - - -# Keep this in sync line-for-line with the ZBNF grammar for easy -# comparison between the two. - -# This file is meant to be compatible with: -# https://piumarta.com/software/peg - -# Due to a quirk in the peg tool this file is used with, the grammar -# must not allow an empty stream. Therefore, the Unit rule has its -# Datum declared as mandatory rather than optional. - - -# Local Variables: -# eval: (flyspell-mode -1) -# End: diff --git a/doc/c1/grammar/zbnf.txt b/doc/c1/grammar/zbnf.txt deleted file mode 100644 index 923ac83..0000000 --- a/doc/c1/grammar/zbnf.txt +++ /dev/null @@ -1,77 +0,0 @@ -; Custom notation with PEG semantics - -Stream : Unit ( Blank Unit )* - - -Unit : Blank* [Datum] - -Blank : '\t'...'\r' | SP | Comment - - -Datum : OneDatum ( [JoinChar] OneDatum )* - -JoinChar : '.' | ':' - - -Comment : ';' ( SkipUnit | SkipLine ) - -SkipUnit : '~' Unit - -SkipLine : ( ~LF )* [LF] - - -OneDatum : BareString | CladDatum - - -BareString : SpecBareChar ( BareChar | JoinChar )* - | BareChar+ - -SpecBareChar : '+' | '-' | JoinChar | DIGIT - -BareChar : ALPHA | DIGIT - | '!' | '$' | '%' | '*' | '+' | '-' | '/' - | '<' | '=' | '>' | '?' | '^' | '_' | '~' - - -CladDatum : PipeStr | QuoteStr | HashExpr | QuoteExpr | List - -PipeStr : '|' ( PipeStrChar | '\' StringEsc )* '|' -QuoteStr : '"' ( QuotStrChar | '\' StringEsc )* '"' -HashExpr : '#' HashExprs -QuoteExpr : "'" Datum | '`' Datum | ',' Datum -List : ParenList | SquareList | BraceList - - -PipeStrChar : ~( '|' | '\' ) -QuotStrChar : ~( '"' | '\' ) - -StringEsc : '\' | '|' | '"' | ( HTAB | SP )* LF ( HTAB | SP )* - | '0' | 'a' | 'b' | 't' | 'n' | 'v' | 'f' | 'r' | 'e' - | 'x' HexByte* ';' - | 'u' UnicodeSV ';' - -HexByte : HEXDIG HEXDIG -UnicodeSV : HEXDIG+ - - -HashExprs : '!' ( SP | HTAB )* HBangLine [ LF ] - | '%' Label ( '%' | '=' Datum ) - | '\' BareString | CladDatum - | Rune [ '\' BareString | CladDatum ] - -HBangLine : HBChars+ ( SP | HTAB )* [ HBChars+ ] -HBChars : ~( SP | HTAB | LF ) -Label : HEXDIG+ -Rune : ALPHA ( ALPHA | DIGIT )* - - -ParenList : '(' ListBody ')' -SquareList : '[' ListBody ']' -BraceList : '{' ListBody '}' - -ListBody : Unit* [ Blank* '&' Unit ] Blank* - - -;; Local Variables: -;; eval: (flyspell-mode -1) -;; End: diff --git a/doc/c1/index.md b/doc/c1/index.md deleted file mode 100644 index 6cec369..0000000 --- a/doc/c1/index.md +++ /dev/null @@ -1,30 +0,0 @@ -# Chapter 1: Genesis - -This chapter goes through the processes involved in reading source -code, running it, and optionally compiling it. - -1. [Parse](1-parse.html) (see also [grammar](grammar/)) - - The parser receives a stream of bytes and transforms them into a - minimal set of data types with very little processing. - -2. [Decode](2-decode.html) - - The decoder runs configurable and extensible pre-processing steps - over data received from the parser, enriching it with more complex - data types, and handling primitive source code transforms. It's - comparable to the C pre-processor or Lisp's `DEFMACRO` mechanism, - with a few more responsibilities, such as number literal parsing. - -3. [Execute](3-execute.html) - - Code is executed (or interpreted, or evaluated) in an environment, - also called a module, which may be mutated, and linked with other - modules. Execution is immediate, without any pre-compilation. - -4. [Compile](4-compile.html) - - Procedures from within the compiler module can be used to demand - the compilation of other modules, with various options, yielding - static or dynamic object files. These may be loaded immediately, - replacing the previously uncompiled module code in memory. diff --git a/doc/index.md b/doc/index.md index beaa78c..51b92fa 100644 --- a/doc/index.md +++ b/doc/index.md @@ -2,31 +2,35 @@ This document explains the Zisp language and its implementation. -Zisp intentionally blurs the line between developers and users of the -language. After all, Zisp is software, and its users are software -developers; the easiest way to explain *why* Zisp does certain things -is often to explain *how* it does them. +Zisp intentionally blurs the line between developers and users of the language. +After all, Zisp is software, and its users are software developers; the easiest +way to explain *why* Zisp does certain things is often to explain *how* it does +them. -That doesn't mean this manual will walk you through the source code -line by line. Instead, consider it a documentation of the code base -at large, doubling as a reference to the language implemented by the -code base. +That doesn't mean this manual will walk you through the source code line by +line. Instead, consider this a documentation of the implementation at large, +doubling as a language reference. ## Table of Contents -1. [Chapter 1: Genesis](c1/) +0. [Chapter 0: Genesis](./0/) - 1. [Parse](c1/1-parse.html) - 2. [Decode](c1/2-decode.html) - 3. [Execute](c1/3-execute.html) - 4. [Compile](c1/4-compile.html) + 0. [Value](./0/0-value.html) + 1. [Parse](./0/1-parse.html) + 2. [Decode](./0/2-decode.html) + 3. [Execute](./0/3-execute.html) + 4. [Compile](./0/4-compile.html) -2. [Chapter 2: Types](c2/) - - This chapter deals with the standard data types, and the methods - Zisp offers for defining new types. +1. [Chapter 1: Taxonomy](./1/) + 0. ... 1. ... - 2. ... -3. [Chapter 3: ...](c3/) +2. [Chapter 2: ...](./2/) + + + diff --git a/src/zisp/gc.zig b/src/zisp/gc.zig index 9267087..b2fcc46 100644 --- a/src/zisp/gc.zig +++ b/src/zisp/gc.zig @@ -5,11 +5,11 @@ const Alloc = std.mem.Allocator; const value = @import("value.zig"); -pub const PairPool = @import("gc/PairPool.zig"); +pub const ListPool = @import("gc/ListPool.zig"); pub const IstrSet = @import("gc/IstrSet.zig"); var main_alloc: Alloc = undefined; -var main_pair_pool: PairPool = undefined; +var main_list_pool: ListPool = undefined; var main_istr_set: IstrSet = undefined; var init_done = false; @@ -20,24 +20,24 @@ pub fn init() !void { const alloc = std.heap.smp_allocator; initCustom( alloc, - try PairPool.init(alloc), + try ListPool.init(alloc), try IstrSet.init(alloc), ); } pub fn initCustom( alloc: Alloc, - pair_pool: PairPool, + list_pool: ListPool, istr_set: IstrSet, ) void { main_alloc = alloc; - main_pair_pool = pair_pool; + main_list_pool = list_pool; main_istr_set = istr_set; } -pub fn mainPairPool() *PairPool { +pub fn mainListPool() *ListPool { init() catch @panic("OOM"); // TODO this is only here for the test suite - return &main_pair_pool; + return &main_list_pool; } pub fn mainIstrSet() *IstrSet { diff --git a/src/zisp/gc/IstrSet.zig b/src/zisp/gc/IstrSet.zig index bc345f2..bf0db47 100644 --- a/src/zisp/gc/IstrSet.zig +++ b/src/zisp/gc/IstrSet.zig @@ -74,7 +74,7 @@ pub fn getOrNew(self: *Set, s: []const u8) !IstrPtr { /// Get the canonical istr with the same contents, or store this one as it. pub fn getOrPut(self: *Set, istr: IstrPtr) !IstrPtr { - return self.getOrPutOrNew(istr.str(), istr); + return self.getOrPutOrNew(istr.bytes(), istr); } fn getOrPutOrNew(self: *Set, s: []const u8, ptr: ?IstrPtr) !IstrPtr { @@ -100,7 +100,7 @@ fn getOrPutOrNew(self: *Set, s: []const u8, ptr: ?IstrPtr) !IstrPtr { } if (bucket.fp == fp) { const istr = bucket.istrPtr(); - if (strEq(s, istr.str())) return istr; + if (strEq(s, istr.bytes())) return istr; } } } @@ -114,7 +114,7 @@ fn resize(self: *Set) !void { const idx_mask = new_buckets.len - 1; for (old_buckets) |old| { if (old.empty()) continue; - const str = old.istrPtr().str(); + const str = old.istrPtr().bytes(); var idx = strHash(str) & idx_mask; while (!new_buckets[idx].empty()) idx = (idx + 1) & idx_mask; new_buckets[idx] = old; @@ -147,7 +147,7 @@ pub fn addStdlib(self: *Set, s: []const u8) !IstrPtr { } const istr = try self.newIstr(s); - gop.key_ptr.* = istr.str(); + gop.key_ptr.* = istr.bytes(); gop.value_ptr.* = istr; return istr; } diff --git a/src/zisp/gc/ListPool.zig b/src/zisp/gc/ListPool.zig new file mode 100644 index 0000000..6c34074 --- /dev/null +++ b/src/zisp/gc/ListPool.zig @@ -0,0 +1,69 @@ +//! List array pool +//! +//! Strategy: To ensure list arrays are allocated sequentially without any +//! padding, we allocate memory in large blocks, and whenever a list array with +//! a certain length is to be allocated, we check if the current block still has +//! enough room; otherwise we allocate the next block. +//! +//! This is only used for lists up to 7 elements. Starting at 8, they can't +//! share a cache line with other list elements anyway. + +const std = @import("std"); + +const Alloc = std.mem.Allocator; +const ArrayList = std.ArrayListUnmanaged; + +const value = @import("../value.zig"); + +const Value = value.Value; + +const Self = @This(); + +// How many Value elements fit in a block. A good value should be 512, since +// each Value is 8 bytes: 512 * 8 = 4 KiB +const block_value_cap = 512; + +// Initial capacity of ArrayList of pointers to filled blocks. Also 512, since +// pointers are 8 bytes as well. +const def_full_blocks_cap = 512; + +const Block = [block_value_cap]Value; + +alloc: Alloc, + +current_block: *Block, +current_index: usize, + +full_blocks: ArrayList(*Block), + +pub fn init(alloc: Alloc) !Self { + return .{ + .alloc = alloc, + .current_block = try alloc.create(Block), + .current_index = 0, + .full_blocks = try .initCapacity(alloc, def_full_blocks_cap), + }; +} + +pub fn deinit(self: *Self) void { + self.alloc.destroy(self.current_block); + for (self.full_blocks) |block| self.alloc.destroy(block); + self.full_blocks.deinit(self.alloc); +} + +fn newBlock(self: *Self) !void { + try self.full_blocks.append(self.alloc, self.current_block); + self.current_block = try self.alloc.create(Block); + self.current_index = 0; +} + +pub fn create(self: *Self, len: u3) ![*]Value { + std.debug.assert(len != 0); + + if (len > block_value_cap - self.current_index) { + try self.newBlock(); + } + + defer self.current_index += len; + return @ptrCast(&self.current_block[self.current_index]); +} diff --git a/src/zisp/gc/PairPool.zig b/src/zisp/gc/PairPool.zig deleted file mode 100644 index 4a77acc..0000000 --- a/src/zisp/gc/PairPool.zig +++ /dev/null @@ -1,34 +0,0 @@ -const std = @import("std"); - -const Alloc = std.mem.Allocator; -const AlignedPool = std.heap.memory_pool.Aligned; - -const value = @import("../value.zig"); - -const Value = value.Value; -const PairPtr = value.pair.PairPtr; - -const PairPool = @This(); - -alloc: Alloc, -pool: AlignedPool(value.pair.Pair, @enumFromInt(@alignOf(value.Zptr))), - -const default_init_cap = 1024; - -pub fn init(alloc: Alloc) !PairPool { - return initCustom(alloc, default_init_cap); -} - -pub fn initCustom(alloc: Alloc, init_cap: usize) !PairPool { - return .{ - .alloc = alloc, - .pool = try .initCapacity(alloc, init_cap), - }; -} - -pub fn cons(self: *PairPool, car: Value, cdr: Value) !PairPtr { - var p = try self.pool.create(self.alloc); - p.car = car; - p.cdr = cdr; - return p; -} diff --git a/src/zisp/io/Decoder.zig b/src/zisp/io/Decoder.zig new file mode 100644 index 0000000..95a0b68 --- /dev/null +++ b/src/zisp/io/Decoder.zig @@ -0,0 +1 @@ +const std = @import("std"); diff --git a/src/zisp/io/Encoder.zig b/src/zisp/io/Encoder.zig new file mode 100644 index 0000000..eb27e20 --- /dev/null +++ b/src/zisp/io/Encoder.zig @@ -0,0 +1 @@ +// wip diff --git a/src/zisp/io/Parser.zig b/src/zisp/io/Parser.zig index d4a0a68..f768468 100644 --- a/src/zisp/io/Parser.zig +++ b/src/zisp/io/Parser.zig @@ -1,7 +1,7 @@ //! //! === Syntax === //! -//! See doc/c1/1-parse.md to understand the implemented syntax. +//! See /doc/0/1-parse.md to understand the implemented syntax. //! //! //! === Trampolining strategy === @@ -43,8 +43,8 @@ const gc = @import("../gc.zig"); const lib = @import("../lib.zig"); const value = @import("../value.zig"); +const ListPool = gc.ListPool; const IstrSet = gc.IstrSet; -const PairPool = gc.PairPool; const Value = value.Value; const Parser = @This(); @@ -81,21 +81,22 @@ pub const Error = enum { }; pub const Context = struct { - // What to do next. + /// What to do next. next: ?Fn = undefined, - // For storing a context value, like datum to join in join syntax. + /// For storing a context value, like datum to join in join syntax. val: Value = undefined, - // For storing a context char, like list opening bracket. + /// For storing a context char, like list opening bracket. char: u8 = undefined, - // Count of list elements on current parse level. - list_len: usize = undefined, + /// Start index of list elements on current parse level, within the global + /// list element accumulation array. + list_start: usize = undefined, }; -alloc: Alloc, +list_pool: ?*ListPool, istr_set: ?*IstrSet, -pair_pool: *PairPool, +alloc: Alloc, -input: *Reader = undefined, +reader: *Reader = undefined, context: Context = .{}, ctx_stack: List(Context) = undefined, @@ -107,23 +108,23 @@ unread_char: ?u8 = null, err_msg: []const u8 = undefined, pub fn init(alloc: Alloc) !Parser { + const list_pool = gc.mainListPool(); const istr_set = gc.mainIstrSet(); - const pair_pool = gc.mainPairPool(); - return initCustom(alloc, 32, 2048, 32, istr_set, pair_pool); + return initCustom(list_pool, istr_set, alloc, 32, 2048, 32); } pub fn initCustom( + list_pool: ?*ListPool, + istr_set: ?*IstrSet, alloc: Alloc, init_ctx_stack_cap: usize, init_str_chars_cap: usize, init_list_elts_cap: usize, - istr_set: ?*IstrSet, - pair_pool: *PairPool, ) !Parser { var p: Parser = .{ - .alloc = alloc, + .list_pool = list_pool, .istr_set = istr_set, - .pair_pool = pair_pool, + .alloc = alloc, }; p.ctx_stack = try .initCapacity(alloc, init_ctx_stack_cap); p.str_chars = try .initCapacity(alloc, init_str_chars_cap); @@ -148,7 +149,7 @@ fn read(p: *Parser) !?u8 { .{p.unread_char.?}, ); } - const c = p.input.takeByte() catch |e| switch (e) { + const c = p.reader.takeByte() catch |e| switch (e) { error.EndOfStream => return null, else => return p.err(.ReadError, "???"), }; @@ -158,6 +159,13 @@ fn read(p: *Parser) !?u8 { return c; } +fn readIntoSlice(p: *Parser, slice: []u8) !void { + p.reader.readSliceAll(slice) catch |e| return switch (e) { + error.EndOfStream => p.err(.UnexpectedEof, "reading into slice"), + else => p.err(.ReadError, "???"), + }; +} + fn readNoEof(p: *Parser, comptime emsg: []const u8) !u8 { return try p.read() orelse p.err(.UnexpectedEof, emsg); } @@ -222,25 +230,22 @@ fn getCharsAsRune(p: *Parser) Value { } // -// Pair consing & list creation +// List creation // -fn cons(p: *Parser, car: Value, cdr: Value) !Value { - return value.pair.consInPool(p.pair_pool, car, cdr); -} - fn addListElt(p: *Parser, elt: Value) !void { try p.list_elts.append(p.alloc, elt); - p.context.list_len += 1; } -fn getList(p: *Parser, tail: Value) !Value { - var list = tail; - for (0..p.context.list_len) |_| { - const elt = p.list_elts.pop() orelse unreachable; - list = try p.cons(elt, list); - } - return list; +fn getList(p: *Parser) !Value { + if (p.list_elts.items.len == p.context.list_start) return value.nil; + defer p.list_elts.items.len = p.context.list_start; + const vals = p.list_elts.items[p.context.list_start..]; + return value.list.new(p.alloc, p.list_pool, vals); +} + +fn makeList(p: *Parser, vals: []const Value) !Value { + return value.list.new(p.alloc, p.list_pool, vals); } // @@ -259,8 +264,6 @@ const Fn = enum { endRuneDatum, endLabelDatum, continueList, - endImproperList, - closeImproperList, endQuoteExpr, }; @@ -277,14 +280,12 @@ inline fn call(p: *Parser, f: Fn) !void { .endRuneDatum => p.endRuneDatum(), .endLabelDatum => p.endLabelDatum(), .continueList => p.continueList(), - .endImproperList => p.endImproperList(), - .closeImproperList => p.closeImproperList(), .endQuoteExpr => p.endQuoteExpr(), }; } -pub fn run(p: *Parser, input: *Reader) !Value { - p.input = input; +pub fn run(p: *Parser, reader: *Reader) !Value { + p.reader = reader; p.context.next = .parseUnit; while (p.context.next) |next| { if (detailed_debug) p.printStack(); @@ -336,7 +337,7 @@ fn pushContext(p: *Parser, next: Fn) !void { .next = next, .val = p.context.val, .char = p.context.char, - .list_len = p.context.list_len, + .list_start = p.context.list_start, }); } @@ -455,7 +456,7 @@ fn endJoinDatum(p: *Parser) !void { ':' => COLON, else => unreachable, }; - const joined = try p.cons(rune, try p.cons(prev, p.result)); + const joined = try p.makeList(&.{ rune, prev, p.result }); return p.jump(.parseJoin, joined); } @@ -511,21 +512,44 @@ fn getString(p: *Parser, comptime close: u8) !Value { }; const s = try p.getCharsAsString(); return switch (close) { - '|' => try p.cons(PQSTR, s), - '"' => try p.cons(DQSTR, s), + '|' => try p.makeList(&.{ PQSTR, s }), + '"' => try p.makeList(&.{ DQSTR, s }), else => unreachable, }; } fn getAtString(p: *Parser) !Value { const stop = try p.readNoEof("at-string"); + return if (stop == 255) p.getAtLenStr() else p.getAtSentinelStr(stop); +} + +fn getAtLenStr(p: *Parser) !Value { + var len: u48 = 0; + inline for (0..6) |_| { + len <<= 8; + len += try p.readNoEof("at-length-string"); + } + const AH = value.array.ArrayHeader; + const aln: std.mem.Alignment = @enumFromInt(@alignOf(AH)); + const mem = try p.alloc.alignedAlloc(u8, aln, @sizeOf(AH) + len); + const arr: value.array.ArrayPtr = @ptrCast(mem); + arr.* = .{ + .len_or_ptr = len, + .type = .str, + .info = .{ .str = .{} }, + }; + try p.readIntoSlice(arr.bytes()); + return p.makeList(&.{ ATSTR, value.ptr.pack(.array, arr) }); +} + +fn getAtSentinelStr(p: *Parser, stop: u8) !Value { while (try p.readNoEofOpt("at-string")) |c| { if (c == stop) break; try p.addChar(c); } const str = try p.getCharsAsString(); const byte = value.fixnum.pack(stop); - return try p.cons(ATSTR, try p.cons(byte, str)); + return p.makeList(&.{ ATSTR, byte, str }); } fn skipStringLfEscape(p: *Parser) !u8 { @@ -591,8 +615,9 @@ fn parseHashExpr(p: *Parser, next: Fn) !void { }, '\\' => { const c1 = try p.readNoEof("hash-backslash"); - const bs = try p.getBareString(c1); - return p.jump(next, try p.cons(HASH, bs)); + const str = try p.getBareString(c1); + const val = try p.makeList(&.{ HASH, str }); + return p.jump(next, val); }, '!' => return p.parseHashBang(next), '%' => return p.parseLabel(next), @@ -611,7 +636,7 @@ fn endHashDatum(p: *Parser) !void { if (p.result.eq(value.none)) { return p.err(.InvalidCharacter, "hash datum"); } - return p.retval(try p.cons(HASH, p.result)); + return p.retval(try p.makeList(&.{ HASH, p.result })); } fn getRune(p: *Parser, c1: u8) !Value { @@ -635,11 +660,25 @@ fn parseRuneEnd(p: *Parser, r: Value, next: Fn) !void { switch (c) { '\\' => { const c1 = try p.readNoEof("rune-backslash"); - return p.jump(next, try p.cons(r, try p.getBareString(c1))); + const str = try p.getBareString(c1); + const val = try p.makeList(&.{ r, str }); + return p.jump(next, val); + }, + '"' => { + const str = try p.getString('"'); + const val = try p.makeList(&.{ r, str }); + return p.jump(next, val); + }, + '|' => { + const str = try p.getString('|'); + const val = try p.makeList(&.{ r, str }); + return p.jump(next, val); + }, + '@' => { + const str = try p.getAtString(); + const val = try p.makeList(&.{ r, str }); + return p.jump(next, val); }, - '"' => return p.jump(next, try p.cons(r, try p.getString('"'))), - '|' => return p.jump(next, try p.cons(r, try p.getString('|'))), - '@' => return p.jump(next, try p.cons(r, try p.getAtString())), '#', '(', '[', '{', '\'', '`', ',' => { p.unread(c); try p.push(next); @@ -654,31 +693,31 @@ fn parseRuneEnd(p: *Parser, r: Value, next: Fn) !void { } fn endRuneDatum(p: *Parser) !void { - return p.retval(try p.cons(p.context.val, p.result)); + return p.retval(try p.makeList(&.{ p.context.val, p.result })); } fn parseHashBang(p: *Parser, next: Fn) !void { - const val = try p.getHashBangValue(); - return p.jump(next, try p.cons(SHBANG, val)); + const interp, const arg_line = try p.getHashBangValue(); + if (arg_line) |args| { + return p.jump(next, try p.makeList(&.{ SHBANG, interp, args })); + } else { + return p.jump(next, try p.makeList(&.{ SHBANG, interp })); + } } -fn getHashBangValue(p: *Parser) !Value { +fn getHashBangValue(p: *Parser) !struct { Value, ?Value } { while (try p.readNoEofOpt("hash-bang")) |c| switch (c) { ' ', '\t' => continue, '\n' => return p.err(.InvalidCharacter, "hash-bang"), else => { try p.addChar(c); while (try p.read()) |c2| switch (c2) { - '\n' => return p.getCharsAsString(), + '\n' => return .{ try p.getCharsAsString(), null }, ' ', '\t' => break, else => try p.addChar(c2), }; const interp = try p.getCharsAsString(); - if (try p.getHashBangArgLine()) |arg_line| { - return try p.cons(interp, arg_line); - } else { - return interp; - } + return .{ interp, try p.getHashBangArgLine() }; }, }; unreachable; @@ -704,7 +743,7 @@ fn parseLabel(p: *Parser, next: Fn) !void { const n = try p.parseHex(u48, "datum label"); const l = value.fixnum.pack(n); switch (p.getUnread() orelse try p.readNoEof("datum label")) { - '%' => return p.jump(next, try p.cons(LABEL, l)), + '%' => return p.jump(next, try p.makeList(&.{ LABEL, l })), '=' => { try p.push(next); p.context.val = l; @@ -718,7 +757,7 @@ fn endLabelDatum(p: *Parser) !void { if (p.result.eq(value.none)) { return p.err(.InvalidCharacter, "label datum"); } - return p.retval(try p.cons(LABEL, try p.cons(p.context.val, p.result))); + return p.retval(try p.makeList(&.{ LABEL, p.context.val, p.result })); } fn parseList(p: *Parser, open: u8, next: Fn) !void { @@ -729,7 +768,7 @@ fn parseList(p: *Parser, open: u8, next: Fn) !void { '{' => '}', else => unreachable, }; - p.context.list_len = 0; + p.context.list_start = p.list_elts.items.len; switch (open) { '(' => {}, '[' => try p.addListElt(SQUARE), @@ -750,9 +789,6 @@ fn continueList(p: *Parser) !void { if (c == close) { return p.endList(); } - if (c == '&') { - return p.subr(.parseUnit, .endImproperList); - } return p.err(.InvalidCharacter, "list"); } @@ -762,32 +798,7 @@ fn continueList(p: *Parser) !void { } fn endList(p: *Parser) !void { - return p.retval(try p.getList(value.nil)); -} - -fn endImproperList(p: *Parser) !void { - if (p.result.eq(value.none)) { - return p.err(.InvalidCharacter, "list tail"); - } - p.context.val = try p.getList(p.result); - return p.closeImproperList(); -} - -fn closeImproperList(p: *Parser) !void { - const result = p.context.val; - const close = p.context.char; - var c1 = p.getUnread() orelse try p.read(); - while (c1) |c| : (c1 = try p.read()) { - if (c == close) { - return p.retval(result); - } - switch (try p.checkBlank(c)) { - .yes => {}, - .skip_unit => return p.subr(.parseUnit, .closeImproperList), - .no => return p.err(.InvalidCharacter, "after list tail"), - } - } - return p.err(.UnexpectedEof, "after list tail"); + return p.retval(try p.getList()); } fn parseQuoteExpr(p: *Parser, c1: u8, next: Fn) !void { @@ -808,7 +819,7 @@ fn endQuoteExpr(p: *Parser) !void { if (p.result.eq(value.none)) { return p.err(.InvalidCharacter, "quote expression datum"); } - return p.retval(try p.cons(p.context.val, p.result)); + return p.retval(try p.makeList(&.{ p.context.val, p.result })); } // Helpers @@ -836,7 +847,7 @@ pub fn isSpecialBareChar(c: u8) bool { pub fn isBareChar(c: u8) bool { return switch (c) { // zig fmt: off - 'a'...'z' , 'A'...'Z' , '0'...'9' , '!' , '$' , '%' , '*' , + 'a'...'z' , 'A'...'Z' , '0'...'9' , '!' , '$' , '%' , '&' , '*' , '+' , '-' , '/' , '<' , '=' , '>' , '?' , '^' , '_' , '~' , => true, // zig fmt: on else => false, diff --git a/src/zisp/io/Printer.zig b/src/zisp/io/Printer.zig index e6b3d5b..4b06005 100644 --- a/src/zisp/io/Printer.zig +++ b/src/zisp/io/Printer.zig @@ -7,7 +7,6 @@ const value = @import("../value.zig"); const Parser = io.Parser; const Value = value.Value; -const PairPtr = value.pair.PairPtr; const IstrPtr = value.istr.IstrPtr; const ArrayPtr = value.array.ArrayPtr; @@ -19,17 +18,21 @@ pub fn init(writer: *Writer) Printer { return .{ .writer = writer }; } -fn write(p: *Printer, str: []const u8) !void { - _ = try p.writer.write(str); +fn write(p: *Printer, bytes: []const u8) !void { + _ = try p.writer.write(bytes); +} + +fn writeByte(p: *Printer, c: u8) !void { + _ = try p.writer.writeByte(c); } pub fn print(p: *Printer, v: Value) anyerror!void { if (v.isSstr()) return p.printBareStr(value.sstr.unpack(&v)); if (v.isRune()) return p.printRune(v); if (v.isMisc()) return p.printMisc(v); - if (value.istr.check(v)) |str| return p.printBareStr(str.str()); - if (value.array.check(.str, v)) |str| return p.printBareStr(str.str()); - if (value.pair.check(v)) |pair| return p.printPair(pair); + if (value.list.check(v)) return p.printList(v); + if (value.istr.check(v)) return p.printIstr(v); + if (value.array.check(.str, v)) |str| return p.printBareStr(str.bytes()); @panic("Unsupported type to print."); } @@ -38,15 +41,20 @@ pub fn printBareStr(p: *Printer, s: []const u8) !void { const no_joins = Parser.isSpecialBareChar(s[0]); for (s) |c| { if (Parser.isBareChar(c)) { - try p.writer.writeByte(c); + try p.writeByte(c); } else if (no_joins and (c == '.' or c == ':')) { - try p.writer.writeByte(c); + try p.writeByte(c); } else { @panic("String needs quoting."); } } } +pub fn printIstr(p: *Printer, v: Value) !void { + const istr = value.istr.unpack(v); + try p.printBareStr(istr.bytes()); +} + pub fn printRune(p: *Printer, v: Value) !void { try p.write("#"); try p.write(value.rune.unpack(&v)); @@ -58,90 +66,201 @@ pub fn printMisc(p: *Printer, v: Value) !void { value.t.bits => p.write("#t"), value.nil.bits => p.write("()"), value.eof.bits => p.write("#EOF"), - value.none.bits => p.write("#NONE"), - value.undef.bits => p.write("#UNDEF"), else => @panic("Unsupported misc value to print."), }; } -pub fn printPair(p: *Printer, pair: PairPtr) !void { - try switch (pair.car.bits) { - Parser.PQSTR.bits => p.printQuotString(pair.cdr, '|'), - Parser.DQSTR.bits => p.printQuotString(pair.cdr, '"'), - Parser.ATSTR.bits => p.printAtString(pair.cdr), - Parser.LABEL.bits => p.printLabel(pair.cdr), - else => p.printList(pair), +pub fn printList(p: *Printer, list: Value) !void { + const len = value.list.getLenTag(list); + const ptr = value.list.getValPtr(list); + try switch (len) { + 2 => switch (ptr[0].bits) { + Parser.SHBANG.bits => p.printShebang(ptr[1]), + Parser.LABEL.bits => p.printLabel(ptr[1]), + Parser.HASH.bits => p.printHashDatum(ptr[1]), + Parser.PQSTR.bits => p.printQuotStr(ptr[1], '|'), + Parser.DQSTR.bits => p.printQuotStr(ptr[1], '"'), + Parser.ATSTR.bits => p.printAtLenStr(ptr[1]), + Parser.QUOTE.bits => p.printQuoteWithChar('\'', ptr[1]), + Parser.GRAVE.bits => p.printQuoteWithChar('`', ptr[1]), + Parser.COMMA.bits => p.printQuoteWithChar(',', ptr[1]), + Parser.SQUARE.bits => p.printListDirect(len, ptr), + Parser.BRACE.bits => p.printListDirect(len, ptr), + else => if (ptr[0].isRune()) { + return p.printRuneDatum(ptr[0], ptr[1]); + } else { + return p.printListDirect(len, ptr); + }, + }, + 3 => switch (ptr[0].bits) { + Parser.DOT.bits => p.printJoinWithChar('.', ptr[1], ptr[2]), + Parser.COLON.bits => p.printJoinWithChar(':', ptr[1], ptr[2]), + Parser.JOIN.bits => p.printJoin(ptr[1], ptr[2]), + Parser.SHBANG.bits => p.printShebangWithArgs(ptr[1], ptr[2]), + Parser.LABEL.bits => p.printLabelDatum(ptr[1], ptr[2]), + Parser.ATSTR.bits => p.printAtStr(ptr[1], ptr[2]), + else => p.printListDirect(len, ptr), + }, + else => p.printListDirect(len, ptr), }; } -fn printQuotString(p: *Printer, s: Value, comptime qchar: u8) !void { - try p.writer.writeByte(qchar); - const str = try value.string.getString(&s); - for (str) |c| switch (c) { +fn printJoinWithChar(p: *Printer, c: u8, d1: Value, d2: Value) !void { + try p.print(d1); + try p.writeByte(c); + try p.print(d2); +} + +fn printJoin(p: *Printer, d1: Value, d2: Value) !void { + try p.print(d1); + try p.print(d2); +} + +fn printShebang(p: *Printer, str: Value) !void { + try p.write("#!"); + try p.write(value.string.getBytes(&str)); + try p.write("\n"); +} + +fn printShebangWithArgs(p: *Printer, str: Value, str2: Value) !void { + try p.write("#!"); + try p.write(value.string.getBytes(&str)); + try p.write(" "); + try p.write(value.string.getBytes(&str2)); + try p.write("\n"); +} + +fn printLabel(p: *Printer, num: Value) !void { + try p.printLabelNum(num); + try p.write("%"); +} + +fn printLabelDatum(p: *Printer, num: Value, dat: Value) !void { + try p.printLabelNum(num); + try p.write("="); + try p.print(dat); +} + +fn printLabelNum(p: *Printer, num: Value) !void { + const x = value.fixnum.unpack(num); + std.debug.assert(x >= 0); + std.debug.assert(x <= std.math.maxInt(u48)); + + var buf: [12]u8 = undefined; + const end = std.fmt.printInt(&buf, x, 16, .lower, .{}); + + try p.write("#%"); + try p.write(buf[0..end]); +} + +fn printHashDatum(p: *Printer, d: Value) !void { + try p.write("#"); + if (value.string.isString(d)) try p.write("\\"); + try p.print(d); +} + +fn printRuneDatum(p: *Printer, r: Value, d: Value) !void { + try p.printRune(r); + if (value.string.isString(d)) try p.write("\\"); + try p.print(d); +} + +fn printQuotStr(p: *Printer, str: Value, comptime qchar: u8) !void { + try p.writeByte(qchar); + for (value.string.getBytes(&str)) |c| switch (c) { qchar => { - try p.writer.writeByte('\\'); - try p.writer.writeByte(qchar); + try p.writeByte('\\'); + try p.writeByte(qchar); }, '\\' => { - try p.writer.writeByte('\\'); - try p.writer.writeByte('\\'); + try p.writeByte('\\'); + try p.writeByte('\\'); }, else => { - try p.writer.writeByte(c); + try p.writeByte(c); }, }; - try p.writer.writeByte(qchar); + try p.writeByte(qchar); } -pub fn printAtString(p: *Printer, at_str_pair: Value) !void { - const pair = value.pair.assert(at_str_pair); - const byte = value.fixnum.unpack(pair.car); - std.debug.assert(byte <= 255); - const str = try value.string.getString(&pair.cdr); +fn printAtLenStr(p: *Printer, str: Value) !void { + const bytes = value.string.getBytes(&str); + try p.write("@"); - try p.writer.writeByte(@intCast(byte)); - try p.write(str); - try p.writer.writeByte(@intCast(byte)); -} + try p.writeByte(255); -pub fn printLabel(p: *Printer, v: Value) !void { - var num: Value = undefined; - var dat: ?Value = undefined; - if (value.pair.check(v)) |pair| { - num = pair.car; - dat = pair.cdr; - } else { - num = v; - dat = null; + var buf: [6]u8 = undefined; + var len = bytes.len; + inline for (0..6) |i| { + buf[5 - i] = @truncate(len); + len >>= 8; } - const x = value.fixnum.unpack(num); - std.debug.assert(x >= 0); - std.debug.assert(x <= std.math.maxInt(u48)); - var buf: [12]u8 = undefined; - const end = std.fmt.printInt(&buf, x, 16, .lower, .{}); + try p.write(buf[0..6]); + try p.write(bytes); +} - try p.write("#%"); - try p.write(buf[0..end]); - if (dat) |d| { - try p.write("="); - try p.print(d); - } else { - try p.write("%"); +fn printAtStr(p: *Printer, sentinel: Value, str: Value) !void { + const sentinel_num = value.fixnum.unpack(sentinel); + if (sentinel_num > 254) { + @panic("At-string sentinel must be <= 254."); } + + const byte: u8 = @intCast(sentinel_num); + const str_bytes = value.string.getBytes(&str); + for (str_bytes) |c| { + if (c == byte) @panic("String contains sentinel byte."); + } + try p.write("@"); + try p.writeByte(byte); + try p.write(str_bytes); + try p.writeByte(byte); +} + +fn printQuoteWithChar(p: *Printer, c: u8, d: Value) !void { + try p.writeByte(c); + try p.print(d); } -pub fn printList(p: *Printer, pair: PairPtr) !void { - try p.write("("); - try p.print(pair.car); - var cdr = pair.cdr; - while (value.pair.check(cdr)) |next| : (cdr = next.cdr) { - try p.write(" "); - try p.print(next.car); +fn printListDirect(p: *Printer, len: u3, vals: [*]Value) !void { + var close = ")"; + var i: usize = 0; + + switch (vals[0].bits) { + Parser.SQUARE.bits => { + try p.write("["); + close = "]"; + i = 1; + }, + Parser.BRACE.bits => { + try p.write("{"); + close = "}"; + i = 1; + }, + else => { + if (vals[0].isRune()) { + try p.printRune(vals[0]); + i = 1; + } + try p.write("("); + }, } - if (!value.nil.eq(cdr)) { - try p.write(" & "); - try p.print(cdr); + + if (len != 0) { + var first = true; + for (i..len) |j| { + if (!first) try p.write(" "); + first = false; + try p.print(vals[j]); + } + } else { + try p.print(vals[i]); + i += 1; + while (vals[i].bits != value.none.bits) : (i += 1) { + try p.write(" "); + try p.print(vals[i]); + } } - try p.write(")"); + + try p.write(close); } diff --git a/src/zisp/value.zig b/src/zisp/value.zig index 9fd9384..4ff683a 100644 --- a/src/zisp/value.zig +++ b/src/zisp/value.zig @@ -1,170 +1,4 @@ -//! -//! == NaN Packing Strategy == -//! -//! Format of a double, in most to least significant field order: -//! -//! { sign: u1, exponent: u11, fraction: u52 } -//! -//! When the exponent bits are all set, it's either a NaN or an Infinity. -//! -//! For value packing, almost all remaining 53 bits are available, giving us -//! about 2^53 values, except for the four following bit patterns: -//! -//! *** FORBIDDEN VALUES *** -//! -//! 1. Negative cqNaN = { sign = 1, exponent = max, fraction = 2^51 } -//! -//! 2. Negative Infinity = { sign = 1, exponent = max, fraction = 0 } -//! -//! 3. Positive cqNaN = { sign = 0, exponent = max, fraction = 2^51 } -//! -//! 4. Positive Infinity = { sign = 0, exponent = max, fraction = 0 } -//! -//! The abbreviation "cqNaN" stands for canonical quiet NaN. -//! -//! Note that 2^51 means the MSb of the 52 fraction bits being set, and the rest -//! being zero. The fraction MSb is also called the is_quiet flag, because it -//! demarcates quiet NaNs. The rest being zero makes it the canonical qNaN. -//! -//! The positive and negative cqNaN are the *only* NaN values that can actually -//! be returned by FP operations, which is convenient because it means we can -//! simply use them to represent themselves in Zisp. -//! -//! Infinity values may also be returned by FP operations, and we want them to -//! exist in Zisp as doubles as well, so they also represent themselves. -//! -//! Beyond those four bit patterns, all values with a maximum exponent (all bits -//! set) are fair game for representing other values, so 2^53 - 4 possibilities. -//! -//! We split those 2^53 - 4 available values into four groups, each allowing for -//! 2^51 - 1 different values to be encoded. (51-bit values excluding zero.) -//! -//! sign = 1, quiet = 1 :: Negative Fixnum from -1 to -2^51+1 -//! -//! sign = 1, quiet = 0 :: Positive Fixnum from 0 to 2^51-2 -//! -//! sign = 0, quiet = 1 :: Pointers and various immediates -//! -//! sign = 0, quiet = 0 :: Internal use by interpreter -//! -//! -//! === Fixnums === -//! -//! Negative fixnums actually represent themselves without needing to go through -//! any transformation. Only the smallest 52-bit signed negative, -2^51, cannot -//! be represented, as it would step on Forbidden Value #1, Negative cqNaN. -//! -//! Positive fixnums go through a bitsiwe NOT (implemented as an XOR mask to -//! combine it with removal of NaN-related high bits) to avoid the all-zero -//! payload value, which would step on Forbidden Value #2, Negative Infinity. -//! -//! -//! === Pointers and immediates === -//! -//! This region of 51-bit values is divided as follows based on the three high -//! bits, providing a payload value of 48 bits for each. -//! -//! 000 :: Regular pointer to Zisp heap (type tagged) -//! -//! 001 :: Weak pointer to Zisp heap (type tagged) -//! -//! 010 :: List segment array pointer (length tagged) -//! -//! 011 :: Interned string pointer (untagged) -//! -//! 100 :: Subdivided as follows: -//! -//! 0....... 0....... 0....... (etc.) :: Rune -//! -//! 1....... :: 128 40-bit types -//! -//! 0....... 1....... :: 16384 32-bit types -//! -//! 0....... 0....... 1....... :: 2097152 24-bit types -//! -//! (etc.) -//! -//! 101 :: Short immediate string -//! -//! 11s :: Small rational (signed) -//! -//! ==== Type-tagged pointers ==== -//! -//! Zisp heap pointers use 48-bit addresses. This is sufficient, since the -//! address space of user-land applications is effectively 48 bits on 64-bit -//! systems. Further, Zisp heap objects are allocated at 16-byte boundaries, -//! meaning the lowest 4 bits are always zero; this is used for type tagging, -//! providing immediate information about the type of object pointed to. -//! -//! Forbidden Value #3, Positive cqNaN, is avoided thanks to the fact that a -//! regular Zisp heap pointer can never be null. -//! -//! ==== Segmented lists ==== -//! -//! List segment arrays are also allocated at 16-byte boundaries, but use their -//! low 4 bits for segment length; see documentation of this API for details. -//! -//! ==== Interned strings ==== -//! -//! Interned string pointers don't use any low tag bits. Unlike other heap -//! objects, they can be allocated at any address. -//! -//! ==== Runes and misc. small values ==== -//! -//! Runes are symbols up to 6 ASCII characters used to implement reader syntax. -//! They are NUL-terminated if shorter than six characters, meaning they cannot -//! contain the NUL byte in their value. -//! -//! NOTE: The order in which the characters of the rune are encoded depends on -//! endianness. On little-endian systems (i.e. most modern architectures) the -//! characters will be in "reverse" order, with the first character in lowest -//! position, so the terminating NUL has to be searched from low to high. -//! -//! The fact that runes are limited to ASCII opens up a lot of space for other -//! small values to co-inhabit the same 48-bit range. We subdivide this space -//! into increasingly many potential types, with smaller and smaller payloads, -//! where the highest byte with a non-zero MSb determines which size category -//! we're in: If the highest byte has its MSb set, then its seven non-MSb bits -//! define a type and each type has a 40-bit value range; if the second highest -//! byte has its MSb set, then the 14 non-MSb bits of the two high bytes define -//! the type and each has a 32-bit value range; and so on. -//! -//! Unicode code points need 21 bits, so we use a 24-bit type for Characters. -//! Miscellaneous values like true, false, nil, eof, etc. are placed into an -//! 8-bit type, since there will never be that many of them. A virtually -//! unlimited number of user-defined enum types can fit here. -//! -//! ==== Short strings ==== -//! -//! Another 48-bit space is used for strings of zero to six bytes. Like runes, -//! these are NUL-terminated if shorter than six bytes, meaning that NUL cannot -//! appear in them, but otherwise they allow arbitrary bytes. -//! -//! ==== Small rationals ==== -//! -//! We use a 49-bit space for small exact rational numbers, with the numerator -//! being a two's complement 25-bit signed integer, and denominator a 24-bit -//! unsigned integer. -//! -//! -//! === Internal use values === -//! -//! The final 51-bit range is used for various internal purposes by the Zisp -//! interpreter, mostly related to transparent code optimization: -//! -//! 000 :: Direct function pointer aka CALL instruction -//! -//! 001 :: Lexically scoped variable reference by index -//! -//! 010 :: Pointer to heap object as constant data -//! -//! 011 :: Short immediate string as constant data -//! -//! 1.. :: TBD -//! -//! Forbidden Value #4, Positive Infinity, is avoided thanks to the fact that -//! direct function pointers cannot be null. -//! +//! See /doc/0/0-value.md to understand the NaN-packing strategy. const builtin = @import("builtin"); const std = @import("std"); @@ -174,17 +8,18 @@ const gc = @import("gc.zig"); pub const double = @import("value/double.zig"); pub const fixnum = @import("value/fixnum.zig"); +pub const ptr = @import("value/ptr.zig"); +pub const list = @import("value/list.zig"); +pub const istr = @import("value/istr.zig"); +pub const sstr = @import("value/sstr.zig"); +pub const srat = @import("value/srat.zig"); pub const rune = @import("value/rune.zig"); pub const char = @import("value/char.zig"); pub const misc = @import("value/misc.zig"); -pub const sstr = @import("value/sstr.zig"); -pub const boole = @import("value/boole.zig"); -pub const ptr = @import("value/ptr.zig"); -pub const pair = @import("value/pair.zig"); -pub const istr = @import("value/istr.zig"); pub const array = @import("value/array.zig"); +pub const boole = @import("value/boole.zig"); pub const string = @import("value/string.zig"); const endian = builtin.target.cpu.arch.endian(); @@ -209,11 +44,11 @@ pub fn sstrLen(x: u64) u8 { /// Transform back and forth between rune and sstr pub fn runeXsstr(v: Value) Value { - return @bitCast(v.bits ^ (1 << 48)); + return @bitCast(v.bits ^ (1 << 50)); } // Make sure false/true only differ in LSb. -pub const MiscValue = enum(u8) { f, t, nil, eof, none, undef }; +pub const MiscValue = enum(u8) { f, t, nil, eof, none }; // zig fmt: off pub const f = Value{ .misc = .{ .value = .f } }; @@ -221,56 +56,56 @@ pub const t = Value{ .misc = .{ .value = .t } }; pub const nil = Value{ .misc = .{ .value = .nil } }; pub const eof = Value{ .misc = .{ .value = .eof } }; pub const none = Value{ .misc = .{ .value = .none } }; -pub const undef = Value{ .misc = .{ .value = .undef } }; // zig fmt: on -/// A plain (unpacked, untagged) pointer into the Zisp heap. May point to any -/// kind of object. +/// A plain (unpacked, untagged) pointer into the Zisp heap. pub const Zptr = *align(16) anyopaque; /// Values for the lowest 4 bits of a heap pointer, indicating the heap type. -pub const PtrTag = enum(u4) { - /// Pair aka cons cell aka *[2]Value - pair, - /// Interned string buffer (8-bit length, then contents) - istr, +pub const HeapType = enum(u4) { + /// Unused so the 48-bit payload of a NaN-packed pointer is never zero and + /// cannot be confused for an actual NaN even in case of a null-index. + _unused = 0, + /// Array of various types (see `Array` type for details) array, - /// Procedure - proc, - pub fn ptype(self: PtrTag) type { + pub fn PtrType(self: HeapType) type { return switch (self) { - .pair => pair.PairPtr, - .istr => istr.IstrPtr, .array => array.ArrayPtr, else => @panic("not implemented"), }; } }; -pub const PtrTagUIntType = @typeInfo(PtrTag).@"enum".tag_type; +/// Make a "pointer etc." high 16-bit pattern with a 3-bit sub-range tag. +fn ptrEtc(comptime tag: u3) u16 { + return @as(u16, 0x7ff8) + tag; +} -// "Pointer etc." high bits (sign=0,exp=MAX,quiet=1) as a u13 field. -const ptr_etc: u13 = max(u13) - 1; +const hi16_tag_ptr = ptrEtc(0b000); +const hi16_tag_list = ptrEtc(0b001); +const hi16_tag_istr = ptrEtc(0b010); +const hi16_tag_sstr = ptrEtc(0b011); +const hi15_tag_srat = @as(u15, @intCast(ptrEtc(0b100) >> 1)); +const hi16_tag_rune = ptrEtc(0b111); -/// Represents a Zisp value/object. +/// Represents a Zisp value. pub const Value = packed union { - /// To get the value as a regular double. - double: f64, - /// To get an agnostic value for direct comparison with == i.e. eq? as well /// as manual bit-fiddling to test for and extract packed values. bits: u64, + /// Regular double, including +nan, -nan, +inf, -inf. + double: f64, + // Some of the structs below are just for inspection, whereas others are to - // initialize a new value of that category as well as read it that way. + // initialize a new value of that category, as well as to read it as such. + // Some of these are not used at all and are merely demonstrative. - // Note: Zig packed struct fields are ordered from LSb to MSb, contrary to - // most diagrams used to represent IEEE 754, including our own at the top. + // Note: Zig packed struct fields are ordered from LSb to MSb. /// Inspection through the lens of the general IEEE 754 double layout. - /// (Unused because we do manual bit-fiddling for optimal results.) ieee: packed struct { rest: u51, quiet: bool, @@ -278,110 +113,84 @@ pub const Value = packed union { sign: bool, }, - /// For initializing and reading fixnums. + /// 52-bit signed fixnum fixnum: packed struct { code: u51, negative: bool, - _: u11 = max(u11), + _exp: u11 = max(u11), _is_fixnum: bool = true, }, - /// For initializing and reading pointers. + /// Ordinary heap pointer ptr: packed struct { - tagged_value: u48, - is_weak: bool = false, - _1: bool = false, - _2: bool = false, - _: u13 = ptr_etc, - }, - - /// List segment array pointers - lsa: packed struct { - tagged_value: u48, - _id: u3 = 0b010, - _: u13 = ptr_etc, + index: u44, + htype: HeapType, + _hi16_tag: u16 = hi16_tag_ptr, }, - /// Interned string pointer - isp: packed struct { - pointer: u48, - _id: u3 = 0b011, - _: u13 = ptr_etc, + /// List pointer + list: packed struct { + len_tagged_ptr: u48, + _hi16_tag: u16 = hi16_tag_list, }, - /// For initializing and reading runes. - rune: packed struct { - // actually [6]u8 but packed struct cannot contain arrays - name: u48, - _id: u3 = 0b100, - _: u13 = ptr_etc, + /// Istr pointer + istr: packed struct { + ptr: u48, + _hi16_tag: u16 = hi16_tag_istr, }, - /// For initializing and reading short strings. + /// Short string sstr: packed struct { // actually [6]u8 but packed struct cannot contain arrays bytes: u48, - _id: u3 = 0b101, - _: u13 = ptr_etc, + _hi16_tag: u16 = hi16_tag_sstr, }, - /// For initializing and reading small rats (rational numbers). + /// Small rat (rational number) srat: packed struct { q: u24, p: i25, - _tag: u2 = 0b11, - _: u13 = ptr_etc, + _hi16_tag: u15 = hi15_tag_srat, + }, + + /// Rune (6-byte ASCII string) + rune: packed struct { + // actually [6]u8 but packed struct cannot contain arrays + name: u48, + _hi16_tag: u16 = hi16_tag_rune, }, // TODO: Use a general Small Value type registration mechanism. - /// For initializing and reading characters. char: packed struct { value: u24, _sv_tag: u24 = 0x000080, - _id: u3 = 0b100, - _: u13 = ptr_etc, + _hi16_tag: u16 = hi16_tag_rune, }, // TODO: Use a general Small Value type registration mechanism. - /// For initializing and reading miscellaneous values. misc: packed struct { value: MiscValue, _sv_tag: u40 = 0x0000000080, - _id: u3 = 0b100, - _: u13 = ptr_etc, + _hi16_tag: u16 = hi16_tag_rune, }, - // Disjoint masks where a specific bit or bit-group are set. - // zig fmt: off - const mask_sign: u64 = 1 << 63 ; // 1 sign bit - const mask_exp: u64 = max(u11) << 52 ; // 11 exponent bits - const mask_quiet: u64 = 1 << 51 ; // 1 quiet bit of fraction - const mask_rest: u64 = max(u51) ; // 51 rest bits of fraction - // zig fmt: on - - // Mask for pointer type tag bits - const mask_ptr_tag: u48 = max(PtrTagUIntType); - /// Dumps the value for inspection. pub fn dump(v: Value) void { - const sign: u1 = @intCast((v.bits & mask_sign) >> 63); - const exp: u11 = @intCast((v.bits & mask_exp) >> 52); - const quiet: u1 = @intCast((v.bits & mask_quiet) >> 51); - const rest: u51 = @intCast(v.bits & mask_rest); std.debug.print( \\value: 0x{x} \\sign: {}, exp: {b}, quiet: {} \\rest: 0x{x} \\ - , .{ v.bits, sign, exp, quiet, rest }); + , .{ v.bits, v.ieee.sign, v.ieee.exp, v.ieee.quiet, v.ieee.rest }); } - /// To treat the 64-bit value as an 8-byte array. + /// To treat the 64-bit value as an 8-byte const array. pub fn bufRO(v: *const Value) *const [8]u8 { return @ptrCast(v); } - /// To treat the 64-bit value as an 8-byte array. + /// To treat the 64-bit value as an 8-byte mutable array. pub fn bufRW(v: *Value) *[8]u8 { return @ptrCast(v); } @@ -391,19 +200,11 @@ pub const Value = packed union { return v1.bits == v2.bits; } - // It would be great if we could just write the most readable code here, - // using the packed struct definitions above, but the optimizer isn't - // sufficiently intelligent to turn that into optimal instructions, so - // manual bit fiddling it is. + // It would be great if we could just write the most readable code in the + // following functions, using the packed struct definitions above, but the + // optimizer isn't smart enough, so manual bit fiddling it is. - // The isDouble and isFixnum checks are not as efficient as we would like - // them to be, each requiring two 10-byte MOV instructions to load 64-bit - // values for masking and comparison. This seems difficult to improve on - // without sacrificing large value ranges in our NaN-packing scheme; it's - // probably best to leave it like this and hope that type checks will be - // hoisted outside hot-spots by a smart compiler. - - /// Checks for a Zisp double, including: +nan.0, -nan.0, +inf.0, -inf.0 + /// Checks for a double, including: +nan, -nan, +inf, -inf. pub fn isDouble(v: Value) bool { // Readable version: // @@ -411,18 +212,22 @@ pub const Value = packed union { // // Optimized: // - // 1. AND away the sign and quiet bits, since they may vary. + // 1. Shift the 12 highest bits all the way to the right, then mask out + // the sign bit. Allows efficiently checking if the exponent is all + // set by comparing to 0x7ff. // - // 2. If exp is all 1, and rest non-zero, it's a packed value, so check - // if the value is less than that. + // 2. Shift out 13 high bits (sign, expt, quiet) so we can easily check + // if the rest bits are zero. // - return v.bits & ~(mask_sign | mask_quiet) <= mask_exp; + const expt = v.bits >> 52 & 0x7ff; + const rest = v.bits << 13; + return expt != 0x7ff or rest == 0; } // Imagine there's an isPacked() implemented as !isDouble() here, used to // make the following functions more readable. - /// Checks for a fixnum. + /// Checks for a fixnum integer. pub fn isFixnum(v: Value) bool { // Readable version: // @@ -430,137 +235,93 @@ pub const Value = packed union { // // Optimized: // - // 1. AND away the quiet bit, since it may vary. + // 1. Shift the 12 highest bits all the way to the right; these need to + // be all set, since the sign bit (and all exponent bits) being set + // marks a fixnum. // - // 2. Check if highest 12 all 1, rest not all 0, with a greater-than. + // 2. But also verify that the lowest 51 bits aren't zero, since that + // would be a cqNaN or Infinity. // - return v.bits & ~mask_quiet > mask_sign | mask_exp; + const expt = v.bits >> 52; + const rest = v.bits << 13; + return expt == 0xfff and rest != 0; + } + + /// Checks if the value is any type-tagged pointer. You probably want to + /// use getPtr() or getPtrAny() instead, so you can combine the check with + /// the extraction of the pointer and type tag. + pub fn isPtrAny(v: Value) bool { + const hi: u16 = @intCast(v.bits >> 48); + const lo: u48 = @truncate(v.bits); + return hi == hi16_tag_ptr and lo != 0; } /// Checks for a pointer with a given type tag, returning null on failure - /// and the pointer value otherwise. This function is for when a value is - /// assumed to be of a heap type, such as when `car` is called on it, and - /// the "high" pointer tags (49-51) that encode properties like weakness - /// don't matter; we only care if it's some kind of valid heap pointer of - /// the given type, so we only check the type tag in the low bits (0-2). - pub fn getPtr(v: Value, comptime tag: PtrTag) ?tag.ptype() { - // Readable version: - // - // if (v.isPacked() and !v.ieee.sign and v.ieee.quiet ...) - // - // Optimized: - // - // I'm not explaining that shit. Figure it out yourself. - // - // Jokes aside, the code should be self-explanatory, except for: - // - // Intermediate values should be given explicit types here, for optimal - // code-gen. And it should be noted that the null pointer, with no - // other high bits set (49-51), which would represent +cqNaN, also - // results in a null return, so it can't be mistaken for a pointer. - // - // This function currently assumes that bits 50 and 51 being set, which - // has no defined semantics right now, doesn't change the fact that the - // pointer is a Zisp heap pointer. If we decide to use those bits for - // something else, simply change the hi_bits check to include them, so - // that for example bits 50 and 51 must be zero, or 51 must be zero - // while 50 is still ignored, and so on, as appropriate. - // - const hi_bits: u14 = @intCast(v.bits >> 50); - const ptr_val: u48 = @intCast(v.bits & ~mask_ptr_tag); - const tag_bits: PtrTagUIntType = @intCast(v.bits & mask_ptr_tag); - const is_ptr = hi_bits == 0b01111111111110; - const is_tag = tag_bits == @intFromEnum(tag); - return if (is_ptr and is_tag) @ptrFromInt(ptr_val) else null; + /// and the pointer value otherwise. + pub fn getPtr(v: Value, comptime ht: HeapType) ?ht.PtrType() { + // Check if the 20 highest bits equal the combination of the 16-bit + // pattern marking a pointer, followed by the 4-bit heap type tag. + const hi20_bits: u20 = @intCast(v.bits >> 44); + const ptr_bits: u20 = hi16_tag_ptr; + const tag_bits: u20 = @intFromEnum(ht); + if (hi20_bits != ptr_bits << 4 | tag_bits) return null; + + // A zero payload value here would actually indicate that the value is + // not a pointer at all, but the Positive cqNaN value; this is totally + // fine, because @ptrFromInt() turns zero into null anyway. + return @ptrFromInt(v.bits << 20 >> 16); } /// Checks for a pointer and returns the value and tag separately, or null - /// if this isn't a pointer at all. This could be useful for a dispatch - /// table based on type. - pub fn getPtrAny(v: Value) ?struct { Zptr, PtrTag } { - // See last function, which is almost identical. - const hi_bits: u14 = @intCast(v.bits >> 50); - const ptr_val: u48 = @intCast(v.bits & ~mask_ptr_tag); - const tag_bits: PtrTagUIntType = @intCast(v.bits & mask_ptr_tag); - const is_ptr = hi_bits == 0b01111111111110; - const zptr: Zptr = @ptrFromInt(ptr_val); - const tag: PtrTag = @enumFromInt(tag_bits); - return if (is_ptr) .{ zptr, tag } else null; + /// if this isn't a pointer at all. Could be useful for a dispatch table. + pub fn getPtrAny(v: Value) ?struct { Zptr, HeapType } { + const hi16_bits: u16 = @intCast(v.bits >> 48); + if (hi16_bits != hi16_tag_ptr) return null; + + const ptr_val: u48 = @intCast(v.bits << 20 >> 16); + if (ptr_val == 0) return null; + + const ht_val: u4 = @intCast(v.bits << 16 >> 60); + return .{ @ptrFromInt(ptr_val), @enumFromInt(ht_val) }; } - /// Checks whether the value is a pointer with certain property bits, not - /// caring about the type tag bits. NOTE: This function doesn't check for - /// +cqNaN, which will be mis-identified as a pointer with zero props and - /// address. Therefore, don't expose this function to Zisp! Parameter - /// `props_mask` specifies props bits to mask away with a NAND, so it's - /// possible to only check for bits you care for. For example, to check - /// weakness, pass `0b110` for `props_mask` and `0b001` for `props`. - pub fn isPtrProps(v: Value, props_mask: u3, props: u3) bool { - const hi_bits: u16 = @intCast(v.bits >> 48 & ~@as(u16, props_mask)); - return hi_bits == 0b0111111111111000 | @as(u16, props); + /// Checks if the value is a list pointer. + pub fn isList(v: Value) bool { + return v.bits >> 48 == hi16_tag_list; } - /// Checks whether the value is any kind of pointer, making sure not to - /// mis-identify +cqNaN as a null pointer with zero property bits, which - /// makes this a bit slower than `isPtrProps()`. - pub fn isPtrAny(v: Value) bool { - const hi: u14 = @intCast(v.bits >> 50); - const lo: u50 = @truncate(v.bits); - return hi == 0b01111111111110 and lo != 0; + /// Checks if the value is an istr pointer. + pub fn isIstr(v: Value) bool { + return v.bits >> 48 == hi16_tag_istr; + } + + /// Checks if the value is a short string. + pub fn isSstr(v: Value) bool { + return v.bits >> 48 == hi16_tag_sstr; } - // TODO: Needs update from here on, and test the above + /// Checks if the value is a small rat (rational number). + pub fn isSrat(v: Value) bool { + return v.bits >> 49 == hi15_tag_srat; + } /// Checks for a rune. pub fn isRune(v: Value) bool { - // - // 1. AND away non-MSb bits of the low 6 bytes, since they may vary. - // - // 2. Check if the rest is the exact pattern we expect, with sign = 0, - // exp all 1, quiet = 0, 3-bit tag = 0, and MSb of remaining bytes 0, - // i.e., only the exp bits are set. - // - // 3. Ensnure that the "first" (based on endianness) of the 6 lowest - // bytes is not NUL, which would mean it's the illegal "empty string" - // rune corresponding to +Infinity. - // - // Note that, while this is a rather complicated check for a type that - // appears ubiquitously in parser output, we rarely use this type check, - // instead using direct comparison to specific rune values, so it's not - // important if this is a little inefficient. - // - const msb_mask = 0xffff808080808080; - const nul_mask = switch (endian) { - .big => 127 << 40, - .little => 127, - }; - return v.bits & msb_mask == mask_exp and v.bits & nul_mask != 0; + // This check isn't used much, since one typically checks for specific + // runes via direct equality, so the efficiency here is unimportant. + const hi = v.bits >> 48; + const lo = v.bits << 16; + const msb_mask = 0x7f7f7f7f7f7f0000; + return hi == 0x7fff and lo != 0 and lo & msb_mask == lo; } - // The rest are straightforward compared to the above, since the cqNaN and - // Infinity special-cases are out of the way and we just need to compare N - // high bits to a specific pattern. - - /// Checks for a 24-bit character value. This makes no guarantees related - /// to Unicode, such as being a valid Code Point or Unicode Scalar Value. + // TODO: Use a general Small Value type registration mechanism. pub fn isChar(v: Value) bool { - // TODO: Use a general Small Value type registration mechanism. - return v.bits >> 24 == 0x7ff0000080; + return v.bits >> 24 == 0x7fff000080; } - /// Checks for an 8-bit miscellaneous value. + // TODO: Use a general Small Value type registration mechanism. pub fn isMisc(v: Value) bool { - // TODO: Use a general Small Value type registration mechanism. - return v.bits >> 8 == 0x7ff00000000080; - } - - /// Checks if the value is a short string of either kind. - pub fn isSstr(v: Value) bool { - return v.bits >> 48 == 0x7ff1; - } - - /// Checks if the value is a small rat (rational number). - pub fn isSrat(v: Value) bool { - return v.bits >> 49 == 0x7ff2 >> 1; + return v.bits >> 8 == 0x7fff0000000080; } }; diff --git a/src/zisp/value/array.zig b/src/zisp/value/array.zig index ad04ce2..5d326a1 100644 --- a/src/zisp/value/array.zig +++ b/src/zisp/value/array.zig @@ -140,7 +140,7 @@ pub const ArrayHeader = packed struct(u64) { } } - pub fn str(self: *Self) []const u8 { + pub fn bytes(self: *Self) []u8 { if (self.is_slice) { const buf = self.bufU8(); const start, const end = self.sliceInfo(); @@ -149,11 +149,15 @@ pub const ArrayHeader = packed struct(u64) { var arr = self; if (self.is_ptr) { arr = self.arrPointer() orelse { - @panic("Called str() on array with direct buffer pointer."); + @panic("Called bytes() on array with direct buffer pointer."); }; } return arr.bufContent()[0..arr.size()]; } + + pub fn bytesRO(self: *Self) []const u8 { + return self.bytes(); + } }; const ArrayType = enum(u2) { int, flt, val, str }; diff --git a/src/zisp/value/istr.zig b/src/zisp/value/istr.zig index 35093ed..dfe4614 100644 --- a/src/zisp/value/istr.zig +++ b/src/zisp/value/istr.zig @@ -15,7 +15,7 @@ const Value = value.Value; // Zig API /// Pointer to an interned string. First byte is length. -pub const IstrPtr = *align(@alignOf(value.Zptr)) IstrHead; +pub const IstrPtr = *IstrHead; pub const IstrHead = packed struct(u8) { len: u8, @@ -24,7 +24,7 @@ pub const IstrHead = packed struct(u8) { return @ptrCast(self); } - pub fn str(self: *@This()) []const u8 { + pub fn bytes(self: *@This()) []const u8 { const start = @sizeOf(IstrHead); const len: usize = self.len; return self.bufU8()[start .. start + len]; @@ -37,15 +37,15 @@ pub const IstrHead = packed struct(u8) { } }; -pub fn check(v: Value) ?IstrPtr { - return v.getPtr(.istr) orelse null; +pub fn check(v: Value) bool { + return v.isIstr(); } -pub fn assert(v: Value) IstrPtr { - return check(v) orelse { +pub fn assert(v: Value) void { + if (!check(v)) { v.dump(); @panic("not istr"); - }; + } } pub fn isValidIstr(s: []const u8) bool { @@ -68,7 +68,12 @@ pub fn new(alloc: Alloc, s: []const u8) !IstrPtr { } pub fn pack(istr: IstrPtr) Value { - return value.ptr.pack(.istr, istr); + const ptr = @intFromPtr(istr); + return .{ .istr = .{ .ptr = @intCast(ptr) } }; +} + +pub fn unpack(v: Value) IstrPtr { + return @ptrFromInt(v.istr.ptr); } pub fn getOrNew(s: []const u8) !Value { @@ -80,19 +85,22 @@ pub fn getOrNewInSet(set: *IstrSet, s: []const u8) !Value { return pack(try set.getOrNew(s)); } -pub fn getStr(v: Value) []const u8 { - return assert(v).str(); +pub fn getBytes(v: Value) []const u8 { + assert(v); + const istr: IstrPtr = @ptrFromInt(v.istr.ptr); + return istr.bytes(); } // Zisp API pub fn pred(v: Value) Value { - return value.boole.pack(check(v) != null); + return value.boole.pack(check(v)); } pub fn getLen(v: Value) Value { - const istr = assert(v); - return value.fixnum.pack(@intCast(istr.len)); + assert(v); + const istr: IstrPtr = @ptrFromInt(v.istr.ptr); + return value.fixnum.pack(istr.len); } // TODO: Zisp representation of IstrSet & ability to intern in given IstrSet diff --git a/src/zisp/value/list.zig b/src/zisp/value/list.zig new file mode 100644 index 0000000..d040a34 --- /dev/null +++ b/src/zisp/value/list.zig @@ -0,0 +1,64 @@ +//! List array + +const std = @import("std"); + +const Alloc = std.mem.Allocator; + +const gc = @import("../gc.zig"); +const value = @import("../value.zig"); + +const ListPool = gc.ListPool; +const Value = value.Value; + +// Zig API + +pub fn check(v: Value) bool { + return v.isList(); +} + +pub fn assert(v: Value) void { + if (!check(v)) { + v.dump(); + @panic("not istr"); + } +} + +pub fn new(alloc: Alloc, pool: ?*ListPool, vals: []const Value) !Value { + var len_tagged_ptr: u48 = undefined; + if (vals.len < 8 and pool != null) { + const ptr = try pool.?.create(@intCast(vals.len)); + for (vals, 0..) |v, i| ptr[i] = v; + len_tagged_ptr = @intCast(@intFromPtr(ptr) | vals.len); + } else { + const ptr = try alloc.alloc(Value, vals.len + 1); + for (vals, 0..) |v, i| ptr[i] = v; + ptr[vals.len] = value.none; + len_tagged_ptr = @intCast(@intFromPtr(ptr.ptr)); + } + return .{ .list = .{ .len_tagged_ptr = len_tagged_ptr } }; +} + +pub fn getLenTag(v: Value) u3 { + return @truncate(v.bits); +} + +pub fn getValPtr(v: Value) [*]Value { + return @ptrFromInt(v.bits & 0x0000fffffffffff8); +} + +// Zisp API + +pub fn pred(v: Value) Value { + return value.boole.pack(check(v)); +} + +pub fn getLen(v: Value) Value { + assert(v); + var len = getLenTag(v); + if (len == 0) { + const ptr = getValPtr(v); + len = 8; + while (ptr[len].bits != value.none.bits) len += 1; + } + return value.fixnum.pack(@intCast(len)); +} diff --git a/src/zisp/value/ptr.zig b/src/zisp/value/ptr.zig index ef12f79..3a6995b 100644 --- a/src/zisp/value/ptr.zig +++ b/src/zisp/value/ptr.zig @@ -3,7 +3,7 @@ const std = @import("std"); const gc = @import("../gc.zig"); const value = @import("../value.zig"); -const PtrTag = value.PtrTag; +const HeapType = value.HeapType; const Value = value.Value; const Zptr = value.Zptr; @@ -22,70 +22,12 @@ pub fn assert(v: Value) void { } } -pub fn checkWeak(v: Value) bool { - return v.isPtrProps(0b110, 0b001); +pub fn pack(comptime ht: HeapType, ptr: ht.PtrType()) Value { + const index: u44 = @intCast(@intFromPtr(ptr) >> 4); + return .{ .ptr = .{ .index = index, .htype = ht } }; } -pub fn assertWeak(v: Value) void { - if (!checkWeak(v)) { - v.dump(); - @panic("not weak pointer"); - } -} - -pub fn _pack(comptime tag: PtrTag, ptr: tag.ptype(), is_weak: bool) Value { - const ptr_val: usize = @intFromPtr(ptr); - std.debug.assert(ptr_val < std.math.maxInt(u48)); - const tagged: u48 = @intCast(ptr_val | @intFromEnum(tag)); - return .{ .ptr = .{ .tagged_value = tagged, .is_weak = is_weak } }; -} - -pub fn pack(comptime tag: PtrTag, ptr: tag.ptype()) Value { - return _pack(tag, ptr, false); -} - -pub fn packWeak(comptime tag: PtrTag, ptr: tag.ptype()) Value { - return _pack(tag, ptr, true); -} - -pub fn setWeakNull(v: *Value) void { - assertWeak(v.*); - v.ptr.tagged_value = 0; -} - -pub fn isWeakNull(v: Value) bool { - assertWeak(v); - return v.ptr.tagged_value == 0; -} - -pub fn unpack(v: Value) struct { Zptr, PtrTag } { +pub fn unpack(v: Value) struct { Zptr, HeapType } { assert(v); return v.getPtrAny() orelse unreachable; } - -// Zisp API - -pub fn makeWeak(v: Value) Value { - assert(v); - var copy = v; - copy.ptr.is_weak = true; - return copy; -} - -pub fn predWeak(v: Value) Value { - return value.boole.pack(checkWeak(v)); -} - -pub fn predWeakNull(v: Value) Value { - return value.boole.pack(isWeakNull(v)); -} - -pub fn getWeak(v: Value) Value { - if (isWeakNull(v)) { - return value.f; - } else { - var copy = v; - copy.ptr.is_weak = false; - return copy; - } -} diff --git a/src/zisp/value/srat.zig b/src/zisp/value/srat.zig new file mode 100644 index 0000000..65b3dba --- /dev/null +++ b/src/zisp/value/srat.zig @@ -0,0 +1 @@ +// todo diff --git a/src/zisp/value/string.zig b/src/zisp/value/string.zig index 129e35b..1fe5e5e 100644 --- a/src/zisp/value/string.zig +++ b/src/zisp/value/string.zig @@ -4,10 +4,17 @@ const value = @import("../value.zig"); const Value = value.Value; -pub fn getString(s: *const Value) ![]const u8 { +pub fn isString(v: Value) bool { + if (value.sstr.check(v)) return true; + if (value.istr.check(v)) return true; + if (value.array.check(.str, v)) |_| return true; + return false; +} + +pub fn getBytes(s: *const Value) []const u8 { if (value.sstr.check(s.*)) return value.sstr.unpack(s); - if (value.istr.check(s.*)) |istr_ptr| return istr_ptr.str(); - if (value.array.check(.str, s.*)) |arr_ptr| return arr_ptr.str(); + if (value.istr.check(s.*)) return value.istr.unpack(s.*).bytes(); + if (value.array.check(.str, s.*)) |arr_ptr| return arr_ptr.bytesRO(); s.dump(); @panic("This is not a string."); } diff --git a/update-html.sh b/update-html.sh index 0c37ffd..e611006 100755 --- a/update-html.sh +++ b/update-html.sh @@ -20,7 +20,10 @@ md2ht() { fi echo "$src -> $dst" title=$(sed 's/# //; s/&/\\&/; s_/_\\/_; q' "$src") - if [ "${src##*/}" = index.md ] + if [ "$src" = html/index.md ] # It's the root + then + nav='' + elif [ "${src##*/}" = index.md ] then nav='^ Up' else @@ -34,7 +37,10 @@ md2ht() { { sed "s/__TITLE__/$title/" html/prelude.html echo "" - sed "1 a $nav" < "$src" | markdown2 -x "$ext" + if [ "$nav" ] + then + sed "1 a $nav" < "$src" | markdown2 -x "$ext" + fi echo "" echo "" } > "$dst" -- cgit v1.2.3