diff options
| -rw-r--r-- | docs/c1/1-parse.md | 4 | ||||
| -rw-r--r-- | src/zisp/gc/IstrSet.zig | 53 | ||||
| -rw-r--r-- | src/zisp/value/istr.zig | 20 |
3 files changed, 69 insertions, 8 deletions
diff --git a/docs/c1/1-parse.md b/docs/c1/1-parse.md index 31e340a..0fc0da5 100644 --- a/docs/c1/1-parse.md +++ b/docs/c1/1-parse.md @@ -228,12 +228,12 @@ including zero (aka ASCII NULL). The parser reads bytes, not characters, and has no concept of a character encoding, which means that a string can contain UTF-8 byte sequences, but these are not tested for validity. -A string that is up to 64 bytes long is automatically *interned*, meaning any +A string that is up to 255 bytes long is automatically *interned*, meaning any occurrence of the same string -- equal in length and containing the same byte values -- ends up being represented by the same bit-pattern; either a memory address, or an immediate representation within a CPU word for short strings. -Strings with a length greater than 64 bytes end up being represented by a +Strings with a length greater than 255 bytes end up being represented by a distinct memory address, even if they are equal in length and content. diff --git a/src/zisp/gc/IstrSet.zig b/src/zisp/gc/IstrSet.zig new file mode 100644 index 0000000..c90f5dd --- /dev/null +++ b/src/zisp/gc/IstrSet.zig @@ -0,0 +1,53 @@ +//! +//! Efficient hash set for short(ish) strings. +//! +//! Layout: []Bucket +//! +//! * Bucket: [8]Sector +//! +//! * Sector: [8]u8 or u64 +//! +//! Strategy: Hybrid between open addressing with linear probing, and chaining, +//! with special markers for indirection; see details below. +//! +//! The Zisp parser interns strings up to 255 bytes in length, using the `Istr` +//! heap representation with a len prefix of one byte and hash cache suffix of +//! eight bytes. This means that strings of up to 55 bytes of content can fit +//! within 64 bytes. We store these inline in our buckets. A pointer to that +//! bucket is then itself the Istr. +//! +//! A bucket can contain multiple strings if they fit. The strings are aligned +//! at 8 byte "sectors" so a pointer to a sector can also be a valid Istr. +//! +//! A sector may appear empty (all 0 bytes) but actually be part of a string; +//! this is not a problem since strings are length-prefixed. Linear probing +//! starts at the beginning of a bucket, and jumps over occupied sectors as +//! indicated by a string's length prefix. This can continue across buckets. +//! When an empty sector is found after jumping over a string, we check if +//! there's enough space within the bucket to hold the string, otherwise we +//! advance into the next bucket and keep probing. +//! +//! Splitting across buckets is not allowed, since this introduces ambiguity: +//! Strings can contain ASCII NUL bytes, so if a string ending in a series of +//! NUL bytes were allowed to spill into the next bucket, it would appear to +//! begin with an empty sector. A later probe sequence beginning with that +//! bucket would then see it as empty and overwrite part of another string. +//! +//! Since strings up to 6 bytes are packed into NaN, they can never appear here, +//! which means the empty string cannot appear, so a length prefix cannot be 0. +//! Thus there's no ambiguity between empty sector and empty string. +//! +//! A sector can begin with a 0 byte, but have a non-0 value as a u64; this +//! indicates a packed pointer to an Istr elsewhere on the heap. The low 48 +//! bits are the pointer, while the high bits encode the length of the string +//! being pointed to, so we don't need to follow the pointer for the equality +//! check in case the lengths are different. Also, in this case, the cached +//! hash follows directly in the next sector, so we also needn't follow the +//! pointer when resizing the hash set. +//! + +/// Bucket count +bcount: usize = 512, // 512 + +used_bytes: usize, +total_bytes: usize, diff --git a/src/zisp/value/istr.zig b/src/zisp/value/istr.zig index f34dbd0..2ca2dad 100644 --- a/src/zisp/value/istr.zig +++ b/src/zisp/value/istr.zig @@ -1,25 +1,33 @@ const std = @import("std"); -const value = @import("../value.zig"); const gc = @import("../gc.zig"); +const value = @import("../value.zig"); const Value = value.Value; // Zig API -/// Pointer to beginning of an interned string with a 1-byte length prefix. -/// Aligned to 8 bytes like all Zisp heap pointers. +/// Pointer to beginning of an interned string with a 1-byte length prefix and +/// 8-byte hash cache suffix. Aligned to 8 bytes, like all Zisp heap pointers. pub const Istr = *align(8) struct { - pub fn buf(self: *@This()) [*]u8 { + fn buf(self: *@This()) [*]u8 { return @ptrCast(self); } pub fn len(self: *@This()) u8 { - return buf()[0]; + return self.buf()[0]; } pub fn str(self: *@This()) []u8 { - return buf()[1 .. 1 + len()]; + const end = 1 + self.len(); + return self.buf()[1..end]; + } + + pub fn getHash(self: *@This()) u64 { + // Hash stored in the next naturally aligned u64 + const hash_idx = ((self.len() + 8) & -8) / 8; + const buf: [*]u64 = @ptrCast(self); + return buf[hash_idx]; } }; |
