summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/zisp/gc/IstrSet.zig53
-rw-r--r--src/zisp/value/istr.zig20
2 files changed, 67 insertions, 6 deletions
diff --git a/src/zisp/gc/IstrSet.zig b/src/zisp/gc/IstrSet.zig
new file mode 100644
index 0000000..c90f5dd
--- /dev/null
+++ b/src/zisp/gc/IstrSet.zig
@@ -0,0 +1,53 @@
+//!
+//! Efficient hash set for short(ish) strings.
+//!
+//! Layout: []Bucket
+//!
+//! * Bucket: [8]Sector
+//!
+//! * Sector: [8]u8 or u64
+//!
+//! Strategy: Hybrid between open addressing with linear probing, and chaining,
+//! with special markers for indirection; see details below.
+//!
+//! The Zisp parser interns strings up to 255 bytes in length, using the `Istr`
+//! heap representation with a len prefix of one byte and hash cache suffix of
+//! eight bytes. This means that strings of up to 55 bytes of content can fit
+//! within 64 bytes. We store these inline in our buckets. A pointer to that
+//! bucket is then itself the Istr.
+//!
+//! A bucket can contain multiple strings if they fit. The strings are aligned
+//! at 8 byte "sectors" so a pointer to a sector can also be a valid Istr.
+//!
+//! A sector may appear empty (all 0 bytes) but actually be part of a string;
+//! this is not a problem since strings are length-prefixed. Linear probing
+//! starts at the beginning of a bucket, and jumps over occupied sectors as
+//! indicated by a string's length prefix. This can continue across buckets.
+//! When an empty sector is found after jumping over a string, we check if
+//! there's enough space within the bucket to hold the string, otherwise we
+//! advance into the next bucket and keep probing.
+//!
+//! Splitting across buckets is not allowed, since this introduces ambiguity:
+//! Strings can contain ASCII NUL bytes, so if a string ending in a series of
+//! NUL bytes were allowed to spill into the next bucket, it would appear to
+//! begin with an empty sector. A later probe sequence beginning with that
+//! bucket would then see it as empty and overwrite part of another string.
+//!
+//! Since strings up to 6 bytes are packed into NaN, they can never appear here,
+//! which means the empty string cannot appear, so a length prefix cannot be 0.
+//! Thus there's no ambiguity between empty sector and empty string.
+//!
+//! A sector can begin with a 0 byte, but have a non-0 value as a u64; this
+//! indicates a packed pointer to an Istr elsewhere on the heap. The low 48
+//! bits are the pointer, while the high bits encode the length of the string
+//! being pointed to, so we don't need to follow the pointer for the equality
+//! check in case the lengths are different. Also, in this case, the cached
+//! hash follows directly in the next sector, so we also needn't follow the
+//! pointer when resizing the hash set.
+//!
+
+/// Bucket count
+bcount: usize = 512, // 512
+
+used_bytes: usize,
+total_bytes: usize,
diff --git a/src/zisp/value/istr.zig b/src/zisp/value/istr.zig
index f34dbd0..2ca2dad 100644
--- a/src/zisp/value/istr.zig
+++ b/src/zisp/value/istr.zig
@@ -1,25 +1,33 @@
const std = @import("std");
-const value = @import("../value.zig");
const gc = @import("../gc.zig");
+const value = @import("../value.zig");
const Value = value.Value;
// Zig API
-/// Pointer to beginning of an interned string with a 1-byte length prefix.
-/// Aligned to 8 bytes like all Zisp heap pointers.
+/// Pointer to beginning of an interned string with a 1-byte length prefix and
+/// 8-byte hash cache suffix. Aligned to 8 bytes, like all Zisp heap pointers.
pub const Istr = *align(8) struct {
- pub fn buf(self: *@This()) [*]u8 {
+ fn buf(self: *@This()) [*]u8 {
return @ptrCast(self);
}
pub fn len(self: *@This()) u8 {
- return buf()[0];
+ return self.buf()[0];
}
pub fn str(self: *@This()) []u8 {
- return buf()[1 .. 1 + len()];
+ const end = 1 + self.len();
+ return self.buf()[1..end];
+ }
+
+ pub fn getHash(self: *@This()) u64 {
+ // Hash stored in the next naturally aligned u64
+ const hash_idx = ((self.len() + 8) & -8) / 8;
+ const buf: [*]u64 = @ptrCast(self);
+ return buf[hash_idx];
}
};