summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTaylan Kammer <taylan.kammer@gmail.com>2026-05-30 23:04:38 +0200
committerTaylan Kammer <taylan.kammer@gmail.com>2026-05-30 23:04:38 +0200
commita21c485635f74c6fb86db630110fa4ddebc8c0c0 (patch)
treecaf5d2bc53d8a3e194c6417d4c9bfe5ffa7a0d11
parent237d2e48af8dc5203dbf4c6b2414af6792a56243 (diff)
Improve Istr stuff.
-rw-r--r--src/zisp/gc/IstrSet.zig192
-rw-r--r--src/zisp/value/istr.zig38
2 files changed, 163 insertions, 67 deletions
diff --git a/src/zisp/gc/IstrSet.zig b/src/zisp/gc/IstrSet.zig
index c90f5dd..31bc850 100644
--- a/src/zisp/gc/IstrSet.zig
+++ b/src/zisp/gc/IstrSet.zig
@@ -1,53 +1,139 @@
-//!
-//! Efficient hash set for short(ish) strings.
-//!
-//! Layout: []Bucket
-//!
-//! * Bucket: [8]Sector
-//!
-//! * Sector: [8]u8 or u64
-//!
-//! Strategy: Hybrid between open addressing with linear probing, and chaining,
-//! with special markers for indirection; see details below.
-//!
-//! The Zisp parser interns strings up to 255 bytes in length, using the `Istr`
-//! heap representation with a len prefix of one byte and hash cache suffix of
-//! eight bytes. This means that strings of up to 55 bytes of content can fit
-//! within 64 bytes. We store these inline in our buckets. A pointer to that
-//! bucket is then itself the Istr.
-//!
-//! A bucket can contain multiple strings if they fit. The strings are aligned
-//! at 8 byte "sectors" so a pointer to a sector can also be a valid Istr.
-//!
-//! A sector may appear empty (all 0 bytes) but actually be part of a string;
-//! this is not a problem since strings are length-prefixed. Linear probing
-//! starts at the beginning of a bucket, and jumps over occupied sectors as
-//! indicated by a string's length prefix. This can continue across buckets.
-//! When an empty sector is found after jumping over a string, we check if
-//! there's enough space within the bucket to hold the string, otherwise we
-//! advance into the next bucket and keep probing.
-//!
-//! Splitting across buckets is not allowed, since this introduces ambiguity:
-//! Strings can contain ASCII NUL bytes, so if a string ending in a series of
-//! NUL bytes were allowed to spill into the next bucket, it would appear to
-//! begin with an empty sector. A later probe sequence beginning with that
-//! bucket would then see it as empty and overwrite part of another string.
-//!
-//! Since strings up to 6 bytes are packed into NaN, they can never appear here,
-//! which means the empty string cannot appear, so a length prefix cannot be 0.
-//! Thus there's no ambiguity between empty sector and empty string.
-//!
-//! A sector can begin with a 0 byte, but have a non-0 value as a u64; this
-//! indicates a packed pointer to an Istr elsewhere on the heap. The low 48
-//! bits are the pointer, while the high bits encode the length of the string
-//! being pointed to, so we don't need to follow the pointer for the equality
-//! check in case the lengths are different. Also, in this case, the cached
-//! hash follows directly in the next sector, so we also needn't follow the
-//! pointer when resizing the hash set.
-//!
-
-/// Bucket count
-bcount: usize = 512, // 512
-
-used_bytes: usize,
-total_bytes: usize,
+///
+/// Efficient hash set for short(ish) strings.
+///
+/// Layout: []Bucket
+///
+/// * Bucket: [2]Sector
+///
+/// * Sector: [32]u8 or [4]u64
+///
+/// The parser interns strings up to 255 bytes in length, using the `Istr` heap
+/// representation, which has an 8-byte metadata prefix. Therefore, strings of
+/// up to 24 bytes can fit into a 32-byte sector as an immediate Istr, so the
+/// pointer to the sector becomes the Istr pointer itself.
+///
+/// The 8-byte header of an Istr is a u64 hash, but its lowest 8 bits are also
+/// the length of the string: u64hash = (real_u64hash << 1) | u8length
+///
+/// Note that the empty string is never interned since strings up to 6 bytes in
+/// length are packed directly into a NaN value. Thus, checking if a sector is
+/// empty is a matter of checking its first 8 bytes as a u64 value, which would
+/// hold the Istr header, which cannot be zero, because the lowest 8 bits are
+/// the length of the string.
+///
+/// If the length value is greater than 24, meaning the Istr can't actually fit
+/// here, that means the next 8 bytes of the sector are actually a pointer to
+/// the Istr elsewhere on the heap, and another 16 bytes are unused.
+///
+const std = @import("std");
+
+const Set = @This();
+
+const Bucket = [2][4]u64;
+
+// This is really just the head of the sector, but contains functions to
+// manipulate the entire contents.
+const Sector = *align(8) packed union {
+ hash: u64,
+ meta: packed struct(u64) {
+ len: u8,
+ _: u56,
+ },
+
+ fn bufU64(self: *@This()) [*]u64 {
+ return @ptrCast(self);
+ }
+
+ fn getIstr(self: *@This()) Istr {
+ if (self.meta.len <= 24) {
+ return @ptrCast(self);
+ } else {
+ return @bitCast(self.bufU64()[1]);
+ }
+ }
+
+ pub fn match(self: *@This(), hash: u64, s: []const u8) ?Istr {
+ if (self.hash != hash) {
+ return false;
+ }
+ const istr = self.getIstr();
+ if (std.hash_map.eqlString(s, istr.str())) {
+ return istr;
+ } else {
+ return null;
+ }
+ }
+
+ pub fn insert(
+ self: *@This(),
+ alloc: *Allocator,
+ hash: u64,
+ s: []const u8,
+ ) Istr {
+ self.hash = hash;
+ if (s.len <= 24) {
+ const istr = @ptrCast(self);
+ istr.putStr(s);
+ return istr;
+ } else {
+ const istr = Istr.new(alloc, hash, s);
+ self.bufU64()[1] = @bitCast(istr);
+ return istr;
+ }
+ }
+};
+
+buckets: []Bucket,
+
+const default_bcount = 512;
+
+pub fn init(alloc: *Allocator) !Set {
+ return initCustom(alloc, default_bcount);
+}
+
+pub fn initCustom(alloc: *Allocator, bcount: usize) !Set {
+ return Set{
+ .buckets = try alloc.alloc(Bucket, bcount),
+ };
+}
+
+pub fn deinit(self: *Set, alloc: *Allocator) void {
+ alloc.free(self.buckets);
+}
+
+pub fn add(self: *Set, s: []const u8) Istr {
+ std.debug.assert(s.len < 256);
+
+ const idx_mask = self.buckets.len - 1;
+
+ const len = s.len;
+
+ const h0 = std.hash_map.hashString(s);
+ const hash = (h0 << 1) | len;
+
+ // We resize if we had to walk through 50% of the buckets.
+ const probe_limit = self.buckets.len / 2;
+
+ var probes: usize = 0;
+ var idx = @as(usize, @truncate(hash & idx_mask));
+ var s_idx: usize = undefined;
+ while (true) : (idx = (idx + 1) & idx_mask) {
+ inline for (0..1) |i| {
+ const sec: Sector = @ptrCast(&self.buckets[idx][i]);
+ if (sec.hash == 0) { s_idx = i; break; }
+ if (sec.match(s)) |istr| return istr;
+ }
+ probes += 1;
+ if (probes > probe_limit) {
+ self.resize();
+ return self.add(s);
+ }
+ };
+
+ const sec = &self.buckets[idx][s_idx];
+ return sec.insert(alloc, hash, s);
+}
+
+fn resize() void {
+ @panic("IstrSet resize not implemented.");
+}
diff --git a/src/zisp/value/istr.zig b/src/zisp/value/istr.zig
index 2ca2dad..dec3b09 100644
--- a/src/zisp/value/istr.zig
+++ b/src/zisp/value/istr.zig
@@ -7,27 +7,37 @@ const Value = value.Value;
// Zig API
-/// Pointer to beginning of an interned string with a 1-byte length prefix and
-/// 8-byte hash cache suffix. Aligned to 8 bytes, like all Zisp heap pointers.
-pub const Istr = *align(8) struct {
- fn buf(self: *@This()) [*]u8 {
+/// Pointer to an interned string.
+///
+/// First 64 bits are a cached hash, of which the lowest 8 bits are actually the
+/// length of the string: u64hash = (real_u64hash << 1) | u8length
+pub const Istr = *align(8) packed union {
+ hash: u64,
+ meta: packed struct(u64) {
+ len: u8,
+ _: u56,
+ },
+
+ fn bufU8(self: *@This()) [*]u8 {
return @ptrCast(self);
}
- pub fn len(self: *@This()) u8 {
- return self.buf()[0];
+ pub fn new(alloc: *Allocator, hash: u64, s: []const u8) Istr {
+ const istr: Istr = @ptrCast(alloc.alloc(u8, 8 + s.len));
+ istr.hash = hash;
+ istr.putStr(s);
+ return istr;
}
- pub fn str(self: *@This()) []u8 {
- const end = 1 + self.len();
- return self.buf()[1..end];
+ pub fn putStr(self: *@This(), s: []const u8) void {
+ std.debug.assert(s.len <= 255);
+ @memcpy(istr.bufU8()[8 .. 8 + s.len], s);
}
- pub fn getHash(self: *@This()) u64 {
- // Hash stored in the next naturally aligned u64
- const hash_idx = ((self.len() + 8) & -8) / 8;
- const buf: [*]u64 = @ptrCast(self);
- return buf[hash_idx];
+ pub fn str(self: *@This()) []const u8 {
+ const buf = self.bufU8();
+ const len = self.meta.len;
+ return buf[8 .. 8 + len];
}
};