Improve Istr stuff.

author: Taylan Kammer <taylan.kammer@gmail.com> 2026-05-30 23:04:38 +0200
committer: Taylan Kammer <taylan.kammer@gmail.com> 2026-05-30 23:04:38 +0200
commit: a21c485635f74c6fb86db630110fa4ddebc8c0c0 (patch)
tree: caf5d2bc53d8a3e194c6417d4c9bfe5ffa7a0d11
parent: 237d2e48af8dc5203dbf4c6b2414af6792a56243 (diff)
2 files changed, 163 insertions, 67 deletions
diff --git a/src/zisp/gc/IstrSet.zig b/src/zisp/gc/IstrSet.zig
index c90f5dd..31bc850 100644
--- a/src/zisp/gc/IstrSet.zig
+++ b/src/zisp/gc/IstrSet.zig
@@ -1,53 +1,139 @@
-//!
-//! Efficient hash set for short(ish) strings.
-//!
-//! Layout: []Bucket
-//!
-//! * Bucket: [8]Sector
-//!
-//! * Sector: [8]u8 or u64
-//!
-//! Strategy: Hybrid between open addressing with linear probing, and chaining,
-//! with special markers for indirection; see details below.
-//!
-//! The Zisp parser interns strings up to 255 bytes in length, using the `Istr`
-//! heap representation with a len prefix of one byte and hash cache suffix of
-//! eight bytes.  This means that strings of up to 55 bytes of content can fit
-//! within 64 bytes.  We store these inline in our buckets.  A pointer to that
-//! bucket is then itself the Istr.
-//!
-//! A bucket can contain multiple strings if they fit.  The strings are aligned
-//! at 8 byte "sectors" so a pointer to a sector can also be a valid Istr.
-//!
-//! A sector may appear empty (all 0 bytes) but actually be part of a string;
-//! this is not a problem since strings are length-prefixed.  Linear probing
-//! starts at the beginning of a bucket, and jumps over occupied sectors as
-//! indicated by a string's length prefix.  This can continue across buckets.
-//! When an empty sector is found after jumping over a string, we check if
-//! there's enough space within the bucket to hold the string, otherwise we
-//! advance into the next bucket and keep probing.
-//!
-//! Splitting across buckets is not allowed, since this introduces ambiguity:
-//! Strings can contain ASCII NUL bytes, so if a string ending in a series of
-//! NUL bytes were allowed to spill into the next bucket, it would appear to
-//! begin with an empty sector.  A later probe sequence beginning with that
-//! bucket would then see it as empty and overwrite part of another string.
-//!
-//! Since strings up to 6 bytes are packed into NaN, they can never appear here,
-//! which means the empty string cannot appear, so a length prefix cannot be 0.
-//! Thus there's no ambiguity between empty sector and empty string.
-//!
-//! A sector can begin with a 0 byte, but have a non-0 value as a u64; this
-//! indicates a packed pointer to an Istr elsewhere on the heap.  The low 48
-//! bits are the pointer, while the high bits encode the length of the string
-//! being pointed to, so we don't need to follow the pointer for the equality
-//! check in case the lengths are different.  Also, in this case, the cached
-//! hash follows directly in the next sector, so we also needn't follow the
-//! pointer when resizing the hash set.
-//!
-
-/// Bucket count
-bcount: usize = 512, // 512
-
-used_bytes: usize,
-total_bytes: usize,
+///
+/// Efficient hash set for short(ish) strings.
+///
+/// Layout: []Bucket
+///
+/// * Bucket: [2]Sector
+///
+/// * Sector: [32]u8 or [4]u64
+///
+/// The parser interns strings up to 255 bytes in length, using the `Istr` heap
+/// representation, which has an 8-byte metadata prefix.  Therefore, strings of
+/// up to 24 bytes can fit into a 32-byte sector as an immediate Istr, so the
+/// pointer to the sector becomes the Istr pointer itself.
+///
+/// The 8-byte header of an Istr is a u64 hash, but its lowest 8 bits are also
+/// the length of the string: u64hash = (real_u64hash << 1) | u8length
+///
+/// Note that the empty string is never interned since strings up to 6 bytes in
+/// length are packed directly into a NaN value.  Thus, checking if a sector is
+/// empty is a matter of checking its first 8 bytes as a u64 value, which would
+/// hold the Istr header, which cannot be zero, because the lowest 8 bits are
+/// the length of the string.
+///
+/// If the length value is greater than 24, meaning the Istr can't actually fit
+/// here, that means the next 8 bytes of the sector are actually a pointer to
+/// the Istr elsewhere on the heap, and another 16 bytes are unused.
+///
+const std = @import("std");
+
+const Set = @This();
+
+const Bucket = [2][4]u64;
+
+// This is really just the head of the sector, but contains functions to
+// manipulate the entire contents.
+const Sector = *align(8) packed union {
+    hash: u64,
+    meta: packed struct(u64) {
+        len: u8,
+        _: u56,
+    },
+
+    fn bufU64(self: *@This()) [*]u64 {
+        return @ptrCast(self);
+    }
+
+    fn getIstr(self: *@This()) Istr {
+        if (self.meta.len <= 24) {
+            return @ptrCast(self);
+        } else {
+            return @bitCast(self.bufU64()[1]);
+        }
+    }
+
+    pub fn match(self: *@This(), hash: u64, s: []const u8) ?Istr {
+        if (self.hash != hash) {
+            return false;
+        }
+        const istr = self.getIstr();
+        if (std.hash_map.eqlString(s, istr.str())) {
+            return istr;
+        } else {
+            return null;
+        }
+    }
+
+    pub fn insert(
+        self: *@This(),
+        alloc: *Allocator,
+        hash: u64,
+        s: []const u8,
+    ) Istr {
+        self.hash = hash;
+        if (s.len <= 24) {
+            const istr = @ptrCast(self);
+            istr.putStr(s);
+            return istr;
+        } else {
+            const istr = Istr.new(alloc, hash, s);
+            self.bufU64()[1] = @bitCast(istr);
+            return istr;
+        }
+    }
+};
+
+buckets: []Bucket,
+
+const default_bcount = 512;
+
+pub fn init(alloc: *Allocator) !Set {
+    return initCustom(alloc, default_bcount);
+}
+
+pub fn initCustom(alloc: *Allocator, bcount: usize) !Set {
+    return Set{
+        .buckets = try alloc.alloc(Bucket, bcount),
+    };
+}
+
+pub fn deinit(self: *Set, alloc: *Allocator) void {
+    alloc.free(self.buckets);
+}
+
+pub fn add(self: *Set, s: []const u8) Istr {
+    std.debug.assert(s.len < 256);
+
+    const idx_mask = self.buckets.len - 1;
+
+    const len = s.len;
+
+    const h0 = std.hash_map.hashString(s);
+    const hash = (h0 << 1) | len;
+
+    // We resize if we had to walk through 50% of the buckets.
+    const probe_limit = self.buckets.len / 2;
+
+    var probes: usize = 0;
+    var idx = @as(usize, @truncate(hash & idx_mask));
+    var s_idx: usize = undefined;
+    while (true) : (idx = (idx + 1) & idx_mask) {
+        inline for (0..1) |i| {
+            const sec: Sector = @ptrCast(&self.buckets[idx][i]);
+            if (sec.hash == 0) { s_idx = i; break; }
+            if (sec.match(s)) |istr| return istr;
+        }
+        probes += 1;
+        if (probes > probe_limit) {
+            self.resize();
+            return self.add(s);
+        }
+    };
+
+    const sec = &self.buckets[idx][s_idx];
+    return sec.insert(alloc, hash, s);
+}
+
+fn resize() void {
+    @panic("IstrSet resize not implemented.");
+}
diff --git a/src/zisp/value/istr.zig b/src/zisp/value/istr.zig
index 2ca2dad..dec3b09 100644
--- a/src/zisp/value/istr.zig
+++ b/src/zisp/value/istr.zig
@@ -7,27 +7,37 @@ const Value = value.Value;
 
 // Zig API
 
-/// Pointer to beginning of an interned string with a 1-byte length prefix and
-/// 8-byte hash cache suffix.  Aligned to 8 bytes, like all Zisp heap pointers.
-pub const Istr = *align(8) struct {
-    fn buf(self: *@This()) [*]u8 {
+/// Pointer to an interned string.
+///
+/// First 64 bits are a cached hash, of which the lowest 8 bits are actually the
+/// length of the string: u64hash = (real_u64hash << 1) | u8length
+pub const Istr = *align(8) packed union {
+    hash: u64,
+    meta: packed struct(u64) {
+        len: u8,
+        _: u56,
+    },
+
+    fn bufU8(self: *@This()) [*]u8 {
         return @ptrCast(self);
     }
 
-    pub fn len(self: *@This()) u8 {
-        return self.buf()[0];
+    pub fn new(alloc: *Allocator, hash: u64, s: []const u8) Istr {
+        const istr: Istr = @ptrCast(alloc.alloc(u8, 8 + s.len));
+        istr.hash = hash;
+        istr.putStr(s);
+        return istr;
     }
 
-    pub fn str(self: *@This()) []u8 {
-        const end = 1 + self.len();
-        return self.buf()[1..end];
+    pub fn putStr(self: *@This(), s: []const u8) void {
+        std.debug.assert(s.len <= 255);
+        @memcpy(istr.bufU8()[8 .. 8 + s.len], s);
     }
 
-    pub fn getHash(self: *@This()) u64 {
-        // Hash stored in the next naturally aligned u64
-        const hash_idx = ((self.len() + 8) & -8) / 8;
-        const buf: [*]u64 = @ptrCast(self);
-        return buf[hash_idx];
+    pub fn str(self: *@This()) []const u8 {
+        const buf = self.bufU8();
+        const len = self.meta.len;
+        return buf[8 .. 8 + len];
     }
 };
author	Taylan Kammer <taylan.kammer@gmail.com>	2026-05-30 23:04:38 +0200
committer	Taylan Kammer <taylan.kammer@gmail.com>	2026-05-30 23:04:38 +0200
commit	a21c485635f74c6fb86db630110fa4ddebc8c0c0 (patch)
tree	caf5d2bc53d8a3e194c6417d4c9bfe5ffa7a0d11
parent	237d2e48af8dc5203dbf4c6b2414af6792a56243 (diff)