diff options
| author | Taylan Kammer <taylan.kammer@gmail.com> | 2026-05-30 23:04:38 +0200 |
|---|---|---|
| committer | Taylan Kammer <taylan.kammer@gmail.com> | 2026-05-30 23:04:38 +0200 |
| commit | a21c485635f74c6fb86db630110fa4ddebc8c0c0 (patch) | |
| tree | caf5d2bc53d8a3e194c6417d4c9bfe5ffa7a0d11 | |
| parent | 237d2e48af8dc5203dbf4c6b2414af6792a56243 (diff) | |
Improve Istr stuff.
| -rw-r--r-- | src/zisp/gc/IstrSet.zig | 192 | ||||
| -rw-r--r-- | src/zisp/value/istr.zig | 38 |
2 files changed, 163 insertions, 67 deletions
diff --git a/src/zisp/gc/IstrSet.zig b/src/zisp/gc/IstrSet.zig index c90f5dd..31bc850 100644 --- a/src/zisp/gc/IstrSet.zig +++ b/src/zisp/gc/IstrSet.zig @@ -1,53 +1,139 @@ -//! -//! Efficient hash set for short(ish) strings. -//! -//! Layout: []Bucket -//! -//! * Bucket: [8]Sector -//! -//! * Sector: [8]u8 or u64 -//! -//! Strategy: Hybrid between open addressing with linear probing, and chaining, -//! with special markers for indirection; see details below. -//! -//! The Zisp parser interns strings up to 255 bytes in length, using the `Istr` -//! heap representation with a len prefix of one byte and hash cache suffix of -//! eight bytes. This means that strings of up to 55 bytes of content can fit -//! within 64 bytes. We store these inline in our buckets. A pointer to that -//! bucket is then itself the Istr. -//! -//! A bucket can contain multiple strings if they fit. The strings are aligned -//! at 8 byte "sectors" so a pointer to a sector can also be a valid Istr. -//! -//! A sector may appear empty (all 0 bytes) but actually be part of a string; -//! this is not a problem since strings are length-prefixed. Linear probing -//! starts at the beginning of a bucket, and jumps over occupied sectors as -//! indicated by a string's length prefix. This can continue across buckets. -//! When an empty sector is found after jumping over a string, we check if -//! there's enough space within the bucket to hold the string, otherwise we -//! advance into the next bucket and keep probing. -//! -//! Splitting across buckets is not allowed, since this introduces ambiguity: -//! Strings can contain ASCII NUL bytes, so if a string ending in a series of -//! NUL bytes were allowed to spill into the next bucket, it would appear to -//! begin with an empty sector. A later probe sequence beginning with that -//! bucket would then see it as empty and overwrite part of another string. -//! -//! Since strings up to 6 bytes are packed into NaN, they can never appear here, -//! which means the empty string cannot appear, so a length prefix cannot be 0. -//! Thus there's no ambiguity between empty sector and empty string. -//! -//! A sector can begin with a 0 byte, but have a non-0 value as a u64; this -//! indicates a packed pointer to an Istr elsewhere on the heap. The low 48 -//! bits are the pointer, while the high bits encode the length of the string -//! being pointed to, so we don't need to follow the pointer for the equality -//! check in case the lengths are different. Also, in this case, the cached -//! hash follows directly in the next sector, so we also needn't follow the -//! pointer when resizing the hash set. -//! - -/// Bucket count -bcount: usize = 512, // 512 - -used_bytes: usize, -total_bytes: usize, +/// +/// Efficient hash set for short(ish) strings. +/// +/// Layout: []Bucket +/// +/// * Bucket: [2]Sector +/// +/// * Sector: [32]u8 or [4]u64 +/// +/// The parser interns strings up to 255 bytes in length, using the `Istr` heap +/// representation, which has an 8-byte metadata prefix. Therefore, strings of +/// up to 24 bytes can fit into a 32-byte sector as an immediate Istr, so the +/// pointer to the sector becomes the Istr pointer itself. +/// +/// The 8-byte header of an Istr is a u64 hash, but its lowest 8 bits are also +/// the length of the string: u64hash = (real_u64hash << 1) | u8length +/// +/// Note that the empty string is never interned since strings up to 6 bytes in +/// length are packed directly into a NaN value. Thus, checking if a sector is +/// empty is a matter of checking its first 8 bytes as a u64 value, which would +/// hold the Istr header, which cannot be zero, because the lowest 8 bits are +/// the length of the string. +/// +/// If the length value is greater than 24, meaning the Istr can't actually fit +/// here, that means the next 8 bytes of the sector are actually a pointer to +/// the Istr elsewhere on the heap, and another 16 bytes are unused. +/// +const std = @import("std"); + +const Set = @This(); + +const Bucket = [2][4]u64; + +// This is really just the head of the sector, but contains functions to +// manipulate the entire contents. +const Sector = *align(8) packed union { + hash: u64, + meta: packed struct(u64) { + len: u8, + _: u56, + }, + + fn bufU64(self: *@This()) [*]u64 { + return @ptrCast(self); + } + + fn getIstr(self: *@This()) Istr { + if (self.meta.len <= 24) { + return @ptrCast(self); + } else { + return @bitCast(self.bufU64()[1]); + } + } + + pub fn match(self: *@This(), hash: u64, s: []const u8) ?Istr { + if (self.hash != hash) { + return false; + } + const istr = self.getIstr(); + if (std.hash_map.eqlString(s, istr.str())) { + return istr; + } else { + return null; + } + } + + pub fn insert( + self: *@This(), + alloc: *Allocator, + hash: u64, + s: []const u8, + ) Istr { + self.hash = hash; + if (s.len <= 24) { + const istr = @ptrCast(self); + istr.putStr(s); + return istr; + } else { + const istr = Istr.new(alloc, hash, s); + self.bufU64()[1] = @bitCast(istr); + return istr; + } + } +}; + +buckets: []Bucket, + +const default_bcount = 512; + +pub fn init(alloc: *Allocator) !Set { + return initCustom(alloc, default_bcount); +} + +pub fn initCustom(alloc: *Allocator, bcount: usize) !Set { + return Set{ + .buckets = try alloc.alloc(Bucket, bcount), + }; +} + +pub fn deinit(self: *Set, alloc: *Allocator) void { + alloc.free(self.buckets); +} + +pub fn add(self: *Set, s: []const u8) Istr { + std.debug.assert(s.len < 256); + + const idx_mask = self.buckets.len - 1; + + const len = s.len; + + const h0 = std.hash_map.hashString(s); + const hash = (h0 << 1) | len; + + // We resize if we had to walk through 50% of the buckets. + const probe_limit = self.buckets.len / 2; + + var probes: usize = 0; + var idx = @as(usize, @truncate(hash & idx_mask)); + var s_idx: usize = undefined; + while (true) : (idx = (idx + 1) & idx_mask) { + inline for (0..1) |i| { + const sec: Sector = @ptrCast(&self.buckets[idx][i]); + if (sec.hash == 0) { s_idx = i; break; } + if (sec.match(s)) |istr| return istr; + } + probes += 1; + if (probes > probe_limit) { + self.resize(); + return self.add(s); + } + }; + + const sec = &self.buckets[idx][s_idx]; + return sec.insert(alloc, hash, s); +} + +fn resize() void { + @panic("IstrSet resize not implemented."); +} diff --git a/src/zisp/value/istr.zig b/src/zisp/value/istr.zig index 2ca2dad..dec3b09 100644 --- a/src/zisp/value/istr.zig +++ b/src/zisp/value/istr.zig @@ -7,27 +7,37 @@ const Value = value.Value; // Zig API -/// Pointer to beginning of an interned string with a 1-byte length prefix and -/// 8-byte hash cache suffix. Aligned to 8 bytes, like all Zisp heap pointers. -pub const Istr = *align(8) struct { - fn buf(self: *@This()) [*]u8 { +/// Pointer to an interned string. +/// +/// First 64 bits are a cached hash, of which the lowest 8 bits are actually the +/// length of the string: u64hash = (real_u64hash << 1) | u8length +pub const Istr = *align(8) packed union { + hash: u64, + meta: packed struct(u64) { + len: u8, + _: u56, + }, + + fn bufU8(self: *@This()) [*]u8 { return @ptrCast(self); } - pub fn len(self: *@This()) u8 { - return self.buf()[0]; + pub fn new(alloc: *Allocator, hash: u64, s: []const u8) Istr { + const istr: Istr = @ptrCast(alloc.alloc(u8, 8 + s.len)); + istr.hash = hash; + istr.putStr(s); + return istr; } - pub fn str(self: *@This()) []u8 { - const end = 1 + self.len(); - return self.buf()[1..end]; + pub fn putStr(self: *@This(), s: []const u8) void { + std.debug.assert(s.len <= 255); + @memcpy(istr.bufU8()[8 .. 8 + s.len], s); } - pub fn getHash(self: *@This()) u64 { - // Hash stored in the next naturally aligned u64 - const hash_idx = ((self.len() + 8) & -8) / 8; - const buf: [*]u64 = @ptrCast(self); - return buf[hash_idx]; + pub fn str(self: *@This()) []const u8 { + const buf = self.bufU8(); + const len = self.meta.len; + return buf[8 .. 8 + len]; } }; |
