diff options
| author | Taylan Kammer <taylan.kammer@gmail.com> | 2026-06-06 05:51:16 +0200 |
|---|---|---|
| committer | Taylan Kammer <taylan.kammer@gmail.com> | 2026-06-06 05:51:16 +0200 |
| commit | 23b632b61dee4c489bda4626d6b3ccff548a9ccc (patch) | |
| tree | 934ff685e7038b1d65de86c60da101799d50d7a3 /src | |
| parent | abdc8526797236fd55c70fb4e1a3fef7c4fb23fb (diff) | |
Overhaul and fix IstrSet.
Diffstat (limited to 'src')
| -rw-r--r-- | src/test/strings.zig | 4 | ||||
| -rw-r--r-- | src/zisp/gc/IstrSet.zig | 292 | ||||
| -rw-r--r-- | src/zisp/value/istr.zig | 20 |
3 files changed, 122 insertions, 194 deletions
diff --git a/src/test/strings.zig b/src/test/strings.zig index 3039615..09c8e2a 100644 --- a/src/test/strings.zig +++ b/src/test/strings.zig @@ -12,13 +12,13 @@ const fx = value.fixnum; test "istr" { const s1 = "foo bar baz"; const v1 = try istr.intern(s1); - const v1_len: usize = @intCast(fx.unpack(istr.len(v1))); + const v1_len: usize = @intCast(fx.unpack(istr.getLen(v1))); try testing.expectEqualStrings(s1, istr.assert(v1).str()); try testing.expectEqual(s1.len, v1_len); const s2 = @embedFile("data/string.txt"); const v2 = try istr.intern(s2); - const v2_len: usize = @intCast(fx.unpack(istr.len(v2))); + const v2_len: usize = @intCast(fx.unpack(istr.getLen(v2))); try testing.expectEqualStrings(s2, istr.assert(v2).str()); try testing.expectEqual(s2.len, v2_len); diff --git a/src/zisp/gc/IstrSet.zig b/src/zisp/gc/IstrSet.zig index b4575f6..6e5b95c 100644 --- a/src/zisp/gc/IstrSet.zig +++ b/src/zisp/gc/IstrSet.zig @@ -1,31 +1,3 @@ -//! -//! Efficient hash set for short(ish) strings. -//! -//! Layout: []Bucket -//! -//! * Bucket: [2]Sector -//! -//! * Sector: [32]u8 or [4]u64 -//! -//! The parser interns strings up to 255 bytes in length, using the `Istr` heap -//! representation, which has an 8-byte metadata prefix. Therefore, strings of -//! up to 24 bytes can fit into a 32-byte sector as an immediate Istr, so the -//! pointer to the sector becomes the Istr pointer itself. -//! -//! The 8-byte header of an Istr is a u64 hash, but its lowest 8 bits are also -//! the length of the string: u64hash = (real_u64hash << 8) | u8length -//! -//! Note that the empty string is never interned since strings up to 6 bytes in -//! length are packed directly into a NaN value. Thus, checking if a sector is -//! empty is a matter of checking its first 8 bytes as a u64 value, which would -//! hold the Istr header, which cannot be zero, because the lowest 8 bits are -//! the length of the string. -//! -//! If the length value is greater than 24, meaning the Istr can't actually fit -//! here, that means the next 8 bytes of the sector are actually a pointer to -//! the Istr elsewhere on the heap, and another 16 bytes are unused. -//! - const std = @import("std"); const Alloc = std.mem.Allocator; @@ -37,96 +9,21 @@ const IstrHead = value.istr.IstrHead; const Set = @This(); -const bucket_sectors = 2; -const sector_u64_cnt = 4; - -const sector_size = sector_u64_cnt * 8; - -// Empirically, 32 is the sweet spot; perhaps because it corresponds to 4 KiB; -// this was however tested with strings from /dev/random, which also tended to -// be on the shorter end. TODO: Re-test with various data. -const max_probe_length = 32; - -// Really just a fall-back to ensure we don't fill to 100% because our resize -// logic needs to find at least one empty sector. -const max_fill_percent = 95; - -const Bucket = [bucket_sectors][sector_u64_cnt]u64; - -const SectorPtr = *align(8) SectorHead; - -// This must remain compatible with IstrHead! -const SectorHead = packed union { - hash: u64, - meta: packed struct(u64) { - len: u8, - _: u56, - }, +const max_fill_percent = 80; - fn bufU64(self: SectorPtr) [*]u64 { - return @ptrCast(self); - } - - fn getIstrPtr(self: SectorPtr) IstrPtr { - if (self.meta.len <= sector_size - @sizeOf(SectorHead)) { - return @ptrCast(self); - } else { - return @ptrFromInt(self.bufU64()[1]); - } - } - - pub fn match(self: SectorPtr, hash: u64, s: []const u8) ?IstrPtr { - if (self.hash != hash) { - return null; - } - const istr = self.getIstrPtr(); - if (std.hash_map.eqlString(s, istr.str())) { - return istr; - } else { - return null; - } - } - - pub fn insert( - self: SectorPtr, - alloc: Alloc, - hash: u64, - s: []const u8, - ) !IstrPtr { - self.hash = hash; - if (s.len <= sector_size - @sizeOf(SectorHead)) { - const istr: IstrPtr = @ptrCast(self); - istr.putStr(s); - return istr; - } else { - const istr = try newIstr(alloc, hash, s); - self.bufU64()[1] = @intFromPtr(istr); - return istr; - } - } - - pub fn copy(self: SectorPtr, dest: SectorPtr) void { - const end = sector_u64_cnt; - @memcpy(dest.bufU64()[0..end], self.bufU64()[0..end]); - } - - fn newIstr(alloc: Alloc, hash: u64, s: []const u8) !IstrPtr { - const algn = std.mem.Alignment.of(IstrPtr); - const size = @sizeOf(IstrHead) + s.len; - const ptr = try alloc.alignedAlloc(u8, algn, size); - const istr: IstrPtr = @ptrCast(ptr); - istr.* = .{ .hash = hash }; - istr.putStr(s); - return istr; - } +const Bucket = packed struct(u64) { + ptr: u48, + fp: u16, }; alloc: Alloc, buckets: []Bucket = undefined, -used_sectors: usize = 0, +used_buckets: usize = 0, used_threshold: usize = undefined, +const test_stdlib_impl = false; + const default_bcount = 512; pub fn init(alloc: Alloc) !Set { @@ -134,6 +31,10 @@ pub fn init(alloc: Alloc) !Set { } pub fn initCustom(alloc: Alloc, bcount: usize) !Set { + if (test_stdlib_impl) { + istr_hash_map = .init(alloc); + return Set{ .alloc = alloc }; + } std.debug.assert(@popCount(bcount) == 1); // Must be power of 2. var self = Set{ .alloc = alloc }; try self.allocBuckets(bcount); @@ -143,103 +44,136 @@ pub fn initCustom(alloc: Alloc, bcount: usize) !Set { fn allocBuckets(self: *Set, bcount: usize) !void { self.buckets = try self.alloc.alloc(Bucket, bcount); @memset(self.buckets, std.mem.zeroes(Bucket)); - self.used_threshold = bcount * bucket_sectors * max_fill_percent / 100; + self.used_threshold = bcount * max_fill_percent / 100; } pub fn deinit(self: *Set) void { self.alloc.free(self.buckets); } -fn sector(bs: []Bucket, b_idx: usize, s_idx: usize) SectorPtr { - return @ptrCast(&bs[b_idx][s_idx]); -} - /// Add, or get existing, string. pub fn add(self: *Set, s: []const u8) !IstrPtr { std.debug.assert(s.len < 256); + if (test_stdlib_impl) { + return addStdlib(self.alloc, s); + } + + if (self.used_buckets > self.used_threshold) { + try self.resize(); + } + const idx_mask = self.buckets.len - 1; - const real_hash = std.hash_map.hashString(s); - const hash = (real_hash << 8) | s.len; + const hash = std.hash_map.hashString(s); + const fp: u16 = @truncate(hash); const idx_init = @as(usize, @truncate(hash & idx_mask)); var idx = idx_init; - const s_idx = b: while (true) : (idx = (idx +% 1) & idx_mask) { - inline for (0..bucket_sectors) |si| { - const sec = sector(self.buckets, idx, si); - if (sec.hash == 0) break :b si; - if (sec.match(hash, s)) |istr| return istr; + while (true) : (idx = (idx +% 1) & idx_mask) { + const bucket = self.buckets[idx]; + if (bucket.ptr == 0) break; + if (bucket.fp == fp) { + const istr: IstrPtr = @ptrFromInt(bucket.ptr); + if (std.hash_map.eqlString(s, istr.str())) return istr; } - }; - - if (idx -% idx_init >= max_probe_length) { - try self.resize(); - return self.add(s); } - self.used_sectors += 1; - if (self.used_sectors >= self.used_threshold) { - try self.resize(); - return self.add(s); - } - - const sec = sector(self.buckets, idx, s_idx); - return sec.insert(self.alloc, hash, s); + self.used_buckets += 1; + + const algn = std.mem.Alignment.of(IstrPtr); + const size = @sizeOf(IstrHead) + s.len; + const ptr = try self.alloc.alignedAlloc(u8, algn, size); + const istr: IstrPtr = @ptrCast(ptr); + istr.* = .{ .len = @intCast(s.len) }; + istr.putStr(s); + self.buckets[idx] = .{ + .fp = fp, + .ptr = @intCast(@intFromPtr(istr)), + }; + return istr; } fn resize(self: *Set) !void { - const old_bc = self.buckets.len; - const old_bs = self.buckets; + const old_bcount = self.buckets.len; + const new_bcount = self.buckets.len << 1; - const new_bc = old_bc << 1; + const old_mask = old_bcount - 1; + const new_mask = new_bcount - 1; - try self.allocBuckets(new_bc); - defer self.alloc.free(old_bs); + const old_buckets = self.buckets; + try self.allocBuckets(new_bcount); + defer self.alloc.free(old_buckets); + const new_buckets = self.buckets; - const new_bs = self.buckets; - - const old_mask = old_bc - 1; - const new_mask = new_bc - 1; - - // Make sure we're not in the middle of a cluster - var idx = b: for (old_bs, 0..) |_, idx| { - inline for (0..bucket_sectors) |s_idx| { - const sec = sector(old_bs, idx, s_idx); - if (sec.hash == 0) break :b (idx +% 1) & old_mask; - } + // Make sure we're not in the middle of a cluster; find an empty bucket. + const idx_start = b: for (old_buckets, 0..) |_, idx| { + if (old_buckets[idx].ptr == 0) break :b idx; } else unreachable; // Resize strategy guarantees empty slots - const idx_last = (idx -% 1) & old_mask; - - while (idx != idx_last) : (idx = (idx +% 1) & old_mask) { - // Need to walk destinations sector by sector, since strings grouped in - // one sector in the old buckets may be split across buckets in the new - // array. - - const bucket_size = sector_size * bucket_sectors; - const new_bs_base = @intFromPtr(new_bs.ptr); - - var dest_lo = new_bs_base + (bucket_size * idx); - var dest_hi = new_bs_base + (bucket_size * (idx + old_bc)); - - // Start copying cluster - b: while (idx != idx_last) : (idx = (idx +% 1) & old_mask) { - inline for (0..bucket_sectors) |s_idx| { - const sec = sector(old_bs, idx, s_idx); - if (sec.hash == 0) break :b; - const new_idx = sec.hash & new_mask; - var dest: SectorPtr = undefined; - if (new_idx < old_bc) { - dest = @ptrFromInt(dest_lo); - dest_lo += sector_size; - } else { - dest = @ptrFromInt(dest_hi); - dest_hi += sector_size; - } - sec.copy(dest); + var idx = idx_start; + while (true) { + // +1 at start of first iteration is OK since idx_start is empty. + idx = (idx +% 1) & old_mask; + + if (idx == idx_start) break; + if (old_buckets[idx].ptr == 0) continue; + + var dest_lo = idx; + var dest_hi = idx + old_bcount; + while (true) : (idx = (idx +% 1) & old_mask) { + const bucket = old_buckets[idx]; + if (bucket.ptr == 0) break; + + const istr: IstrPtr = @ptrFromInt(bucket.ptr); + const hash = std.hash_map.hashString(istr.str()); + const new_idx = @as(usize, @truncate(hash & new_mask)); + + if (new_idx < old_bcount) { + new_buckets[dest_lo] = bucket; + dest_lo += 1; + } else { + new_buckets[dest_hi] = bucket; + dest_hi = (dest_hi +% 1) & new_mask; } } + + if (idx == idx_start) break; } } + +// Using stdlib, to compare + +const Map = std.hash_map.StringHashMap(IstrPtr); +const str_context = std.hash_map.StringContext{}; +var istr_hash_map: Map = undefined; + +pub fn addStdlib(alloc: Alloc, str: []const u8) !IstrPtr { + const gop = istr_hash_map.getOrPutAdapted( + str, + str_context, + ) catch @panic("OOM"); + if (gop.found_existing) { + return gop.value_ptr.*; + } + + std.debug.assert(str.len <= std.math.maxInt(u48)); + + const header: IstrHead = .{ .len = @intCast(str.len) }; + + const h_align: std.mem.Alignment = @enumFromInt(@alignOf(IstrPtr)); + const h_size = @sizeOf(IstrHead); + const size = str.len + h_size; + + const mem = alloc.alignedAlloc(u8, h_align, size) catch @panic("OOM"); + + const h_bytes: [h_size]u8 = @bitCast(header); + @memcpy(mem[0..h_size], &h_bytes); + @memcpy(mem[h_size..size], str); + + gop.key_ptr.* = alloc.dupe(u8, str) catch @panic("OOM"); + gop.value_ptr.* = @ptrCast(mem.ptr); + + return @ptrCast(mem.ptr); +} diff --git a/src/zisp/value/istr.zig b/src/zisp/value/istr.zig index 2abfa66..1159cc7 100644 --- a/src/zisp/value/istr.zig +++ b/src/zisp/value/istr.zig @@ -10,18 +10,11 @@ const Value = value.Value; // Zig API -/// Pointer to an interned string. -/// -/// First 64 bits are a cached hash, of which the lowest 8 bits are actually the -/// length of the string: u64hash = (real_u64hash << 8) | u8length +/// Pointer to an interned string. First byte is length. pub const IstrPtr = *align(@alignOf(value.Zptr)) IstrHead; -pub const IstrHead = packed union { - hash: u64, - meta: packed struct(u64) { - len: u8, - _: u56, - }, +pub const IstrHead = packed struct(u8) { + len: u8, fn bufU8(self: *@This()) [*]u8 { return @ptrCast(self); @@ -37,7 +30,8 @@ pub const IstrHead = packed union { pub fn str(self: *@This()) []const u8 { const buf = self.bufU8(); const start = @sizeOf(IstrHead); - return buf[start .. start + self.meta.len]; + const len: usize = self.len; + return buf[start .. start + len]; } }; @@ -86,7 +80,7 @@ pub fn pred(v: Value) Value { return value.boole.pack(check(v) != null); } -pub fn len(v: Value) Value { +pub fn getLen(v: Value) Value { const istr = assert(v); - return value.fixnum.pack(@intCast(istr.meta.len)); + return value.fixnum.pack(@intCast(istr.len)); } |
