summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/test/strings.zig4
-rw-r--r--src/zisp/gc/IstrSet.zig292
-rw-r--r--src/zisp/value/istr.zig20
3 files changed, 122 insertions, 194 deletions
diff --git a/src/test/strings.zig b/src/test/strings.zig
index 3039615..09c8e2a 100644
--- a/src/test/strings.zig
+++ b/src/test/strings.zig
@@ -12,13 +12,13 @@ const fx = value.fixnum;
test "istr" {
const s1 = "foo bar baz";
const v1 = try istr.intern(s1);
- const v1_len: usize = @intCast(fx.unpack(istr.len(v1)));
+ const v1_len: usize = @intCast(fx.unpack(istr.getLen(v1)));
try testing.expectEqualStrings(s1, istr.assert(v1).str());
try testing.expectEqual(s1.len, v1_len);
const s2 = @embedFile("data/string.txt");
const v2 = try istr.intern(s2);
- const v2_len: usize = @intCast(fx.unpack(istr.len(v2)));
+ const v2_len: usize = @intCast(fx.unpack(istr.getLen(v2)));
try testing.expectEqualStrings(s2, istr.assert(v2).str());
try testing.expectEqual(s2.len, v2_len);
diff --git a/src/zisp/gc/IstrSet.zig b/src/zisp/gc/IstrSet.zig
index b4575f6..6e5b95c 100644
--- a/src/zisp/gc/IstrSet.zig
+++ b/src/zisp/gc/IstrSet.zig
@@ -1,31 +1,3 @@
-//!
-//! Efficient hash set for short(ish) strings.
-//!
-//! Layout: []Bucket
-//!
-//! * Bucket: [2]Sector
-//!
-//! * Sector: [32]u8 or [4]u64
-//!
-//! The parser interns strings up to 255 bytes in length, using the `Istr` heap
-//! representation, which has an 8-byte metadata prefix. Therefore, strings of
-//! up to 24 bytes can fit into a 32-byte sector as an immediate Istr, so the
-//! pointer to the sector becomes the Istr pointer itself.
-//!
-//! The 8-byte header of an Istr is a u64 hash, but its lowest 8 bits are also
-//! the length of the string: u64hash = (real_u64hash << 8) | u8length
-//!
-//! Note that the empty string is never interned since strings up to 6 bytes in
-//! length are packed directly into a NaN value. Thus, checking if a sector is
-//! empty is a matter of checking its first 8 bytes as a u64 value, which would
-//! hold the Istr header, which cannot be zero, because the lowest 8 bits are
-//! the length of the string.
-//!
-//! If the length value is greater than 24, meaning the Istr can't actually fit
-//! here, that means the next 8 bytes of the sector are actually a pointer to
-//! the Istr elsewhere on the heap, and another 16 bytes are unused.
-//!
-
const std = @import("std");
const Alloc = std.mem.Allocator;
@@ -37,96 +9,21 @@ const IstrHead = value.istr.IstrHead;
const Set = @This();
-const bucket_sectors = 2;
-const sector_u64_cnt = 4;
-
-const sector_size = sector_u64_cnt * 8;
-
-// Empirically, 32 is the sweet spot; perhaps because it corresponds to 4 KiB;
-// this was however tested with strings from /dev/random, which also tended to
-// be on the shorter end. TODO: Re-test with various data.
-const max_probe_length = 32;
-
-// Really just a fall-back to ensure we don't fill to 100% because our resize
-// logic needs to find at least one empty sector.
-const max_fill_percent = 95;
-
-const Bucket = [bucket_sectors][sector_u64_cnt]u64;
-
-const SectorPtr = *align(8) SectorHead;
-
-// This must remain compatible with IstrHead!
-const SectorHead = packed union {
- hash: u64,
- meta: packed struct(u64) {
- len: u8,
- _: u56,
- },
+const max_fill_percent = 80;
- fn bufU64(self: SectorPtr) [*]u64 {
- return @ptrCast(self);
- }
-
- fn getIstrPtr(self: SectorPtr) IstrPtr {
- if (self.meta.len <= sector_size - @sizeOf(SectorHead)) {
- return @ptrCast(self);
- } else {
- return @ptrFromInt(self.bufU64()[1]);
- }
- }
-
- pub fn match(self: SectorPtr, hash: u64, s: []const u8) ?IstrPtr {
- if (self.hash != hash) {
- return null;
- }
- const istr = self.getIstrPtr();
- if (std.hash_map.eqlString(s, istr.str())) {
- return istr;
- } else {
- return null;
- }
- }
-
- pub fn insert(
- self: SectorPtr,
- alloc: Alloc,
- hash: u64,
- s: []const u8,
- ) !IstrPtr {
- self.hash = hash;
- if (s.len <= sector_size - @sizeOf(SectorHead)) {
- const istr: IstrPtr = @ptrCast(self);
- istr.putStr(s);
- return istr;
- } else {
- const istr = try newIstr(alloc, hash, s);
- self.bufU64()[1] = @intFromPtr(istr);
- return istr;
- }
- }
-
- pub fn copy(self: SectorPtr, dest: SectorPtr) void {
- const end = sector_u64_cnt;
- @memcpy(dest.bufU64()[0..end], self.bufU64()[0..end]);
- }
-
- fn newIstr(alloc: Alloc, hash: u64, s: []const u8) !IstrPtr {
- const algn = std.mem.Alignment.of(IstrPtr);
- const size = @sizeOf(IstrHead) + s.len;
- const ptr = try alloc.alignedAlloc(u8, algn, size);
- const istr: IstrPtr = @ptrCast(ptr);
- istr.* = .{ .hash = hash };
- istr.putStr(s);
- return istr;
- }
+const Bucket = packed struct(u64) {
+ ptr: u48,
+ fp: u16,
};
alloc: Alloc,
buckets: []Bucket = undefined,
-used_sectors: usize = 0,
+used_buckets: usize = 0,
used_threshold: usize = undefined,
+const test_stdlib_impl = false;
+
const default_bcount = 512;
pub fn init(alloc: Alloc) !Set {
@@ -134,6 +31,10 @@ pub fn init(alloc: Alloc) !Set {
}
pub fn initCustom(alloc: Alloc, bcount: usize) !Set {
+ if (test_stdlib_impl) {
+ istr_hash_map = .init(alloc);
+ return Set{ .alloc = alloc };
+ }
std.debug.assert(@popCount(bcount) == 1); // Must be power of 2.
var self = Set{ .alloc = alloc };
try self.allocBuckets(bcount);
@@ -143,103 +44,136 @@ pub fn initCustom(alloc: Alloc, bcount: usize) !Set {
fn allocBuckets(self: *Set, bcount: usize) !void {
self.buckets = try self.alloc.alloc(Bucket, bcount);
@memset(self.buckets, std.mem.zeroes(Bucket));
- self.used_threshold = bcount * bucket_sectors * max_fill_percent / 100;
+ self.used_threshold = bcount * max_fill_percent / 100;
}
pub fn deinit(self: *Set) void {
self.alloc.free(self.buckets);
}
-fn sector(bs: []Bucket, b_idx: usize, s_idx: usize) SectorPtr {
- return @ptrCast(&bs[b_idx][s_idx]);
-}
-
/// Add, or get existing, string.
pub fn add(self: *Set, s: []const u8) !IstrPtr {
std.debug.assert(s.len < 256);
+ if (test_stdlib_impl) {
+ return addStdlib(self.alloc, s);
+ }
+
+ if (self.used_buckets > self.used_threshold) {
+ try self.resize();
+ }
+
const idx_mask = self.buckets.len - 1;
- const real_hash = std.hash_map.hashString(s);
- const hash = (real_hash << 8) | s.len;
+ const hash = std.hash_map.hashString(s);
+ const fp: u16 = @truncate(hash);
const idx_init = @as(usize, @truncate(hash & idx_mask));
var idx = idx_init;
- const s_idx = b: while (true) : (idx = (idx +% 1) & idx_mask) {
- inline for (0..bucket_sectors) |si| {
- const sec = sector(self.buckets, idx, si);
- if (sec.hash == 0) break :b si;
- if (sec.match(hash, s)) |istr| return istr;
+ while (true) : (idx = (idx +% 1) & idx_mask) {
+ const bucket = self.buckets[idx];
+ if (bucket.ptr == 0) break;
+ if (bucket.fp == fp) {
+ const istr: IstrPtr = @ptrFromInt(bucket.ptr);
+ if (std.hash_map.eqlString(s, istr.str())) return istr;
}
- };
-
- if (idx -% idx_init >= max_probe_length) {
- try self.resize();
- return self.add(s);
}
- self.used_sectors += 1;
- if (self.used_sectors >= self.used_threshold) {
- try self.resize();
- return self.add(s);
- }
-
- const sec = sector(self.buckets, idx, s_idx);
- return sec.insert(self.alloc, hash, s);
+ self.used_buckets += 1;
+
+ const algn = std.mem.Alignment.of(IstrPtr);
+ const size = @sizeOf(IstrHead) + s.len;
+ const ptr = try self.alloc.alignedAlloc(u8, algn, size);
+ const istr: IstrPtr = @ptrCast(ptr);
+ istr.* = .{ .len = @intCast(s.len) };
+ istr.putStr(s);
+ self.buckets[idx] = .{
+ .fp = fp,
+ .ptr = @intCast(@intFromPtr(istr)),
+ };
+ return istr;
}
fn resize(self: *Set) !void {
- const old_bc = self.buckets.len;
- const old_bs = self.buckets;
+ const old_bcount = self.buckets.len;
+ const new_bcount = self.buckets.len << 1;
- const new_bc = old_bc << 1;
+ const old_mask = old_bcount - 1;
+ const new_mask = new_bcount - 1;
- try self.allocBuckets(new_bc);
- defer self.alloc.free(old_bs);
+ const old_buckets = self.buckets;
+ try self.allocBuckets(new_bcount);
+ defer self.alloc.free(old_buckets);
+ const new_buckets = self.buckets;
- const new_bs = self.buckets;
-
- const old_mask = old_bc - 1;
- const new_mask = new_bc - 1;
-
- // Make sure we're not in the middle of a cluster
- var idx = b: for (old_bs, 0..) |_, idx| {
- inline for (0..bucket_sectors) |s_idx| {
- const sec = sector(old_bs, idx, s_idx);
- if (sec.hash == 0) break :b (idx +% 1) & old_mask;
- }
+ // Make sure we're not in the middle of a cluster; find an empty bucket.
+ const idx_start = b: for (old_buckets, 0..) |_, idx| {
+ if (old_buckets[idx].ptr == 0) break :b idx;
} else unreachable; // Resize strategy guarantees empty slots
- const idx_last = (idx -% 1) & old_mask;
-
- while (idx != idx_last) : (idx = (idx +% 1) & old_mask) {
- // Need to walk destinations sector by sector, since strings grouped in
- // one sector in the old buckets may be split across buckets in the new
- // array.
-
- const bucket_size = sector_size * bucket_sectors;
- const new_bs_base = @intFromPtr(new_bs.ptr);
-
- var dest_lo = new_bs_base + (bucket_size * idx);
- var dest_hi = new_bs_base + (bucket_size * (idx + old_bc));
-
- // Start copying cluster
- b: while (idx != idx_last) : (idx = (idx +% 1) & old_mask) {
- inline for (0..bucket_sectors) |s_idx| {
- const sec = sector(old_bs, idx, s_idx);
- if (sec.hash == 0) break :b;
- const new_idx = sec.hash & new_mask;
- var dest: SectorPtr = undefined;
- if (new_idx < old_bc) {
- dest = @ptrFromInt(dest_lo);
- dest_lo += sector_size;
- } else {
- dest = @ptrFromInt(dest_hi);
- dest_hi += sector_size;
- }
- sec.copy(dest);
+ var idx = idx_start;
+ while (true) {
+ // +1 at start of first iteration is OK since idx_start is empty.
+ idx = (idx +% 1) & old_mask;
+
+ if (idx == idx_start) break;
+ if (old_buckets[idx].ptr == 0) continue;
+
+ var dest_lo = idx;
+ var dest_hi = idx + old_bcount;
+ while (true) : (idx = (idx +% 1) & old_mask) {
+ const bucket = old_buckets[idx];
+ if (bucket.ptr == 0) break;
+
+ const istr: IstrPtr = @ptrFromInt(bucket.ptr);
+ const hash = std.hash_map.hashString(istr.str());
+ const new_idx = @as(usize, @truncate(hash & new_mask));
+
+ if (new_idx < old_bcount) {
+ new_buckets[dest_lo] = bucket;
+ dest_lo += 1;
+ } else {
+ new_buckets[dest_hi] = bucket;
+ dest_hi = (dest_hi +% 1) & new_mask;
}
}
+
+ if (idx == idx_start) break;
}
}
+
+// Using stdlib, to compare
+
+const Map = std.hash_map.StringHashMap(IstrPtr);
+const str_context = std.hash_map.StringContext{};
+var istr_hash_map: Map = undefined;
+
+pub fn addStdlib(alloc: Alloc, str: []const u8) !IstrPtr {
+ const gop = istr_hash_map.getOrPutAdapted(
+ str,
+ str_context,
+ ) catch @panic("OOM");
+ if (gop.found_existing) {
+ return gop.value_ptr.*;
+ }
+
+ std.debug.assert(str.len <= std.math.maxInt(u48));
+
+ const header: IstrHead = .{ .len = @intCast(str.len) };
+
+ const h_align: std.mem.Alignment = @enumFromInt(@alignOf(IstrPtr));
+ const h_size = @sizeOf(IstrHead);
+ const size = str.len + h_size;
+
+ const mem = alloc.alignedAlloc(u8, h_align, size) catch @panic("OOM");
+
+ const h_bytes: [h_size]u8 = @bitCast(header);
+ @memcpy(mem[0..h_size], &h_bytes);
+ @memcpy(mem[h_size..size], str);
+
+ gop.key_ptr.* = alloc.dupe(u8, str) catch @panic("OOM");
+ gop.value_ptr.* = @ptrCast(mem.ptr);
+
+ return @ptrCast(mem.ptr);
+}
diff --git a/src/zisp/value/istr.zig b/src/zisp/value/istr.zig
index 2abfa66..1159cc7 100644
--- a/src/zisp/value/istr.zig
+++ b/src/zisp/value/istr.zig
@@ -10,18 +10,11 @@ const Value = value.Value;
// Zig API
-/// Pointer to an interned string.
-///
-/// First 64 bits are a cached hash, of which the lowest 8 bits are actually the
-/// length of the string: u64hash = (real_u64hash << 8) | u8length
+/// Pointer to an interned string. First byte is length.
pub const IstrPtr = *align(@alignOf(value.Zptr)) IstrHead;
-pub const IstrHead = packed union {
- hash: u64,
- meta: packed struct(u64) {
- len: u8,
- _: u56,
- },
+pub const IstrHead = packed struct(u8) {
+ len: u8,
fn bufU8(self: *@This()) [*]u8 {
return @ptrCast(self);
@@ -37,7 +30,8 @@ pub const IstrHead = packed union {
pub fn str(self: *@This()) []const u8 {
const buf = self.bufU8();
const start = @sizeOf(IstrHead);
- return buf[start .. start + self.meta.len];
+ const len: usize = self.len;
+ return buf[start .. start + len];
}
};
@@ -86,7 +80,7 @@ pub fn pred(v: Value) Value {
return value.boole.pack(check(v) != null);
}
-pub fn len(v: Value) Value {
+pub fn getLen(v: Value) Value {
const istr = assert(v);
- return value.fixnum.pack(@intCast(istr.meta.len));
+ return value.fixnum.pack(@intCast(istr.len));
}