diff --git a/benches/bench.rs b/benches/bench.rs index 7103f68bf..de06e3619 100644 --- a/benches/bench.rs +++ b/benches/bench.rs @@ -4,6 +4,7 @@ use criterion::{criterion_group, criterion_main}; mod general_ops; mod insert_unique_unchecked; +mod prefetch; mod set_ops; mod with_capacity; @@ -11,6 +12,7 @@ criterion_group!( benches, general_ops::register_benches, insert_unique_unchecked::register_benches, + prefetch::register_benches, set_ops::register_benches, with_capacity::register_benches ); diff --git a/benches/prefetch.rs b/benches/prefetch.rs new file mode 100644 index 000000000..16ea5c08b --- /dev/null +++ b/benches/prefetch.rs @@ -0,0 +1,293 @@ +//! Benches for `HashMap::prefetch_get` and `HashMap::prefetch_insert`. +//! +//! Two flavors of workload: batch lookups (`prefetch_get`) and batch inserts +//! (`prefetch_insert`). Each flavor runs against integer keys (`(u64, u64)`) +//! and heap-allocated `String` keys; the string variant exists because +//! heap-allocated keys force a pointer dereference to hash, which changes the +//! cache-miss profile of the prefetch call itself. + +use criterion::{BenchmarkId, Criterion, Throughput}; +use hashbrown::{DefaultHashBuilder, HashMap}; +use std::hint::black_box; + +// ---------- Shared knobs ---------- +// +// Table-size sweep covers the in-cache → cache-spilled crossover. The prefetch +// is a hint that pays off only when the control bytes have spilled out of L2/L3 +// and the caller has independent work to overlap with the fetch, so the small +// sizes are a sanity check (prefetch should be noise or a slight loss) and the +// large sizes are where the win materializes. +const SIZES: &[usize] = &[1 << 12, 1 << 16, 1 << 18, 1 << 20, 1 << 22]; + +// The number of iterations ahead we issue the prefetch. Eight is a common +// rule-of-thumb (covers ~one cache-miss-worth of work on modern cores) and +// matches the abseil prefetch_hash idiom. +const LOOKAHEAD: usize = 8; + +// Query batch size. Large enough that fixed per-iteration overhead is +// amortized; small enough that the bench finishes in seconds. +const N_QUERIES: usize = 1 << 16; + +// 16-byte key, like a common join-key shape (two u64s). +type Key = (u64, u64); + +// ---------- Integer-key workload ---------- +// +// Keys are packed inline (16 bytes), so hashing the key never dereferences +// outside the slice. This isolates the prefetch effect to the table's control +// + data lines: there's no extra cache miss "behind" the key itself. + +fn build_map(n: usize) -> HashMap { + let mut m = HashMap::with_capacity_and_hasher(n, DefaultHashBuilder::default()); + for i in 0..n as u64 { + m.insert((i, i.wrapping_mul(0x9E37_79B9_7F4A_7C15)), i); + } + m +} + +// A cheap PRNG so the lookup order is unpredictable to the hardware prefetcher. +// `xorshift` is deterministic given the seed; the same query set is generated +// each invocation so the comparison is apples-to-apples. +fn xorshift(state: &mut u64) -> u64 { + let mut x = *state; + x ^= x << 13; + x ^= x >> 7; + x ^= x << 17; + *state = x; + x +} + +fn query_keys(n: usize) -> Vec { + let mut state = 0x1234_5678_9ABC_DEF0u64; + (0..N_QUERIES) + .map(|_| { + let i = xorshift(&mut state) % n as u64; + (i, i.wrapping_mul(0x9E37_79B9_7F4A_7C15)) + }) + .collect() +} + +// Miss-heavy query set: every key is drawn from outside the inserted range so +// every lookup misses. The probe finds an empty control group and never reads +// the data line. This is the regime where `prefetch_get`'s ctrl-only hint is +// supposed to win over `prefetch`-both, because the data prefetch is wasted +// bandwidth when the lookup terminates on the control bytes. +fn query_keys_miss(n: usize) -> Vec { + let mut state = 0xCAFE_BABE_DEAD_BEEFu64; + let offset = (n as u64).saturating_mul(2); + (0..N_QUERIES) + .map(|_| { + let i = offset + (xorshift(&mut state) % n as u64); + (i, i.wrapping_mul(0x9E37_79B9_7F4A_7C15)) + }) + .collect() +} + +fn lookup_naive(map: &HashMap, keys: &[Key]) -> u64 { + let mut sum = 0u64; + for k in keys { + if let Some(&v) = map.get(k) { + sum = sum.wrapping_add(v); + } + } + sum +} + +// The look-ahead pattern: prefetch the key `i + LOOKAHEAD` iterations ahead +// while processing key `i`. By the time iteration `i + LOOKAHEAD` arrives, the +// control line is already in cache. +fn lookup_prefetched(map: &HashMap, keys: &[Key]) -> u64 { + let mut sum = 0u64; + for (i, k) in keys.iter().enumerate() { + if let Some(next) = keys.get(i + LOOKAHEAD) { + map.prefetch_get(next); + } + if let Some(&v) = map.get(k) { + sum = sum.wrapping_add(v); + } + } + sum +} + +// ---------- Heap-string workload ---------- +// +// Heap-allocated `String` keys are scattered across allocations: each key is a +// pointer + length, and hashing the key dereferences the pointer to read the +// bytes. That's an extra cache miss "behind" the key compared to the inline +// integer keys. Whether prefetch still wins on this workload depends on how +// much of that extra miss the look-ahead can overlap with the caller's work. +// Note that the prefetch_get call here doesn't hint the key's heap buffer; it +// only hints the control bytes the key's *hash* would land on. The key +// dereference cost is paid at the prefetch site (during `make_hash`), not the +// lookup site. + +fn build_map_string(n: usize) -> HashMap { + let mut m = HashMap::with_capacity_and_hasher(n, DefaultHashBuilder::default()); + for i in 0..n as u64 { + m.insert( + format!("key-{}-{:016x}", i, i.wrapping_mul(0x9E37_79B9_7F4A_7C15)), + i, + ); + } + m +} + +fn query_keys_string(n: usize) -> Vec { + let mut state = 0x1234_5678_9ABC_DEF0u64; + (0..N_QUERIES) + .map(|_| { + let i = xorshift(&mut state) % n as u64; + format!("key-{}-{:016x}", i, i.wrapping_mul(0x9E37_79B9_7F4A_7C15)) + }) + .collect() +} + +fn lookup_naive_string(map: &HashMap, keys: &[&str]) -> u64 { + let mut sum = 0u64; + for k in keys { + if let Some(&v) = map.get(*k) { + sum = sum.wrapping_add(v); + } + } + sum +} + +fn lookup_prefetched_string(map: &HashMap, keys: &[&str]) -> u64 { + let mut sum = 0u64; + for (i, k) in keys.iter().enumerate() { + if let Some(next) = keys.get(i + LOOKAHEAD) { + map.prefetch_get(*next); + } + if let Some(&v) = map.get(*k) { + sum = sum.wrapping_add(v); + } + } + sum +} + +// ---------- Insert workload (prefetch_insert) ---------- +// +// Inserts hint *both* the control line and the data bucket, since an insert +// will write to the data position regardless of whether the slot is currently +// empty. The bench reserves capacity up front so the workload measures the +// steady-state insert (find-empty-slot + write), not amortized growth. + +fn insert_naive(keys: &[Key], capacity: usize) -> u64 { + let mut m: HashMap = + HashMap::with_capacity_and_hasher(capacity, DefaultHashBuilder::default()); + let mut sum = 0u64; + for (i, &k) in keys.iter().enumerate() { + m.insert(k, i as u64); + sum = sum.wrapping_add(i as u64); + } + sum +} + +fn insert_prefetched(keys: &[Key], capacity: usize) -> u64 { + let mut m: HashMap = + HashMap::with_capacity_and_hasher(capacity, DefaultHashBuilder::default()); + let mut sum = 0u64; + for (i, &k) in keys.iter().enumerate() { + if let Some(next) = keys.get(i + LOOKAHEAD) { + m.prefetch_insert(next); + } + m.insert(k, i as u64); + sum = sum.wrapping_add(i as u64); + } + sum +} + +// Unique-insert key set: every key is distinct so each iteration adds a fresh +// entry. Capacity is reserved up front (in the bench harness) so the workload +// doesn't include rehash cost. +fn insert_keys(n: usize) -> Vec { + let mut state = 0xDEAD_BEEF_FACE_CAFEu64; + (0..n) + .map(|_| { + let i = xorshift(&mut state); + (i, i.wrapping_mul(0x9E37_79B9_7F4A_7C15)) + }) + .collect() +} + +// ---------- Bench registration ---------- + +pub(crate) fn register_benches(c: &mut Criterion) { + // Group 1: integer-key batch lookup, queries hit the map (100% hit rate). + // Probes find a matching tag and read the data line on every iteration, so + // prefetching the data line is useful. This is the regime where the + // original `prefetch` (hint both ctrl + data) is expected to win and a + // ctrl-only `prefetch_get` is expected to lose. + let mut group = c.benchmark_group("batch_lookup"); + group.throughput(Throughput::Elements(N_QUERIES as u64)); + for &n in SIZES { + let map = build_map(n); + let keys = query_keys(n); + group.bench_with_input(BenchmarkId::new("naive", n), &n, |b, _| { + b.iter(|| black_box(lookup_naive(black_box(&map), black_box(&keys)))); + }); + group.bench_with_input(BenchmarkId::new("prefetch_get", n), &n, |b, _| { + b.iter(|| black_box(lookup_prefetched(black_box(&map), black_box(&keys)))); + }); + } + group.finish(); + + // Group 1b: integer-key batch lookup, queries miss the map (0% hit rate). + // Probes find an empty control group and never read the data line, so the + // data prefetch in `prefetch`-both would be wasted bandwidth. This is the + // regime where ctrl-only `prefetch_get` is expected to win, because the + // ctrl hint is still useful and the wasted data hint is avoided. + let mut group = c.benchmark_group("batch_lookup_miss"); + group.throughput(Throughput::Elements(N_QUERIES as u64)); + for &n in SIZES { + let map = build_map(n); + let keys = query_keys_miss(n); + group.bench_with_input(BenchmarkId::new("naive", n), &n, |b, _| { + b.iter(|| black_box(lookup_naive(black_box(&map), black_box(&keys)))); + }); + group.bench_with_input(BenchmarkId::new("prefetch_get", n), &n, |b, _| { + b.iter(|| black_box(lookup_prefetched(black_box(&map), black_box(&keys)))); + }); + } + group.finish(); + + // Group 2: heap-string-key batch lookup (prefetch_get). String keys force a + // pointer dereference at hash time, exposing whether the prefetch's + // look-ahead overlap survives the extra cache miss on the key buffer. + let mut group = c.benchmark_group("batch_lookup_string"); + group.throughput(Throughput::Elements(N_QUERIES as u64)); + for &n in SIZES { + let map = build_map_string(n); + let keys = query_keys_string(n); + let key_refs: Vec<&str> = keys.iter().map(String::as_str).collect(); + group.bench_with_input(BenchmarkId::new("naive", n), &n, |b, _| { + b.iter(|| black_box(lookup_naive_string(black_box(&map), black_box(&key_refs)))); + }); + group.bench_with_input(BenchmarkId::new("prefetch_get", n), &n, |b, _| { + b.iter(|| { + black_box(lookup_prefetched_string( + black_box(&map), + black_box(&key_refs), + )) + }); + }); + } + group.finish(); + + // Group 3: integer-key batch insert (prefetch_insert). Capacity is reserved + // so the bench measures steady-state insert cost (find-empty-slot + write), + // not amortized growth. + let mut group = c.benchmark_group("batch_insert"); + group.throughput(Throughput::Elements(N_QUERIES as u64)); + for &n in SIZES { + let keys = insert_keys(N_QUERIES); + let capacity = n; + group.bench_with_input(BenchmarkId::new("naive", n), &n, |b, _| { + b.iter(|| black_box(insert_naive(black_box(&keys), capacity))); + }); + group.bench_with_input(BenchmarkId::new("prefetch_insert", n), &n, |b, _| { + b.iter(|| black_box(insert_prefetched(black_box(&keys), capacity))); + }); + } + group.finish(); +} diff --git a/src/lib.rs b/src/lib.rs index effc178a3..04d07354d 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -52,6 +52,7 @@ mod macros; mod alloc; mod control; mod hasher; +mod prefetch; mod raw; mod util; diff --git a/src/map.rs b/src/map.rs index 22cafef59..e3d1d982a 100644 --- a/src/map.rs +++ b/src/map.rs @@ -1292,6 +1292,71 @@ where } } + /// Issues a software prefetch hint for the control bytes and data bucket + /// a *lookup* of `k` would touch first. + /// + /// This hashes `k` and then prefetches both the control-byte group at the + /// start of its probe sequence and the corresponding data bucket. The + /// method name signals lookup intent; the implementation hints both lines + /// because measured bench evidence shows the data prefetch is load-bearing + /// for the win on lookup workloads. Use + /// [`prefetch_insert`](Self::prefetch_insert) to signal insert intent. + /// + /// Purely a performance hint with no observable effect; compiles to nothing + /// on architectures without a prefetch instruction. + /// + /// It is only worth using when looking up *many* keys in a sequence and the + /// map is large enough that the control bytes do not fit in cache: in that + /// case you can call `prefetch_get` on a key several iterations ahead of + /// the one currently being looked up, so the cache lines it needs are in + /// flight before the lookup reaches them. For a single lookup, or a map + /// that fits in cache, it does nothing useful. + /// + /// # Examples + /// + /// ``` + /// use hashbrown::HashMap; + /// + /// let map: HashMap = (0..1000).map(|i| (i, i)).collect(); + /// let queries: Vec = (0..1000).rev().collect(); + /// + /// let mut sum = 0u64; + /// for (i, q) in queries.iter().enumerate() { + /// if let Some(next) = queries.get(i + 8) { + /// map.prefetch_get(next); + /// } + /// if let Some(&v) = map.get(q) { + /// sum += u64::from(v); + /// } + /// } + /// # let _ = sum; + /// ``` + #[inline] + pub fn prefetch_get(&self, k: &Q) + where + Q: Hash + Equivalent + ?Sized, + { + let hash = make_hash::(&self.hash_builder, k); + self.table.prefetch_get(hash); + } + + /// Issues a software prefetch hint for the control bytes and data bucket + /// an *insert* of `k` would touch first. + /// + /// The method name signals insert intent. Currently shares the same + /// implementation as [`prefetch_get`](Self::prefetch_get). + /// + /// Purely a performance hint with no observable effect; compiles to nothing + /// on architectures without a prefetch instruction. + #[inline] + pub fn prefetch_insert(&self, k: &Q) + where + Q: Hash + Equivalent + ?Sized, + { + let hash = make_hash::(&self.hash_builder, k); + self.table.prefetch_insert(hash); + } + /// Returns the key-value pair corresponding to the supplied key. /// /// The supplied key may be any borrowed form of the map's key type, but @@ -6899,6 +6964,69 @@ mod test_map { HashMap::::with_capacity(1).allocation_size() > core::mem::size_of::() ); } + + #[test] + fn test_prefetch() { + // `prefetch_get` and `prefetch_insert` are hints with no observable + // effect; the contract we can test is "calling them never misbehaves + // and never disturbs the table", across the interesting shapes: the + // empty singleton, a tiny table, a larger one, a ZST-value table, + // present and absent keys, and the look-ahead pattern from the docs. + let empty: HashMap = HashMap::new(); + empty.prefetch_get(&0); + empty.prefetch_get(&12345); + empty.prefetch_insert(&0); + empty.prefetch_insert(&12345); + + let zst: HashMap = (0..200).map(|i| (i, ())).collect(); + for i in 0..256 { + zst.prefetch_get(&i); + zst.prefetch_insert(&i); + } + + let mut map: HashMap = HashMap::new(); + for i in 0..1000u32 { + map.insert(i, i.wrapping_mul(7)); + } + for i in 0..2000u32 { + map.prefetch_get(&i); + map.prefetch_insert(&i); + } + // The table is still intact and lookups still work after prefetching. + for i in 0..1000u32 { + assert_eq!(map.get(&i), Some(&i.wrapping_mul(7))); + } + for i in 1000..2000u32 { + assert_eq!(map.get(&i), None); + } + + // The look-ahead pattern from the docs (lookup-side). + let queries: Vec = (0..1000u32).rev().collect(); + let mut found = 0; + for (i, &q) in queries.iter().enumerate() { + if let Some(&next) = queries.get(i + 8) { + map.prefetch_get(&next); + } + if map.get(&q).is_some() { + found += 1; + } + } + assert_eq!(found, 1000); + + // The look-ahead pattern on the insert side. + let mut bulk: HashMap = HashMap::with_capacity(4096); + let inserts: Vec = (0..2000u32).collect(); + for (i, &k) in inserts.iter().enumerate() { + if let Some(&next) = inserts.get(i + 8) { + bulk.prefetch_insert(&next); + } + bulk.insert(k, k); + } + assert_eq!(bulk.len(), 2000); + for i in 0..2000u32 { + assert_eq!(bulk.get(&i), Some(&i)); + } + } } #[cfg(all(test, unix, any(feature = "nightly", feature = "allocator-api2")))] diff --git a/src/prefetch.rs b/src/prefetch.rs new file mode 100644 index 000000000..12b43b129 --- /dev/null +++ b/src/prefetch.rs @@ -0,0 +1,81 @@ +//! Software prefetch hint. +//! +//! A prefetch is a *hint* to the CPU that the cache line containing a given +//! address will be accessed soon, so the memory subsystem can start fetching it +//! while the core does other work. It is purely advisory: it never reads or +//! writes memory, never faults (even for an invalid or dangling pointer), and is +//! a no-op in the Rust abstract machine. Architectures without a stable prefetch +//! intrinsic simply compile it away. +//! +//! Two paths to the underlying hint: +//! +//! - **Stable shim (default).** Per-architecture stable intrinsics where they +//! exist (`_mm_prefetch::<_MM_HINT_T0>` on x86/x86-64) and a no-op fallback +//! elsewhere (aarch64 has no stable prefetch intrinsic). +//! - **Nightly intrinsic (`nightly` feature).** `core::intrinsics::prefetch_read_data::<_, 3>(ptr)` +//! where locality `3` matches the stable shim's `_MM_HINT_T0` ("prefetch into +//! all cache levels"). Available across all architectures the compiler +//! recognizes. Gated on the `nightly` feature so end users can compare codegen +//! against the stable shim on their target. + +/// Issues an L1 read prefetch for the cache line containing `ptr`. +/// +/// This is a hint only. `ptr` does not need to be valid, aligned, or even +/// non-null; an out-of-bounds or dangling pointer is fine and will not fault. +/// On targets without a stable prefetch intrinsic this is a no-op. +/// +/// With the `nightly` feature enabled, this routes through +/// `core::intrinsics::prefetch_read_data::<_, 3>(ptr)`. Locality `3` is the +/// highest (all cache levels), matching the stable shim's `_MM_HINT_T0` on x86 +/// so the two paths bench apples-to-apples. +#[inline] +#[allow(clippy::let_unit_value)] +pub(crate) fn prefetch_read_l1(ptr: *const u8) { + #[cfg(feature = "nightly")] + { + // `prefetch_read_data` is safe to call: it performs no memory access, + // never faults, and accepts any address. Locality `3` (const-generic + // on the intrinsic) maps to the highest level (all caches), matching + // `_MM_HINT_T0` on x86 so the comparison against the stable shim is + // apples-to-apples. + core::intrinsics::prefetch_read_data::<_, 3>(ptr); + } + + #[cfg(all( + not(feature = "nightly"), + any(target_arch = "x86", target_arch = "x86_64"), + target_feature = "sse", + not(miri), + ))] + { + #[cfg(target_arch = "x86")] + use core::arch::x86::{_MM_HINT_T0, _mm_prefetch}; + #[cfg(target_arch = "x86_64")] + use core::arch::x86_64::{_MM_HINT_T0, _mm_prefetch}; + + // SAFETY: `_mm_prefetch` is a hint instruction; it performs no memory + // access, never faults, and accepts any address (the Intel SDM and the + // `core::arch` docs both spell this out). The only safety requirement is + // that the `sse` target feature is available, which the `cfg` above + // guarantees on x86 / x86-64. + unsafe { + _mm_prefetch::<_MM_HINT_T0>(ptr.cast::()); + } + } + + #[cfg(all( + not(feature = "nightly"), + not(all( + any(target_arch = "x86", target_arch = "x86_64"), + target_feature = "sse", + not(miri), + )), + ))] + { + // No stable prefetch intrinsic on this target (aarch64 has none yet). + // The `nightly` feature path covers it via `prefetch_read_data` when + // available. Make sure `ptr` is still "used" so callers don't trip an + // unused-variable lint. + let _ = ptr; + } +} diff --git a/src/raw.rs b/src/raw.rs index 39f50ef78..7a3012e16 100644 --- a/src/raw.rs +++ b/src/raw.rs @@ -1210,6 +1210,41 @@ impl RawTable { } } + /// Issues a software prefetch hint for the control-byte group and data + /// bucket a *lookup* of `hash` would touch first. + /// + /// The method name signals lookup intent; the implementation hints both + /// lines because measured bench evidence (PR #727) shows the data prefetch + /// is load-bearing for the win on lookup workloads. Use + /// [`prefetch_insert`](Self::prefetch_insert) to signal insert intent — it + /// currently shares the same implementation but the split keeps room for a + /// behavioral specialization in a follow-up if a workload supports it. + /// + /// Purely a performance hint with no observable effect. Most useful when + /// looking up many keys in a row: hash and prefetch a key a few iterations + /// ahead of the one currently being looked up. On a single lookup, or on + /// a table small enough to stay in cache, it does nothing useful (and on + /// architectures without a prefetch instruction it compiles away entirely). + #[inline] + pub(crate) fn prefetch_get(&self, hash: u64) { + // SAFETY: We use the same `table_layout` that was used to allocate + // this table. + unsafe { self.table.prefetch_both(hash, Self::TABLE_LAYOUT) } + } + + /// Issues a software prefetch hint for the control-byte group and data + /// bucket an *insert* of `hash` would touch first. + /// + /// The method name signals insert intent. Currently shares the same + /// implementation as [`prefetch_get`](Self::prefetch_get) — see that + /// method's note on the named-split-only design. + #[inline] + pub(crate) fn prefetch_insert(&self, hash: u64) { + // SAFETY: We use the same `table_layout` that was used to allocate + // this table. + unsafe { self.table.prefetch_both(hash, Self::TABLE_LAYOUT) } + } + /// Gets a reference to an element in the table. #[inline] pub(crate) fn get(&self, hash: u64, eq: impl FnMut(&T) -> bool) -> Option<&T> { @@ -2454,6 +2489,56 @@ impl RawTableInner { } } + /// Issues a software prefetch hint for the control-byte group *and* the + /// data bucket at the start of the probe sequence for `hash`. + /// + /// Used by both `prefetch_get` (lookup-side hint) and `prefetch_insert` + /// (insert-side hint) wrappers. The two wrappers share the same underlying + /// implementation because measured bench evidence (PR #727, Ryzen 9 9950X, + /// hit-heavy AND miss-heavy workloads) shows that the data-line prefetch + /// is load-bearing for the win on lookup workloads — skipping the data + /// prefetch in the lookup case regresses 18–40% across the size sweep. + /// The named-method split (`prefetch_get` vs `prefetch_insert`) expresses + /// caller intent without changing behavior; the implementations can + /// diverge in a follow-up if a workload surfaces where the trade-off pays. + /// + /// `table_layout` must be the layout used to allocate this table (so that + /// the data-bucket address is computed correctly). + /// + /// This is a hint only: it performs no memory access, never faults, and is + /// a no-op in the abstract machine. On the empty singleton table the + /// "addresses" point into / just before the shared empty control array, + /// which is fine — prefetching them is still harmless. + /// + /// # Safety + /// + /// `table_layout` must match the layout used to allocate this table. + /// (The function does not dereference any pointer, but it computes one from + /// `table_layout.size`; a mismatched layout would only mean prefetching the + /// wrong cache line, never UB.) + #[inline] + unsafe fn prefetch_both(&self, hash: u64, table_layout: TableLayout) { + let pos = h1(hash) & self.bucket_mask; + + // Control bytes: the group `Group::load` would read first. + let ctrl_ptr = self.ctrl.as_ptr().wrapping_add(pos); + + // Data bucket at index `pos`: `data_end - (pos + 1) * size`. `data_end` + // is `self.ctrl`, so this is `self.ctrl - (pos + 1) * size`. Use + // `wrapping_*` so this can never be UB even for the empty singleton + // (where it points just before the shared empty control array). + let data_ptr = self + .ctrl + .as_ptr() + .wrapping_sub((pos + 1).wrapping_mul(table_layout.size)); + + crate::prefetch::prefetch_read_l1(ctrl_ptr); + // For zero-sized values there is no data array to prefetch. + if table_layout.size != 0 { + crate::prefetch::prefetch_read_l1(data_ptr); + } + } + #[inline] unsafe fn record_item_insert_at(&mut self, index: usize, old_ctrl: Tag, new_ctrl: Tag) { self.growth_left -= usize::from(old_ctrl.special_is_empty()); diff --git a/src/set.rs b/src/set.rs index cea1690f1..e1d010f32 100644 --- a/src/set.rs +++ b/src/set.rs @@ -854,6 +854,51 @@ where self.map.contains_key(value) } + /// Issues a software prefetch hint for the table memory that a lookup of + /// `value` would touch first. + /// + /// This hashes `value` and prefetches both the control-byte group at the + /// start of its probe sequence and the corresponding data bucket. The + /// method name signals lookup intent; the implementation hints both lines + /// because measured bench evidence shows the data prefetch is load-bearing + /// for the win on lookup workloads. Use + /// [`prefetch_insert`](Self::prefetch_insert) to signal insert intent. + /// + /// Purely a performance hint with no observable effect; compiles to nothing + /// on architectures without a prefetch instruction. + /// + /// It is only worth using when looking up *many* values in a sequence and + /// the set is large enough that the control bytes do not fit in cache: in + /// that case you can call `prefetch_get` on a value several iterations + /// ahead of the one currently being looked up. For a single lookup, or a + /// set that fits in cache, it does nothing useful. See + /// [`HashMap::prefetch_get`] for an example of the look-ahead pattern. + /// + /// [`HashMap::prefetch_get`]: crate::HashMap::prefetch_get + #[cfg_attr(feature = "inline-more", inline)] + pub fn prefetch_get(&self, value: &Q) + where + Q: Hash + Equivalent + ?Sized, + { + self.map.prefetch_get(value); + } + + /// Issues a software prefetch hint for the control bytes and data bucket + /// an *insert* of `value` would touch first. + /// + /// The method name signals insert intent. Currently shares the same + /// implementation as [`prefetch_get`](Self::prefetch_get). + /// + /// Purely a performance hint with no observable effect; compiles to nothing + /// on architectures without a prefetch instruction. + #[cfg_attr(feature = "inline-more", inline)] + pub fn prefetch_insert(&self, value: &Q) + where + Q: Hash + Equivalent + ?Sized, + { + self.map.prefetch_insert(value); + } + /// Returns a reference to the value in the set, if any, that is equal to the given value. /// /// The value may be any borrowed form of the set's value type, but diff --git a/src/table.rs b/src/table.rs index f50f575f6..70049bab1 100644 --- a/src/table.rs +++ b/src/table.rs @@ -229,6 +229,80 @@ where self.raw.get(hash, eq) } + /// Issues a software prefetch hint for the control-byte group and data + /// bucket a *lookup* of `hash` would touch first. + /// + /// The method name signals lookup intent; the implementation hints both + /// lines because measured bench evidence shows the data prefetch is + /// load-bearing for the win on lookup workloads. Use + /// [`prefetch_insert`](Self::prefetch_insert) to signal insert intent. + /// + /// This is purely a performance hint with no observable effect, and it + /// compiles to nothing on architectures without a prefetch instruction. + /// + /// It is only worth using when looking up *many* hashes in a sequence and + /// the table is large enough that the control bytes do not fit in cache: + /// in that case you can hash a key several iterations ahead of the one + /// currently being looked up and call `prefetch_get` on it, so the cache + /// lines it needs are in flight before the lookup reaches them. For a + /// single lookup, or a table that fits in cache, it does nothing useful. + /// + /// `hash` must be computed with the same hasher you use for [`find`]; using + /// an unrelated hash just prefetches an unrelated (still valid-to-prefetch) + /// cache line and wastes the hint. + /// + /// # Examples + /// + /// ``` + /// # #[cfg(feature = "nightly")] + /// # fn test() { + /// use hashbrown::{HashTable, DefaultHashBuilder}; + /// use std::hash::BuildHasher; + /// + /// let s = DefaultHashBuilder::default(); + /// let mut table: HashTable = HashTable::new(); + /// for i in 0..1000 { + /// table.insert_unique(s.hash_one(i), i, |&x| s.hash_one(x)); + /// } + /// + /// let queries: Vec = (0..1000).rev().collect(); + /// // Look up `queries`, prefetching 8 iterations ahead. + /// let mut found = 0; + /// for (i, &q) in queries.iter().enumerate() { + /// if let Some(&next) = queries.get(i + 8) { + /// table.prefetch_get(s.hash_one(next)); + /// } + /// if table.find(s.hash_one(q), |&x| x == q).is_some() { + /// found += 1; + /// } + /// } + /// assert_eq!(found, 1000); + /// # } + /// # fn main() { + /// # #[cfg(feature = "nightly")] + /// # test() + /// # } + /// ``` + /// + /// [`find`]: Self::find + #[inline] + pub fn prefetch_get(&self, hash: u64) { + self.raw.prefetch_get(hash); + } + + /// Issues a software prefetch hint for the control-byte group and data + /// bucket an *insert* of `hash` would touch first. + /// + /// The method name signals insert intent. Currently shares the same + /// implementation as [`prefetch_get`](Self::prefetch_get). + /// + /// Purely a performance hint with no observable effect; compiles to + /// nothing on architectures without a prefetch instruction. + #[inline] + pub fn prefetch_insert(&self, hash: u64) { + self.raw.prefetch_insert(hash); + } + /// Returns a mutable reference to an entry in the table with the given hash /// and which satisfies the equality function passed. ///