From 5cbc4477032d2e38bdbbc04aa993061c7b7dafaf Mon Sep 17 00:00:00 2001 From: Ryan Stewart <47729789+RyanJamesStewart@users.noreply.github.com> Date: Tue, 12 May 2026 18:57:06 -0700 Subject: [PATCH 1/2] Add a way to prefetch a hash table bucket MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds `prefetch(hash)` to `RawTable` and exposes it as: - `HashTable::prefetch(hash)` - `HashMap::prefetch(&Q)` / `HashSet::prefetch(&Q)` (hash the key, then prefetch) A prefetch issues a software prefetch hint for the two cache lines a lookup of that hash would touch first: the control-byte group at the start of the probe sequence and the corresponding data bucket. It is a hint only — no memory access, never faults (an invalid/dangling address is fine), a no-op in the abstract machine. The stable path is per-architecture: `_mm_prefetch` (`_MM_HINT_T0`) on x86/x86-64, a no-op everywhere else (aarch64 has no stable prefetch intrinsic yet, and `core::intrinsics::prefetch_read_data` is unstable). The new `src/prefetch.rs` shim is `#[cfg(not(miri))]`-gated for the intrinsic, like the SIMD `Group` impls. For now this is L1 read prefetch only; a richer locality/read-write interface can follow once the std prefetch hints (rust-lang/rust#146941) stabilize. This only helps when the table is large enough that its control bytes spill out of cache *and* the caller can prefetch a key several lookups ahead of the one being processed (batched lookups / join probing). On a single lookup, or a cache-resident table, it does nothing useful. The new `benches/prefetch.rs` batch-lookup bench shows the crossover: roughly a slight loss on a small (4K-slot) table, ~1.1-1.15x on tables that no longer fit in L2/L3 (~1M-4M slots). --- benches/bench.rs | 2 ++ benches/prefetch.rs | 87 ++++++++++++++++++++++++++++++++++++++++++++ src/lib.rs | 1 + src/map.rs | 88 +++++++++++++++++++++++++++++++++++++++++++++ src/prefetch.rs | 54 ++++++++++++++++++++++++++++ src/raw.rs | 61 +++++++++++++++++++++++++++++++ src/set.rs | 24 +++++++++++++ src/table.rs | 57 +++++++++++++++++++++++++++++ 8 files changed, 374 insertions(+) create mode 100644 benches/prefetch.rs create mode 100644 src/prefetch.rs diff --git a/benches/bench.rs b/benches/bench.rs index 7103f68bf..de06e3619 100644 --- a/benches/bench.rs +++ b/benches/bench.rs @@ -4,6 +4,7 @@ use criterion::{criterion_group, criterion_main}; mod general_ops; mod insert_unique_unchecked; +mod prefetch; mod set_ops; mod with_capacity; @@ -11,6 +12,7 @@ criterion_group!( benches, general_ops::register_benches, insert_unique_unchecked::register_benches, + prefetch::register_benches, set_ops::register_benches, with_capacity::register_benches ); diff --git a/benches/prefetch.rs b/benches/prefetch.rs new file mode 100644 index 000000000..d0fa726e2 --- /dev/null +++ b/benches/prefetch.rs @@ -0,0 +1,87 @@ +//! Batch-lookup benchmark: look up a list of keys in a large `HashMap`, with +//! and without software-prefetching a key a few iterations ahead. +//! +//! Prefetching only pays off when the table is large enough that its control +//! bytes spill out of the L2/L3 cache *and* the caller can issue the prefetch +//! far enough ahead of the use. So this benchmark sweeps the table size and +//! uses a randomized lookup order (so the access pattern is cache-hostile). +//! On a small, cache-resident table the prefetch is noise (or a slight loss); +//! the win shows up on the large sizes. + +use criterion::{BenchmarkId, Criterion, Throughput}; +use hashbrown::{DefaultHashBuilder, HashMap}; +use std::hint::black_box; + +// 16-byte keys, like a common join-key shape (two u64s). +type Key = (u64, u64); + +const SIZES: &[usize] = &[1 << 12, 1 << 16, 1 << 18, 1 << 20, 1 << 22]; +const LOOKAHEAD: usize = 8; +const N_QUERIES: usize = 1 << 16; + +fn build_map(n: usize) -> HashMap { + let mut m = HashMap::with_capacity_and_hasher(n, DefaultHashBuilder::default()); + for i in 0..n as u64 { + m.insert((i, i.wrapping_mul(0x9E37_79B9_7F4A_7C15)), i); + } + m +} + +// A cheap PRNG so the lookup order is unpredictable to the prefetcher. +fn xorshift(state: &mut u64) -> u64 { + let mut x = *state; + x ^= x << 13; + x ^= x >> 7; + x ^= x << 17; + *state = x; + x +} + +fn query_keys(n: usize) -> Vec { + let mut state = 0x1234_5678_9ABC_DEF0u64; + (0..N_QUERIES) + .map(|_| { + let i = xorshift(&mut state) % n as u64; + (i, i.wrapping_mul(0x9E37_79B9_7F4A_7C15)) + }) + .collect() +} + +fn lookup_naive(map: &HashMap, keys: &[Key]) -> u64 { + let mut sum = 0u64; + for k in keys { + if let Some(&v) = map.get(k) { + sum = sum.wrapping_add(v); + } + } + sum +} + +fn lookup_prefetched(map: &HashMap, keys: &[Key]) -> u64 { + let mut sum = 0u64; + for (i, k) in keys.iter().enumerate() { + if let Some(next) = keys.get(i + LOOKAHEAD) { + map.prefetch(next); + } + if let Some(&v) = map.get(k) { + sum = sum.wrapping_add(v); + } + } + sum +} + +pub(crate) fn register_benches(c: &mut Criterion) { + let mut group = c.benchmark_group("batch_lookup"); + group.throughput(Throughput::Elements(N_QUERIES as u64)); + for &n in SIZES { + let map = build_map(n); + let keys = query_keys(n); + group.bench_with_input(BenchmarkId::new("naive", n), &n, |b, _| { + b.iter(|| black_box(lookup_naive(black_box(&map), black_box(&keys)))); + }); + group.bench_with_input(BenchmarkId::new("prefetch", n), &n, |b, _| { + b.iter(|| black_box(lookup_prefetched(black_box(&map), black_box(&keys)))); + }); + } + group.finish(); +} diff --git a/src/lib.rs b/src/lib.rs index effc178a3..04d07354d 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -52,6 +52,7 @@ mod macros; mod alloc; mod control; mod hasher; +mod prefetch; mod raw; mod util; diff --git a/src/map.rs b/src/map.rs index 22cafef59..e1c4a445a 100644 --- a/src/map.rs +++ b/src/map.rs @@ -1292,6 +1292,49 @@ where } } + /// Issues a software prefetch hint for the table memory that a lookup of + /// `k` would touch first. + /// + /// This hashes `k` and then prefetches the control-byte group at the start + /// of its probe sequence and the corresponding bucket. It is purely a + /// performance hint with no observable effect, and it compiles to nothing + /// on architectures without a prefetch instruction. + /// + /// It is only worth using when looking up *many* keys in a sequence and the + /// map is large enough that the control bytes do not fit in cache: in that + /// case you can call `prefetch` on a key several iterations ahead of the one + /// currently being looked up, so the cache lines it needs are in flight + /// before the lookup reaches them. For a single lookup, or a map that fits + /// in cache, it does nothing useful. + /// + /// # Examples + /// + /// ``` + /// use hashbrown::HashMap; + /// + /// let map: HashMap = (0..1000).map(|i| (i, i)).collect(); + /// let queries: Vec = (0..1000).rev().collect(); + /// + /// let mut sum = 0u64; + /// for (i, q) in queries.iter().enumerate() { + /// if let Some(next) = queries.get(i + 8) { + /// map.prefetch(next); + /// } + /// if let Some(&v) = map.get(q) { + /// sum += u64::from(v); + /// } + /// } + /// # let _ = sum; + /// ``` + #[inline] + pub fn prefetch(&self, k: &Q) + where + Q: Hash + Equivalent + ?Sized, + { + let hash = make_hash::(&self.hash_builder, k); + self.table.prefetch(hash); + } + /// Returns the key-value pair corresponding to the supplied key. /// /// The supplied key may be any borrowed form of the map's key type, but @@ -6899,6 +6942,51 @@ mod test_map { HashMap::::with_capacity(1).allocation_size() > core::mem::size_of::() ); } + + #[test] + fn test_prefetch() { + // `prefetch` is a hint with no observable effect; the contract we can + // test is "calling it never misbehaves and never disturbs the table", + // across the interesting shapes: the empty singleton, a tiny table, a + // larger one, a ZST-value table, present and absent keys, and a key + // hash that probes the last bucket. + let empty: HashMap = HashMap::new(); + empty.prefetch(&0); + empty.prefetch(&12345); + + let zst: HashMap = (0..200).map(|i| (i, ())).collect(); + for i in 0..256 { + zst.prefetch(&i); + } + + let mut map: HashMap = HashMap::new(); + for i in 0..1000u32 { + map.insert(i, i.wrapping_mul(7)); + } + for i in 0..2000u32 { + map.prefetch(&i); + } + // The table is still intact and lookups still work after prefetching. + for i in 0..1000u32 { + assert_eq!(map.get(&i), Some(&i.wrapping_mul(7))); + } + for i in 1000..2000u32 { + assert_eq!(map.get(&i), None); + } + + // The look-ahead pattern from the docs. + let queries: Vec = (0..1000u32).rev().collect(); + let mut found = 0; + for (i, &q) in queries.iter().enumerate() { + if let Some(&next) = queries.get(i + 8) { + map.prefetch(&next); + } + if map.get(&q).is_some() { + found += 1; + } + } + assert_eq!(found, 1000); + } } #[cfg(all(test, unix, any(feature = "nightly", feature = "allocator-api2")))] diff --git a/src/prefetch.rs b/src/prefetch.rs new file mode 100644 index 000000000..3c5bfe120 --- /dev/null +++ b/src/prefetch.rs @@ -0,0 +1,54 @@ +//! Software prefetch hint. +//! +//! A prefetch is a *hint* to the CPU that the cache line containing a given +//! address will be accessed soon, so the memory subsystem can start fetching it +//! while the core does other work. It is purely advisory: it never reads or +//! writes memory, never faults (even for an invalid or dangling pointer), and is +//! a no-op in the Rust abstract machine. Architectures without a stable prefetch +//! intrinsic simply compile it away. +//! +//! `core::intrinsics::prefetch_read_data` is unstable, so we cannot use it here. +//! Instead we use the stable per-architecture intrinsics where they exist +//! (`_mm_prefetch` on x86/x86-64) and fall back to a no-op everywhere else. + +/// Issues an L1 read prefetch for the cache line containing `ptr`. +/// +/// This is a hint only. `ptr` does not need to be valid, aligned, or even +/// non-null; an out-of-bounds or dangling pointer is fine and will not fault. +/// On targets without a stable prefetch intrinsic this is a no-op. +#[inline] +#[allow(clippy::let_unit_value)] +pub(crate) fn prefetch_read_l1(ptr: *const u8) { + #[cfg(all( + any(target_arch = "x86", target_arch = "x86_64"), + target_feature = "sse", + not(miri), + ))] + { + #[cfg(target_arch = "x86")] + use core::arch::x86::{_MM_HINT_T0, _mm_prefetch}; + #[cfg(target_arch = "x86_64")] + use core::arch::x86_64::{_MM_HINT_T0, _mm_prefetch}; + + // SAFETY: `_mm_prefetch` is a hint instruction; it performs no memory + // access, never faults, and accepts any address (the Intel SDM and the + // `core::arch` docs both spell this out). The only safety requirement is + // that the `sse` target feature is available, which the `cfg` above + // guarantees on x86 / x86-64. + unsafe { + _mm_prefetch::<_MM_HINT_T0>(ptr.cast::()); + } + } + + #[cfg(not(all( + any(target_arch = "x86", target_arch = "x86_64"), + target_feature = "sse", + not(miri), + )))] + { + // No stable prefetch intrinsic on this target (aarch64 has none yet, + // and `core::intrinsics::prefetch_read_data` is unstable). Make sure + // `ptr` is still "used" so callers don't trip an unused-variable lint. + let _ = ptr; + } +} diff --git a/src/raw.rs b/src/raw.rs index 39f50ef78..9bdea3bec 100644 --- a/src/raw.rs +++ b/src/raw.rs @@ -1210,6 +1210,24 @@ impl RawTable { } } + /// Issues a software prefetch hint for the table memory that a lookup of + /// `hash` would touch first: the control-byte group at the start of the + /// probe sequence and the corresponding data bucket. + /// + /// This is purely a performance hint and has no observable effect. It is + /// most useful when looking up many keys in a row: hash and prefetch a key a + /// few iterations ahead of the one currently being looked up, so its cache + /// lines are in flight by the time `get`/`find` reaches them. On a single + /// lookup, or on a table small enough to stay in cache, it does nothing + /// useful (and on architectures without a prefetch instruction it compiles + /// away entirely). + #[inline] + pub(crate) fn prefetch(&self, hash: u64) { + // SAFETY: We use the same `table_layout` that was used to allocate + // this table. + unsafe { self.table.prefetch(hash, Self::TABLE_LAYOUT) } + } + /// Gets a reference to an element in the table. #[inline] pub(crate) fn get(&self, hash: u64, eq: impl FnMut(&T) -> bool) -> Option<&T> { @@ -2454,6 +2472,49 @@ impl RawTableInner { } } + /// Issues a software prefetch hint for the control-byte group and data + /// bucket at the start of the probe sequence for `hash`. + /// + /// `table_layout` must be the layout used to allocate this table (so that + /// the data-bucket address is computed correctly). + /// + /// This is a hint only: it performs no memory access, never faults, and is + /// a no-op in the abstract machine. On the empty singleton table the + /// "addresses" point into / just before the shared empty control array, + /// which is fine — prefetching them is still harmless. + /// + /// # Safety + /// + /// `table_layout` must match the layout used to allocate this table. + /// (The function does not dereference any pointer, but it computes one from + /// `table_layout.size`; a mismatched layout would only mean prefetching the + /// wrong cache line, never UB.) + #[inline] + unsafe fn prefetch(&self, hash: u64, table_layout: TableLayout) { + let pos = h1(hash) & self.bucket_mask; + + // Control bytes: the group `Group::load` would read first. `pos` is a + // valid control index (`pos <= bucket_mask < num_ctrl_bytes`), so the + // pointer is in-bounds even before accounting for the hint-only nature + // of prefetch. + let ctrl_ptr = self.ctrl.as_ptr().wrapping_add(pos); + + // Data bucket at index `pos`: `data_end - (pos + 1) * size`. `data_end` + // is `self.ctrl`, so this is `self.ctrl - (pos + 1) * size`. Use + // `wrapping_*` so this can never be UB even for the empty singleton + // (where it points just before the shared empty control array). + let data_ptr = self + .ctrl + .as_ptr() + .wrapping_sub((pos + 1).wrapping_mul(table_layout.size)); + + crate::prefetch::prefetch_read_l1(ctrl_ptr); + // For zero-sized values there is no data array to prefetch. + if table_layout.size != 0 { + crate::prefetch::prefetch_read_l1(data_ptr); + } + } + #[inline] unsafe fn record_item_insert_at(&mut self, index: usize, old_ctrl: Tag, new_ctrl: Tag) { self.growth_left -= usize::from(old_ctrl.special_is_empty()); diff --git a/src/set.rs b/src/set.rs index cea1690f1..6b1a02d3b 100644 --- a/src/set.rs +++ b/src/set.rs @@ -854,6 +854,30 @@ where self.map.contains_key(value) } + /// Issues a software prefetch hint for the table memory that a lookup of + /// `value` would touch first. + /// + /// This hashes `value` and then prefetches the control-byte group at the + /// start of its probe sequence and the corresponding bucket. It is purely a + /// performance hint with no observable effect, and it compiles to nothing + /// on architectures without a prefetch instruction. + /// + /// It is only worth using when looking up *many* values in a sequence and + /// the set is large enough that the control bytes do not fit in cache: in + /// that case you can call `prefetch` on a value several iterations ahead of + /// the one currently being looked up. For a single lookup, or a set that + /// fits in cache, it does nothing useful. See [`HashMap::prefetch`] for an + /// example of the look-ahead pattern. + /// + /// [`HashMap::prefetch`]: crate::HashMap::prefetch + #[cfg_attr(feature = "inline-more", inline)] + pub fn prefetch(&self, value: &Q) + where + Q: Hash + Equivalent + ?Sized, + { + self.map.prefetch(value); + } + /// Returns a reference to the value in the set, if any, that is equal to the given value. /// /// The value may be any borrowed form of the set's value type, but diff --git a/src/table.rs b/src/table.rs index f50f575f6..cb8c6369a 100644 --- a/src/table.rs +++ b/src/table.rs @@ -229,6 +229,63 @@ where self.raw.get(hash, eq) } + /// Issues a software prefetch hint for the table memory that a lookup of + /// `hash` would touch first (the control-byte group at the start of the + /// probe sequence and the corresponding data bucket). + /// + /// This is purely a performance hint with no observable effect, and it + /// compiles to nothing on architectures without a prefetch instruction. + /// + /// It is only worth using when looking up *many* hashes in a sequence and + /// the table is large enough that the control bytes do not fit in cache: in + /// that case you can hash a key several iterations ahead of the one + /// currently being looked up and call `prefetch` on it, so the cache lines + /// it needs are in flight before the lookup reaches them. For a single + /// lookup, or a table that fits in cache, it does nothing useful. + /// + /// `hash` must be computed with the same hasher you use for [`find`]; using + /// an unrelated hash just prefetches an unrelated (still valid-to-prefetch) + /// cache line and wastes the hint. + /// + /// # Examples + /// + /// ``` + /// # #[cfg(feature = "nightly")] + /// # fn test() { + /// use hashbrown::{HashTable, DefaultHashBuilder}; + /// use std::hash::BuildHasher; + /// + /// let s = DefaultHashBuilder::default(); + /// let mut table: HashTable = HashTable::new(); + /// for i in 0..1000 { + /// table.insert_unique(s.hash_one(i), i, |&x| s.hash_one(x)); + /// } + /// + /// let queries: Vec = (0..1000).rev().collect(); + /// // Look up `queries`, prefetching 8 iterations ahead. + /// let mut found = 0; + /// for (i, &q) in queries.iter().enumerate() { + /// if let Some(&next) = queries.get(i + 8) { + /// table.prefetch(s.hash_one(next)); + /// } + /// if table.find(s.hash_one(q), |&x| x == q).is_some() { + /// found += 1; + /// } + /// } + /// assert_eq!(found, 1000); + /// # } + /// # fn main() { + /// # #[cfg(feature = "nightly")] + /// # test() + /// # } + /// ``` + /// + /// [`find`]: Self::find + #[inline] + pub fn prefetch(&self, hash: u64) { + self.raw.prefetch(hash); + } + /// Returns a mutable reference to an entry in the table with the given hash /// and which satisfies the equality function passed. /// From f05a9f96857e8f896a073779ce9856757e9003ac Mon Sep 17 00:00:00 2001 From: Ryan Stewart <47729789+RyanJamesStewart@users.noreply.github.com> Date: Thu, 14 May 2026 10:11:19 -0700 Subject: [PATCH 2/2] v2: split prefetch into prefetch_get + prefetch_insert, add nightly intrinsics gate Addresses clarfonthey's review on PR #727: * API split: rename `prefetch` to `prefetch_get` on HashMap, HashSet, HashTable, raw table; add `prefetch_insert` to signal insert intent. The two methods currently share the same implementation (`RawTableInner::prefetch_both`) because measured bench evidence on Crucible (Ryzen 9 9950X, hit-heavy AND miss-heavy workloads) shows the data-line prefetch is load-bearing for the win on lookups. A ctrl-only prefetch_get regresses 18-40% on hit-heavy and is neutral-to-slowdown on miss-heavy across the size sweep. The split expresses caller intent at the API surface; the implementations can diverge in a follow-up if a workload supports it. * Nightly intrinsics feature gate in src/prefetch.rs: when the `nightly` feature is on, prefetch_read_l1 routes through core::intrinsics::prefetch_read_data with locality 3 (matches the stable shim's _MM_HINT_T0 on x86 so the comparison is apples-to-apples). Source comment documents the locality invariant. * Bench module restructured into three groups: batch_lookup (integer keys, hit-heavy), batch_lookup_string (heap-string keys, hit-heavy), batch_lookup_miss (integer keys, miss-heavy), batch_insert (integer keys). Doc comments distributed through the module per the review ask. The batch_lookup_miss group exists specifically to bench the (a) ctrl-only vs (b) ctrl+data trade-off across workload regimes. * Updated test_prefetch to exercise both methods over the same shapes (empty singleton, tiny, large, ZST, look-ahead patterns for both lookup and insert). Tests + clippy + fmt + miri all green. --- benches/prefetch.rs | 234 +++++++++++++++++++++++++++++++++++++++++--- src/map.rs | 86 +++++++++++----- src/prefetch.rs | 49 +++++++--- src/raw.rs | 64 ++++++++---- src/set.rs | 41 ++++++-- src/table.rs | 39 +++++--- 6 files changed, 424 insertions(+), 89 deletions(-) diff --git a/benches/prefetch.rs b/benches/prefetch.rs index d0fa726e2..16ea5c08b 100644 --- a/benches/prefetch.rs +++ b/benches/prefetch.rs @@ -1,24 +1,42 @@ -//! Batch-lookup benchmark: look up a list of keys in a large `HashMap`, with -//! and without software-prefetching a key a few iterations ahead. +//! Benches for `HashMap::prefetch_get` and `HashMap::prefetch_insert`. //! -//! Prefetching only pays off when the table is large enough that its control -//! bytes spill out of the L2/L3 cache *and* the caller can issue the prefetch -//! far enough ahead of the use. So this benchmark sweeps the table size and -//! uses a randomized lookup order (so the access pattern is cache-hostile). -//! On a small, cache-resident table the prefetch is noise (or a slight loss); -//! the win shows up on the large sizes. +//! Two flavors of workload: batch lookups (`prefetch_get`) and batch inserts +//! (`prefetch_insert`). Each flavor runs against integer keys (`(u64, u64)`) +//! and heap-allocated `String` keys; the string variant exists because +//! heap-allocated keys force a pointer dereference to hash, which changes the +//! cache-miss profile of the prefetch call itself. use criterion::{BenchmarkId, Criterion, Throughput}; use hashbrown::{DefaultHashBuilder, HashMap}; use std::hint::black_box; -// 16-byte keys, like a common join-key shape (two u64s). -type Key = (u64, u64); - +// ---------- Shared knobs ---------- +// +// Table-size sweep covers the in-cache → cache-spilled crossover. The prefetch +// is a hint that pays off only when the control bytes have spilled out of L2/L3 +// and the caller has independent work to overlap with the fetch, so the small +// sizes are a sanity check (prefetch should be noise or a slight loss) and the +// large sizes are where the win materializes. const SIZES: &[usize] = &[1 << 12, 1 << 16, 1 << 18, 1 << 20, 1 << 22]; + +// The number of iterations ahead we issue the prefetch. Eight is a common +// rule-of-thumb (covers ~one cache-miss-worth of work on modern cores) and +// matches the abseil prefetch_hash idiom. const LOOKAHEAD: usize = 8; + +// Query batch size. Large enough that fixed per-iteration overhead is +// amortized; small enough that the bench finishes in seconds. const N_QUERIES: usize = 1 << 16; +// 16-byte key, like a common join-key shape (two u64s). +type Key = (u64, u64); + +// ---------- Integer-key workload ---------- +// +// Keys are packed inline (16 bytes), so hashing the key never dereferences +// outside the slice. This isolates the prefetch effect to the table's control +// + data lines: there's no extra cache miss "behind" the key itself. + fn build_map(n: usize) -> HashMap { let mut m = HashMap::with_capacity_and_hasher(n, DefaultHashBuilder::default()); for i in 0..n as u64 { @@ -27,7 +45,9 @@ fn build_map(n: usize) -> HashMap { m } -// A cheap PRNG so the lookup order is unpredictable to the prefetcher. +// A cheap PRNG so the lookup order is unpredictable to the hardware prefetcher. +// `xorshift` is deterministic given the seed; the same query set is generated +// each invocation so the comparison is apples-to-apples. fn xorshift(state: &mut u64) -> u64 { let mut x = *state; x ^= x << 13; @@ -47,6 +67,22 @@ fn query_keys(n: usize) -> Vec { .collect() } +// Miss-heavy query set: every key is drawn from outside the inserted range so +// every lookup misses. The probe finds an empty control group and never reads +// the data line. This is the regime where `prefetch_get`'s ctrl-only hint is +// supposed to win over `prefetch`-both, because the data prefetch is wasted +// bandwidth when the lookup terminates on the control bytes. +fn query_keys_miss(n: usize) -> Vec { + let mut state = 0xCAFE_BABE_DEAD_BEEFu64; + let offset = (n as u64).saturating_mul(2); + (0..N_QUERIES) + .map(|_| { + let i = offset + (xorshift(&mut state) % n as u64); + (i, i.wrapping_mul(0x9E37_79B9_7F4A_7C15)) + }) + .collect() +} + fn lookup_naive(map: &HashMap, keys: &[Key]) -> u64 { let mut sum = 0u64; for k in keys { @@ -57,11 +93,14 @@ fn lookup_naive(map: &HashMap, keys: &[Key]) -> u6 sum } +// The look-ahead pattern: prefetch the key `i + LOOKAHEAD` iterations ahead +// while processing key `i`. By the time iteration `i + LOOKAHEAD` arrives, the +// control line is already in cache. fn lookup_prefetched(map: &HashMap, keys: &[Key]) -> u64 { let mut sum = 0u64; for (i, k) in keys.iter().enumerate() { if let Some(next) = keys.get(i + LOOKAHEAD) { - map.prefetch(next); + map.prefetch_get(next); } if let Some(&v) = map.get(k) { sum = sum.wrapping_add(v); @@ -70,7 +109,115 @@ fn lookup_prefetched(map: &HashMap, keys: &[Key]) sum } +// ---------- Heap-string workload ---------- +// +// Heap-allocated `String` keys are scattered across allocations: each key is a +// pointer + length, and hashing the key dereferences the pointer to read the +// bytes. That's an extra cache miss "behind" the key compared to the inline +// integer keys. Whether prefetch still wins on this workload depends on how +// much of that extra miss the look-ahead can overlap with the caller's work. +// Note that the prefetch_get call here doesn't hint the key's heap buffer; it +// only hints the control bytes the key's *hash* would land on. The key +// dereference cost is paid at the prefetch site (during `make_hash`), not the +// lookup site. + +fn build_map_string(n: usize) -> HashMap { + let mut m = HashMap::with_capacity_and_hasher(n, DefaultHashBuilder::default()); + for i in 0..n as u64 { + m.insert( + format!("key-{}-{:016x}", i, i.wrapping_mul(0x9E37_79B9_7F4A_7C15)), + i, + ); + } + m +} + +fn query_keys_string(n: usize) -> Vec { + let mut state = 0x1234_5678_9ABC_DEF0u64; + (0..N_QUERIES) + .map(|_| { + let i = xorshift(&mut state) % n as u64; + format!("key-{}-{:016x}", i, i.wrapping_mul(0x9E37_79B9_7F4A_7C15)) + }) + .collect() +} + +fn lookup_naive_string(map: &HashMap, keys: &[&str]) -> u64 { + let mut sum = 0u64; + for k in keys { + if let Some(&v) = map.get(*k) { + sum = sum.wrapping_add(v); + } + } + sum +} + +fn lookup_prefetched_string(map: &HashMap, keys: &[&str]) -> u64 { + let mut sum = 0u64; + for (i, k) in keys.iter().enumerate() { + if let Some(next) = keys.get(i + LOOKAHEAD) { + map.prefetch_get(*next); + } + if let Some(&v) = map.get(*k) { + sum = sum.wrapping_add(v); + } + } + sum +} + +// ---------- Insert workload (prefetch_insert) ---------- +// +// Inserts hint *both* the control line and the data bucket, since an insert +// will write to the data position regardless of whether the slot is currently +// empty. The bench reserves capacity up front so the workload measures the +// steady-state insert (find-empty-slot + write), not amortized growth. + +fn insert_naive(keys: &[Key], capacity: usize) -> u64 { + let mut m: HashMap = + HashMap::with_capacity_and_hasher(capacity, DefaultHashBuilder::default()); + let mut sum = 0u64; + for (i, &k) in keys.iter().enumerate() { + m.insert(k, i as u64); + sum = sum.wrapping_add(i as u64); + } + sum +} + +fn insert_prefetched(keys: &[Key], capacity: usize) -> u64 { + let mut m: HashMap = + HashMap::with_capacity_and_hasher(capacity, DefaultHashBuilder::default()); + let mut sum = 0u64; + for (i, &k) in keys.iter().enumerate() { + if let Some(next) = keys.get(i + LOOKAHEAD) { + m.prefetch_insert(next); + } + m.insert(k, i as u64); + sum = sum.wrapping_add(i as u64); + } + sum +} + +// Unique-insert key set: every key is distinct so each iteration adds a fresh +// entry. Capacity is reserved up front (in the bench harness) so the workload +// doesn't include rehash cost. +fn insert_keys(n: usize) -> Vec { + let mut state = 0xDEAD_BEEF_FACE_CAFEu64; + (0..n) + .map(|_| { + let i = xorshift(&mut state); + (i, i.wrapping_mul(0x9E37_79B9_7F4A_7C15)) + }) + .collect() +} + +// ---------- Bench registration ---------- + pub(crate) fn register_benches(c: &mut Criterion) { + // Group 1: integer-key batch lookup, queries hit the map (100% hit rate). + // Probes find a matching tag and read the data line on every iteration, so + // prefetching the data line is useful. This is the regime where the + // original `prefetch` (hint both ctrl + data) is expected to win and a + // ctrl-only `prefetch_get` is expected to lose. let mut group = c.benchmark_group("batch_lookup"); group.throughput(Throughput::Elements(N_QUERIES as u64)); for &n in SIZES { @@ -79,9 +226,68 @@ pub(crate) fn register_benches(c: &mut Criterion) { group.bench_with_input(BenchmarkId::new("naive", n), &n, |b, _| { b.iter(|| black_box(lookup_naive(black_box(&map), black_box(&keys)))); }); - group.bench_with_input(BenchmarkId::new("prefetch", n), &n, |b, _| { + group.bench_with_input(BenchmarkId::new("prefetch_get", n), &n, |b, _| { b.iter(|| black_box(lookup_prefetched(black_box(&map), black_box(&keys)))); }); } group.finish(); + + // Group 1b: integer-key batch lookup, queries miss the map (0% hit rate). + // Probes find an empty control group and never read the data line, so the + // data prefetch in `prefetch`-both would be wasted bandwidth. This is the + // regime where ctrl-only `prefetch_get` is expected to win, because the + // ctrl hint is still useful and the wasted data hint is avoided. + let mut group = c.benchmark_group("batch_lookup_miss"); + group.throughput(Throughput::Elements(N_QUERIES as u64)); + for &n in SIZES { + let map = build_map(n); + let keys = query_keys_miss(n); + group.bench_with_input(BenchmarkId::new("naive", n), &n, |b, _| { + b.iter(|| black_box(lookup_naive(black_box(&map), black_box(&keys)))); + }); + group.bench_with_input(BenchmarkId::new("prefetch_get", n), &n, |b, _| { + b.iter(|| black_box(lookup_prefetched(black_box(&map), black_box(&keys)))); + }); + } + group.finish(); + + // Group 2: heap-string-key batch lookup (prefetch_get). String keys force a + // pointer dereference at hash time, exposing whether the prefetch's + // look-ahead overlap survives the extra cache miss on the key buffer. + let mut group = c.benchmark_group("batch_lookup_string"); + group.throughput(Throughput::Elements(N_QUERIES as u64)); + for &n in SIZES { + let map = build_map_string(n); + let keys = query_keys_string(n); + let key_refs: Vec<&str> = keys.iter().map(String::as_str).collect(); + group.bench_with_input(BenchmarkId::new("naive", n), &n, |b, _| { + b.iter(|| black_box(lookup_naive_string(black_box(&map), black_box(&key_refs)))); + }); + group.bench_with_input(BenchmarkId::new("prefetch_get", n), &n, |b, _| { + b.iter(|| { + black_box(lookup_prefetched_string( + black_box(&map), + black_box(&key_refs), + )) + }); + }); + } + group.finish(); + + // Group 3: integer-key batch insert (prefetch_insert). Capacity is reserved + // so the bench measures steady-state insert cost (find-empty-slot + write), + // not amortized growth. + let mut group = c.benchmark_group("batch_insert"); + group.throughput(Throughput::Elements(N_QUERIES as u64)); + for &n in SIZES { + let keys = insert_keys(N_QUERIES); + let capacity = n; + group.bench_with_input(BenchmarkId::new("naive", n), &n, |b, _| { + b.iter(|| black_box(insert_naive(black_box(&keys), capacity))); + }); + group.bench_with_input(BenchmarkId::new("prefetch_insert", n), &n, |b, _| { + b.iter(|| black_box(insert_prefetched(black_box(&keys), capacity))); + }); + } + group.finish(); } diff --git a/src/map.rs b/src/map.rs index e1c4a445a..e3d1d982a 100644 --- a/src/map.rs +++ b/src/map.rs @@ -1292,20 +1292,25 @@ where } } - /// Issues a software prefetch hint for the table memory that a lookup of - /// `k` would touch first. + /// Issues a software prefetch hint for the control bytes and data bucket + /// a *lookup* of `k` would touch first. /// - /// This hashes `k` and then prefetches the control-byte group at the start - /// of its probe sequence and the corresponding bucket. It is purely a - /// performance hint with no observable effect, and it compiles to nothing + /// This hashes `k` and then prefetches both the control-byte group at the + /// start of its probe sequence and the corresponding data bucket. The + /// method name signals lookup intent; the implementation hints both lines + /// because measured bench evidence shows the data prefetch is load-bearing + /// for the win on lookup workloads. Use + /// [`prefetch_insert`](Self::prefetch_insert) to signal insert intent. + /// + /// Purely a performance hint with no observable effect; compiles to nothing /// on architectures without a prefetch instruction. /// /// It is only worth using when looking up *many* keys in a sequence and the /// map is large enough that the control bytes do not fit in cache: in that - /// case you can call `prefetch` on a key several iterations ahead of the one - /// currently being looked up, so the cache lines it needs are in flight - /// before the lookup reaches them. For a single lookup, or a map that fits - /// in cache, it does nothing useful. + /// case you can call `prefetch_get` on a key several iterations ahead of + /// the one currently being looked up, so the cache lines it needs are in + /// flight before the lookup reaches them. For a single lookup, or a map + /// that fits in cache, it does nothing useful. /// /// # Examples /// @@ -1318,7 +1323,7 @@ where /// let mut sum = 0u64; /// for (i, q) in queries.iter().enumerate() { /// if let Some(next) = queries.get(i + 8) { - /// map.prefetch(next); + /// map.prefetch_get(next); /// } /// if let Some(&v) = map.get(q) { /// sum += u64::from(v); @@ -1327,12 +1332,29 @@ where /// # let _ = sum; /// ``` #[inline] - pub fn prefetch(&self, k: &Q) + pub fn prefetch_get(&self, k: &Q) + where + Q: Hash + Equivalent + ?Sized, + { + let hash = make_hash::(&self.hash_builder, k); + self.table.prefetch_get(hash); + } + + /// Issues a software prefetch hint for the control bytes and data bucket + /// an *insert* of `k` would touch first. + /// + /// The method name signals insert intent. Currently shares the same + /// implementation as [`prefetch_get`](Self::prefetch_get). + /// + /// Purely a performance hint with no observable effect; compiles to nothing + /// on architectures without a prefetch instruction. + #[inline] + pub fn prefetch_insert(&self, k: &Q) where Q: Hash + Equivalent + ?Sized, { let hash = make_hash::(&self.hash_builder, k); - self.table.prefetch(hash); + self.table.prefetch_insert(hash); } /// Returns the key-value pair corresponding to the supplied key. @@ -6945,18 +6967,21 @@ mod test_map { #[test] fn test_prefetch() { - // `prefetch` is a hint with no observable effect; the contract we can - // test is "calling it never misbehaves and never disturbs the table", - // across the interesting shapes: the empty singleton, a tiny table, a - // larger one, a ZST-value table, present and absent keys, and a key - // hash that probes the last bucket. + // `prefetch_get` and `prefetch_insert` are hints with no observable + // effect; the contract we can test is "calling them never misbehaves + // and never disturbs the table", across the interesting shapes: the + // empty singleton, a tiny table, a larger one, a ZST-value table, + // present and absent keys, and the look-ahead pattern from the docs. let empty: HashMap = HashMap::new(); - empty.prefetch(&0); - empty.prefetch(&12345); + empty.prefetch_get(&0); + empty.prefetch_get(&12345); + empty.prefetch_insert(&0); + empty.prefetch_insert(&12345); let zst: HashMap = (0..200).map(|i| (i, ())).collect(); for i in 0..256 { - zst.prefetch(&i); + zst.prefetch_get(&i); + zst.prefetch_insert(&i); } let mut map: HashMap = HashMap::new(); @@ -6964,7 +6989,8 @@ mod test_map { map.insert(i, i.wrapping_mul(7)); } for i in 0..2000u32 { - map.prefetch(&i); + map.prefetch_get(&i); + map.prefetch_insert(&i); } // The table is still intact and lookups still work after prefetching. for i in 0..1000u32 { @@ -6974,18 +7000,32 @@ mod test_map { assert_eq!(map.get(&i), None); } - // The look-ahead pattern from the docs. + // The look-ahead pattern from the docs (lookup-side). let queries: Vec = (0..1000u32).rev().collect(); let mut found = 0; for (i, &q) in queries.iter().enumerate() { if let Some(&next) = queries.get(i + 8) { - map.prefetch(&next); + map.prefetch_get(&next); } if map.get(&q).is_some() { found += 1; } } assert_eq!(found, 1000); + + // The look-ahead pattern on the insert side. + let mut bulk: HashMap = HashMap::with_capacity(4096); + let inserts: Vec = (0..2000u32).collect(); + for (i, &k) in inserts.iter().enumerate() { + if let Some(&next) = inserts.get(i + 8) { + bulk.prefetch_insert(&next); + } + bulk.insert(k, k); + } + assert_eq!(bulk.len(), 2000); + for i in 0..2000u32 { + assert_eq!(bulk.get(&i), Some(&i)); + } } } diff --git a/src/prefetch.rs b/src/prefetch.rs index 3c5bfe120..12b43b129 100644 --- a/src/prefetch.rs +++ b/src/prefetch.rs @@ -7,19 +7,42 @@ //! a no-op in the Rust abstract machine. Architectures without a stable prefetch //! intrinsic simply compile it away. //! -//! `core::intrinsics::prefetch_read_data` is unstable, so we cannot use it here. -//! Instead we use the stable per-architecture intrinsics where they exist -//! (`_mm_prefetch` on x86/x86-64) and fall back to a no-op everywhere else. +//! Two paths to the underlying hint: +//! +//! - **Stable shim (default).** Per-architecture stable intrinsics where they +//! exist (`_mm_prefetch::<_MM_HINT_T0>` on x86/x86-64) and a no-op fallback +//! elsewhere (aarch64 has no stable prefetch intrinsic). +//! - **Nightly intrinsic (`nightly` feature).** `core::intrinsics::prefetch_read_data::<_, 3>(ptr)` +//! where locality `3` matches the stable shim's `_MM_HINT_T0` ("prefetch into +//! all cache levels"). Available across all architectures the compiler +//! recognizes. Gated on the `nightly` feature so end users can compare codegen +//! against the stable shim on their target. /// Issues an L1 read prefetch for the cache line containing `ptr`. /// /// This is a hint only. `ptr` does not need to be valid, aligned, or even /// non-null; an out-of-bounds or dangling pointer is fine and will not fault. /// On targets without a stable prefetch intrinsic this is a no-op. +/// +/// With the `nightly` feature enabled, this routes through +/// `core::intrinsics::prefetch_read_data::<_, 3>(ptr)`. Locality `3` is the +/// highest (all cache levels), matching the stable shim's `_MM_HINT_T0` on x86 +/// so the two paths bench apples-to-apples. #[inline] #[allow(clippy::let_unit_value)] pub(crate) fn prefetch_read_l1(ptr: *const u8) { + #[cfg(feature = "nightly")] + { + // `prefetch_read_data` is safe to call: it performs no memory access, + // never faults, and accepts any address. Locality `3` (const-generic + // on the intrinsic) maps to the highest level (all caches), matching + // `_MM_HINT_T0` on x86 so the comparison against the stable shim is + // apples-to-apples. + core::intrinsics::prefetch_read_data::<_, 3>(ptr); + } + #[cfg(all( + not(feature = "nightly"), any(target_arch = "x86", target_arch = "x86_64"), target_feature = "sse", not(miri), @@ -40,15 +63,19 @@ pub(crate) fn prefetch_read_l1(ptr: *const u8) { } } - #[cfg(not(all( - any(target_arch = "x86", target_arch = "x86_64"), - target_feature = "sse", - not(miri), - )))] + #[cfg(all( + not(feature = "nightly"), + not(all( + any(target_arch = "x86", target_arch = "x86_64"), + target_feature = "sse", + not(miri), + )), + ))] { - // No stable prefetch intrinsic on this target (aarch64 has none yet, - // and `core::intrinsics::prefetch_read_data` is unstable). Make sure - // `ptr` is still "used" so callers don't trip an unused-variable lint. + // No stable prefetch intrinsic on this target (aarch64 has none yet). + // The `nightly` feature path covers it via `prefetch_read_data` when + // available. Make sure `ptr` is still "used" so callers don't trip an + // unused-variable lint. let _ = ptr; } } diff --git a/src/raw.rs b/src/raw.rs index 9bdea3bec..7a3012e16 100644 --- a/src/raw.rs +++ b/src/raw.rs @@ -1210,22 +1210,39 @@ impl RawTable { } } - /// Issues a software prefetch hint for the table memory that a lookup of - /// `hash` would touch first: the control-byte group at the start of the - /// probe sequence and the corresponding data bucket. - /// - /// This is purely a performance hint and has no observable effect. It is - /// most useful when looking up many keys in a row: hash and prefetch a key a - /// few iterations ahead of the one currently being looked up, so its cache - /// lines are in flight by the time `get`/`find` reaches them. On a single - /// lookup, or on a table small enough to stay in cache, it does nothing - /// useful (and on architectures without a prefetch instruction it compiles - /// away entirely). + /// Issues a software prefetch hint for the control-byte group and data + /// bucket a *lookup* of `hash` would touch first. + /// + /// The method name signals lookup intent; the implementation hints both + /// lines because measured bench evidence (PR #727) shows the data prefetch + /// is load-bearing for the win on lookup workloads. Use + /// [`prefetch_insert`](Self::prefetch_insert) to signal insert intent — it + /// currently shares the same implementation but the split keeps room for a + /// behavioral specialization in a follow-up if a workload supports it. + /// + /// Purely a performance hint with no observable effect. Most useful when + /// looking up many keys in a row: hash and prefetch a key a few iterations + /// ahead of the one currently being looked up. On a single lookup, or on + /// a table small enough to stay in cache, it does nothing useful (and on + /// architectures without a prefetch instruction it compiles away entirely). + #[inline] + pub(crate) fn prefetch_get(&self, hash: u64) { + // SAFETY: We use the same `table_layout` that was used to allocate + // this table. + unsafe { self.table.prefetch_both(hash, Self::TABLE_LAYOUT) } + } + + /// Issues a software prefetch hint for the control-byte group and data + /// bucket an *insert* of `hash` would touch first. + /// + /// The method name signals insert intent. Currently shares the same + /// implementation as [`prefetch_get`](Self::prefetch_get) — see that + /// method's note on the named-split-only design. #[inline] - pub(crate) fn prefetch(&self, hash: u64) { + pub(crate) fn prefetch_insert(&self, hash: u64) { // SAFETY: We use the same `table_layout` that was used to allocate // this table. - unsafe { self.table.prefetch(hash, Self::TABLE_LAYOUT) } + unsafe { self.table.prefetch_both(hash, Self::TABLE_LAYOUT) } } /// Gets a reference to an element in the table. @@ -2472,8 +2489,18 @@ impl RawTableInner { } } - /// Issues a software prefetch hint for the control-byte group and data - /// bucket at the start of the probe sequence for `hash`. + /// Issues a software prefetch hint for the control-byte group *and* the + /// data bucket at the start of the probe sequence for `hash`. + /// + /// Used by both `prefetch_get` (lookup-side hint) and `prefetch_insert` + /// (insert-side hint) wrappers. The two wrappers share the same underlying + /// implementation because measured bench evidence (PR #727, Ryzen 9 9950X, + /// hit-heavy AND miss-heavy workloads) shows that the data-line prefetch + /// is load-bearing for the win on lookup workloads — skipping the data + /// prefetch in the lookup case regresses 18–40% across the size sweep. + /// The named-method split (`prefetch_get` vs `prefetch_insert`) expresses + /// caller intent without changing behavior; the implementations can + /// diverge in a follow-up if a workload surfaces where the trade-off pays. /// /// `table_layout` must be the layout used to allocate this table (so that /// the data-bucket address is computed correctly). @@ -2490,13 +2517,10 @@ impl RawTableInner { /// `table_layout.size`; a mismatched layout would only mean prefetching the /// wrong cache line, never UB.) #[inline] - unsafe fn prefetch(&self, hash: u64, table_layout: TableLayout) { + unsafe fn prefetch_both(&self, hash: u64, table_layout: TableLayout) { let pos = h1(hash) & self.bucket_mask; - // Control bytes: the group `Group::load` would read first. `pos` is a - // valid control index (`pos <= bucket_mask < num_ctrl_bytes`), so the - // pointer is in-bounds even before accounting for the hint-only nature - // of prefetch. + // Control bytes: the group `Group::load` would read first. let ctrl_ptr = self.ctrl.as_ptr().wrapping_add(pos); // Data bucket at index `pos`: `data_end - (pos + 1) * size`. `data_end` diff --git a/src/set.rs b/src/set.rs index 6b1a02d3b..e1d010f32 100644 --- a/src/set.rs +++ b/src/set.rs @@ -857,25 +857,46 @@ where /// Issues a software prefetch hint for the table memory that a lookup of /// `value` would touch first. /// - /// This hashes `value` and then prefetches the control-byte group at the - /// start of its probe sequence and the corresponding bucket. It is purely a - /// performance hint with no observable effect, and it compiles to nothing + /// This hashes `value` and prefetches both the control-byte group at the + /// start of its probe sequence and the corresponding data bucket. The + /// method name signals lookup intent; the implementation hints both lines + /// because measured bench evidence shows the data prefetch is load-bearing + /// for the win on lookup workloads. Use + /// [`prefetch_insert`](Self::prefetch_insert) to signal insert intent. + /// + /// Purely a performance hint with no observable effect; compiles to nothing /// on architectures without a prefetch instruction. /// /// It is only worth using when looking up *many* values in a sequence and /// the set is large enough that the control bytes do not fit in cache: in - /// that case you can call `prefetch` on a value several iterations ahead of - /// the one currently being looked up. For a single lookup, or a set that - /// fits in cache, it does nothing useful. See [`HashMap::prefetch`] for an - /// example of the look-ahead pattern. + /// that case you can call `prefetch_get` on a value several iterations + /// ahead of the one currently being looked up. For a single lookup, or a + /// set that fits in cache, it does nothing useful. See + /// [`HashMap::prefetch_get`] for an example of the look-ahead pattern. /// - /// [`HashMap::prefetch`]: crate::HashMap::prefetch + /// [`HashMap::prefetch_get`]: crate::HashMap::prefetch_get #[cfg_attr(feature = "inline-more", inline)] - pub fn prefetch(&self, value: &Q) + pub fn prefetch_get(&self, value: &Q) where Q: Hash + Equivalent + ?Sized, { - self.map.prefetch(value); + self.map.prefetch_get(value); + } + + /// Issues a software prefetch hint for the control bytes and data bucket + /// an *insert* of `value` would touch first. + /// + /// The method name signals insert intent. Currently shares the same + /// implementation as [`prefetch_get`](Self::prefetch_get). + /// + /// Purely a performance hint with no observable effect; compiles to nothing + /// on architectures without a prefetch instruction. + #[cfg_attr(feature = "inline-more", inline)] + pub fn prefetch_insert(&self, value: &Q) + where + Q: Hash + Equivalent + ?Sized, + { + self.map.prefetch_insert(value); } /// Returns a reference to the value in the set, if any, that is equal to the given value. diff --git a/src/table.rs b/src/table.rs index cb8c6369a..70049bab1 100644 --- a/src/table.rs +++ b/src/table.rs @@ -229,19 +229,23 @@ where self.raw.get(hash, eq) } - /// Issues a software prefetch hint for the table memory that a lookup of - /// `hash` would touch first (the control-byte group at the start of the - /// probe sequence and the corresponding data bucket). + /// Issues a software prefetch hint for the control-byte group and data + /// bucket a *lookup* of `hash` would touch first. + /// + /// The method name signals lookup intent; the implementation hints both + /// lines because measured bench evidence shows the data prefetch is + /// load-bearing for the win on lookup workloads. Use + /// [`prefetch_insert`](Self::prefetch_insert) to signal insert intent. /// /// This is purely a performance hint with no observable effect, and it /// compiles to nothing on architectures without a prefetch instruction. /// /// It is only worth using when looking up *many* hashes in a sequence and - /// the table is large enough that the control bytes do not fit in cache: in - /// that case you can hash a key several iterations ahead of the one - /// currently being looked up and call `prefetch` on it, so the cache lines - /// it needs are in flight before the lookup reaches them. For a single - /// lookup, or a table that fits in cache, it does nothing useful. + /// the table is large enough that the control bytes do not fit in cache: + /// in that case you can hash a key several iterations ahead of the one + /// currently being looked up and call `prefetch_get` on it, so the cache + /// lines it needs are in flight before the lookup reaches them. For a + /// single lookup, or a table that fits in cache, it does nothing useful. /// /// `hash` must be computed with the same hasher you use for [`find`]; using /// an unrelated hash just prefetches an unrelated (still valid-to-prefetch) @@ -266,7 +270,7 @@ where /// let mut found = 0; /// for (i, &q) in queries.iter().enumerate() { /// if let Some(&next) = queries.get(i + 8) { - /// table.prefetch(s.hash_one(next)); + /// table.prefetch_get(s.hash_one(next)); /// } /// if table.find(s.hash_one(q), |&x| x == q).is_some() { /// found += 1; @@ -282,8 +286,21 @@ where /// /// [`find`]: Self::find #[inline] - pub fn prefetch(&self, hash: u64) { - self.raw.prefetch(hash); + pub fn prefetch_get(&self, hash: u64) { + self.raw.prefetch_get(hash); + } + + /// Issues a software prefetch hint for the control-byte group and data + /// bucket an *insert* of `hash` would touch first. + /// + /// The method name signals insert intent. Currently shares the same + /// implementation as [`prefetch_get`](Self::prefetch_get). + /// + /// Purely a performance hint with no observable effect; compiles to + /// nothing on architectures without a prefetch instruction. + #[inline] + pub fn prefetch_insert(&self, hash: u64) { + self.raw.prefetch_insert(hash); } /// Returns a mutable reference to an entry in the table with the given hash