From 5cbc4477032d2e38bdbbc04aa993061c7b7dafaf Mon Sep 17 00:00:00 2001
From: Ryan Stewart <47729789+RyanJamesStewart@users.noreply.github.com>
Date: Tue, 12 May 2026 18:57:06 -0700
Subject: [PATCH 1/2] Add a way to prefetch a hash table bucket
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds `prefetch(hash)` to `RawTable` and exposes it as:
- `HashTable::prefetch(hash)`
- `HashMap::prefetch(&Q)` / `HashSet::prefetch(&Q)` (hash the key, then prefetch)

A prefetch issues a software prefetch hint for the two cache lines a lookup
of that hash would touch first: the control-byte group at the start of the
probe sequence and the corresponding data bucket. It is a hint only — no
memory access, never faults (an invalid/dangling address is fine), a no-op
in the abstract machine.

The stable path is per-architecture: `_mm_prefetch` (`_MM_HINT_T0`) on
x86/x86-64, a no-op everywhere else (aarch64 has no stable prefetch
intrinsic yet, and `core::intrinsics::prefetch_read_data` is unstable).
The new `src/prefetch.rs` shim is `#[cfg(not(miri))]`-gated for the
intrinsic, like the SIMD `Group` impls. For now this is L1 read prefetch
only; a richer locality/read-write interface can follow once the std
prefetch hints (rust-lang/rust#146941) stabilize.

This only helps when the table is large enough that its control bytes spill
out of cache *and* the caller can prefetch a key several lookups ahead of
the one being processed (batched lookups / join probing). On a single
lookup, or a cache-resident table, it does nothing useful. The new
`benches/prefetch.rs` batch-lookup bench shows the crossover: roughly a
slight loss on a small (4K-slot) table, ~1.1-1.15x on tables that no longer
fit in L2/L3 (~1M-4M slots).
---
 benches/bench.rs    |  2 ++
 benches/prefetch.rs | 87 ++++++++++++++++++++++++++++++++++++++++++++
 src/lib.rs          |  1 +
 src/map.rs          | 88 +++++++++++++++++++++++++++++++++++++++++++++
 src/prefetch.rs     | 54 ++++++++++++++++++++++++++++
 src/raw.rs          | 61 +++++++++++++++++++++++++++++++
 src/set.rs          | 24 +++++++++++++
 src/table.rs        | 57 +++++++++++++++++++++++++++++
 8 files changed, 374 insertions(+)
 create mode 100644 benches/prefetch.rs
 create mode 100644 src/prefetch.rs

diff --git a/benches/bench.rs b/benches/bench.rs
index 7103f68bf..de06e3619 100644
--- a/benches/bench.rs
+++ b/benches/bench.rs
@@ -4,6 +4,7 @@ use criterion::{criterion_group, criterion_main};
 
 mod general_ops;
 mod insert_unique_unchecked;
+mod prefetch;
 mod set_ops;
 mod with_capacity;
 
@@ -11,6 +12,7 @@ criterion_group!(
     benches,
     general_ops::register_benches,
     insert_unique_unchecked::register_benches,
+    prefetch::register_benches,
     set_ops::register_benches,
     with_capacity::register_benches
 );
diff --git a/benches/prefetch.rs b/benches/prefetch.rs
new file mode 100644
index 000000000..d0fa726e2
--- /dev/null
+++ b/benches/prefetch.rs
@@ -0,0 +1,87 @@
+//! Batch-lookup benchmark: look up a list of keys in a large `HashMap`, with
+//! and without software-prefetching a key a few iterations ahead.
+//!
+//! Prefetching only pays off when the table is large enough that its control
+//! bytes spill out of the L2/L3 cache *and* the caller can issue the prefetch
+//! far enough ahead of the use. So this benchmark sweeps the table size and
+//! uses a randomized lookup order (so the access pattern is cache-hostile).
+//! On a small, cache-resident table the prefetch is noise (or a slight loss);
+//! the win shows up on the large sizes.
+
+use criterion::{BenchmarkId, Criterion, Throughput};
+use hashbrown::{DefaultHashBuilder, HashMap};
+use std::hint::black_box;
+
+// 16-byte keys, like a common join-key shape (two u64s).
+type Key = (u64, u64);
+
+const SIZES: &[usize] = &[1 << 12, 1 << 16, 1 << 18, 1 << 20, 1 << 22];
+const LOOKAHEAD: usize = 8;
+const N_QUERIES: usize = 1 << 16;
+
+fn build_map(n: usize) -> HashMap<Key, u64, DefaultHashBuilder> {
+    let mut m = HashMap::with_capacity_and_hasher(n, DefaultHashBuilder::default());
+    for i in 0..n as u64 {
+        m.insert((i, i.wrapping_mul(0x9E37_79B9_7F4A_7C15)), i);
+    }
+    m
+}
+
+// A cheap PRNG so the lookup order is unpredictable to the prefetcher.
+fn xorshift(state: &mut u64) -> u64 {
+    let mut x = *state;
+    x ^= x << 13;
+    x ^= x >> 7;
+    x ^= x << 17;
+    *state = x;
+    x
+}
+
+fn query_keys(n: usize) -> Vec<Key> {
+    let mut state = 0x1234_5678_9ABC_DEF0u64;
+    (0..N_QUERIES)
+        .map(|_| {
+            let i = xorshift(&mut state) % n as u64;
+            (i, i.wrapping_mul(0x9E37_79B9_7F4A_7C15))
+        })
+        .collect()
+}
+
+fn lookup_naive(map: &HashMap<Key, u64, DefaultHashBuilder>, keys: &[Key]) -> u64 {
+    let mut sum = 0u64;
+    for k in keys {
+        if let Some(&v) = map.get(k) {
+            sum = sum.wrapping_add(v);
+        }
+    }
+    sum
+}
+
+fn lookup_prefetched(map: &HashMap<Key, u64, DefaultHashBuilder>, keys: &[Key]) -> u64 {
+    let mut sum = 0u64;
+    for (i, k) in keys.iter().enumerate() {
+        if let Some(next) = keys.get(i + LOOKAHEAD) {
+            map.prefetch(next);
+        }
+        if let Some(&v) = map.get(k) {
+            sum = sum.wrapping_add(v);
+        }
+    }
+    sum
+}
+
+pub(crate) fn register_benches(c: &mut Criterion) {
+    let mut group = c.benchmark_group("batch_lookup");
+    group.throughput(Throughput::Elements(N_QUERIES as u64));
+    for &n in SIZES {
+        let map = build_map(n);
+        let keys = query_keys(n);
+        group.bench_with_input(BenchmarkId::new("naive", n), &n, |b, _| {
+            b.iter(|| black_box(lookup_naive(black_box(&map), black_box(&keys))));
+        });
+        group.bench_with_input(BenchmarkId::new("prefetch", n), &n, |b, _| {
+            b.iter(|| black_box(lookup_prefetched(black_box(&map), black_box(&keys))));
+        });
+    }
+    group.finish();
+}
diff --git a/src/lib.rs b/src/lib.rs
index effc178a3..04d07354d 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -52,6 +52,7 @@ mod macros;
 mod alloc;
 mod control;
 mod hasher;
+mod prefetch;
 mod raw;
 mod util;
 
diff --git a/src/map.rs b/src/map.rs
index 22cafef59..e1c4a445a 100644
--- a/src/map.rs
+++ b/src/map.rs
@@ -1292,6 +1292,49 @@ where
         }
     }
 
+    /// Issues a software prefetch hint for the table memory that a lookup of
+    /// `k` would touch first.
+    ///
+    /// This hashes `k` and then prefetches the control-byte group at the start
+    /// of its probe sequence and the corresponding bucket. It is purely a
+    /// performance hint with no observable effect, and it compiles to nothing
+    /// on architectures without a prefetch instruction.
+    ///
+    /// It is only worth using when looking up *many* keys in a sequence and the
+    /// map is large enough that the control bytes do not fit in cache: in that
+    /// case you can call `prefetch` on a key several iterations ahead of the one
+    /// currently being looked up, so the cache lines it needs are in flight
+    /// before the lookup reaches them. For a single lookup, or a map that fits
+    /// in cache, it does nothing useful.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// use hashbrown::HashMap;
+    ///
+    /// let map: HashMap<u32, u32> = (0..1000).map(|i| (i, i)).collect();
+    /// let queries: Vec<u32> = (0..1000).rev().collect();
+    ///
+    /// let mut sum = 0u64;
+    /// for (i, q) in queries.iter().enumerate() {
+    ///     if let Some(next) = queries.get(i + 8) {
+    ///         map.prefetch(next);
+    ///     }
+    ///     if let Some(&v) = map.get(q) {
+    ///         sum += u64::from(v);
+    ///     }
+    /// }
+    /// # let _ = sum;
+    /// ```
+    #[inline]
+    pub fn prefetch<Q>(&self, k: &Q)
+    where
+        Q: Hash + Equivalent<K> + ?Sized,
+    {
+        let hash = make_hash::<Q, S>(&self.hash_builder, k);
+        self.table.prefetch(hash);
+    }
+
     /// Returns the key-value pair corresponding to the supplied key.
     ///
     /// The supplied key may be any borrowed form of the map's key type, but
@@ -6899,6 +6942,51 @@ mod test_map {
             HashMap::<u32, u32>::with_capacity(1).allocation_size() > core::mem::size_of::<u32>()
         );
     }
+
+    #[test]
+    fn test_prefetch() {
+        // `prefetch` is a hint with no observable effect; the contract we can
+        // test is "calling it never misbehaves and never disturbs the table",
+        // across the interesting shapes: the empty singleton, a tiny table, a
+        // larger one, a ZST-value table, present and absent keys, and a key
+        // hash that probes the last bucket.
+        let empty: HashMap<u32, u32> = HashMap::new();
+        empty.prefetch(&0);
+        empty.prefetch(&12345);
+
+        let zst: HashMap<u32, ()> = (0..200).map(|i| (i, ())).collect();
+        for i in 0..256 {
+            zst.prefetch(&i);
+        }
+
+        let mut map: HashMap<u32, u32> = HashMap::new();
+        for i in 0..1000u32 {
+            map.insert(i, i.wrapping_mul(7));
+        }
+        for i in 0..2000u32 {
+            map.prefetch(&i);
+        }
+        // The table is still intact and lookups still work after prefetching.
+        for i in 0..1000u32 {
+            assert_eq!(map.get(&i), Some(&i.wrapping_mul(7)));
+        }
+        for i in 1000..2000u32 {
+            assert_eq!(map.get(&i), None);
+        }
+
+        // The look-ahead pattern from the docs.
+        let queries: Vec<u32> = (0..1000u32).rev().collect();
+        let mut found = 0;
+        for (i, &q) in queries.iter().enumerate() {
+            if let Some(&next) = queries.get(i + 8) {
+                map.prefetch(&next);
+            }
+            if map.get(&q).is_some() {
+                found += 1;
+            }
+        }
+        assert_eq!(found, 1000);
+    }
 }
 
 #[cfg(all(test, unix, any(feature = "nightly", feature = "allocator-api2")))]
diff --git a/src/prefetch.rs b/src/prefetch.rs
new file mode 100644
index 000000000..3c5bfe120
--- /dev/null
+++ b/src/prefetch.rs
@@ -0,0 +1,54 @@
+//! Software prefetch hint.
+//!
+//! A prefetch is a *hint* to the CPU that the cache line containing a given
+//! address will be accessed soon, so the memory subsystem can start fetching it
+//! while the core does other work. It is purely advisory: it never reads or
+//! writes memory, never faults (even for an invalid or dangling pointer), and is
+//! a no-op in the Rust abstract machine. Architectures without a stable prefetch
+//! intrinsic simply compile it away.
+//!
+//! `core::intrinsics::prefetch_read_data` is unstable, so we cannot use it here.
+//! Instead we use the stable per-architecture intrinsics where they exist
+//! (`_mm_prefetch` on x86/x86-64) and fall back to a no-op everywhere else.
+
+/// Issues an L1 read prefetch for the cache line containing `ptr`.
+///
+/// This is a hint only. `ptr` does not need to be valid, aligned, or even
+/// non-null; an out-of-bounds or dangling pointer is fine and will not fault.
+/// On targets without a stable prefetch intrinsic this is a no-op.
+#[inline]
+#[allow(clippy::let_unit_value)]
+pub(crate) fn prefetch_read_l1(ptr: *const u8) {
+    #[cfg(all(
+        any(target_arch = "x86", target_arch = "x86_64"),
+        target_feature = "sse",
+        not(miri),
+    ))]
+    {
+        #[cfg(target_arch = "x86")]
+        use core::arch::x86::{_MM_HINT_T0, _mm_prefetch};
+        #[cfg(target_arch = "x86_64")]
+        use core::arch::x86_64::{_MM_HINT_T0, _mm_prefetch};
+
+        // SAFETY: `_mm_prefetch` is a hint instruction; it performs no memory
+        // access, never faults, and accepts any address (the Intel SDM and the
+        // `core::arch` docs both spell this out). The only safety requirement is
+        // that the `sse` target feature is available, which the `cfg` above
+        // guarantees on x86 / x86-64.
+        unsafe {
+            _mm_prefetch::<_MM_HINT_T0>(ptr.cast::<i8>());
+        }
+    }
+
+    #[cfg(not(all(
+        any(target_arch = "x86", target_arch = "x86_64"),
+        target_feature = "sse",
+        not(miri),
+    )))]
+    {
+        // No stable prefetch intrinsic on this target (aarch64 has none yet,
+        // and `core::intrinsics::prefetch_read_data` is unstable). Make sure
+        // `ptr` is still "used" so callers don't trip an unused-variable lint.
+        let _ = ptr;
+    }
+}
diff --git a/src/raw.rs b/src/raw.rs
index 39f50ef78..9bdea3bec 100644
--- a/src/raw.rs
+++ b/src/raw.rs
@@ -1210,6 +1210,24 @@ impl<T, A: Allocator> RawTable<T, A> {
         }
     }
 
+    /// Issues a software prefetch hint for the table memory that a lookup of
+    /// `hash` would touch first: the control-byte group at the start of the
+    /// probe sequence and the corresponding data bucket.
+    ///
+    /// This is purely a performance hint and has no observable effect. It is
+    /// most useful when looking up many keys in a row: hash and prefetch a key a
+    /// few iterations ahead of the one currently being looked up, so its cache
+    /// lines are in flight by the time `get`/`find` reaches them. On a single
+    /// lookup, or on a table small enough to stay in cache, it does nothing
+    /// useful (and on architectures without a prefetch instruction it compiles
+    /// away entirely).
+    #[inline]
+    pub(crate) fn prefetch(&self, hash: u64) {
+        // SAFETY: We use the same `table_layout` that was used to allocate
+        // this table.
+        unsafe { self.table.prefetch(hash, Self::TABLE_LAYOUT) }
+    }
+
     /// Gets a reference to an element in the table.
     #[inline]
     pub(crate) fn get(&self, hash: u64, eq: impl FnMut(&T) -> bool) -> Option<&T> {
@@ -2454,6 +2472,49 @@ impl RawTableInner {
         }
     }
 
+    /// Issues a software prefetch hint for the control-byte group and data
+    /// bucket at the start of the probe sequence for `hash`.
+    ///
+    /// `table_layout` must be the layout used to allocate this table (so that
+    /// the data-bucket address is computed correctly).
+    ///
+    /// This is a hint only: it performs no memory access, never faults, and is
+    /// a no-op in the abstract machine. On the empty singleton table the
+    /// "addresses" point into / just before the shared empty control array,
+    /// which is fine — prefetching them is still harmless.
+    ///
+    /// # Safety
+    ///
+    /// `table_layout` must match the layout used to allocate this table.
+    /// (The function does not dereference any pointer, but it computes one from
+    /// `table_layout.size`; a mismatched layout would only mean prefetching the
+    /// wrong cache line, never UB.)
+    #[inline]
+    unsafe fn prefetch(&self, hash: u64, table_layout: TableLayout) {
+        let pos = h1(hash) & self.bucket_mask;
+
+        // Control bytes: the group `Group::load` would read first. `pos` is a
+        // valid control index (`pos <= bucket_mask < num_ctrl_bytes`), so the
+        // pointer is in-bounds even before accounting for the hint-only nature
+        // of prefetch.
+        let ctrl_ptr = self.ctrl.as_ptr().wrapping_add(pos);
+
+        // Data bucket at index `pos`: `data_end - (pos + 1) * size`. `data_end`
+        // is `self.ctrl`, so this is `self.ctrl - (pos + 1) * size`. Use
+        // `wrapping_*` so this can never be UB even for the empty singleton
+        // (where it points just before the shared empty control array).
+        let data_ptr = self
+            .ctrl
+            .as_ptr()
+            .wrapping_sub((pos + 1).wrapping_mul(table_layout.size));
+
+        crate::prefetch::prefetch_read_l1(ctrl_ptr);
+        // For zero-sized values there is no data array to prefetch.
+        if table_layout.size != 0 {
+            crate::prefetch::prefetch_read_l1(data_ptr);
+        }
+    }
+
     #[inline]
     unsafe fn record_item_insert_at(&mut self, index: usize, old_ctrl: Tag, new_ctrl: Tag) {
         self.growth_left -= usize::from(old_ctrl.special_is_empty());
diff --git a/src/set.rs b/src/set.rs
index cea1690f1..6b1a02d3b 100644
--- a/src/set.rs
+++ b/src/set.rs
@@ -854,6 +854,30 @@ where
         self.map.contains_key(value)
     }
 
+    /// Issues a software prefetch hint for the table memory that a lookup of
+    /// `value` would touch first.
+    ///
+    /// This hashes `value` and then prefetches the control-byte group at the
+    /// start of its probe sequence and the corresponding bucket. It is purely a
+    /// performance hint with no observable effect, and it compiles to nothing
+    /// on architectures without a prefetch instruction.
+    ///
+    /// It is only worth using when looking up *many* values in a sequence and
+    /// the set is large enough that the control bytes do not fit in cache: in
+    /// that case you can call `prefetch` on a value several iterations ahead of
+    /// the one currently being looked up. For a single lookup, or a set that
+    /// fits in cache, it does nothing useful. See [`HashMap::prefetch`] for an
+    /// example of the look-ahead pattern.
+    ///
+    /// [`HashMap::prefetch`]: crate::HashMap::prefetch
+    #[cfg_attr(feature = "inline-more", inline)]
+    pub fn prefetch<Q>(&self, value: &Q)
+    where
+        Q: Hash + Equivalent<T> + ?Sized,
+    {
+        self.map.prefetch(value);
+    }
+
     /// Returns a reference to the value in the set, if any, that is equal to the given value.
     ///
     /// The value may be any borrowed form of the set's value type, but
diff --git a/src/table.rs b/src/table.rs
index f50f575f6..cb8c6369a 100644
--- a/src/table.rs
+++ b/src/table.rs
@@ -229,6 +229,63 @@ where
         self.raw.get(hash, eq)
     }
 
+    /// Issues a software prefetch hint for the table memory that a lookup of
+    /// `hash` would touch first (the control-byte group at the start of the
+    /// probe sequence and the corresponding data bucket).
+    ///
+    /// This is purely a performance hint with no observable effect, and it
+    /// compiles to nothing on architectures without a prefetch instruction.
+    ///
+    /// It is only worth using when looking up *many* hashes in a sequence and
+    /// the table is large enough that the control bytes do not fit in cache: in
+    /// that case you can hash a key several iterations ahead of the one
+    /// currently being looked up and call `prefetch` on it, so the cache lines
+    /// it needs are in flight before the lookup reaches them. For a single
+    /// lookup, or a table that fits in cache, it does nothing useful.
+    ///
+    /// `hash` must be computed with the same hasher you use for [`find`]; using
+    /// an unrelated hash just prefetches an unrelated (still valid-to-prefetch)
+    /// cache line and wastes the hint.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// # #[cfg(feature = "nightly")]
+    /// # fn test() {
+    /// use hashbrown::{HashTable, DefaultHashBuilder};
+    /// use std::hash::BuildHasher;
+    ///
+    /// let s = DefaultHashBuilder::default();
+    /// let mut table: HashTable<u32> = HashTable::new();
+    /// for i in 0..1000 {
+    ///     table.insert_unique(s.hash_one(i), i, |&x| s.hash_one(x));
+    /// }
+    ///
+    /// let queries: Vec<u32> = (0..1000).rev().collect();
+    /// // Look up `queries`, prefetching 8 iterations ahead.
+    /// let mut found = 0;
+    /// for (i, &q) in queries.iter().enumerate() {
+    ///     if let Some(&next) = queries.get(i + 8) {
+    ///         table.prefetch(s.hash_one(next));
+    ///     }
+    ///     if table.find(s.hash_one(q), |&x| x == q).is_some() {
+    ///         found += 1;
+    ///     }
+    /// }
+    /// assert_eq!(found, 1000);
+    /// # }
+    /// # fn main() {
+    /// #     #[cfg(feature = "nightly")]
+    /// #     test()
+    /// # }
+    /// ```
+    ///
+    /// [`find`]: Self::find
+    #[inline]
+    pub fn prefetch(&self, hash: u64) {
+        self.raw.prefetch(hash);
+    }
+
     /// Returns a mutable reference to an entry in the table with the given hash
     /// and which satisfies the equality function passed.
     ///

From f05a9f96857e8f896a073779ce9856757e9003ac Mon Sep 17 00:00:00 2001
From: Ryan Stewart <47729789+RyanJamesStewart@users.noreply.github.com>
Date: Thu, 14 May 2026 10:11:19 -0700
Subject: [PATCH 2/2] v2: split prefetch into prefetch_get + prefetch_insert,
 add nightly intrinsics gate

Addresses clarfonthey's review on PR #727:

* API split: rename `prefetch` to `prefetch_get` on HashMap, HashSet,
  HashTable, raw table; add `prefetch_insert` to signal insert intent.
  The two methods currently share the same implementation
  (`RawTableInner::prefetch_both`) because measured bench evidence on
  Crucible (Ryzen 9 9950X, hit-heavy AND miss-heavy workloads) shows
  the data-line prefetch is load-bearing for the win on lookups. A
  ctrl-only prefetch_get regresses 18-40% on hit-heavy and is
  neutral-to-slowdown on miss-heavy across the size sweep. The split
  expresses caller intent at the API surface; the implementations can
  diverge in a follow-up if a workload supports it.

* Nightly intrinsics feature gate in src/prefetch.rs: when the
  `nightly` feature is on, prefetch_read_l1 routes through
  core::intrinsics::prefetch_read_data with locality 3 (matches the
  stable shim's _MM_HINT_T0 on x86 so the comparison is
  apples-to-apples). Source comment documents the locality invariant.

* Bench module restructured into three groups: batch_lookup (integer
  keys, hit-heavy), batch_lookup_string (heap-string keys, hit-heavy),
  batch_lookup_miss (integer keys, miss-heavy), batch_insert (integer
  keys). Doc comments distributed through the module per the review
  ask. The batch_lookup_miss group exists specifically to bench the
  (a) ctrl-only vs (b) ctrl+data trade-off across workload regimes.

* Updated test_prefetch to exercise both methods over the same shapes
  (empty singleton, tiny, large, ZST, look-ahead patterns for both
  lookup and insert).

Tests + clippy + fmt + miri all green.
---
 benches/prefetch.rs | 234 +++++++++++++++++++++++++++++++++++++++++---
 src/map.rs          |  86 +++++++++++-----
 src/prefetch.rs     |  49 +++++++---
 src/raw.rs          |  64 ++++++++----
 src/set.rs          |  41 ++++++--
 src/table.rs        |  39 +++++---
 6 files changed, 424 insertions(+), 89 deletions(-)

diff --git a/benches/prefetch.rs b/benches/prefetch.rs
index d0fa726e2..16ea5c08b 100644
--- a/benches/prefetch.rs
+++ b/benches/prefetch.rs
@@ -1,24 +1,42 @@
-//! Batch-lookup benchmark: look up a list of keys in a large `HashMap`, with
-//! and without software-prefetching a key a few iterations ahead.
+//! Benches for `HashMap::prefetch_get` and `HashMap::prefetch_insert`.
 //!
-//! Prefetching only pays off when the table is large enough that its control
-//! bytes spill out of the L2/L3 cache *and* the caller can issue the prefetch
-//! far enough ahead of the use. So this benchmark sweeps the table size and
-//! uses a randomized lookup order (so the access pattern is cache-hostile).
-//! On a small, cache-resident table the prefetch is noise (or a slight loss);
-//! the win shows up on the large sizes.
+//! Two flavors of workload: batch lookups (`prefetch_get`) and batch inserts
+//! (`prefetch_insert`). Each flavor runs against integer keys (`(u64, u64)`)
+//! and heap-allocated `String` keys; the string variant exists because
+//! heap-allocated keys force a pointer dereference to hash, which changes the
+//! cache-miss profile of the prefetch call itself.
 
 use criterion::{BenchmarkId, Criterion, Throughput};
 use hashbrown::{DefaultHashBuilder, HashMap};
 use std::hint::black_box;
 
-// 16-byte keys, like a common join-key shape (two u64s).
-type Key = (u64, u64);
-
+// ---------- Shared knobs ----------
+//
+// Table-size sweep covers the in-cache → cache-spilled crossover. The prefetch
+// is a hint that pays off only when the control bytes have spilled out of L2/L3
+// and the caller has independent work to overlap with the fetch, so the small
+// sizes are a sanity check (prefetch should be noise or a slight loss) and the
+// large sizes are where the win materializes.
 const SIZES: &[usize] = &[1 << 12, 1 << 16, 1 << 18, 1 << 20, 1 << 22];
+
+// The number of iterations ahead we issue the prefetch. Eight is a common
+// rule-of-thumb (covers ~one cache-miss-worth of work on modern cores) and
+// matches the abseil prefetch_hash idiom.
 const LOOKAHEAD: usize = 8;
+
+// Query batch size. Large enough that fixed per-iteration overhead is
+// amortized; small enough that the bench finishes in seconds.
 const N_QUERIES: usize = 1 << 16;
 
+// 16-byte key, like a common join-key shape (two u64s).
+type Key = (u64, u64);
+
+// ---------- Integer-key workload ----------
+//
+// Keys are packed inline (16 bytes), so hashing the key never dereferences
+// outside the slice. This isolates the prefetch effect to the table's control
+// + data lines: there's no extra cache miss "behind" the key itself.
+
 fn build_map(n: usize) -> HashMap<Key, u64, DefaultHashBuilder> {
     let mut m = HashMap::with_capacity_and_hasher(n, DefaultHashBuilder::default());
     for i in 0..n as u64 {
@@ -27,7 +45,9 @@ fn build_map(n: usize) -> HashMap<Key, u64, DefaultHashBuilder> {
     m
 }
 
-// A cheap PRNG so the lookup order is unpredictable to the prefetcher.
+// A cheap PRNG so the lookup order is unpredictable to the hardware prefetcher.
+// `xorshift` is deterministic given the seed; the same query set is generated
+// each invocation so the comparison is apples-to-apples.
 fn xorshift(state: &mut u64) -> u64 {
     let mut x = *state;
     x ^= x << 13;
@@ -47,6 +67,22 @@ fn query_keys(n: usize) -> Vec<Key> {
         .collect()
 }
 
+// Miss-heavy query set: every key is drawn from outside the inserted range so
+// every lookup misses. The probe finds an empty control group and never reads
+// the data line. This is the regime where `prefetch_get`'s ctrl-only hint is
+// supposed to win over `prefetch`-both, because the data prefetch is wasted
+// bandwidth when the lookup terminates on the control bytes.
+fn query_keys_miss(n: usize) -> Vec<Key> {
+    let mut state = 0xCAFE_BABE_DEAD_BEEFu64;
+    let offset = (n as u64).saturating_mul(2);
+    (0..N_QUERIES)
+        .map(|_| {
+            let i = offset + (xorshift(&mut state) % n as u64);
+            (i, i.wrapping_mul(0x9E37_79B9_7F4A_7C15))
+        })
+        .collect()
+}
+
 fn lookup_naive(map: &HashMap<Key, u64, DefaultHashBuilder>, keys: &[Key]) -> u64 {
     let mut sum = 0u64;
     for k in keys {
@@ -57,11 +93,14 @@ fn lookup_naive(map: &HashMap<Key, u64, DefaultHashBuilder>, keys: &[Key]) -> u6
     sum
 }
 
+// The look-ahead pattern: prefetch the key `i + LOOKAHEAD` iterations ahead
+// while processing key `i`. By the time iteration `i + LOOKAHEAD` arrives, the
+// control line is already in cache.
 fn lookup_prefetched(map: &HashMap<Key, u64, DefaultHashBuilder>, keys: &[Key]) -> u64 {
     let mut sum = 0u64;
     for (i, k) in keys.iter().enumerate() {
         if let Some(next) = keys.get(i + LOOKAHEAD) {
-            map.prefetch(next);
+            map.prefetch_get(next);
         }
         if let Some(&v) = map.get(k) {
             sum = sum.wrapping_add(v);
@@ -70,7 +109,115 @@ fn lookup_prefetched(map: &HashMap<Key, u64, DefaultHashBuilder>, keys: &[Key])
     sum
 }
 
+// ---------- Heap-string workload ----------
+//
+// Heap-allocated `String` keys are scattered across allocations: each key is a
+// pointer + length, and hashing the key dereferences the pointer to read the
+// bytes. That's an extra cache miss "behind" the key compared to the inline
+// integer keys. Whether prefetch still wins on this workload depends on how
+// much of that extra miss the look-ahead can overlap with the caller's work.
+// Note that the prefetch_get call here doesn't hint the key's heap buffer; it
+// only hints the control bytes the key's *hash* would land on. The key
+// dereference cost is paid at the prefetch site (during `make_hash`), not the
+// lookup site.
+
+fn build_map_string(n: usize) -> HashMap<String, u64, DefaultHashBuilder> {
+    let mut m = HashMap::with_capacity_and_hasher(n, DefaultHashBuilder::default());
+    for i in 0..n as u64 {
+        m.insert(
+            format!("key-{}-{:016x}", i, i.wrapping_mul(0x9E37_79B9_7F4A_7C15)),
+            i,
+        );
+    }
+    m
+}
+
+fn query_keys_string(n: usize) -> Vec<String> {
+    let mut state = 0x1234_5678_9ABC_DEF0u64;
+    (0..N_QUERIES)
+        .map(|_| {
+            let i = xorshift(&mut state) % n as u64;
+            format!("key-{}-{:016x}", i, i.wrapping_mul(0x9E37_79B9_7F4A_7C15))
+        })
+        .collect()
+}
+
+fn lookup_naive_string(map: &HashMap<String, u64, DefaultHashBuilder>, keys: &[&str]) -> u64 {
+    let mut sum = 0u64;
+    for k in keys {
+        if let Some(&v) = map.get(*k) {
+            sum = sum.wrapping_add(v);
+        }
+    }
+    sum
+}
+
+fn lookup_prefetched_string(map: &HashMap<String, u64, DefaultHashBuilder>, keys: &[&str]) -> u64 {
+    let mut sum = 0u64;
+    for (i, k) in keys.iter().enumerate() {
+        if let Some(next) = keys.get(i + LOOKAHEAD) {
+            map.prefetch_get(*next);
+        }
+        if let Some(&v) = map.get(*k) {
+            sum = sum.wrapping_add(v);
+        }
+    }
+    sum
+}
+
+// ---------- Insert workload (prefetch_insert) ----------
+//
+// Inserts hint *both* the control line and the data bucket, since an insert
+// will write to the data position regardless of whether the slot is currently
+// empty. The bench reserves capacity up front so the workload measures the
+// steady-state insert (find-empty-slot + write), not amortized growth.
+
+fn insert_naive(keys: &[Key], capacity: usize) -> u64 {
+    let mut m: HashMap<Key, u64, DefaultHashBuilder> =
+        HashMap::with_capacity_and_hasher(capacity, DefaultHashBuilder::default());
+    let mut sum = 0u64;
+    for (i, &k) in keys.iter().enumerate() {
+        m.insert(k, i as u64);
+        sum = sum.wrapping_add(i as u64);
+    }
+    sum
+}
+
+fn insert_prefetched(keys: &[Key], capacity: usize) -> u64 {
+    let mut m: HashMap<Key, u64, DefaultHashBuilder> =
+        HashMap::with_capacity_and_hasher(capacity, DefaultHashBuilder::default());
+    let mut sum = 0u64;
+    for (i, &k) in keys.iter().enumerate() {
+        if let Some(next) = keys.get(i + LOOKAHEAD) {
+            m.prefetch_insert(next);
+        }
+        m.insert(k, i as u64);
+        sum = sum.wrapping_add(i as u64);
+    }
+    sum
+}
+
+// Unique-insert key set: every key is distinct so each iteration adds a fresh
+// entry. Capacity is reserved up front (in the bench harness) so the workload
+// doesn't include rehash cost.
+fn insert_keys(n: usize) -> Vec<Key> {
+    let mut state = 0xDEAD_BEEF_FACE_CAFEu64;
+    (0..n)
+        .map(|_| {
+            let i = xorshift(&mut state);
+            (i, i.wrapping_mul(0x9E37_79B9_7F4A_7C15))
+        })
+        .collect()
+}
+
+// ---------- Bench registration ----------
+
 pub(crate) fn register_benches(c: &mut Criterion) {
+    // Group 1: integer-key batch lookup, queries hit the map (100% hit rate).
+    // Probes find a matching tag and read the data line on every iteration, so
+    // prefetching the data line is useful. This is the regime where the
+    // original `prefetch` (hint both ctrl + data) is expected to win and a
+    // ctrl-only `prefetch_get` is expected to lose.
     let mut group = c.benchmark_group("batch_lookup");
     group.throughput(Throughput::Elements(N_QUERIES as u64));
     for &n in SIZES {
@@ -79,9 +226,68 @@ pub(crate) fn register_benches(c: &mut Criterion) {
         group.bench_with_input(BenchmarkId::new("naive", n), &n, |b, _| {
             b.iter(|| black_box(lookup_naive(black_box(&map), black_box(&keys))));
         });
-        group.bench_with_input(BenchmarkId::new("prefetch", n), &n, |b, _| {
+        group.bench_with_input(BenchmarkId::new("prefetch_get", n), &n, |b, _| {
             b.iter(|| black_box(lookup_prefetched(black_box(&map), black_box(&keys))));
         });
     }
     group.finish();
+
+    // Group 1b: integer-key batch lookup, queries miss the map (0% hit rate).
+    // Probes find an empty control group and never read the data line, so the
+    // data prefetch in `prefetch`-both would be wasted bandwidth. This is the
+    // regime where ctrl-only `prefetch_get` is expected to win, because the
+    // ctrl hint is still useful and the wasted data hint is avoided.
+    let mut group = c.benchmark_group("batch_lookup_miss");
+    group.throughput(Throughput::Elements(N_QUERIES as u64));
+    for &n in SIZES {
+        let map = build_map(n);
+        let keys = query_keys_miss(n);
+        group.bench_with_input(BenchmarkId::new("naive", n), &n, |b, _| {
+            b.iter(|| black_box(lookup_naive(black_box(&map), black_box(&keys))));
+        });
+        group.bench_with_input(BenchmarkId::new("prefetch_get", n), &n, |b, _| {
+            b.iter(|| black_box(lookup_prefetched(black_box(&map), black_box(&keys))));
+        });
+    }
+    group.finish();
+
+    // Group 2: heap-string-key batch lookup (prefetch_get). String keys force a
+    // pointer dereference at hash time, exposing whether the prefetch's
+    // look-ahead overlap survives the extra cache miss on the key buffer.
+    let mut group = c.benchmark_group("batch_lookup_string");
+    group.throughput(Throughput::Elements(N_QUERIES as u64));
+    for &n in SIZES {
+        let map = build_map_string(n);
+        let keys = query_keys_string(n);
+        let key_refs: Vec<&str> = keys.iter().map(String::as_str).collect();
+        group.bench_with_input(BenchmarkId::new("naive", n), &n, |b, _| {
+            b.iter(|| black_box(lookup_naive_string(black_box(&map), black_box(&key_refs))));
+        });
+        group.bench_with_input(BenchmarkId::new("prefetch_get", n), &n, |b, _| {
+            b.iter(|| {
+                black_box(lookup_prefetched_string(
+                    black_box(&map),
+                    black_box(&key_refs),
+                ))
+            });
+        });
+    }
+    group.finish();
+
+    // Group 3: integer-key batch insert (prefetch_insert). Capacity is reserved
+    // so the bench measures steady-state insert cost (find-empty-slot + write),
+    // not amortized growth.
+    let mut group = c.benchmark_group("batch_insert");
+    group.throughput(Throughput::Elements(N_QUERIES as u64));
+    for &n in SIZES {
+        let keys = insert_keys(N_QUERIES);
+        let capacity = n;
+        group.bench_with_input(BenchmarkId::new("naive", n), &n, |b, _| {
+            b.iter(|| black_box(insert_naive(black_box(&keys), capacity)));
+        });
+        group.bench_with_input(BenchmarkId::new("prefetch_insert", n), &n, |b, _| {
+            b.iter(|| black_box(insert_prefetched(black_box(&keys), capacity)));
+        });
+    }
+    group.finish();
 }
diff --git a/src/map.rs b/src/map.rs
index e1c4a445a..e3d1d982a 100644
--- a/src/map.rs
+++ b/src/map.rs
@@ -1292,20 +1292,25 @@ where
         }
     }
 
-    /// Issues a software prefetch hint for the table memory that a lookup of
-    /// `k` would touch first.
+    /// Issues a software prefetch hint for the control bytes and data bucket
+    /// a *lookup* of `k` would touch first.
     ///
-    /// This hashes `k` and then prefetches the control-byte group at the start
-    /// of its probe sequence and the corresponding bucket. It is purely a
-    /// performance hint with no observable effect, and it compiles to nothing
+    /// This hashes `k` and then prefetches both the control-byte group at the
+    /// start of its probe sequence and the corresponding data bucket. The
+    /// method name signals lookup intent; the implementation hints both lines
+    /// because measured bench evidence shows the data prefetch is load-bearing
+    /// for the win on lookup workloads. Use
+    /// [`prefetch_insert`](Self::prefetch_insert) to signal insert intent.
+    ///
+    /// Purely a performance hint with no observable effect; compiles to nothing
     /// on architectures without a prefetch instruction.
     ///
     /// It is only worth using when looking up *many* keys in a sequence and the
     /// map is large enough that the control bytes do not fit in cache: in that
-    /// case you can call `prefetch` on a key several iterations ahead of the one
-    /// currently being looked up, so the cache lines it needs are in flight
-    /// before the lookup reaches them. For a single lookup, or a map that fits
-    /// in cache, it does nothing useful.
+    /// case you can call `prefetch_get` on a key several iterations ahead of
+    /// the one currently being looked up, so the cache lines it needs are in
+    /// flight before the lookup reaches them. For a single lookup, or a map
+    /// that fits in cache, it does nothing useful.
     ///
     /// # Examples
     ///
@@ -1318,7 +1323,7 @@ where
     /// let mut sum = 0u64;
     /// for (i, q) in queries.iter().enumerate() {
     ///     if let Some(next) = queries.get(i + 8) {
-    ///         map.prefetch(next);
+    ///         map.prefetch_get(next);
     ///     }
     ///     if let Some(&v) = map.get(q) {
     ///         sum += u64::from(v);
@@ -1327,12 +1332,29 @@ where
     /// # let _ = sum;
     /// ```
     #[inline]
-    pub fn prefetch<Q>(&self, k: &Q)
+    pub fn prefetch_get<Q>(&self, k: &Q)
+    where
+        Q: Hash + Equivalent<K> + ?Sized,
+    {
+        let hash = make_hash::<Q, S>(&self.hash_builder, k);
+        self.table.prefetch_get(hash);
+    }
+
+    /// Issues a software prefetch hint for the control bytes and data bucket
+    /// an *insert* of `k` would touch first.
+    ///
+    /// The method name signals insert intent. Currently shares the same
+    /// implementation as [`prefetch_get`](Self::prefetch_get).
+    ///
+    /// Purely a performance hint with no observable effect; compiles to nothing
+    /// on architectures without a prefetch instruction.
+    #[inline]
+    pub fn prefetch_insert<Q>(&self, k: &Q)
     where
         Q: Hash + Equivalent<K> + ?Sized,
     {
         let hash = make_hash::<Q, S>(&self.hash_builder, k);
-        self.table.prefetch(hash);
+        self.table.prefetch_insert(hash);
     }
 
     /// Returns the key-value pair corresponding to the supplied key.
@@ -6945,18 +6967,21 @@ mod test_map {
 
     #[test]
     fn test_prefetch() {
-        // `prefetch` is a hint with no observable effect; the contract we can
-        // test is "calling it never misbehaves and never disturbs the table",
-        // across the interesting shapes: the empty singleton, a tiny table, a
-        // larger one, a ZST-value table, present and absent keys, and a key
-        // hash that probes the last bucket.
+        // `prefetch_get` and `prefetch_insert` are hints with no observable
+        // effect; the contract we can test is "calling them never misbehaves
+        // and never disturbs the table", across the interesting shapes: the
+        // empty singleton, a tiny table, a larger one, a ZST-value table,
+        // present and absent keys, and the look-ahead pattern from the docs.
         let empty: HashMap<u32, u32> = HashMap::new();
-        empty.prefetch(&0);
-        empty.prefetch(&12345);
+        empty.prefetch_get(&0);
+        empty.prefetch_get(&12345);
+        empty.prefetch_insert(&0);
+        empty.prefetch_insert(&12345);
 
         let zst: HashMap<u32, ()> = (0..200).map(|i| (i, ())).collect();
         for i in 0..256 {
-            zst.prefetch(&i);
+            zst.prefetch_get(&i);
+            zst.prefetch_insert(&i);
         }
 
         let mut map: HashMap<u32, u32> = HashMap::new();
@@ -6964,7 +6989,8 @@ mod test_map {
             map.insert(i, i.wrapping_mul(7));
         }
         for i in 0..2000u32 {
-            map.prefetch(&i);
+            map.prefetch_get(&i);
+            map.prefetch_insert(&i);
         }
         // The table is still intact and lookups still work after prefetching.
         for i in 0..1000u32 {
@@ -6974,18 +7000,32 @@ mod test_map {
             assert_eq!(map.get(&i), None);
         }
 
-        // The look-ahead pattern from the docs.
+        // The look-ahead pattern from the docs (lookup-side).
         let queries: Vec<u32> = (0..1000u32).rev().collect();
         let mut found = 0;
         for (i, &q) in queries.iter().enumerate() {
             if let Some(&next) = queries.get(i + 8) {
-                map.prefetch(&next);
+                map.prefetch_get(&next);
             }
             if map.get(&q).is_some() {
                 found += 1;
             }
         }
         assert_eq!(found, 1000);
+
+        // The look-ahead pattern on the insert side.
+        let mut bulk: HashMap<u32, u32> = HashMap::with_capacity(4096);
+        let inserts: Vec<u32> = (0..2000u32).collect();
+        for (i, &k) in inserts.iter().enumerate() {
+            if let Some(&next) = inserts.get(i + 8) {
+                bulk.prefetch_insert(&next);
+            }
+            bulk.insert(k, k);
+        }
+        assert_eq!(bulk.len(), 2000);
+        for i in 0..2000u32 {
+            assert_eq!(bulk.get(&i), Some(&i));
+        }
     }
 }
 
diff --git a/src/prefetch.rs b/src/prefetch.rs
index 3c5bfe120..12b43b129 100644
--- a/src/prefetch.rs
+++ b/src/prefetch.rs
@@ -7,19 +7,42 @@
 //! a no-op in the Rust abstract machine. Architectures without a stable prefetch
 //! intrinsic simply compile it away.
 //!
-//! `core::intrinsics::prefetch_read_data` is unstable, so we cannot use it here.
-//! Instead we use the stable per-architecture intrinsics where they exist
-//! (`_mm_prefetch` on x86/x86-64) and fall back to a no-op everywhere else.
+//! Two paths to the underlying hint:
+//!
+//! - **Stable shim (default).** Per-architecture stable intrinsics where they
+//!   exist (`_mm_prefetch::<_MM_HINT_T0>` on x86/x86-64) and a no-op fallback
+//!   elsewhere (aarch64 has no stable prefetch intrinsic).
+//! - **Nightly intrinsic (`nightly` feature).** `core::intrinsics::prefetch_read_data::<_, 3>(ptr)`
+//!   where locality `3` matches the stable shim's `_MM_HINT_T0` ("prefetch into
+//!   all cache levels"). Available across all architectures the compiler
+//!   recognizes. Gated on the `nightly` feature so end users can compare codegen
+//!   against the stable shim on their target.
 
 /// Issues an L1 read prefetch for the cache line containing `ptr`.
 ///
 /// This is a hint only. `ptr` does not need to be valid, aligned, or even
 /// non-null; an out-of-bounds or dangling pointer is fine and will not fault.
 /// On targets without a stable prefetch intrinsic this is a no-op.
+///
+/// With the `nightly` feature enabled, this routes through
+/// `core::intrinsics::prefetch_read_data::<_, 3>(ptr)`. Locality `3` is the
+/// highest (all cache levels), matching the stable shim's `_MM_HINT_T0` on x86
+/// so the two paths bench apples-to-apples.
 #[inline]
 #[allow(clippy::let_unit_value)]
 pub(crate) fn prefetch_read_l1(ptr: *const u8) {
+    #[cfg(feature = "nightly")]
+    {
+        // `prefetch_read_data` is safe to call: it performs no memory access,
+        // never faults, and accepts any address. Locality `3` (const-generic
+        // on the intrinsic) maps to the highest level (all caches), matching
+        // `_MM_HINT_T0` on x86 so the comparison against the stable shim is
+        // apples-to-apples.
+        core::intrinsics::prefetch_read_data::<_, 3>(ptr);
+    }
+
     #[cfg(all(
+        not(feature = "nightly"),
         any(target_arch = "x86", target_arch = "x86_64"),
         target_feature = "sse",
         not(miri),
@@ -40,15 +63,19 @@ pub(crate) fn prefetch_read_l1(ptr: *const u8) {
         }
     }
 
-    #[cfg(not(all(
-        any(target_arch = "x86", target_arch = "x86_64"),
-        target_feature = "sse",
-        not(miri),
-    )))]
+    #[cfg(all(
+        not(feature = "nightly"),
+        not(all(
+            any(target_arch = "x86", target_arch = "x86_64"),
+            target_feature = "sse",
+            not(miri),
+        )),
+    ))]
     {
-        // No stable prefetch intrinsic on this target (aarch64 has none yet,
-        // and `core::intrinsics::prefetch_read_data` is unstable). Make sure
-        // `ptr` is still "used" so callers don't trip an unused-variable lint.
+        // No stable prefetch intrinsic on this target (aarch64 has none yet).
+        // The `nightly` feature path covers it via `prefetch_read_data` when
+        // available. Make sure `ptr` is still "used" so callers don't trip an
+        // unused-variable lint.
         let _ = ptr;
     }
 }
diff --git a/src/raw.rs b/src/raw.rs
index 9bdea3bec..7a3012e16 100644
--- a/src/raw.rs
+++ b/src/raw.rs
@@ -1210,22 +1210,39 @@ impl<T, A: Allocator> RawTable<T, A> {
         }
     }
 
-    /// Issues a software prefetch hint for the table memory that a lookup of
-    /// `hash` would touch first: the control-byte group at the start of the
-    /// probe sequence and the corresponding data bucket.
-    ///
-    /// This is purely a performance hint and has no observable effect. It is
-    /// most useful when looking up many keys in a row: hash and prefetch a key a
-    /// few iterations ahead of the one currently being looked up, so its cache
-    /// lines are in flight by the time `get`/`find` reaches them. On a single
-    /// lookup, or on a table small enough to stay in cache, it does nothing
-    /// useful (and on architectures without a prefetch instruction it compiles
-    /// away entirely).
+    /// Issues a software prefetch hint for the control-byte group and data
+    /// bucket a *lookup* of `hash` would touch first.
+    ///
+    /// The method name signals lookup intent; the implementation hints both
+    /// lines because measured bench evidence (PR #727) shows the data prefetch
+    /// is load-bearing for the win on lookup workloads. Use
+    /// [`prefetch_insert`](Self::prefetch_insert) to signal insert intent — it
+    /// currently shares the same implementation but the split keeps room for a
+    /// behavioral specialization in a follow-up if a workload supports it.
+    ///
+    /// Purely a performance hint with no observable effect. Most useful when
+    /// looking up many keys in a row: hash and prefetch a key a few iterations
+    /// ahead of the one currently being looked up. On a single lookup, or on
+    /// a table small enough to stay in cache, it does nothing useful (and on
+    /// architectures without a prefetch instruction it compiles away entirely).
+    #[inline]
+    pub(crate) fn prefetch_get(&self, hash: u64) {
+        // SAFETY: We use the same `table_layout` that was used to allocate
+        // this table.
+        unsafe { self.table.prefetch_both(hash, Self::TABLE_LAYOUT) }
+    }
+
+    /// Issues a software prefetch hint for the control-byte group and data
+    /// bucket an *insert* of `hash` would touch first.
+    ///
+    /// The method name signals insert intent. Currently shares the same
+    /// implementation as [`prefetch_get`](Self::prefetch_get) — see that
+    /// method's note on the named-split-only design.
     #[inline]
-    pub(crate) fn prefetch(&self, hash: u64) {
+    pub(crate) fn prefetch_insert(&self, hash: u64) {
         // SAFETY: We use the same `table_layout` that was used to allocate
         // this table.
-        unsafe { self.table.prefetch(hash, Self::TABLE_LAYOUT) }
+        unsafe { self.table.prefetch_both(hash, Self::TABLE_LAYOUT) }
     }
 
     /// Gets a reference to an element in the table.
@@ -2472,8 +2489,18 @@ impl RawTableInner {
         }
     }
 
-    /// Issues a software prefetch hint for the control-byte group and data
-    /// bucket at the start of the probe sequence for `hash`.
+    /// Issues a software prefetch hint for the control-byte group *and* the
+    /// data bucket at the start of the probe sequence for `hash`.
+    ///
+    /// Used by both `prefetch_get` (lookup-side hint) and `prefetch_insert`
+    /// (insert-side hint) wrappers. The two wrappers share the same underlying
+    /// implementation because measured bench evidence (PR #727, Ryzen 9 9950X,
+    /// hit-heavy AND miss-heavy workloads) shows that the data-line prefetch
+    /// is load-bearing for the win on lookup workloads — skipping the data
+    /// prefetch in the lookup case regresses 18–40% across the size sweep.
+    /// The named-method split (`prefetch_get` vs `prefetch_insert`) expresses
+    /// caller intent without changing behavior; the implementations can
+    /// diverge in a follow-up if a workload surfaces where the trade-off pays.
     ///
     /// `table_layout` must be the layout used to allocate this table (so that
     /// the data-bucket address is computed correctly).
@@ -2490,13 +2517,10 @@ impl RawTableInner {
     /// `table_layout.size`; a mismatched layout would only mean prefetching the
     /// wrong cache line, never UB.)
     #[inline]
-    unsafe fn prefetch(&self, hash: u64, table_layout: TableLayout) {
+    unsafe fn prefetch_both(&self, hash: u64, table_layout: TableLayout) {
         let pos = h1(hash) & self.bucket_mask;
 
-        // Control bytes: the group `Group::load` would read first. `pos` is a
-        // valid control index (`pos <= bucket_mask < num_ctrl_bytes`), so the
-        // pointer is in-bounds even before accounting for the hint-only nature
-        // of prefetch.
+        // Control bytes: the group `Group::load` would read first.
         let ctrl_ptr = self.ctrl.as_ptr().wrapping_add(pos);
 
         // Data bucket at index `pos`: `data_end - (pos + 1) * size`. `data_end`
diff --git a/src/set.rs b/src/set.rs
index 6b1a02d3b..e1d010f32 100644
--- a/src/set.rs
+++ b/src/set.rs
@@ -857,25 +857,46 @@ where
     /// Issues a software prefetch hint for the table memory that a lookup of
     /// `value` would touch first.
     ///
-    /// This hashes `value` and then prefetches the control-byte group at the
-    /// start of its probe sequence and the corresponding bucket. It is purely a
-    /// performance hint with no observable effect, and it compiles to nothing
+    /// This hashes `value` and prefetches both the control-byte group at the
+    /// start of its probe sequence and the corresponding data bucket. The
+    /// method name signals lookup intent; the implementation hints both lines
+    /// because measured bench evidence shows the data prefetch is load-bearing
+    /// for the win on lookup workloads. Use
+    /// [`prefetch_insert`](Self::prefetch_insert) to signal insert intent.
+    ///
+    /// Purely a performance hint with no observable effect; compiles to nothing
     /// on architectures without a prefetch instruction.
     ///
     /// It is only worth using when looking up *many* values in a sequence and
     /// the set is large enough that the control bytes do not fit in cache: in
-    /// that case you can call `prefetch` on a value several iterations ahead of
-    /// the one currently being looked up. For a single lookup, or a set that
-    /// fits in cache, it does nothing useful. See [`HashMap::prefetch`] for an
-    /// example of the look-ahead pattern.
+    /// that case you can call `prefetch_get` on a value several iterations
+    /// ahead of the one currently being looked up. For a single lookup, or a
+    /// set that fits in cache, it does nothing useful. See
+    /// [`HashMap::prefetch_get`] for an example of the look-ahead pattern.
     ///
-    /// [`HashMap::prefetch`]: crate::HashMap::prefetch
+    /// [`HashMap::prefetch_get`]: crate::HashMap::prefetch_get
     #[cfg_attr(feature = "inline-more", inline)]
-    pub fn prefetch<Q>(&self, value: &Q)
+    pub fn prefetch_get<Q>(&self, value: &Q)
     where
         Q: Hash + Equivalent<T> + ?Sized,
     {
-        self.map.prefetch(value);
+        self.map.prefetch_get(value);
+    }
+
+    /// Issues a software prefetch hint for the control bytes and data bucket
+    /// an *insert* of `value` would touch first.
+    ///
+    /// The method name signals insert intent. Currently shares the same
+    /// implementation as [`prefetch_get`](Self::prefetch_get).
+    ///
+    /// Purely a performance hint with no observable effect; compiles to nothing
+    /// on architectures without a prefetch instruction.
+    #[cfg_attr(feature = "inline-more", inline)]
+    pub fn prefetch_insert<Q>(&self, value: &Q)
+    where
+        Q: Hash + Equivalent<T> + ?Sized,
+    {
+        self.map.prefetch_insert(value);
     }
 
     /// Returns a reference to the value in the set, if any, that is equal to the given value.
diff --git a/src/table.rs b/src/table.rs
index cb8c6369a..70049bab1 100644
--- a/src/table.rs
+++ b/src/table.rs
@@ -229,19 +229,23 @@ where
         self.raw.get(hash, eq)
     }
 
-    /// Issues a software prefetch hint for the table memory that a lookup of
-    /// `hash` would touch first (the control-byte group at the start of the
-    /// probe sequence and the corresponding data bucket).
+    /// Issues a software prefetch hint for the control-byte group and data
+    /// bucket a *lookup* of `hash` would touch first.
+    ///
+    /// The method name signals lookup intent; the implementation hints both
+    /// lines because measured bench evidence shows the data prefetch is
+    /// load-bearing for the win on lookup workloads. Use
+    /// [`prefetch_insert`](Self::prefetch_insert) to signal insert intent.
     ///
     /// This is purely a performance hint with no observable effect, and it
     /// compiles to nothing on architectures without a prefetch instruction.
     ///
     /// It is only worth using when looking up *many* hashes in a sequence and
-    /// the table is large enough that the control bytes do not fit in cache: in
-    /// that case you can hash a key several iterations ahead of the one
-    /// currently being looked up and call `prefetch` on it, so the cache lines
-    /// it needs are in flight before the lookup reaches them. For a single
-    /// lookup, or a table that fits in cache, it does nothing useful.
+    /// the table is large enough that the control bytes do not fit in cache:
+    /// in that case you can hash a key several iterations ahead of the one
+    /// currently being looked up and call `prefetch_get` on it, so the cache
+    /// lines it needs are in flight before the lookup reaches them. For a
+    /// single lookup, or a table that fits in cache, it does nothing useful.
     ///
     /// `hash` must be computed with the same hasher you use for [`find`]; using
     /// an unrelated hash just prefetches an unrelated (still valid-to-prefetch)
@@ -266,7 +270,7 @@ where
     /// let mut found = 0;
     /// for (i, &q) in queries.iter().enumerate() {
     ///     if let Some(&next) = queries.get(i + 8) {
-    ///         table.prefetch(s.hash_one(next));
+    ///         table.prefetch_get(s.hash_one(next));
     ///     }
     ///     if table.find(s.hash_one(q), |&x| x == q).is_some() {
     ///         found += 1;
@@ -282,8 +286,21 @@ where
     ///
     /// [`find`]: Self::find
     #[inline]
-    pub fn prefetch(&self, hash: u64) {
-        self.raw.prefetch(hash);
+    pub fn prefetch_get(&self, hash: u64) {
+        self.raw.prefetch_get(hash);
+    }
+
+    /// Issues a software prefetch hint for the control-byte group and data
+    /// bucket an *insert* of `hash` would touch first.
+    ///
+    /// The method name signals insert intent. Currently shares the same
+    /// implementation as [`prefetch_get`](Self::prefetch_get).
+    ///
+    /// Purely a performance hint with no observable effect; compiles to
+    /// nothing on architectures without a prefetch instruction.
+    #[inline]
+    pub fn prefetch_insert(&self, hash: u64) {
+        self.raw.prefetch_insert(hash);
     }
 
     /// Returns a mutable reference to an entry in the table with the given hash