diff --git a/benches/bench.rs b/benches/bench.rs
index 7103f68bf..de06e3619 100644
--- a/benches/bench.rs
+++ b/benches/bench.rs
@@ -4,6 +4,7 @@ use criterion::{criterion_group, criterion_main};
 
 mod general_ops;
 mod insert_unique_unchecked;
+mod prefetch;
 mod set_ops;
 mod with_capacity;
 
@@ -11,6 +12,7 @@ criterion_group!(
     benches,
     general_ops::register_benches,
     insert_unique_unchecked::register_benches,
+    prefetch::register_benches,
     set_ops::register_benches,
     with_capacity::register_benches
 );
diff --git a/benches/prefetch.rs b/benches/prefetch.rs
new file mode 100644
index 000000000..16ea5c08b
--- /dev/null
+++ b/benches/prefetch.rs
@@ -0,0 +1,293 @@
+//! Benches for `HashMap::prefetch_get` and `HashMap::prefetch_insert`.
+//!
+//! Two flavors of workload: batch lookups (`prefetch_get`) and batch inserts
+//! (`prefetch_insert`). Each flavor runs against integer keys (`(u64, u64)`)
+//! and heap-allocated `String` keys; the string variant exists because
+//! heap-allocated keys force a pointer dereference to hash, which changes the
+//! cache-miss profile of the prefetch call itself.
+
+use criterion::{BenchmarkId, Criterion, Throughput};
+use hashbrown::{DefaultHashBuilder, HashMap};
+use std::hint::black_box;
+
+// ---------- Shared knobs ----------
+//
+// Table-size sweep covers the in-cache → cache-spilled crossover. The prefetch
+// is a hint that pays off only when the control bytes have spilled out of L2/L3
+// and the caller has independent work to overlap with the fetch, so the small
+// sizes are a sanity check (prefetch should be noise or a slight loss) and the
+// large sizes are where the win materializes.
+const SIZES: &[usize] = &[1 << 12, 1 << 16, 1 << 18, 1 << 20, 1 << 22];
+
+// The number of iterations ahead we issue the prefetch. Eight is a common
+// rule-of-thumb (covers ~one cache-miss-worth of work on modern cores) and
+// matches the abseil prefetch_hash idiom.
+const LOOKAHEAD: usize = 8;
+
+// Query batch size. Large enough that fixed per-iteration overhead is
+// amortized; small enough that the bench finishes in seconds.
+const N_QUERIES: usize = 1 << 16;
+
+// 16-byte key, like a common join-key shape (two u64s).
+type Key = (u64, u64);
+
+// ---------- Integer-key workload ----------
+//
+// Keys are packed inline (16 bytes), so hashing the key never dereferences
+// outside the slice. This isolates the prefetch effect to the table's control
+// + data lines: there's no extra cache miss "behind" the key itself.
+
+fn build_map(n: usize) -> HashMap<Key, u64, DefaultHashBuilder> {
+    let mut m = HashMap::with_capacity_and_hasher(n, DefaultHashBuilder::default());
+    for i in 0..n as u64 {
+        m.insert((i, i.wrapping_mul(0x9E37_79B9_7F4A_7C15)), i);
+    }
+    m
+}
+
+// A cheap PRNG so the lookup order is unpredictable to the hardware prefetcher.
+// `xorshift` is deterministic given the seed; the same query set is generated
+// each invocation so the comparison is apples-to-apples.
+fn xorshift(state: &mut u64) -> u64 {
+    let mut x = *state;
+    x ^= x << 13;
+    x ^= x >> 7;
+    x ^= x << 17;
+    *state = x;
+    x
+}
+
+fn query_keys(n: usize) -> Vec<Key> {
+    let mut state = 0x1234_5678_9ABC_DEF0u64;
+    (0..N_QUERIES)
+        .map(|_| {
+            let i = xorshift(&mut state) % n as u64;
+            (i, i.wrapping_mul(0x9E37_79B9_7F4A_7C15))
+        })
+        .collect()
+}
+
+// Miss-heavy query set: every key is drawn from outside the inserted range so
+// every lookup misses. The probe finds an empty control group and never reads
+// the data line. This is the regime where `prefetch_get`'s ctrl-only hint is
+// supposed to win over `prefetch`-both, because the data prefetch is wasted
+// bandwidth when the lookup terminates on the control bytes.
+fn query_keys_miss(n: usize) -> Vec<Key> {
+    let mut state = 0xCAFE_BABE_DEAD_BEEFu64;
+    let offset = (n as u64).saturating_mul(2);
+    (0..N_QUERIES)
+        .map(|_| {
+            let i = offset + (xorshift(&mut state) % n as u64);
+            (i, i.wrapping_mul(0x9E37_79B9_7F4A_7C15))
+        })
+        .collect()
+}
+
+fn lookup_naive(map: &HashMap<Key, u64, DefaultHashBuilder>, keys: &[Key]) -> u64 {
+    let mut sum = 0u64;
+    for k in keys {
+        if let Some(&v) = map.get(k) {
+            sum = sum.wrapping_add(v);
+        }
+    }
+    sum
+}
+
+// The look-ahead pattern: prefetch the key `i + LOOKAHEAD` iterations ahead
+// while processing key `i`. By the time iteration `i + LOOKAHEAD` arrives, the
+// control line is already in cache.
+fn lookup_prefetched(map: &HashMap<Key, u64, DefaultHashBuilder>, keys: &[Key]) -> u64 {
+    let mut sum = 0u64;
+    for (i, k) in keys.iter().enumerate() {
+        if let Some(next) = keys.get(i + LOOKAHEAD) {
+            map.prefetch_get(next);
+        }
+        if let Some(&v) = map.get(k) {
+            sum = sum.wrapping_add(v);
+        }
+    }
+    sum
+}
+
+// ---------- Heap-string workload ----------
+//
+// Heap-allocated `String` keys are scattered across allocations: each key is a
+// pointer + length, and hashing the key dereferences the pointer to read the
+// bytes. That's an extra cache miss "behind" the key compared to the inline
+// integer keys. Whether prefetch still wins on this workload depends on how
+// much of that extra miss the look-ahead can overlap with the caller's work.
+// Note that the prefetch_get call here doesn't hint the key's heap buffer; it
+// only hints the control bytes the key's *hash* would land on. The key
+// dereference cost is paid at the prefetch site (during `make_hash`), not the
+// lookup site.
+
+fn build_map_string(n: usize) -> HashMap<String, u64, DefaultHashBuilder> {
+    let mut m = HashMap::with_capacity_and_hasher(n, DefaultHashBuilder::default());
+    for i in 0..n as u64 {
+        m.insert(
+            format!("key-{}-{:016x}", i, i.wrapping_mul(0x9E37_79B9_7F4A_7C15)),
+            i,
+        );
+    }
+    m
+}
+
+fn query_keys_string(n: usize) -> Vec<String> {
+    let mut state = 0x1234_5678_9ABC_DEF0u64;
+    (0..N_QUERIES)
+        .map(|_| {
+            let i = xorshift(&mut state) % n as u64;
+            format!("key-{}-{:016x}", i, i.wrapping_mul(0x9E37_79B9_7F4A_7C15))
+        })
+        .collect()
+}
+
+fn lookup_naive_string(map: &HashMap<String, u64, DefaultHashBuilder>, keys: &[&str]) -> u64 {
+    let mut sum = 0u64;
+    for k in keys {
+        if let Some(&v) = map.get(*k) {
+            sum = sum.wrapping_add(v);
+        }
+    }
+    sum
+}
+
+fn lookup_prefetched_string(map: &HashMap<String, u64, DefaultHashBuilder>, keys: &[&str]) -> u64 {
+    let mut sum = 0u64;
+    for (i, k) in keys.iter().enumerate() {
+        if let Some(next) = keys.get(i + LOOKAHEAD) {
+            map.prefetch_get(*next);
+        }
+        if let Some(&v) = map.get(*k) {
+            sum = sum.wrapping_add(v);
+        }
+    }
+    sum
+}
+
+// ---------- Insert workload (prefetch_insert) ----------
+//
+// Inserts hint *both* the control line and the data bucket, since an insert
+// will write to the data position regardless of whether the slot is currently
+// empty. The bench reserves capacity up front so the workload measures the
+// steady-state insert (find-empty-slot + write), not amortized growth.
+
+fn insert_naive(keys: &[Key], capacity: usize) -> u64 {
+    let mut m: HashMap<Key, u64, DefaultHashBuilder> =
+        HashMap::with_capacity_and_hasher(capacity, DefaultHashBuilder::default());
+    let mut sum = 0u64;
+    for (i, &k) in keys.iter().enumerate() {
+        m.insert(k, i as u64);
+        sum = sum.wrapping_add(i as u64);
+    }
+    sum
+}
+
+fn insert_prefetched(keys: &[Key], capacity: usize) -> u64 {
+    let mut m: HashMap<Key, u64, DefaultHashBuilder> =
+        HashMap::with_capacity_and_hasher(capacity, DefaultHashBuilder::default());
+    let mut sum = 0u64;
+    for (i, &k) in keys.iter().enumerate() {
+        if let Some(next) = keys.get(i + LOOKAHEAD) {
+            m.prefetch_insert(next);
+        }
+        m.insert(k, i as u64);
+        sum = sum.wrapping_add(i as u64);
+    }
+    sum
+}
+
+// Unique-insert key set: every key is distinct so each iteration adds a fresh
+// entry. Capacity is reserved up front (in the bench harness) so the workload
+// doesn't include rehash cost.
+fn insert_keys(n: usize) -> Vec<Key> {
+    let mut state = 0xDEAD_BEEF_FACE_CAFEu64;
+    (0..n)
+        .map(|_| {
+            let i = xorshift(&mut state);
+            (i, i.wrapping_mul(0x9E37_79B9_7F4A_7C15))
+        })
+        .collect()
+}
+
+// ---------- Bench registration ----------
+
+pub(crate) fn register_benches(c: &mut Criterion) {
+    // Group 1: integer-key batch lookup, queries hit the map (100% hit rate).
+    // Probes find a matching tag and read the data line on every iteration, so
+    // prefetching the data line is useful. This is the regime where the
+    // original `prefetch` (hint both ctrl + data) is expected to win and a
+    // ctrl-only `prefetch_get` is expected to lose.
+    let mut group = c.benchmark_group("batch_lookup");
+    group.throughput(Throughput::Elements(N_QUERIES as u64));
+    for &n in SIZES {
+        let map = build_map(n);
+        let keys = query_keys(n);
+        group.bench_with_input(BenchmarkId::new("naive", n), &n, |b, _| {
+            b.iter(|| black_box(lookup_naive(black_box(&map), black_box(&keys))));
+        });
+        group.bench_with_input(BenchmarkId::new("prefetch_get", n), &n, |b, _| {
+            b.iter(|| black_box(lookup_prefetched(black_box(&map), black_box(&keys))));
+        });
+    }
+    group.finish();
+
+    // Group 1b: integer-key batch lookup, queries miss the map (0% hit rate).
+    // Probes find an empty control group and never read the data line, so the
+    // data prefetch in `prefetch`-both would be wasted bandwidth. This is the
+    // regime where ctrl-only `prefetch_get` is expected to win, because the
+    // ctrl hint is still useful and the wasted data hint is avoided.
+    let mut group = c.benchmark_group("batch_lookup_miss");
+    group.throughput(Throughput::Elements(N_QUERIES as u64));
+    for &n in SIZES {
+        let map = build_map(n);
+        let keys = query_keys_miss(n);
+        group.bench_with_input(BenchmarkId::new("naive", n), &n, |b, _| {
+            b.iter(|| black_box(lookup_naive(black_box(&map), black_box(&keys))));
+        });
+        group.bench_with_input(BenchmarkId::new("prefetch_get", n), &n, |b, _| {
+            b.iter(|| black_box(lookup_prefetched(black_box(&map), black_box(&keys))));
+        });
+    }
+    group.finish();
+
+    // Group 2: heap-string-key batch lookup (prefetch_get). String keys force a
+    // pointer dereference at hash time, exposing whether the prefetch's
+    // look-ahead overlap survives the extra cache miss on the key buffer.
+    let mut group = c.benchmark_group("batch_lookup_string");
+    group.throughput(Throughput::Elements(N_QUERIES as u64));
+    for &n in SIZES {
+        let map = build_map_string(n);
+        let keys = query_keys_string(n);
+        let key_refs: Vec<&str> = keys.iter().map(String::as_str).collect();
+        group.bench_with_input(BenchmarkId::new("naive", n), &n, |b, _| {
+            b.iter(|| black_box(lookup_naive_string(black_box(&map), black_box(&key_refs))));
+        });
+        group.bench_with_input(BenchmarkId::new("prefetch_get", n), &n, |b, _| {
+            b.iter(|| {
+                black_box(lookup_prefetched_string(
+                    black_box(&map),
+                    black_box(&key_refs),
+                ))
+            });
+        });
+    }
+    group.finish();
+
+    // Group 3: integer-key batch insert (prefetch_insert). Capacity is reserved
+    // so the bench measures steady-state insert cost (find-empty-slot + write),
+    // not amortized growth.
+    let mut group = c.benchmark_group("batch_insert");
+    group.throughput(Throughput::Elements(N_QUERIES as u64));
+    for &n in SIZES {
+        let keys = insert_keys(N_QUERIES);
+        let capacity = n;
+        group.bench_with_input(BenchmarkId::new("naive", n), &n, |b, _| {
+            b.iter(|| black_box(insert_naive(black_box(&keys), capacity)));
+        });
+        group.bench_with_input(BenchmarkId::new("prefetch_insert", n), &n, |b, _| {
+            b.iter(|| black_box(insert_prefetched(black_box(&keys), capacity)));
+        });
+    }
+    group.finish();
+}
diff --git a/src/lib.rs b/src/lib.rs
index effc178a3..04d07354d 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -52,6 +52,7 @@ mod macros;
 mod alloc;
 mod control;
 mod hasher;
+mod prefetch;
 mod raw;
 mod util;
 
diff --git a/src/map.rs b/src/map.rs
index 22cafef59..e3d1d982a 100644
--- a/src/map.rs
+++ b/src/map.rs
@@ -1292,6 +1292,71 @@ where
         }
     }
 
+    /// Issues a software prefetch hint for the control bytes and data bucket
+    /// a *lookup* of `k` would touch first.
+    ///
+    /// This hashes `k` and then prefetches both the control-byte group at the
+    /// start of its probe sequence and the corresponding data bucket. The
+    /// method name signals lookup intent; the implementation hints both lines
+    /// because measured bench evidence shows the data prefetch is load-bearing
+    /// for the win on lookup workloads. Use
+    /// [`prefetch_insert`](Self::prefetch_insert) to signal insert intent.
+    ///
+    /// Purely a performance hint with no observable effect; compiles to nothing
+    /// on architectures without a prefetch instruction.
+    ///
+    /// It is only worth using when looking up *many* keys in a sequence and the
+    /// map is large enough that the control bytes do not fit in cache: in that
+    /// case you can call `prefetch_get` on a key several iterations ahead of
+    /// the one currently being looked up, so the cache lines it needs are in
+    /// flight before the lookup reaches them. For a single lookup, or a map
+    /// that fits in cache, it does nothing useful.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// use hashbrown::HashMap;
+    ///
+    /// let map: HashMap<u32, u32> = (0..1000).map(|i| (i, i)).collect();
+    /// let queries: Vec<u32> = (0..1000).rev().collect();
+    ///
+    /// let mut sum = 0u64;
+    /// for (i, q) in queries.iter().enumerate() {
+    ///     if let Some(next) = queries.get(i + 8) {
+    ///         map.prefetch_get(next);
+    ///     }
+    ///     if let Some(&v) = map.get(q) {
+    ///         sum += u64::from(v);
+    ///     }
+    /// }
+    /// # let _ = sum;
+    /// ```
+    #[inline]
+    pub fn prefetch_get<Q>(&self, k: &Q)
+    where
+        Q: Hash + Equivalent<K> + ?Sized,
+    {
+        let hash = make_hash::<Q, S>(&self.hash_builder, k);
+        self.table.prefetch_get(hash);
+    }
+
+    /// Issues a software prefetch hint for the control bytes and data bucket
+    /// an *insert* of `k` would touch first.
+    ///
+    /// The method name signals insert intent. Currently shares the same
+    /// implementation as [`prefetch_get`](Self::prefetch_get).
+    ///
+    /// Purely a performance hint with no observable effect; compiles to nothing
+    /// on architectures without a prefetch instruction.
+    #[inline]
+    pub fn prefetch_insert<Q>(&self, k: &Q)
+    where
+        Q: Hash + Equivalent<K> + ?Sized,
+    {
+        let hash = make_hash::<Q, S>(&self.hash_builder, k);
+        self.table.prefetch_insert(hash);
+    }
+
     /// Returns the key-value pair corresponding to the supplied key.
     ///
     /// The supplied key may be any borrowed form of the map's key type, but
@@ -6899,6 +6964,69 @@ mod test_map {
             HashMap::<u32, u32>::with_capacity(1).allocation_size() > core::mem::size_of::<u32>()
         );
     }
+
+    #[test]
+    fn test_prefetch() {
+        // `prefetch_get` and `prefetch_insert` are hints with no observable
+        // effect; the contract we can test is "calling them never misbehaves
+        // and never disturbs the table", across the interesting shapes: the
+        // empty singleton, a tiny table, a larger one, a ZST-value table,
+        // present and absent keys, and the look-ahead pattern from the docs.
+        let empty: HashMap<u32, u32> = HashMap::new();
+        empty.prefetch_get(&0);
+        empty.prefetch_get(&12345);
+        empty.prefetch_insert(&0);
+        empty.prefetch_insert(&12345);
+
+        let zst: HashMap<u32, ()> = (0..200).map(|i| (i, ())).collect();
+        for i in 0..256 {
+            zst.prefetch_get(&i);
+            zst.prefetch_insert(&i);
+        }
+
+        let mut map: HashMap<u32, u32> = HashMap::new();
+        for i in 0..1000u32 {
+            map.insert(i, i.wrapping_mul(7));
+        }
+        for i in 0..2000u32 {
+            map.prefetch_get(&i);
+            map.prefetch_insert(&i);
+        }
+        // The table is still intact and lookups still work after prefetching.
+        for i in 0..1000u32 {
+            assert_eq!(map.get(&i), Some(&i.wrapping_mul(7)));
+        }
+        for i in 1000..2000u32 {
+            assert_eq!(map.get(&i), None);
+        }
+
+        // The look-ahead pattern from the docs (lookup-side).
+        let queries: Vec<u32> = (0..1000u32).rev().collect();
+        let mut found = 0;
+        for (i, &q) in queries.iter().enumerate() {
+            if let Some(&next) = queries.get(i + 8) {
+                map.prefetch_get(&next);
+            }
+            if map.get(&q).is_some() {
+                found += 1;
+            }
+        }
+        assert_eq!(found, 1000);
+
+        // The look-ahead pattern on the insert side.
+        let mut bulk: HashMap<u32, u32> = HashMap::with_capacity(4096);
+        let inserts: Vec<u32> = (0..2000u32).collect();
+        for (i, &k) in inserts.iter().enumerate() {
+            if let Some(&next) = inserts.get(i + 8) {
+                bulk.prefetch_insert(&next);
+            }
+            bulk.insert(k, k);
+        }
+        assert_eq!(bulk.len(), 2000);
+        for i in 0..2000u32 {
+            assert_eq!(bulk.get(&i), Some(&i));
+        }
+    }
 }
 
 #[cfg(all(test, unix, any(feature = "nightly", feature = "allocator-api2")))]
diff --git a/src/prefetch.rs b/src/prefetch.rs
new file mode 100644
index 000000000..12b43b129
--- /dev/null
+++ b/src/prefetch.rs
@@ -0,0 +1,81 @@
+//! Software prefetch hint.
+//!
+//! A prefetch is a *hint* to the CPU that the cache line containing a given
+//! address will be accessed soon, so the memory subsystem can start fetching it
+//! while the core does other work. It is purely advisory: it never reads or
+//! writes memory, never faults (even for an invalid or dangling pointer), and is
+//! a no-op in the Rust abstract machine. Architectures without a stable prefetch
+//! intrinsic simply compile it away.
+//!
+//! Two paths to the underlying hint:
+//!
+//! - **Stable shim (default).** Per-architecture stable intrinsics where they
+//!   exist (`_mm_prefetch::<_MM_HINT_T0>` on x86/x86-64) and a no-op fallback
+//!   elsewhere (aarch64 has no stable prefetch intrinsic).
+//! - **Nightly intrinsic (`nightly` feature).** `core::intrinsics::prefetch_read_data::<_, 3>(ptr)`
+//!   where locality `3` matches the stable shim's `_MM_HINT_T0` ("prefetch into
+//!   all cache levels"). Available across all architectures the compiler
+//!   recognizes. Gated on the `nightly` feature so end users can compare codegen
+//!   against the stable shim on their target.
+
+/// Issues an L1 read prefetch for the cache line containing `ptr`.
+///
+/// This is a hint only. `ptr` does not need to be valid, aligned, or even
+/// non-null; an out-of-bounds or dangling pointer is fine and will not fault.
+/// On targets without a stable prefetch intrinsic this is a no-op.
+///
+/// With the `nightly` feature enabled, this routes through
+/// `core::intrinsics::prefetch_read_data::<_, 3>(ptr)`. Locality `3` is the
+/// highest (all cache levels), matching the stable shim's `_MM_HINT_T0` on x86
+/// so the two paths bench apples-to-apples.
+#[inline]
+#[allow(clippy::let_unit_value)]
+pub(crate) fn prefetch_read_l1(ptr: *const u8) {
+    #[cfg(feature = "nightly")]
+    {
+        // `prefetch_read_data` is safe to call: it performs no memory access,
+        // never faults, and accepts any address. Locality `3` (const-generic
+        // on the intrinsic) maps to the highest level (all caches), matching
+        // `_MM_HINT_T0` on x86 so the comparison against the stable shim is
+        // apples-to-apples.
+        core::intrinsics::prefetch_read_data::<_, 3>(ptr);
+    }
+
+    #[cfg(all(
+        not(feature = "nightly"),
+        any(target_arch = "x86", target_arch = "x86_64"),
+        target_feature = "sse",
+        not(miri),
+    ))]
+    {
+        #[cfg(target_arch = "x86")]
+        use core::arch::x86::{_MM_HINT_T0, _mm_prefetch};
+        #[cfg(target_arch = "x86_64")]
+        use core::arch::x86_64::{_MM_HINT_T0, _mm_prefetch};
+
+        // SAFETY: `_mm_prefetch` is a hint instruction; it performs no memory
+        // access, never faults, and accepts any address (the Intel SDM and the
+        // `core::arch` docs both spell this out). The only safety requirement is
+        // that the `sse` target feature is available, which the `cfg` above
+        // guarantees on x86 / x86-64.
+        unsafe {
+            _mm_prefetch::<_MM_HINT_T0>(ptr.cast::<i8>());
+        }
+    }
+
+    #[cfg(all(
+        not(feature = "nightly"),
+        not(all(
+            any(target_arch = "x86", target_arch = "x86_64"),
+            target_feature = "sse",
+            not(miri),
+        )),
+    ))]
+    {
+        // No stable prefetch intrinsic on this target (aarch64 has none yet).
+        // The `nightly` feature path covers it via `prefetch_read_data` when
+        // available. Make sure `ptr` is still "used" so callers don't trip an
+        // unused-variable lint.
+        let _ = ptr;
+    }
+}
diff --git a/src/raw.rs b/src/raw.rs
index 39f50ef78..7a3012e16 100644
--- a/src/raw.rs
+++ b/src/raw.rs
@@ -1210,6 +1210,41 @@ impl<T, A: Allocator> RawTable<T, A> {
         }
     }
 
+    /// Issues a software prefetch hint for the control-byte group and data
+    /// bucket a *lookup* of `hash` would touch first.
+    ///
+    /// The method name signals lookup intent; the implementation hints both
+    /// lines because measured bench evidence (PR #727) shows the data prefetch
+    /// is load-bearing for the win on lookup workloads. Use
+    /// [`prefetch_insert`](Self::prefetch_insert) to signal insert intent — it
+    /// currently shares the same implementation but the split keeps room for a
+    /// behavioral specialization in a follow-up if a workload supports it.
+    ///
+    /// Purely a performance hint with no observable effect. Most useful when
+    /// looking up many keys in a row: hash and prefetch a key a few iterations
+    /// ahead of the one currently being looked up. On a single lookup, or on
+    /// a table small enough to stay in cache, it does nothing useful (and on
+    /// architectures without a prefetch instruction it compiles away entirely).
+    #[inline]
+    pub(crate) fn prefetch_get(&self, hash: u64) {
+        // SAFETY: We use the same `table_layout` that was used to allocate
+        // this table.
+        unsafe { self.table.prefetch_both(hash, Self::TABLE_LAYOUT) }
+    }
+
+    /// Issues a software prefetch hint for the control-byte group and data
+    /// bucket an *insert* of `hash` would touch first.
+    ///
+    /// The method name signals insert intent. Currently shares the same
+    /// implementation as [`prefetch_get`](Self::prefetch_get) — see that
+    /// method's note on the named-split-only design.
+    #[inline]
+    pub(crate) fn prefetch_insert(&self, hash: u64) {
+        // SAFETY: We use the same `table_layout` that was used to allocate
+        // this table.
+        unsafe { self.table.prefetch_both(hash, Self::TABLE_LAYOUT) }
+    }
+
     /// Gets a reference to an element in the table.
     #[inline]
     pub(crate) fn get(&self, hash: u64, eq: impl FnMut(&T) -> bool) -> Option<&T> {
@@ -2454,6 +2489,56 @@ impl RawTableInner {
         }
     }
 
+    /// Issues a software prefetch hint for the control-byte group *and* the
+    /// data bucket at the start of the probe sequence for `hash`.
+    ///
+    /// Used by both `prefetch_get` (lookup-side hint) and `prefetch_insert`
+    /// (insert-side hint) wrappers. The two wrappers share the same underlying
+    /// implementation because measured bench evidence (PR #727, Ryzen 9 9950X,
+    /// hit-heavy AND miss-heavy workloads) shows that the data-line prefetch
+    /// is load-bearing for the win on lookup workloads — skipping the data
+    /// prefetch in the lookup case regresses 18–40% across the size sweep.
+    /// The named-method split (`prefetch_get` vs `prefetch_insert`) expresses
+    /// caller intent without changing behavior; the implementations can
+    /// diverge in a follow-up if a workload surfaces where the trade-off pays.
+    ///
+    /// `table_layout` must be the layout used to allocate this table (so that
+    /// the data-bucket address is computed correctly).
+    ///
+    /// This is a hint only: it performs no memory access, never faults, and is
+    /// a no-op in the abstract machine. On the empty singleton table the
+    /// "addresses" point into / just before the shared empty control array,
+    /// which is fine — prefetching them is still harmless.
+    ///
+    /// # Safety
+    ///
+    /// `table_layout` must match the layout used to allocate this table.
+    /// (The function does not dereference any pointer, but it computes one from
+    /// `table_layout.size`; a mismatched layout would only mean prefetching the
+    /// wrong cache line, never UB.)
+    #[inline]
+    unsafe fn prefetch_both(&self, hash: u64, table_layout: TableLayout) {
+        let pos = h1(hash) & self.bucket_mask;
+
+        // Control bytes: the group `Group::load` would read first.
+        let ctrl_ptr = self.ctrl.as_ptr().wrapping_add(pos);
+
+        // Data bucket at index `pos`: `data_end - (pos + 1) * size`. `data_end`
+        // is `self.ctrl`, so this is `self.ctrl - (pos + 1) * size`. Use
+        // `wrapping_*` so this can never be UB even for the empty singleton
+        // (where it points just before the shared empty control array).
+        let data_ptr = self
+            .ctrl
+            .as_ptr()
+            .wrapping_sub((pos + 1).wrapping_mul(table_layout.size));
+
+        crate::prefetch::prefetch_read_l1(ctrl_ptr);
+        // For zero-sized values there is no data array to prefetch.
+        if table_layout.size != 0 {
+            crate::prefetch::prefetch_read_l1(data_ptr);
+        }
+    }
+
     #[inline]
     unsafe fn record_item_insert_at(&mut self, index: usize, old_ctrl: Tag, new_ctrl: Tag) {
         self.growth_left -= usize::from(old_ctrl.special_is_empty());
diff --git a/src/set.rs b/src/set.rs
index cea1690f1..e1d010f32 100644
--- a/src/set.rs
+++ b/src/set.rs
@@ -854,6 +854,51 @@ where
         self.map.contains_key(value)
     }
 
+    /// Issues a software prefetch hint for the table memory that a lookup of
+    /// `value` would touch first.
+    ///
+    /// This hashes `value` and prefetches both the control-byte group at the
+    /// start of its probe sequence and the corresponding data bucket. The
+    /// method name signals lookup intent; the implementation hints both lines
+    /// because measured bench evidence shows the data prefetch is load-bearing
+    /// for the win on lookup workloads. Use
+    /// [`prefetch_insert`](Self::prefetch_insert) to signal insert intent.
+    ///
+    /// Purely a performance hint with no observable effect; compiles to nothing
+    /// on architectures without a prefetch instruction.
+    ///
+    /// It is only worth using when looking up *many* values in a sequence and
+    /// the set is large enough that the control bytes do not fit in cache: in
+    /// that case you can call `prefetch_get` on a value several iterations
+    /// ahead of the one currently being looked up. For a single lookup, or a
+    /// set that fits in cache, it does nothing useful. See
+    /// [`HashMap::prefetch_get`] for an example of the look-ahead pattern.
+    ///
+    /// [`HashMap::prefetch_get`]: crate::HashMap::prefetch_get
+    #[cfg_attr(feature = "inline-more", inline)]
+    pub fn prefetch_get<Q>(&self, value: &Q)
+    where
+        Q: Hash + Equivalent<T> + ?Sized,
+    {
+        self.map.prefetch_get(value);
+    }
+
+    /// Issues a software prefetch hint for the control bytes and data bucket
+    /// an *insert* of `value` would touch first.
+    ///
+    /// The method name signals insert intent. Currently shares the same
+    /// implementation as [`prefetch_get`](Self::prefetch_get).
+    ///
+    /// Purely a performance hint with no observable effect; compiles to nothing
+    /// on architectures without a prefetch instruction.
+    #[cfg_attr(feature = "inline-more", inline)]
+    pub fn prefetch_insert<Q>(&self, value: &Q)
+    where
+        Q: Hash + Equivalent<T> + ?Sized,
+    {
+        self.map.prefetch_insert(value);
+    }
+
     /// Returns a reference to the value in the set, if any, that is equal to the given value.
     ///
     /// The value may be any borrowed form of the set's value type, but
diff --git a/src/table.rs b/src/table.rs
index f50f575f6..70049bab1 100644
--- a/src/table.rs
+++ b/src/table.rs
@@ -229,6 +229,80 @@ where
         self.raw.get(hash, eq)
     }
 
+    /// Issues a software prefetch hint for the control-byte group and data
+    /// bucket a *lookup* of `hash` would touch first.
+    ///
+    /// The method name signals lookup intent; the implementation hints both
+    /// lines because measured bench evidence shows the data prefetch is
+    /// load-bearing for the win on lookup workloads. Use
+    /// [`prefetch_insert`](Self::prefetch_insert) to signal insert intent.
+    ///
+    /// This is purely a performance hint with no observable effect, and it
+    /// compiles to nothing on architectures without a prefetch instruction.
+    ///
+    /// It is only worth using when looking up *many* hashes in a sequence and
+    /// the table is large enough that the control bytes do not fit in cache:
+    /// in that case you can hash a key several iterations ahead of the one
+    /// currently being looked up and call `prefetch_get` on it, so the cache
+    /// lines it needs are in flight before the lookup reaches them. For a
+    /// single lookup, or a table that fits in cache, it does nothing useful.
+    ///
+    /// `hash` must be computed with the same hasher you use for [`find`]; using
+    /// an unrelated hash just prefetches an unrelated (still valid-to-prefetch)
+    /// cache line and wastes the hint.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// # #[cfg(feature = "nightly")]
+    /// # fn test() {
+    /// use hashbrown::{HashTable, DefaultHashBuilder};
+    /// use std::hash::BuildHasher;
+    ///
+    /// let s = DefaultHashBuilder::default();
+    /// let mut table: HashTable<u32> = HashTable::new();
+    /// for i in 0..1000 {
+    ///     table.insert_unique(s.hash_one(i), i, |&x| s.hash_one(x));
+    /// }
+    ///
+    /// let queries: Vec<u32> = (0..1000).rev().collect();
+    /// // Look up `queries`, prefetching 8 iterations ahead.
+    /// let mut found = 0;
+    /// for (i, &q) in queries.iter().enumerate() {
+    ///     if let Some(&next) = queries.get(i + 8) {
+    ///         table.prefetch_get(s.hash_one(next));
+    ///     }
+    ///     if table.find(s.hash_one(q), |&x| x == q).is_some() {
+    ///         found += 1;
+    ///     }
+    /// }
+    /// assert_eq!(found, 1000);
+    /// # }
+    /// # fn main() {
+    /// #     #[cfg(feature = "nightly")]
+    /// #     test()
+    /// # }
+    /// ```
+    ///
+    /// [`find`]: Self::find
+    #[inline]
+    pub fn prefetch_get(&self, hash: u64) {
+        self.raw.prefetch_get(hash);
+    }
+
+    /// Issues a software prefetch hint for the control-byte group and data
+    /// bucket an *insert* of `hash` would touch first.
+    ///
+    /// The method name signals insert intent. Currently shares the same
+    /// implementation as [`prefetch_get`](Self::prefetch_get).
+    ///
+    /// Purely a performance hint with no observable effect; compiles to
+    /// nothing on architectures without a prefetch instruction.
+    #[inline]
+    pub fn prefetch_insert(&self, hash: u64) {
+        self.raw.prefetch_insert(hash);
+    }
+
     /// Returns a mutable reference to an entry in the table with the given hash
     /// and which satisfies the equality function passed.
     ///