Skip to content

Commit eb9caff

Browse files
committed
Adding refcounting to slim_ptrs feature path. About 20% gain on most benchmarks, but should be much better where trie sizes put most pressure on cache
1 parent c77aab8 commit eb9caff

4 files changed

Lines changed: 148 additions & 41 deletions

File tree

Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ bridge_nodes = [] # Enable the experimental BridgeNode type. Incompatible with
3131
fuzzer = ["dep:rand", "dep:rand_distr"] # Used for creating random paths, tries, and zipper movements
3232
counters = [] # Enable features to inspect performance properties of tries. Mainly useful when tuning performance of the PathMap internals.
3333
graft_root_vals = [] # Enables an experimental change in behavior that causes `graft`, `graft_map`, `make_map`, `take_map`, and `join_map` to treat the value at the focus as part of the operation
34-
slim_ptrs = [] # Enables use of a 64-Byte inter-node pointer type (TrieNodeODRc). Currently does not support refcounting, but this may be fixed
34+
slim_ptrs = [] # Enables use of a 64-Byte inter-node pointer type (TrieNodeODRc)
3535
arena_compact = ["dep:memmap2"]
3636
act_counters = ["arena_compact"] # LP: Question: Why isn't this code enabled by just counters + arena_compact???
3737

src/dense_byte_node.rs

Lines changed: 28 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -49,12 +49,25 @@ pub type DenseByteNode<V> = ByteNode<OrdinaryCoFree<V>>;
4949
/// A ByteNode with insides that **can** be shared across threads
5050
pub type CellByteNode<V> = ByteNode<CellCoFree<V>>;
5151

52-
#[derive(Clone)]
52+
#[repr(C)]
5353
pub struct ByteNode<Cf> {
54+
#[cfg(feature = "slim_ptrs")]
55+
refcnt: std::sync::atomic::AtomicU32,
5456
pub mask: ByteMask,
5557
values: Vec<Cf>,
5658
}
5759

60+
impl<V: Clone + Send + Sync, Cf: CoFree<V=V>> Clone for ByteNode<Cf> {
61+
fn clone(&self) -> Self {
62+
Self {
63+
#[cfg(feature = "slim_ptrs")]
64+
refcnt: std::sync::atomic::AtomicU32::new(1),
65+
mask: self.mask,
66+
values: self.values.clone()
67+
}
68+
}
69+
}
70+
5871
impl<V: Clone + Send + Sync, Cf: CoFree<V=V>> Default for ByteNode<Cf> {
5972
fn default() -> Self {
6073
Self::new()
@@ -77,16 +90,19 @@ impl<V: Clone + Send + Sync, Cf: CoFree<V=V>> Debug for ByteNode<Cf> {
7790
impl<V: Clone + Send + Sync, Cf: CoFree<V=V>> ByteNode<Cf> {
7891
#[inline]
7992
pub fn new() -> Self {
80-
Self {
81-
mask: ByteMask::EMPTY,
82-
values: <_>::default()
83-
}
93+
Self::new_with_fields(ByteMask::EMPTY, <_>::default())
8494
}
8595
#[inline]
8696
pub fn with_capacity(capacity: usize) -> Self {
97+
Self::new_with_fields(ByteMask::EMPTY, Vec::with_capacity(capacity))
98+
}
99+
#[inline]
100+
fn new_with_fields(mask: ByteMask, values: Vec<Cf>) -> Self {
87101
Self {
88-
mask: ByteMask::EMPTY,
89-
values: Vec::with_capacity(capacity),
102+
#[cfg(feature = "slim_ptrs")]
103+
refcnt: std::sync::atomic::AtomicU32::new(1),
104+
mask,
105+
values,
90106
}
91107
}
92108
#[inline]
@@ -1929,7 +1945,7 @@ impl<V: Clone + Send + Sync + Lattice, Cf: CoFree<V=V>, OtherCf: CoFree<V=V>> He
19291945
if is_counter_identity { mask |= COUNTER_IDENT; }
19301946
AlgebraicResult::Identity(mask)
19311947
} else {
1932-
AlgebraicResult::Element(Self{ mask: jm, values: <_>::from(v) })
1948+
AlgebraicResult::Element(Self::new_with_fields(jm, <_>::from(v)))
19331949
}
19341950
}
19351951
}
@@ -2077,7 +2093,7 @@ impl<V: Clone + Send + Sync + Lattice, Cf: CoFree<V=V>, OtherCf: CoFree<V=V>> He
20772093
if is_counter_identity { mask |= COUNTER_IDENT; }
20782094
AlgebraicResult::Identity(mask)
20792095
} else {
2080-
AlgebraicResult::Element(Self{ mask: mm, values: <_>::from(v) })
2096+
AlgebraicResult::Element(Self::new_with_fields(mm, <_>::from(v)))
20812097
}
20822098
}
20832099
}
@@ -2113,14 +2129,11 @@ impl<V: Clone + Send + Sync + Lattice, Cf: CoFree<V=V>, OtherCf: CoFree<V=V>> He
21132129
}
21142130

21152131
unsafe{ v.set_len(c); }
2116-
return Self{ mask: jm, values: <_>::from(v) };
2132+
return Self::new_with_fields(jm, <_>::from(v));
21172133
}
21182134
fn convert(other: ByteNode<OtherCf>) -> Self {
21192135
let values = other.values.into_iter().map(|other_cf| Cf::convert(other_cf)).collect();
2120-
Self {
2121-
mask: other.mask,
2122-
values
2123-
}
2136+
Self::new_with_fields(other.mask, values)
21242137
}
21252138
fn bottom() -> Self {
21262139
Self::default()
@@ -2243,7 +2256,7 @@ impl<V: Clone + Send + Sync, Cf: CoFree<V=V>> ByteNode<Cf> {
22432256
if is_identity {
22442257
AlgebraicResult::Identity(SELF_IDENT)
22452258
} else {
2246-
AlgebraicResult::Element(Self{ mask: mm, values: <_>::from(v) })
2259+
AlgebraicResult::Element(Self::new_with_fields(mm, <_>::from(v)))
22472260
}
22482261
}
22492262
}

src/tiny_node.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -319,7 +319,7 @@ impl<'a, V: Clone + Send + Sync> TrieNode<V> for TinyRefNode<'a, V> {
319319
impl<V: Clone + Send + Sync> TrieNodeDowncast<V> for TinyRefNode<'_, V> {
320320
#[inline]
321321
fn tag(&self) -> usize {
322-
TINY_REF_NODE_TAG
322+
unreachable!()
323323
}
324324
fn as_tagged(&self) -> TaggedNodeRef<V> {
325325
TaggedNodeRef::TinyRefNode(self)

src/trie_node.rs

Lines changed: 118 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -711,8 +711,7 @@ impl<'a, V: Clone + Send + Sync> AbstractNodeRef<'a, V> {
711711
pub(crate) const EMPTY_NODE_TAG: usize = 0;
712712
pub(crate) const DENSE_BYTE_NODE_TAG: usize = 1;
713713
pub(crate) const LINE_LIST_NODE_TAG: usize = 2;
714-
pub(crate) const TINY_REF_NODE_TAG: usize = 3;
715-
pub(crate) const CELL_BYTE_NODE_TAG: usize = 4;
714+
pub(crate) const CELL_BYTE_NODE_TAG: usize = 3;
716715

717716
/// A reference to a node with a concrete type
718717
pub enum TaggedNodeRef<'a, V: Clone + Send + Sync> {
@@ -744,12 +743,11 @@ impl<V: Clone + Send + Sync> Copy for TaggedNodeRef<'_, V> {}
744743
impl<'a, V: Clone + Send + Sync> TaggedNodeRef<'a, V> {
745744
#[cfg(feature = "slim_ptrs")]
746745
#[inline]
747-
unsafe fn from_raw_parts(ptr: *mut usize, tag: usize) -> Self {
746+
unsafe fn from_raw_parts(ptr: *mut core::sync::atomic::AtomicU32, tag: usize) -> Self {
748747
match tag {
749748
EMPTY_NODE_TAG => Self::EmptyNode,
750749
DENSE_BYTE_NODE_TAG => Self::DenseByteNode(&*ptr.cast()),
751750
LINE_LIST_NODE_TAG => Self::LineListNode(&*ptr.cast()),
752-
TINY_REF_NODE_TAG => Self::TinyRefNode(&*ptr.cast()),
753751
CELL_BYTE_NODE_TAG => Self::CellByteNode(&*ptr.cast()),
754752
_ => unreachable_unchecked()
755753
}
@@ -781,7 +779,7 @@ pub enum TaggedNodeRefMut<'a, V: Clone + Send + Sync> {
781779
impl<'a, V: Clone + Send + Sync> TaggedNodeRefMut<'a, V> {
782780
#[cfg(feature = "slim_ptrs")]
783781
#[inline]
784-
unsafe fn from_raw_parts(ptr: *mut usize, tag: usize) -> Self {
782+
unsafe fn from_raw_parts(ptr: *mut core::sync::atomic::AtomicU32, tag: usize) -> Self {
785783
match tag {
786784
DENSE_BYTE_NODE_TAG => Self::DenseByteNode(&mut *ptr.cast()),
787785
LINE_LIST_NODE_TAG => Self::LineListNode(&mut *ptr.cast()),
@@ -1353,24 +1351,118 @@ mod opaque_dyn_rc_trie_node {
13531351
mod opaque_dyn_rc_trie_node {
13541352
use core::marker::PhantomData;
13551353
use core::ptr::NonNull;
1356-
use super::{TaggedNodeRef, TaggedNodeRefMut, TrieNode, EMPTY_NODE_TAG};
1354+
use core::sync::atomic::{AtomicU32, Ordering::Acquire, Ordering::Relaxed, Ordering::Release};
1355+
use crate::dense_byte_node::{DenseByteNode, CellByteNode};
1356+
use crate::line_list_node::LineListNode;
1357+
1358+
use super::{TaggedNodeRef, TaggedNodeRefMut, TrieNode, EMPTY_NODE_TAG, DENSE_BYTE_NODE_TAG, LINE_LIST_NODE_TAG, CELL_BYTE_NODE_TAG};
13571359

13581360
#[cfg(all(not(target_pointer_width="64")))]
13591361
compile_error!("slim_ptrs is only compatible with 64-bit architectures");
13601362

13611363
#[repr(align(8))]
1362-
pub struct TrieNodeODRc<V: Clone + Send + Sync>(NonNull<usize>, PhantomData<V>);
1364+
pub struct TrieNodeODRc<V: Clone + Send + Sync>(NonNull<AtomicU32>, PhantomData<V>);
13631365

13641366
unsafe impl<V: Clone + Send + Sync> Send for TrieNodeODRc<V> {}
13651367
unsafe impl<V: Clone + Send + Sync> Sync for TrieNodeODRc<V> {}
13661368

13671369
impl<V: Clone + Send + Sync> Clone for TrieNodeODRc<V> {
1370+
/// Increases the node refcount. See the implementation of Arc::clone in the stdlib
1371+
#[inline]
13681372
fn clone(&self) -> Self {
1369-
//GOAT, we need to bump the refcount when we have them
1373+
//NOTE: This explanation copied verbatim from the Arc implementation in stdlib
1374+
// -------------------------------------------------------------------------------
1375+
// Using a relaxed ordering is alright here, as knowledge of the
1376+
// original reference prevents other threads from erroneously deleting
1377+
// the object.
1378+
//
1379+
// As explained in the [Boost documentation][1], Increasing the
1380+
// reference counter can always be done with memory_order_relaxed: New
1381+
// references to an object can only be formed from an existing
1382+
// reference, and passing an existing reference from one thread to
1383+
// another must already provide any required synchronization.
1384+
//
1385+
// [1]: (www.boost.org/doc/libs/1_55_0/doc/html/atomic/usage_examples.html)
1386+
let (ptr, _tag) = self.get_raw_parts();
1387+
let _old_size = unsafe{ &*ptr }.fetch_add(1, Relaxed);
1388+
13701389
Self(self.0, PhantomData)
13711390
}
13721391
}
13731392

1393+
/// We want to saturate at this refcount. So if we ever see this value, we stop decrementing
1394+
const MAX_REFCOUNT: u32 = 0x7FFFFFFF;
1395+
1396+
impl<V: Clone + Send + Sync> Drop for TrieNodeODRc<V> {
1397+
/// Decrements the refcount, and deletes the node if the refcount reaches 0
1398+
#[inline]
1399+
fn drop(&mut self) {
1400+
let (ptr, tag) = self.get_raw_parts();
1401+
if tag == EMPTY_NODE_TAG {
1402+
return
1403+
}
1404+
1405+
//NOTE: This explanation copied verbatim from the Arc implementation in stdlib
1406+
// -------------------------------------------------------------------------------
1407+
1408+
// Because `fetch_sub` is already atomic, we do not need to synchronize
1409+
// with other threads unless we are going to delete the object. This
1410+
// same logic applies to the below `fetch_sub` to the `weak` count.
1411+
if unsafe{ &*ptr }.fetch_sub(1, Release) != 1 {
1412+
return;
1413+
}
1414+
1415+
//GOAT, deal with saturation
1416+
// This fence is needed to prevent reordering of use of the data and
1417+
// deletion of the data. Because it is marked `Release`, the decreasing
1418+
// of the reference count synchronizes with this `Acquire` fence. This
1419+
// means that use of the data happens before decreasing the reference
1420+
// count, which happens before this fence, which happens before the
1421+
// deletion of the data.
1422+
//
1423+
// As explained in the [Boost documentation][1],
1424+
//
1425+
// > It is important to enforce any possible access to the object in one
1426+
// > thread (through an existing reference) to *happen before* deleting
1427+
// > the object in a different thread. This is achieved by a "release"
1428+
// > operation after dropping a reference (any access to the object
1429+
// > through this reference must obviously happened before), and an
1430+
// > "acquire" operation before deleting the object.
1431+
//
1432+
// In particular, while the contents of an Arc are usually immutable, it's
1433+
// possible to have interior writes to something like a Mutex<T>. Since a
1434+
// Mutex is not acquired when it is deleted, we can't rely on its
1435+
// synchronization logic to make writes in thread A visible to a destructor
1436+
// running in thread B.
1437+
//
1438+
// Also note that the Acquire fence here could probably be replaced with an
1439+
// Acquire load, which could improve performance in highly-contended
1440+
// situations. See [2].
1441+
//
1442+
// [1]: (www.boost.org/doc/libs/1_55_0/doc/html/atomic/usage_examples.html)
1443+
// [2]: (https://github.com/rust-lang/rust/pull/41714)
1444+
1445+
//The fence from Arc is overkill here, so we'll go with an Acquire load
1446+
//Code from Arc with macro expanded: fence(Acquire);
1447+
let refcount = unsafe{ &*ptr }.load(Acquire);
1448+
debug_assert_eq!(refcount, 0);
1449+
1450+
drop_inner::<V>(ptr, tag);
1451+
}
1452+
}
1453+
1454+
#[inline]
1455+
fn drop_inner<V: Clone + Send + Sync>(ptr: *mut AtomicU32, tag: usize) {
1456+
//Convert the pointer back into a box, so it will drop
1457+
match tag {
1458+
EMPTY_NODE_TAG => {},
1459+
DENSE_BYTE_NODE_TAG => unsafe { let _ = Box::<DenseByteNode<V>>::from_raw(ptr.cast()); },
1460+
LINE_LIST_NODE_TAG => unsafe { let _ = Box::<LineListNode<V>>::from_raw(ptr.cast()); },
1461+
CELL_BYTE_NODE_TAG => unsafe { let _ = Box::<CellByteNode<V>>::from_raw(ptr.cast()); },
1462+
_ => unsafe{ core::hint::unreachable_unchecked() }
1463+
};
1464+
}
1465+
13741466
impl<V> core::fmt::Debug for TrieNodeODRc<V>
13751467
where
13761468
for<'a> &'a dyn TrieNode<V>: core::fmt::Debug,
@@ -1388,9 +1480,6 @@ mod opaque_dyn_rc_trie_node {
13881480
}
13891481
}
13901482

1391-
//GOAT, since we don't have reference counting, and we clone these, they cannot implement Drop
1392-
// Make sure to implement Drop when refcounting is added
1393-
13941483
impl<V: Clone + Send + Sync> TrieNodeODRc<V> {
13951484
#[inline]
13961485
pub(crate) fn new<'odb, T>(node: T) -> Self
@@ -1406,12 +1495,13 @@ mod opaque_dyn_rc_trie_node {
14061495
///
14071496
/// This method is needed because it's impossible to get a mutable reference to the EmptyNode, but WriteZippers
14081497
/// carry a mutable reference ato the node at their root
1498+
#[inline]
14091499
pub(crate) fn new_allocated(_child_cnt_capacity: usize, _val_cnt_capacity: usize) -> Self {
14101500
let new_node = crate::line_list_node::LineListNode::new();
14111501
Self::new(new_node)
14121502
}
14131503
#[inline]
1414-
pub(crate) fn get_raw_parts(&self) -> (*mut usize, usize) {
1504+
pub(crate) fn get_raw_parts(&self) -> (*mut AtomicU32, usize) {
14151505
let tag = (self.0.addr().get() >> 59) & 0xF;
14161506
let unpacked_ptr = unpack(self.0.as_ptr(), 0, false, 7);
14171507
(unpacked_ptr, tag)
@@ -1431,11 +1521,6 @@ mod opaque_dyn_rc_trie_node {
14311521
unsafe{ TaggedNodeRef::from_raw_parts(unpacked_ptr, tag) }
14321522
}
14331523
#[inline]
1434-
pub(crate) fn as_tagged_mut(&mut self) -> TaggedNodeRefMut<V> {
1435-
let (unpacked_ptr, tag) = self.get_raw_parts();
1436-
unsafe{ TaggedNodeRefMut::from_raw_parts(unpacked_ptr, tag) }
1437-
}
1438-
#[inline]
14391524
pub(crate) fn borrow(&self) -> &dyn TrieNode<V> {
14401525
self.as_tagged().borrow()
14411526
}
@@ -1451,18 +1536,27 @@ mod opaque_dyn_rc_trie_node {
14511536
}
14521537
#[inline]
14531538
pub(crate) fn refcount(&self) -> usize {
1454-
//GOAT, this needs to return the real number
1455-
2
1539+
let (ptr, _tag) = self.get_raw_parts();
1540+
unsafe{ &*ptr }.load(Acquire) as usize
14561541
}
14571542
#[inline]
14581543
pub(crate) fn make_mut(&mut self) -> &mut (dyn TrieNode<V> + 'static) {
1459-
//GOAT, gotta actually use the refcounts
1544+
let (ptr, _tag) = self.get_raw_parts();
1545+
1546+
if unsafe{ &*ptr }.compare_exchange(1, 0, Acquire, Relaxed).is_err() {
1547+
// Another pointer exists, so we must clone.
1548+
let cloned_node = self.borrow().clone_self();
14601549

1461-
//Path for refcnt > 1, Clone the self node, and replace this ptr
1462-
let cloned_node = self.borrow().clone_self();
1463-
*self = cloned_node;
1550+
//The decrement of the old `self` refcount will happen at this assignment
1551+
*self = cloned_node;
1552+
1553+
} else {
1554+
// We were the sole reference so bump back up the ref count.
1555+
unsafe{ &*ptr }.store(1, Release);
1556+
}
14641557

1465-
//Now return a mutable borrow
1558+
// As with `get_mut()`, the unsafety is ok because our reference was
1559+
// either unique to begin with, or became one upon cloning the contents.
14661560
let (unpacked_ptr, tag) = self.get_raw_parts();
14671561
let tagged: TaggedNodeRefMut<'_, V> = unsafe{ TaggedNodeRefMut::from_raw_parts(unpacked_ptr, tag) };
14681562
unsafe{ core::mem::transmute(tagged.into_dyn()) }

0 commit comments

Comments
 (0)