Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
196 changes: 175 additions & 21 deletions libdd-alloc/src/chain.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
use crate::LinearAllocator;
use crate::{AllocError, Allocator};
use core::alloc::Layout;
use core::cell::UnsafeCell;
use core::cell::{Cell, UnsafeCell};
use core::mem::size_of;
use core::ptr::NonNull;

Expand All @@ -17,11 +17,16 @@ use core::ptr::NonNull;
/// [ChainAllocator] creates a new [LinearAllocator] when the current one
/// doesn't have enough space for the requested allocation, and then links the
/// new [LinearAllocator] to the previous one, creating a chain. This is where
/// its name comes from.
/// its name comes from. Each successful growth doubles the target chunk size
/// up to a cap, so small arenas retain a low initial footprint while larger
/// workloads quickly converge to larger chunks.
pub struct ChainAllocator<A: Allocator + Clone> {
top: UnsafeCell<ChainNodePtr<A>>,
/// The size hint for the linear allocator's chunk.
node_size: usize,
/// The size hint for the next linear allocator chunk.
node_size: Cell<usize>,
/// The maximum size hint used for routine geometric growth. Individual
/// oversized allocations can still request larger chunks.
max_node_size: usize,
allocator: A,
}

Expand Down Expand Up @@ -87,38 +92,89 @@ impl<A: Allocator + Clone> ChainAllocator<A> {
/// is worth it. This is somewhat arbitrarily chosen at the moment.
const MIN_NODE_SIZE: usize = 4 * Self::CHAIN_NODE_OVERHEAD;

/// Creates a new [ChainAllocator]. The `chunk_size_hint` is used as a
/// size hint when creating new chunks of the chain. Note that the
/// Default cap for routine geometric growth. This preserves the historical
/// chunk size used by profiling dictionaries while allowing smaller initial
/// chunks to ramp up quickly.
const DEFAULT_MAX_NODE_SIZE: usize = 1024 * 1024;

const fn normalize_node_size(size: usize) -> usize {
if size < Self::MIN_NODE_SIZE {
Self::MIN_NODE_SIZE
} else {
size
}
}

/// Creates a new [ChainAllocator]. The `chunk_size_hint` is used as the
/// initial size hint for chunks of the chain. Routine growth doubles the
/// chunk size up to at least `Self::DEFAULT_MAX_NODE_SIZE`. Note that the
/// [ChainAllocator] will use some bytes at the beginning of each chunk of
/// the chain. The number of bytes is [Self::CHAIN_NODE_OVERHEAD]. Keep
/// this in mind when sizing your hint if you are trying to be precise,
/// such as making sure a specific object fits.
pub const fn new_in(chunk_size_hint: usize, allocator: A) -> Self {
let initial_node_size = Self::normalize_node_size(chunk_size_hint);
let max_node_size = if initial_node_size < Self::DEFAULT_MAX_NODE_SIZE {
Self::DEFAULT_MAX_NODE_SIZE
} else {
initial_node_size
};
Self::new_capped_in(initial_node_size, max_node_size, allocator)
}

/// Creates a new [ChainAllocator] whose routine growth starts at
/// `chunk_size_hint` and doubles until reaching `max_chunk_size_hint`.
/// Requests larger than the cap are still honored by allocating an
/// oversized chunk for that request.
pub const fn new_capped_in(
chunk_size_hint: usize,
max_chunk_size_hint: usize,
allocator: A,
) -> Self {
let initial_node_size = Self::normalize_node_size(chunk_size_hint);
let max_node_size = if max_chunk_size_hint < initial_node_size {
initial_node_size
} else {
max_chunk_size_hint
};
Self {
top: UnsafeCell::new(ChainNodePtr::new()),
// max is not a const fn, do it manually.
node_size: if chunk_size_hint < Self::MIN_NODE_SIZE {
Self::MIN_NODE_SIZE
} else {
chunk_size_hint
},
node_size: Cell::new(initial_node_size),
max_node_size,
allocator,
}
}

fn next_geometric_node_size(current: usize, max: usize, align: usize) -> usize {
if current >= max {
return max;
}

let Some(doubled) = current.checked_mul(2) else {
return max;
};
let next = if doubled > max { max } else { doubled };

if Layout::from_size_align(next, align).is_ok() {
next
} else {
current
}
}

#[cold]
#[inline(never)]
fn grow(&self, min_size: usize) -> Result<(), AllocError> {
let top = self.top.get();
let chain_layout = Layout::new::<ChainNode<A>>();

let node_size = min_size.max(self.node_size);
let linear = {
let layout = Layout::from_size_align(node_size, chain_layout.align())
.map_err(|_| AllocError)?
.pad_to_align();
LinearAllocator::new_in(layout, self.allocator.clone())?
};
let node_size = min_size.max(self.node_size.get());
let layout = Layout::from_size_align(node_size, chain_layout.align())
.map_err(|_| AllocError)?
.pad_to_align();
let next_node_size =
Self::next_geometric_node_size(layout.size(), self.max_node_size, chain_layout.align());
let linear = LinearAllocator::new_in(layout, self.allocator.clone())?;

// This shouldn't fail.
let chain_node_addr = linear
Expand Down Expand Up @@ -148,6 +204,7 @@ impl<A: Allocator + Clone> ChainAllocator<A> {
// Additionally, references are always temporary for the top, so this
// write will not violate aliasing rules.
unsafe { self.top.get().write(chain_node_ptr) };
self.node_size.set(next_node_size);

Ok(())
}
Expand Down Expand Up @@ -382,6 +439,101 @@ mod tests {
unsafe { allocator.deallocate(ptr.cast(), layout) };
}

#[test]
fn test_size_hint_below_minimum_uses_minimum_node_size() {
let allocator = ChainAllocator::new_capped_in(0, 0, Global);
let layout = Layout::new::<u8>();
let ptr = allocator.allocate(layout).unwrap();
unsafe { allocator.deallocate(ptr.cast(), layout) };

assert!(allocator.reserved_bytes() >= ChainAllocator::<Global>::MIN_NODE_SIZE);
}

#[test]
fn test_geometric_growth() {
let allocator = ChainAllocator::new_in(4096, Global);
let layout = Layout::new::<u8>();

let _ = allocator.allocate(layout).unwrap();
let first_reserved = allocator.reserved_bytes();
fill_to_capacity(&allocator);

let _ = allocator.allocate(layout).unwrap();
let second_reserved = allocator.reserved_bytes();
let second_chunk = second_reserved - first_reserved;
assert!(
second_chunk >= first_reserved * 2,
"second chunk should grow geometrically: first={first_reserved}, second={second_chunk}"
);
fill_to_capacity(&allocator);

let _ = allocator.allocate(layout).unwrap();
let third_reserved = allocator.reserved_bytes();
let third_chunk = third_reserved - second_reserved;
assert!(
third_chunk >= second_chunk * 2,
"third chunk should grow geometrically: second={second_chunk}, third={third_chunk}"
);
}

#[test]
fn test_geometric_growth_clamps_to_cap() {
let allocator = ChainAllocator::new_capped_in(4096, 10 * 1024, Global);
let layout = Layout::new::<u8>();

let _ = allocator.allocate(layout).unwrap();
let first_reserved = allocator.reserved_bytes();
fill_to_capacity(&allocator);

let _ = allocator.allocate(layout).unwrap();
let second_reserved = allocator.reserved_bytes();
fill_to_capacity(&allocator);

let _ = allocator.allocate(layout).unwrap();
let third_reserved = allocator.reserved_bytes();
let third_chunk = third_reserved - second_reserved;

assert!(second_reserved - first_reserved >= first_reserved * 2);
assert!(third_chunk >= 10 * 1024);
assert!(third_chunk < (second_reserved - first_reserved) * 2);
}

#[test]
fn test_capped_growth_honors_initial_size_as_minimum_cap() {
let allocator = ChainAllocator::new_capped_in(8192, 4096, Global);
let layout = Layout::new::<u8>();

let _ = allocator.allocate(layout).unwrap();
let first_reserved = allocator.reserved_bytes();
fill_to_capacity(&allocator);

let _ = allocator.allocate(layout).unwrap();
let second_reserved = allocator.reserved_bytes();
let second_chunk = second_reserved - first_reserved;

assert!(second_chunk >= first_reserved);
assert!(second_chunk < first_reserved * 2);
}

#[test]
fn test_next_geometric_node_size_edge_cases() {
assert_eq!(
4096,
ChainAllocator::<Global>::next_geometric_node_size(8192, 4096, 1)
);
assert_eq!(
usize::MAX,
ChainAllocator::<Global>::next_geometric_node_size(usize::MAX / 2 + 1, usize::MAX, 1)
);

let invalid_layout_size = isize::MAX as usize + 1;
let current = invalid_layout_size / 2 + 1;
assert_eq!(
current,
ChainAllocator::<Global>::next_geometric_node_size(current, invalid_layout_size, 1)
);
}

#[track_caller]
fn fill_to_capacity<A: Allocator + Clone>(allocator: &ChainAllocator<A>) {
let remaining_capacity = allocator.remaining_capacity();
Expand All @@ -401,8 +553,10 @@ mod tests {

let bool_layout = Layout::new::<bool>();

const GROWTH_ITERATIONS: usize = 16;

// test that it fills to capacity a few times.
for _ in 0..100 {
for _ in 0..GROWTH_ITERATIONS {
fill_to_capacity(&allocator);

// This check is theoretically redundant because fill_to_capacity
Expand All @@ -426,7 +580,7 @@ mod tests {
let reserved_bytes = allocator.reserved_bytes();
// The allocations can theoretically be over-allocated, so use >= to
// do the comparison.
assert!(reserved_bytes >= page_size * 100);
assert!(reserved_bytes >= page_size * GROWTH_ITERATIONS);

// Everything is filled to capacity except the last iteration.
let used_bytes = allocator.used_bytes();
Expand Down
11 changes: 7 additions & 4 deletions libdd-profiling/src/collections/string_table/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -83,10 +83,13 @@ impl StringTable {
// Keep in mind 32-bit .NET. There is only 2 GiB of virtual memory
// total available to an application, and we're not the application,
// we're just a piece inside it. Additionally, there may be 2 or more
// string tables in memory at a given time. Talk to .NET profiling
// engineers before making this any bigger.
const SIZE_HINT: usize = 4 * 1024 * 1024;
let bytes = ChainAllocator::new_in(SIZE_HINT, VirtualAllocator {});
// string tables in memory at a given time. Larger profiles grow
// geometrically up to the historical 4 MiB chunk size, while common
// profiles fit comfortably below this initial size. Talk to .NET
// profiling engineers before making this any bigger.
const SIZE_HINT: usize = 512 * 1024;
const MAX_SIZE_HINT: usize = 4 * 1024 * 1024;
let bytes = ChainAllocator::new_capped_in(SIZE_HINT, MAX_SIZE_HINT, VirtualAllocator {});

let mut strings = HashSet::with_hasher(Hasher::default());
// It varies by implementation, but frequently I've noticed that the
Expand Down
11 changes: 6 additions & 5 deletions libdd-profiling/src/profiles/collections/parallel/slice_set.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,9 @@ use std::ops::Deref;

/// Number of shards used by the parallel slice set and (by extension)
/// the string-specific parallel set. Kept as a constant so tests and
/// related code can refer to the same value.
pub const N_SHARDS: usize = 16;
/// related code can refer to the same value. Four shards keep enough
/// concurrency for low-thread-count profilers while lowering the arena floor.
pub const N_SHARDS: usize = 4;

/// The initial capacities for Rust's hash map (and set) currently go
/// like this: 3, 7, 14, 28. We want to avoid some of the smaller sizes so
Expand Down Expand Up @@ -64,9 +65,9 @@ impl<T: Copy + hash::Hash + Eq + 'static> ParallelSliceSet<T> {
pub const fn select_shard(hash: u64) -> usize {
// Use lower bits for shard selection to avoid interfering with
// Swiss tables' internal SIMD comparisons that use upper 7 bits.
// Using 4 bits provides resilience against hash function deficiencies
// and optimal scaling for low thread counts.
(hash & 0b1111) as usize
// N_SHARDS is a power of two, so this masks the lower log2(N_SHARDS)
// bits and keeps the shard index in bounds.
(hash as usize) & (N_SHARDS - 1)
}

/// Tries to create a new parallel slice set.
Expand Down
11 changes: 9 additions & 2 deletions libdd-profiling/src/profiles/collections/set.rs
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,10 @@ pub struct Set<T: Hash + Eq + 'static> {
}

impl<T: Eq + Hash + 'static> Set<T> {
pub const SIZE_HINT: usize = 1024 * 1024;
// Keep the per-shard arena small; larger dictionaries grow
// geometrically up to the historical 1 MiB chunk size.
pub const SIZE_HINT: usize = 64 * 1024;
pub const MAX_SIZE_HINT: usize = 1024 * 1024;

pub fn try_new() -> Result<Self, SetError> {
Self::try_with_capacity(SET_MIN_CAPACITY)
Expand Down Expand Up @@ -146,7 +149,11 @@ impl<T: Hash + Eq + 'static> Drop for Set<T> {

impl<T: Hash + Eq + 'static> Set<T> {
pub(crate) fn try_with_capacity(capacity: usize) -> Result<Self, SetError> {
let arena = ChainAllocator::new_in(Self::SIZE_HINT, VirtualAllocator {});
let arena = ChainAllocator::new_capped_in(
Self::SIZE_HINT,
Self::MAX_SIZE_HINT,
VirtualAllocator {},
);
let mut table = HashTable::new();

// SAFETY: new empty table cannot require rehash, callback unreachable.
Expand Down
11 changes: 9 additions & 2 deletions libdd-profiling/src/profiles/collections/slice_set.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,17 @@ pub struct SliceSet<T: Copy + Hash + Eq + 'static> {
}

impl<T: Copy + Hash + Eq + 'static> SliceSet<T> {
const SIZE_HINT: usize = 1024 * 1024;
// Keep the per-shard arena small; larger dictionaries grow
// geometrically up to the historical 1 MiB chunk size.
const SIZE_HINT: usize = 64 * 1024;
const MAX_SIZE_HINT: usize = 1024 * 1024;

pub fn try_with_capacity(capacity: usize) -> Result<Self, SetError> {
let arena = ChainAllocator::new_in(Self::SIZE_HINT, VirtualAllocator {});
let arena = ChainAllocator::new_capped_in(
Self::SIZE_HINT,
Self::MAX_SIZE_HINT,
VirtualAllocator {},
);

let mut slices = HashTable::new();
// SAFETY: we just made the empty hash table, so there's nothing that
Expand Down
Loading
Loading