From 477c1f49cd60241d1a7e58d484f4e864d12ce75a Mon Sep 17 00:00:00 2001 From: Taegyun Kim Date: Wed, 27 May 2026 15:25:17 -0400 Subject: [PATCH] perf(profiling): reduce profiler arena memory footprint --- libdd-alloc/src/chain.rs | 196 ++++++++++++++++-- .../src/collections/string_table/mod.rs | 11 +- .../collections/parallel/slice_set.rs | 11 +- .../src/profiles/collections/set.rs | 11 +- .../src/profiles/collections/slice_set.rs | 11 +- .../profiles/datatypes/profiles_dictionary.rs | 115 ++++++++++ 6 files changed, 321 insertions(+), 34 deletions(-) diff --git a/libdd-alloc/src/chain.rs b/libdd-alloc/src/chain.rs index f73b9d4cee..b6feca0c46 100644 --- a/libdd-alloc/src/chain.rs +++ b/libdd-alloc/src/chain.rs @@ -4,7 +4,7 @@ use crate::LinearAllocator; use crate::{AllocError, Allocator}; use core::alloc::Layout; -use core::cell::UnsafeCell; +use core::cell::{Cell, UnsafeCell}; use core::mem::size_of; use core::ptr::NonNull; @@ -17,11 +17,16 @@ use core::ptr::NonNull; /// [ChainAllocator] creates a new [LinearAllocator] when the current one /// doesn't have enough space for the requested allocation, and then links the /// new [LinearAllocator] to the previous one, creating a chain. This is where -/// its name comes from. +/// its name comes from. Each successful growth doubles the target chunk size +/// up to a cap, so small arenas retain a low initial footprint while larger +/// workloads quickly converge to larger chunks. pub struct ChainAllocator { top: UnsafeCell>, - /// The size hint for the linear allocator's chunk. - node_size: usize, + /// The size hint for the next linear allocator chunk. + node_size: Cell, + /// The maximum size hint used for routine geometric growth. Individual + /// oversized allocations can still request larger chunks. + max_node_size: usize, allocator: A, } @@ -87,38 +92,89 @@ impl ChainAllocator { /// is worth it. This is somewhat arbitrarily chosen at the moment. const MIN_NODE_SIZE: usize = 4 * Self::CHAIN_NODE_OVERHEAD; - /// Creates a new [ChainAllocator]. The `chunk_size_hint` is used as a - /// size hint when creating new chunks of the chain. Note that the + /// Default cap for routine geometric growth. This preserves the historical + /// chunk size used by profiling dictionaries while allowing smaller initial + /// chunks to ramp up quickly. + const DEFAULT_MAX_NODE_SIZE: usize = 1024 * 1024; + + const fn normalize_node_size(size: usize) -> usize { + if size < Self::MIN_NODE_SIZE { + Self::MIN_NODE_SIZE + } else { + size + } + } + + /// Creates a new [ChainAllocator]. The `chunk_size_hint` is used as the + /// initial size hint for chunks of the chain. Routine growth doubles the + /// chunk size up to at least `Self::DEFAULT_MAX_NODE_SIZE`. Note that the /// [ChainAllocator] will use some bytes at the beginning of each chunk of /// the chain. The number of bytes is [Self::CHAIN_NODE_OVERHEAD]. Keep /// this in mind when sizing your hint if you are trying to be precise, /// such as making sure a specific object fits. pub const fn new_in(chunk_size_hint: usize, allocator: A) -> Self { + let initial_node_size = Self::normalize_node_size(chunk_size_hint); + let max_node_size = if initial_node_size < Self::DEFAULT_MAX_NODE_SIZE { + Self::DEFAULT_MAX_NODE_SIZE + } else { + initial_node_size + }; + Self::new_capped_in(initial_node_size, max_node_size, allocator) + } + + /// Creates a new [ChainAllocator] whose routine growth starts at + /// `chunk_size_hint` and doubles until reaching `max_chunk_size_hint`. + /// Requests larger than the cap are still honored by allocating an + /// oversized chunk for that request. + pub const fn new_capped_in( + chunk_size_hint: usize, + max_chunk_size_hint: usize, + allocator: A, + ) -> Self { + let initial_node_size = Self::normalize_node_size(chunk_size_hint); + let max_node_size = if max_chunk_size_hint < initial_node_size { + initial_node_size + } else { + max_chunk_size_hint + }; Self { top: UnsafeCell::new(ChainNodePtr::new()), - // max is not a const fn, do it manually. - node_size: if chunk_size_hint < Self::MIN_NODE_SIZE { - Self::MIN_NODE_SIZE - } else { - chunk_size_hint - }, + node_size: Cell::new(initial_node_size), + max_node_size, allocator, } } + fn next_geometric_node_size(current: usize, max: usize, align: usize) -> usize { + if current >= max { + return max; + } + + let Some(doubled) = current.checked_mul(2) else { + return max; + }; + let next = if doubled > max { max } else { doubled }; + + if Layout::from_size_align(next, align).is_ok() { + next + } else { + current + } + } + #[cold] #[inline(never)] fn grow(&self, min_size: usize) -> Result<(), AllocError> { let top = self.top.get(); let chain_layout = Layout::new::>(); - let node_size = min_size.max(self.node_size); - let linear = { - let layout = Layout::from_size_align(node_size, chain_layout.align()) - .map_err(|_| AllocError)? - .pad_to_align(); - LinearAllocator::new_in(layout, self.allocator.clone())? - }; + let node_size = min_size.max(self.node_size.get()); + let layout = Layout::from_size_align(node_size, chain_layout.align()) + .map_err(|_| AllocError)? + .pad_to_align(); + let next_node_size = + Self::next_geometric_node_size(layout.size(), self.max_node_size, chain_layout.align()); + let linear = LinearAllocator::new_in(layout, self.allocator.clone())?; // This shouldn't fail. let chain_node_addr = linear @@ -148,6 +204,7 @@ impl ChainAllocator { // Additionally, references are always temporary for the top, so this // write will not violate aliasing rules. unsafe { self.top.get().write(chain_node_ptr) }; + self.node_size.set(next_node_size); Ok(()) } @@ -382,6 +439,101 @@ mod tests { unsafe { allocator.deallocate(ptr.cast(), layout) }; } + #[test] + fn test_size_hint_below_minimum_uses_minimum_node_size() { + let allocator = ChainAllocator::new_capped_in(0, 0, Global); + let layout = Layout::new::(); + let ptr = allocator.allocate(layout).unwrap(); + unsafe { allocator.deallocate(ptr.cast(), layout) }; + + assert!(allocator.reserved_bytes() >= ChainAllocator::::MIN_NODE_SIZE); + } + + #[test] + fn test_geometric_growth() { + let allocator = ChainAllocator::new_in(4096, Global); + let layout = Layout::new::(); + + let _ = allocator.allocate(layout).unwrap(); + let first_reserved = allocator.reserved_bytes(); + fill_to_capacity(&allocator); + + let _ = allocator.allocate(layout).unwrap(); + let second_reserved = allocator.reserved_bytes(); + let second_chunk = second_reserved - first_reserved; + assert!( + second_chunk >= first_reserved * 2, + "second chunk should grow geometrically: first={first_reserved}, second={second_chunk}" + ); + fill_to_capacity(&allocator); + + let _ = allocator.allocate(layout).unwrap(); + let third_reserved = allocator.reserved_bytes(); + let third_chunk = third_reserved - second_reserved; + assert!( + third_chunk >= second_chunk * 2, + "third chunk should grow geometrically: second={second_chunk}, third={third_chunk}" + ); + } + + #[test] + fn test_geometric_growth_clamps_to_cap() { + let allocator = ChainAllocator::new_capped_in(4096, 10 * 1024, Global); + let layout = Layout::new::(); + + let _ = allocator.allocate(layout).unwrap(); + let first_reserved = allocator.reserved_bytes(); + fill_to_capacity(&allocator); + + let _ = allocator.allocate(layout).unwrap(); + let second_reserved = allocator.reserved_bytes(); + fill_to_capacity(&allocator); + + let _ = allocator.allocate(layout).unwrap(); + let third_reserved = allocator.reserved_bytes(); + let third_chunk = third_reserved - second_reserved; + + assert!(second_reserved - first_reserved >= first_reserved * 2); + assert!(third_chunk >= 10 * 1024); + assert!(third_chunk < (second_reserved - first_reserved) * 2); + } + + #[test] + fn test_capped_growth_honors_initial_size_as_minimum_cap() { + let allocator = ChainAllocator::new_capped_in(8192, 4096, Global); + let layout = Layout::new::(); + + let _ = allocator.allocate(layout).unwrap(); + let first_reserved = allocator.reserved_bytes(); + fill_to_capacity(&allocator); + + let _ = allocator.allocate(layout).unwrap(); + let second_reserved = allocator.reserved_bytes(); + let second_chunk = second_reserved - first_reserved; + + assert!(second_chunk >= first_reserved); + assert!(second_chunk < first_reserved * 2); + } + + #[test] + fn test_next_geometric_node_size_edge_cases() { + assert_eq!( + 4096, + ChainAllocator::::next_geometric_node_size(8192, 4096, 1) + ); + assert_eq!( + usize::MAX, + ChainAllocator::::next_geometric_node_size(usize::MAX / 2 + 1, usize::MAX, 1) + ); + + let invalid_layout_size = isize::MAX as usize + 1; + let current = invalid_layout_size / 2 + 1; + assert_eq!( + current, + ChainAllocator::::next_geometric_node_size(current, invalid_layout_size, 1) + ); + } + #[track_caller] fn fill_to_capacity(allocator: &ChainAllocator) { let remaining_capacity = allocator.remaining_capacity(); @@ -401,8 +553,10 @@ mod tests { let bool_layout = Layout::new::(); + const GROWTH_ITERATIONS: usize = 16; + // test that it fills to capacity a few times. - for _ in 0..100 { + for _ in 0..GROWTH_ITERATIONS { fill_to_capacity(&allocator); // This check is theoretically redundant because fill_to_capacity @@ -426,7 +580,7 @@ mod tests { let reserved_bytes = allocator.reserved_bytes(); // The allocations can theoretically be over-allocated, so use >= to // do the comparison. - assert!(reserved_bytes >= page_size * 100); + assert!(reserved_bytes >= page_size * GROWTH_ITERATIONS); // Everything is filled to capacity except the last iteration. let used_bytes = allocator.used_bytes(); diff --git a/libdd-profiling/src/collections/string_table/mod.rs b/libdd-profiling/src/collections/string_table/mod.rs index 1dc71aceef..c28691960e 100644 --- a/libdd-profiling/src/collections/string_table/mod.rs +++ b/libdd-profiling/src/collections/string_table/mod.rs @@ -83,10 +83,13 @@ impl StringTable { // Keep in mind 32-bit .NET. There is only 2 GiB of virtual memory // total available to an application, and we're not the application, // we're just a piece inside it. Additionally, there may be 2 or more - // string tables in memory at a given time. Talk to .NET profiling - // engineers before making this any bigger. - const SIZE_HINT: usize = 4 * 1024 * 1024; - let bytes = ChainAllocator::new_in(SIZE_HINT, VirtualAllocator {}); + // string tables in memory at a given time. Larger profiles grow + // geometrically up to the historical 4 MiB chunk size, while common + // profiles fit comfortably below this initial size. Talk to .NET + // profiling engineers before making this any bigger. + const SIZE_HINT: usize = 512 * 1024; + const MAX_SIZE_HINT: usize = 4 * 1024 * 1024; + let bytes = ChainAllocator::new_capped_in(SIZE_HINT, MAX_SIZE_HINT, VirtualAllocator {}); let mut strings = HashSet::with_hasher(Hasher::default()); // It varies by implementation, but frequently I've noticed that the diff --git a/libdd-profiling/src/profiles/collections/parallel/slice_set.rs b/libdd-profiling/src/profiles/collections/parallel/slice_set.rs index 0fedce7915..b3f3ab936e 100644 --- a/libdd-profiling/src/profiles/collections/parallel/slice_set.rs +++ b/libdd-profiling/src/profiles/collections/parallel/slice_set.rs @@ -9,8 +9,9 @@ use std::ops::Deref; /// Number of shards used by the parallel slice set and (by extension) /// the string-specific parallel set. Kept as a constant so tests and -/// related code can refer to the same value. -pub const N_SHARDS: usize = 16; +/// related code can refer to the same value. Four shards keep enough +/// concurrency for low-thread-count profilers while lowering the arena floor. +pub const N_SHARDS: usize = 4; /// The initial capacities for Rust's hash map (and set) currently go /// like this: 3, 7, 14, 28. We want to avoid some of the smaller sizes so @@ -64,9 +65,9 @@ impl ParallelSliceSet { pub const fn select_shard(hash: u64) -> usize { // Use lower bits for shard selection to avoid interfering with // Swiss tables' internal SIMD comparisons that use upper 7 bits. - // Using 4 bits provides resilience against hash function deficiencies - // and optimal scaling for low thread counts. - (hash & 0b1111) as usize + // N_SHARDS is a power of two, so this masks the lower log2(N_SHARDS) + // bits and keeps the shard index in bounds. + (hash as usize) & (N_SHARDS - 1) } /// Tries to create a new parallel slice set. diff --git a/libdd-profiling/src/profiles/collections/set.rs b/libdd-profiling/src/profiles/collections/set.rs index 6504f0c0f0..0b277a7d32 100644 --- a/libdd-profiling/src/profiles/collections/set.rs +++ b/libdd-profiling/src/profiles/collections/set.rs @@ -61,7 +61,10 @@ pub struct Set { } impl Set { - pub const SIZE_HINT: usize = 1024 * 1024; + // Keep the per-shard arena small; larger dictionaries grow + // geometrically up to the historical 1 MiB chunk size. + pub const SIZE_HINT: usize = 64 * 1024; + pub const MAX_SIZE_HINT: usize = 1024 * 1024; pub fn try_new() -> Result { Self::try_with_capacity(SET_MIN_CAPACITY) @@ -146,7 +149,11 @@ impl Drop for Set { impl Set { pub(crate) fn try_with_capacity(capacity: usize) -> Result { - let arena = ChainAllocator::new_in(Self::SIZE_HINT, VirtualAllocator {}); + let arena = ChainAllocator::new_capped_in( + Self::SIZE_HINT, + Self::MAX_SIZE_HINT, + VirtualAllocator {}, + ); let mut table = HashTable::new(); // SAFETY: new empty table cannot require rehash, callback unreachable. diff --git a/libdd-profiling/src/profiles/collections/slice_set.rs b/libdd-profiling/src/profiles/collections/slice_set.rs index e2aa3636f7..886337ca51 100644 --- a/libdd-profiling/src/profiles/collections/slice_set.rs +++ b/libdd-profiling/src/profiles/collections/slice_set.rs @@ -23,10 +23,17 @@ pub struct SliceSet { } impl SliceSet { - const SIZE_HINT: usize = 1024 * 1024; + // Keep the per-shard arena small; larger dictionaries grow + // geometrically up to the historical 1 MiB chunk size. + const SIZE_HINT: usize = 64 * 1024; + const MAX_SIZE_HINT: usize = 1024 * 1024; pub fn try_with_capacity(capacity: usize) -> Result { - let arena = ChainAllocator::new_in(Self::SIZE_HINT, VirtualAllocator {}); + let arena = ChainAllocator::new_capped_in( + Self::SIZE_HINT, + Self::MAX_SIZE_HINT, + VirtualAllocator {}, + ); let mut slices = HashTable::new(); // SAFETY: we just made the empty hash table, so there's nothing that diff --git a/libdd-profiling/src/profiles/datatypes/profiles_dictionary.rs b/libdd-profiling/src/profiles/datatypes/profiles_dictionary.rs index cddf616202..5bc7bdfe3c 100644 --- a/libdd-profiling/src/profiles/datatypes/profiles_dictionary.rs +++ b/libdd-profiling/src/profiles/datatypes/profiles_dictionary.rs @@ -123,6 +123,34 @@ mod tests { } } + fn string_arena_reserved_bytes(dict: &ProfilesDictionary) -> usize { + dict.strings + .inner + .arc + .shards + .iter() + .map(|shard| shard.read().arena.reserved_bytes()) + .sum() + } + + fn function_arena_reserved_bytes(dict: &ProfilesDictionary) -> usize { + dict.functions + .storage + .shards + .iter() + .map(|shard| shard.read().arena.reserved_bytes()) + .sum() + } + + fn mapping_arena_reserved_bytes(dict: &ProfilesDictionary) -> usize { + dict.mappings + .storage + .shards + .iter() + .map(|shard| shard.read().arena.reserved_bytes()) + .sum() + } + #[test] fn get_str_round_trip() { let dict = ProfilesDictionary::try_new().unwrap(); @@ -163,6 +191,93 @@ mod tests { assert!(got.file_name.is_empty()); } + #[test] + fn initial_dictionary_arena_floor_is_small() { + const SMALL_ARENA_HINT: usize = 64 * 1024; + + let dict = ProfilesDictionary::try_new().unwrap(); + + assert!(string_arena_reserved_bytes(&dict) <= 4 * SMALL_ARENA_HINT); + assert!(function_arena_reserved_bytes(&dict) <= 4 * SMALL_ARENA_HINT); + assert!(mapping_arena_reserved_bytes(&dict) <= 2 * SMALL_ARENA_HINT); + } + + #[test] + fn dictionary_grows_past_initial_arenas_and_preserves_handles() { + let dict = ProfilesDictionary::try_new().unwrap(); + let initial_string_reserved = string_arena_reserved_bytes(&dict); + let initial_function_reserved = function_arena_reserved_bytes(&dict); + let initial_mapping_reserved = mapping_arena_reserved_bytes(&dict); + + let mut string_ids = Vec::new(); + for i in 0..1024 { + let string = format!("profile-dictionary-growth-{i:04}-{}", "x".repeat(512)); + string_ids.push(dict.try_insert_str2(&string).unwrap()); + } + let first_string_id = string_ids[0]; + assert_string_value( + &dict, + first_string_id, + &format!("profile-dictionary-growth-0000-{}", "x".repeat(512)), + ); + assert_eq!( + StringRef::from(first_string_id), + StringRef::from( + dict.try_insert_str2(&format!( + "profile-dictionary-growth-0000-{}", + "x".repeat(512) + )) + .unwrap() + ) + ); + + let make_function = |i: usize| Function2 { + name: string_ids[i % string_ids.len()], + system_name: string_ids[(i / string_ids.len()) % string_ids.len()], + file_name: string_ids[(i.wrapping_mul(31) + i / string_ids.len()) % string_ids.len()], + }; + let first_function = make_function(0); + let first_function_id = dict.try_insert_function2(first_function).unwrap(); + for i in 1..12_000 { + dict.try_insert_function2(make_function(i)).unwrap(); + } + assert_eq!( + first_function_id.0, + dict.try_insert_function2(first_function).unwrap().0 + ); + let first_function_read = unsafe { dict.get_func(first_function_id) }; + assert_string_id_eq(first_function_read.name, first_function.name); + assert_string_id_eq(first_function_read.system_name, first_function.system_name); + assert_string_id_eq(first_function_read.file_name, first_function.file_name); + + let make_mapping = |i: usize| Mapping2 { + memory_start: i as u64, + memory_limit: i as u64 + 4096, + file_offset: i as u64 * 16, + filename: string_ids[i % string_ids.len()], + build_id: string_ids[(i * 17) % string_ids.len()], + }; + let first_mapping = make_mapping(0); + let first_mapping_id = dict.try_insert_mapping2(first_mapping).unwrap(); + for i in 1..4096 { + dict.try_insert_mapping2(make_mapping(i)).unwrap(); + } + assert_eq!( + first_mapping_id.0, + dict.try_insert_mapping2(first_mapping).unwrap().0 + ); + let first_mapping_read = unsafe { first_mapping_id.read().unwrap() }; + assert_eq!(first_mapping_read.memory_start, first_mapping.memory_start); + assert_eq!(first_mapping_read.memory_limit, first_mapping.memory_limit); + assert_eq!(first_mapping_read.file_offset, first_mapping.file_offset); + assert_string_id_eq(first_mapping_read.filename, first_mapping.filename); + assert_string_id_eq(first_mapping_read.build_id, first_mapping.build_id); + + assert!(string_arena_reserved_bytes(&dict) > initial_string_reserved); + assert!(function_arena_reserved_bytes(&dict) > initial_function_reserved); + assert!(mapping_arena_reserved_bytes(&dict) > initial_mapping_reserved); + } + proptest! { #![proptest_config(ProptestConfig { cases: if cfg!(miri) { 8 } else { 64 },