From fb58d4fc74e114e0ab700a5d7716f7945f2a9e50 Mon Sep 17 00:00:00 2001 From: Yann Hamdaoui Date: Fri, 22 May 2026 10:46:39 +0200 Subject: [PATCH 1/9] feat!(span): introduce VecMap datastructure --- libdd-trace-utils/src/span/mod.rs | 1 + libdd-trace-utils/src/span/vec_map.rs | 260 ++++++++++++++++++++++++++ 2 files changed, 261 insertions(+) create mode 100644 libdd-trace-utils/src/span/vec_map.rs diff --git a/libdd-trace-utils/src/span/mod.rs b/libdd-trace-utils/src/span/mod.rs index e6358dfc7a..980b630c18 100644 --- a/libdd-trace-utils/src/span/mod.rs +++ b/libdd-trace-utils/src/span/mod.rs @@ -4,6 +4,7 @@ pub mod trace_utils; pub mod v04; pub mod v05; +pub mod vec_map; use crate::msgpack_decoder::decode::buffer::read_string_ref_nomut; use crate::msgpack_decoder::decode::error::DecodeError; diff --git a/libdd-trace-utils/src/span/vec_map.rs b/libdd-trace-utils/src/span/vec_map.rs new file mode 100644 index 0000000000..6bacc4e5ca --- /dev/null +++ b/libdd-trace-utils/src/span/vec_map.rs @@ -0,0 +1,260 @@ +// Copyright 2026-Present Datadog, Inc. https://www.datadoghq.com/ +// SPDX-License-Identifier: Apache-2.0 + +//! This module defines a associative map datastructure for spans data (meta, metrics, etc.) backed +//! by a vector. Spans are mostly allocated and constructed, and more rarely read or mutated. +//! [VecMap] is thus optimized for insertion (which is just `Vec::push`), without any hashing +//! involved. Fetching and removing a value is, on the other hand, linear time in the size of the +//! map. + +use serde::ser::{Serialize, SerializeMap, Serializer}; +use std::borrow::Borrow; +use std::collections::HashSet; +use std::hash::Hash; + +/// A Vec-backed map that provides HashMap-like lookup by key. +/// +/// # Duplicates +/// +/// Duplicates are tolerated: [VecMap::insert] always appends, and [VecMap::get]/[VecMap::get_mut] +/// return the *last* matching entry so that later writes shadow earlier ones. This optimizes for +/// fast insert and construction (that might happen on the client's application hot path), avoiding +/// a linear scan on each insert (or a potential costly full re-hashing with a hashmap). +/// Additionally, while overriding a metric or a meta definitively happen, it's assumed to be rare +/// enough so such that the size penalty of duplication is expected to be reasonable. +/// +/// **Important**: note that only [VecMap::get] and [VecMap::get_mut] are duplicate-aware, so to +/// speak. [Vec::len], [Vec::iter], and others just delegates to the underlying `Vec`, and won't +/// deduplicate. +/// +/// Explicit deduplication is currently being done automatically and on-the-fly during +/// serialization. If needed, in the future, we might trigger deduplication on other events, for +/// example at insertion if the size is bigger than a threshold. +#[derive(Clone, Debug, PartialEq, Default)] +pub struct VecMap(Vec<(K, V)>); + +impl VecMap { + #[must_use] + #[inline] + pub fn new() -> Self { + VecMap(Vec::new()) + } + + #[must_use] + #[inline] + pub fn with_capacity(capacity: usize) -> Self { + VecMap(Vec::with_capacity(capacity)) + } + + #[inline] + pub fn insert(&mut self, key: K, value: V) { + self.0.push((key, value)); + } + + #[inline] + pub fn get(&self, key: &Q) -> Option<&V> + where + K: Borrow, + Q: PartialEq + ?Sized, + { + self.0 + .iter() + .rev() + .find(|(k, _)| k.borrow() == key) + .map(|(_, v)| v) + } + + #[inline] + pub fn get_mut(&mut self, key: &Q) -> Option<&mut V> + where + K: Borrow, + Q: PartialEq + ?Sized, + { + self.0 + .iter_mut() + .rev() + .find(|(k, _)| (*k).borrow() == key) + .map(|(_, v)| v) + } + + #[inline] + pub fn contains_key(&self, key: &Q) -> bool + where + K: Borrow, + Q: PartialEq + ?Sized, + { + self.0.iter().any(|(k, _)| k.borrow() == key) + } + + /// Remove all entries matching this key from the map. This method use [Vec::retain], and is + /// thus potentially costly (like any removal in a vector-like datastructure). + // Note: we might implement a tombstone or option-based deletion later, if removal is a bit too + // costly. + #[inline] + pub fn remove_slow(&mut self, key: &Q) + where + K: Borrow, + Q: PartialEq + ?Sized, + { + self.0.retain(|(k, _)| k.borrow() != key); + } + + /// Iterate over the element, including duplicate entries. + #[inline] + pub fn iter(&self) -> std::slice::Iter<'_, (K, V)> { + self.0.iter() + } + + /// Iterate mutably over the elements, including duplicate entries. + #[inline] + pub fn iter_mut(&mut self) -> std::slice::IterMut<'_, (K, V)> { + self.0.iter_mut() + } + + /// Return the length of the underlying vector, thus including duplicate entries. + #[inline] + pub fn len(&self) -> usize { + self.0.len() + } + + #[inline] + pub fn is_empty(&self) -> bool { + self.0.is_empty() + } +} + +impl From> for VecMap { + fn from(vec: Vec<(K, V)>) -> Self { + VecMap(vec) + } +} + +impl FromIterator<(K, V)> for VecMap { + fn from_iter>(iter: I) -> Self { + VecMap(iter.into_iter().collect()) + } +} + +impl IntoIterator for VecMap { + type Item = (K, V); + type IntoIter = std::vec::IntoIter<(K, V)>; + + fn into_iter(self) -> Self::IntoIter { + self.0.into_iter() + } +} + +impl<'a, K, V> IntoIterator for &'a VecMap { + type Item = &'a (K, V); + type IntoIter = std::slice::Iter<'a, (K, V)>; + + fn into_iter(self) -> Self::IntoIter { + self.0.iter() + } +} + +impl<'a, K, V> IntoIterator for &'a mut VecMap { + type Item = &'a mut (K, V); + type IntoIter = std::slice::IterMut<'a, (K, V)>; + + fn into_iter(self) -> Self::IntoIter { + self.0.iter_mut() + } +} + +impl Extend<(K, V)> for VecMap { + fn extend>(&mut self, iter: I) { + self.0.extend(iter); + } +} + +impl Serialize for VecMap { + fn serialize(&self, serializer: S) -> Result { + let mut map = serializer.serialize_map(Some(self.0.len()))?; + let mut seen = HashSet::with_capacity(self.len()); + for (k, v) in self.0.iter().rev() { + if seen.insert(k) { + map.serialize_entry(k, v)?; + } + } + map.end() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn get_returns_last_inserted() { + let mut m = VecMap::new(); + m.insert("a", 1); + m.insert("a", 2); + assert_eq!(m.get("a"), Some(&2)); + } + + #[test] + fn get_mut_returns_last_inserted() { + let mut m = VecMap::new(); + m.insert("a", 1); + m.insert("a", 2); + *m.get_mut("a").unwrap() = 42; + assert_eq!(m.get("a"), Some(&42)); + // First entry unchanged + assert_eq!(m.iter().next().unwrap().1, 1); + } + + #[test] + fn remove_removes_all_occurrences() { + let mut m = VecMap::new(); + m.insert("a", 1); + m.insert("b", 2); + m.insert("a", 3); + m.remove_slow("a"); + assert_eq!(m.get("a"), None); + assert!(!m.contains_key("a")); + assert_eq!(m.len(), 1); + } + + #[test] + fn contains_key_works() { + let mut m = VecMap::new(); + assert!(!m.contains_key("x")); + m.insert("x", 10); + assert!(m.contains_key("x")); + } + + #[test] + fn from_iterator() { + let m: VecMap<&str, i32> = vec![("a", 1), ("b", 2)].into_iter().collect(); + assert_eq!(m.len(), 2); + assert_eq!(m.get("b"), Some(&2)); + } + + #[test] + fn into_iter_consuming() { + let mut m = VecMap::new(); + m.insert("a", 1); + m.insert("b", 2); + let pairs: Vec<_> = m.into_iter().collect(); + assert_eq!(pairs, vec![("a", 1), ("b", 2)]); + } + + #[test] + fn serialize_deduplicates_keeping_last() { + let mut m = VecMap::new(); + m.insert("a", 0); + m.insert("b", 0); + m.insert("b", 1); + m.insert("a", 1); + m.insert("a", 3); + m.insert("b", 2); + + let serialized: serde_json::Value = serde_json::to_value(&m).unwrap(); + let obj = serialized.as_object().unwrap(); + + assert_eq!(obj.len(), 2); + assert_eq!(obj.get("a").unwrap(), 3); + assert_eq!(obj.get("b").unwrap(), 2); + } +} From 4d24a575809b3f5b78d31444c4ae2ed1947ef511 Mon Sep 17 00:00:00 2001 From: Yann Hamdaoui Date: Fri, 22 May 2026 12:49:12 +0200 Subject: [PATCH 2/9] chore: minor improvements --- libdd-trace-utils/src/span/vec_map.rs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/libdd-trace-utils/src/span/vec_map.rs b/libdd-trace-utils/src/span/vec_map.rs index 6bacc4e5ca..5fd042765a 100644 --- a/libdd-trace-utils/src/span/vec_map.rs +++ b/libdd-trace-utils/src/span/vec_map.rs @@ -20,7 +20,7 @@ use std::hash::Hash; /// return the *last* matching entry so that later writes shadow earlier ones. This optimizes for /// fast insert and construction (that might happen on the client's application hot path), avoiding /// a linear scan on each insert (or a potential costly full re-hashing with a hashmap). -/// Additionally, while overriding a metric or a meta definitively happen, it's assumed to be rare +/// Additionally, while overriding a metric or a meta definitively happens, it's assumed to be rare /// enough so such that the size penalty of duplication is expected to be reasonable. /// /// **Important**: note that only [VecMap::get] and [VecMap::get_mut] are duplicate-aware, so to @@ -55,7 +55,7 @@ impl VecMap { pub fn get(&self, key: &Q) -> Option<&V> where K: Borrow, - Q: PartialEq + ?Sized, + Q: ?Sized + PartialEq, { self.0 .iter() @@ -68,7 +68,7 @@ impl VecMap { pub fn get_mut(&mut self, key: &Q) -> Option<&mut V> where K: Borrow, - Q: PartialEq + ?Sized, + Q: ?Sized + PartialEq, { self.0 .iter_mut() @@ -81,12 +81,12 @@ impl VecMap { pub fn contains_key(&self, key: &Q) -> bool where K: Borrow, - Q: PartialEq + ?Sized, + Q: ?Sized + PartialEq, { self.0.iter().any(|(k, _)| k.borrow() == key) } - /// Remove all entries matching this key from the map. This method use [Vec::retain], and is + /// Remove all entries matching this key from the map. This method uses [Vec::retain], and is /// thus potentially costly (like any removal in a vector-like datastructure). // Note: we might implement a tombstone or option-based deletion later, if removal is a bit too // costly. @@ -94,7 +94,7 @@ impl VecMap { pub fn remove_slow(&mut self, key: &Q) where K: Borrow, - Q: PartialEq + ?Sized, + Q: ?Sized + PartialEq, { self.0.retain(|(k, _)| k.borrow() != key); } From 8b24795e6cddc3a4a64d4c3768f00fc662e9dafe Mon Sep 17 00:00:00 2001 From: Yann Hamdaoui Date: Fri, 22 May 2026 12:50:49 +0200 Subject: [PATCH 3/9] fix: do not provide wrong size to serializer --- libdd-trace-utils/src/span/vec_map.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libdd-trace-utils/src/span/vec_map.rs b/libdd-trace-utils/src/span/vec_map.rs index 5fd042765a..7f1c1e120b 100644 --- a/libdd-trace-utils/src/span/vec_map.rs +++ b/libdd-trace-utils/src/span/vec_map.rs @@ -170,7 +170,7 @@ impl Extend<(K, V)> for VecMap { impl Serialize for VecMap { fn serialize(&self, serializer: S) -> Result { - let mut map = serializer.serialize_map(Some(self.0.len()))?; + let mut map = serializer.serialize_map(None)?; let mut seen = HashSet::with_capacity(self.len()); for (k, v) in self.0.iter().rev() { if seen.insert(k) { From 3141a05ebe0a11e783e6e17416c6cc9ce3127833 Mon Sep 17 00:00:00 2001 From: Yann Hamdaoui Date: Tue, 26 May 2026 10:47:29 +0200 Subject: [PATCH 4/9] doc: fix typo (Vec -> VecMap) Co-authored-by: Jules Wiriath <53870805+Aaalibaba42@users.noreply.github.com> --- libdd-trace-utils/src/span/vec_map.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libdd-trace-utils/src/span/vec_map.rs b/libdd-trace-utils/src/span/vec_map.rs index 7f1c1e120b..826a8efbcc 100644 --- a/libdd-trace-utils/src/span/vec_map.rs +++ b/libdd-trace-utils/src/span/vec_map.rs @@ -24,7 +24,7 @@ use std::hash::Hash; /// enough so such that the size penalty of duplication is expected to be reasonable. /// /// **Important**: note that only [VecMap::get] and [VecMap::get_mut] are duplicate-aware, so to -/// speak. [Vec::len], [Vec::iter], and others just delegates to the underlying `Vec`, and won't +/// speak. [VecMap::len], [VecMap::iter], and others just delegates to the underlying `Vec`, and won't /// deduplicate. /// /// Explicit deduplication is currently being done automatically and on-the-fly during From 0437af55ca3646923ea88a44eb99df8da5326025 Mon Sep 17 00:00:00 2001 From: Yann Hamdaoui Date: Tue, 26 May 2026 16:14:02 +0200 Subject: [PATCH 5/9] fix: avoid unsized deserialisation --- libdd-trace-utils/src/span/vec_map.rs | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/libdd-trace-utils/src/span/vec_map.rs b/libdd-trace-utils/src/span/vec_map.rs index 826a8efbcc..4ba1770975 100644 --- a/libdd-trace-utils/src/span/vec_map.rs +++ b/libdd-trace-utils/src/span/vec_map.rs @@ -9,7 +9,7 @@ use serde::ser::{Serialize, SerializeMap, Serializer}; use std::borrow::Borrow; -use std::collections::HashSet; +use std::collections::HashMap; use std::hash::Hash; /// A Vec-backed map that provides HashMap-like lookup by key. @@ -24,8 +24,8 @@ use std::hash::Hash; /// enough so such that the size penalty of duplication is expected to be reasonable. /// /// **Important**: note that only [VecMap::get] and [VecMap::get_mut] are duplicate-aware, so to -/// speak. [VecMap::len], [VecMap::iter], and others just delegates to the underlying `Vec`, and won't -/// deduplicate. +/// speak. [VecMap::len], [VecMap::iter], and others just delegates to the underlying `Vec`, and +/// won't deduplicate. /// /// Explicit deduplication is currently being done automatically and on-the-fly during /// serialization. If needed, in the future, we might trigger deduplication on other events, for @@ -170,14 +170,16 @@ impl Extend<(K, V)> for VecMap { impl Serialize for VecMap { fn serialize(&self, serializer: S) -> Result { - let mut map = serializer.serialize_map(None)?; - let mut seen = HashSet::with_capacity(self.len()); - for (k, v) in self.0.iter().rev() { - if seen.insert(k) { - map.serialize_entry(k, v)?; - } - } - map.end() + // We pre-compute the deduped map. If deduplication is done on the fly during serialization, + // we can't provide a length up front to the serializer, and the current one (rmp) will + // allocate an intermediate buffer defensively. + self.0 + .iter() + .map(|(k, v)| (k, v)) + // Since the iterator is sized, `collect()` should pre-allocate with the right capacity + // directly. + .collect::>() + .serialize(serializer) } } From f047caecb1da2d8b114a2c278e417b7ccc5ed244 Mon Sep 17 00:00:00 2001 From: Yann Hamdaoui Date: Tue, 26 May 2026 16:16:28 +0200 Subject: [PATCH 6/9] style: remove unused import --- libdd-trace-utils/src/span/vec_map.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libdd-trace-utils/src/span/vec_map.rs b/libdd-trace-utils/src/span/vec_map.rs index 4ba1770975..f66994edc1 100644 --- a/libdd-trace-utils/src/span/vec_map.rs +++ b/libdd-trace-utils/src/span/vec_map.rs @@ -7,7 +7,7 @@ //! involved. Fetching and removing a value is, on the other hand, linear time in the size of the //! map. -use serde::ser::{Serialize, SerializeMap, Serializer}; +use serde::ser::{Serialize, Serializer}; use std::borrow::Borrow; use std::collections::HashMap; use std::hash::Hash; From 33a91d29e7368eb36c26be4c3b63c7d442656fc1 Mon Sep 17 00:00:00 2001 From: Yann Hamdaoui Date: Wed, 27 May 2026 10:30:38 +0200 Subject: [PATCH 7/9] doc: improve code comments --- libdd-trace-utils/src/span/vec_map.rs | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/libdd-trace-utils/src/span/vec_map.rs b/libdd-trace-utils/src/span/vec_map.rs index f66994edc1..61d9c14518 100644 --- a/libdd-trace-utils/src/span/vec_map.rs +++ b/libdd-trace-utils/src/span/vec_map.rs @@ -5,7 +5,8 @@ //! by a vector. Spans are mostly allocated and constructed, and more rarely read or mutated. //! [VecMap] is thus optimized for insertion (which is just `Vec::push`), without any hashing //! involved. Fetching and removing a value is, on the other hand, linear time in the size of the -//! map. +//! map. However, since meta and metrics are expected to be typically small (20ish elements or +//! less), linear scan is usually still competitive with hashmap's `get`. use serde::ser::{Serialize, Serializer}; use std::borrow::Borrow; @@ -18,8 +19,8 @@ use std::hash::Hash; /// /// Duplicates are tolerated: [VecMap::insert] always appends, and [VecMap::get]/[VecMap::get_mut] /// return the *last* matching entry so that later writes shadow earlier ones. This optimizes for -/// fast insert and construction (that might happen on the client's application hot path), avoiding -/// a linear scan on each insert (or a potential costly full re-hashing with a hashmap). +/// fast insertion and construction (that might happen on the client's application hot path), +/// avoiding a linear scan on each insert, or a potential full re-hashing with a hashmap. /// Additionally, while overriding a metric or a meta definitively happens, it's assumed to be rare /// enough so such that the size penalty of duplication is expected to be reasonable. /// @@ -170,14 +171,14 @@ impl Extend<(K, V)> for VecMap { impl Serialize for VecMap { fn serialize(&self, serializer: S) -> Result { - // We pre-compute the deduped map. If deduplication is done on the fly during serialization, - // we can't provide a length up front to the serializer, and the current one (rmp) will - // allocate an intermediate buffer defensively. + // We pre-compute the deduped map. If deduplication were done on the fly during + // serialization, we couldn't provide a length up front to the serializer, and the current + // one (rmp) will allocate an intermediate buffer defensively. self.0 .iter() .map(|(k, v)| (k, v)) // Since the iterator is sized, `collect()` should pre-allocate with the right capacity - // directly. + // in one shot. .collect::>() .serialize(serializer) } From 7e38b233e8a1b336796d493b7ca6a31ab3c94bd3 Mon Sep 17 00:00:00 2001 From: Yann Hamdaoui Date: Wed, 27 May 2026 13:53:41 +0200 Subject: [PATCH 8/9] feat: in-place deduplication --- libdd-trace-utils/src/span/vec_map.rs | 259 +++++++++++++++++++++++--- 1 file changed, 228 insertions(+), 31 deletions(-) diff --git a/libdd-trace-utils/src/span/vec_map.rs b/libdd-trace-utils/src/span/vec_map.rs index 61d9c14518..4e53996ebe 100644 --- a/libdd-trace-utils/src/span/vec_map.rs +++ b/libdd-trace-utils/src/span/vec_map.rs @@ -10,7 +10,7 @@ use serde::ser::{Serialize, Serializer}; use std::borrow::Borrow; -use std::collections::HashMap; +use std::collections::HashSet; use std::hash::Hash; /// A Vec-backed map that provides HashMap-like lookup by key. @@ -28,28 +28,56 @@ use std::hash::Hash; /// speak. [VecMap::len], [VecMap::iter], and others just delegates to the underlying `Vec`, and /// won't deduplicate. /// -/// Explicit deduplication is currently being done automatically and on-the-fly during -/// serialization. If needed, in the future, we might trigger deduplication on other events, for -/// example at insertion if the size is bigger than a threshold. -#[derive(Clone, Debug, PartialEq, Default)] -pub struct VecMap(Vec<(K, V)>); +/// Explicit deduplication is currently being done on-demand by [VecMap::dedup]. An internal flag is +/// used to avoid undue deduplication (see [VecMap::dedup]). `VecMap` is automatically deduped +/// before serialization. +/// +/// In the future, we could trigger deduplication on other events, for example at insertion if the +/// size is bigger than a threshold (and we haven't deduped for `x` operations). +#[derive(Clone, Debug, PartialEq)] +pub struct VecMap { + data: Vec<(K, V)>, + /// Deduped is a flag that is set after entry deduplication. It is dirtied (set to `false`) + /// when any modification is performed (`deduped == false` doesn't imply there are actual + /// duplicates, just than there might be). This is useful to avoid performing deduplication + /// several times in the export pipeline. + deduped: bool, +} + +impl Default for VecMap { + fn default() -> Self { + Self { + data: Default::default(), + deduped: false, + } + } +} impl VecMap { #[must_use] #[inline] pub fn new() -> Self { - VecMap(Vec::new()) + Self::default() + } + + /// Dirty the `dedup` flag after a mutation that could introduce duplicates. + fn dirty(&mut self) { + self.deduped = false; } #[must_use] #[inline] pub fn with_capacity(capacity: usize) -> Self { - VecMap(Vec::with_capacity(capacity)) + VecMap { + data: Vec::with_capacity(capacity), + deduped: false, + } } #[inline] pub fn insert(&mut self, key: K, value: V) { - self.0.push((key, value)); + self.data.push((key, value)); + self.dirty(); } #[inline] @@ -58,7 +86,7 @@ impl VecMap { K: Borrow, Q: ?Sized + PartialEq, { - self.0 + self.data .iter() .rev() .find(|(k, _)| k.borrow() == key) @@ -71,7 +99,7 @@ impl VecMap { K: Borrow, Q: ?Sized + PartialEq, { - self.0 + self.data .iter_mut() .rev() .find(|(k, _)| (*k).borrow() == key) @@ -84,7 +112,7 @@ impl VecMap { K: Borrow, Q: ?Sized + PartialEq, { - self.0.iter().any(|(k, _)| k.borrow() == key) + self.data.iter().any(|(k, _)| k.borrow() == key) } /// Remove all entries matching this key from the map. This method uses [Vec::retain], and is @@ -97,42 +125,83 @@ impl VecMap { K: Borrow, Q: ?Sized + PartialEq, { - self.0.retain(|(k, _)| k.borrow() != key); + self.data.retain(|(k, _)| k.borrow() != key); } /// Iterate over the element, including duplicate entries. #[inline] pub fn iter(&self) -> std::slice::Iter<'_, (K, V)> { - self.0.iter() + self.data.iter() } /// Iterate mutably over the elements, including duplicate entries. #[inline] pub fn iter_mut(&mut self) -> std::slice::IterMut<'_, (K, V)> { - self.0.iter_mut() + self.dirty(); + self.data.iter_mut() } /// Return the length of the underlying vector, thus including duplicate entries. #[inline] pub fn len(&self) -> usize { - self.0.len() + self.data.len() } #[inline] pub fn is_empty(&self) -> bool { - self.0.is_empty() + self.data.is_empty() + } + + /// Return `true` if the map hasn't been extended since the last call to [Self::dedup], + /// guaranteeing that the underlying vector doesn't have any duplicate key. + /// + /// If `is_deduped` returns `false`, the map may have duplicate keys. + #[inline] + pub fn is_deduped(&self) -> bool { + self.deduped + } +} + +impl VecMap { + /// Remove entries with a duplicate key, only keeping the last one. After this, a flag is set + /// internally, such that as long as the map isn't extended or mutably iterated, the next + /// [Self::dedup] doesn't perform the work again. + pub fn dedup(&mut self) { + if self.deduped { + return; + } + + // Since we're going to shuffle elements around, it's not easy to keep references to keys in + // the deduping set. The simplest is to clone them. + let mut seen = HashSet::with_capacity(self.len()); + + self.data.reverse(); + self.data.retain(|(k, _)| seen.insert(k.clone())); + self.deduped = true; } } impl From> for VecMap { - fn from(vec: Vec<(K, V)>) -> Self { - VecMap(vec) + fn from(data: Vec<(K, V)>) -> Self { + Self { + data, + deduped: false, + } + } +} + +impl From> for Vec<(K, V)> { + fn from(value: VecMap) -> Self { + value.data } } impl FromIterator<(K, V)> for VecMap { fn from_iter>(iter: I) -> Self { - VecMap(iter.into_iter().collect()) + Self { + data: iter.into_iter().collect(), + deduped: false, + } } } @@ -141,7 +210,7 @@ impl IntoIterator for VecMap { type IntoIter = std::vec::IntoIter<(K, V)>; fn into_iter(self) -> Self::IntoIter { - self.0.into_iter() + self.data.into_iter() } } @@ -150,7 +219,7 @@ impl<'a, K, V> IntoIterator for &'a VecMap { type IntoIter = std::slice::Iter<'a, (K, V)>; fn into_iter(self) -> Self::IntoIter { - self.0.iter() + self.data.iter() } } @@ -159,28 +228,44 @@ impl<'a, K, V> IntoIterator for &'a mut VecMap { type IntoIter = std::slice::IterMut<'a, (K, V)>; fn into_iter(self) -> Self::IntoIter { - self.0.iter_mut() + self.data.iter_mut() } } impl Extend<(K, V)> for VecMap { fn extend>(&mut self, iter: I) { - self.0.extend(iter); + self.dirty(); + self.data.extend(iter); } } impl Serialize for VecMap { fn serialize(&self, serializer: S) -> Result { + use serde::ser::SerializeMap; + use std::collections::HashMap; + // We pre-compute the deduped map. If deduplication were done on the fly during // serialization, we couldn't provide a length up front to the serializer, and the current // one (rmp) will allocate an intermediate buffer defensively. - self.0 - .iter() - .map(|(k, v)| (k, v)) - // Since the iterator is sized, `collect()` should pre-allocate with the right capacity - // in one shot. - .collect::>() - .serialize(serializer) + if self.deduped { + let mut map_ser = serializer.serialize_map(Some(self.len()))?; + + for (k, v) in self { + map_ser.serialize_entry(k, v)?; + } + + map_ser.end() + } else { + // Note: using `dedup` would need an additional `clone()` of the whole map here. We can + // use references instead. + self.data + .iter() + .map(|(k, v)| (k, v)) + // Since the iterator is sized, `collect()` should pre-allocate with the right + // capacity in one shot. + .collect::>() + .serialize(serializer) + } } } @@ -243,6 +328,118 @@ mod tests { assert_eq!(pairs, vec![("a", 1), ("b", 2)]); } + #[test] + fn is_deduped_false_initially() { + let m: VecMap<&str, i32> = VecMap::new(); + assert!(!m.is_deduped()); + } + + #[test] + fn is_deduped_false_after_from() { + let m: VecMap<&str, i32> = vec![("a", 1)].into(); + assert!(!m.is_deduped()); + } + + #[test] + fn is_deduped_false_after_collect() { + let m: VecMap<&str, i32> = vec![("a", 1)].into_iter().collect(); + assert!(!m.is_deduped()); + } + + #[test] + fn dedup_sets_flag() { + let mut m = VecMap::new(); + m.insert("a", 1); + assert!(!m.is_deduped()); + m.dedup(); + assert!(m.is_deduped()); + } + + #[test] + fn dedup_on_empty_map() { + let mut m: VecMap = VecMap::new(); + m.dedup(); + assert!(m.is_deduped()); + assert!(m.is_empty()); + } + + #[test] + fn dedup_no_duplicates() { + let mut m = VecMap::new(); + m.insert("a", 1); + m.insert("b", 2); + m.insert("c", 3); + m.dedup(); + assert_eq!(m.len(), 3); + assert_eq!(m.get("a"), Some(&1)); + assert_eq!(m.get("b"), Some(&2)); + assert_eq!(m.get("c"), Some(&3)); + } + + #[test] + fn dedup_keeps_last_value() { + let mut m = VecMap::new(); + m.insert("a", 1); + m.insert("b", 10); + m.insert("a", 2); + m.insert("a", 3); + m.insert("b", 20); + m.dedup(); + assert_eq!(m.len(), 2); + assert_eq!(m.get("a"), Some(&3)); + assert_eq!(m.get("b"), Some(&20)); + } + + #[test] + fn dedup_is_idempotent() { + let mut m = VecMap::new(); + m.insert("a", 1); + m.insert("a", 2); + m.dedup(); + assert!(m.is_deduped()); + assert_eq!(m.len(), 1); + m.dedup(); + assert!(m.is_deduped()); + assert_eq!(m.len(), 1); + assert_eq!(m.get("a"), Some(&2)); + } + + #[test] + fn insert_dirties_dedup_flag() { + let mut m = VecMap::new(); + m.insert("a", 1); + m.dedup(); + assert!(m.is_deduped()); + + m.insert("b", 2); + assert!(!m.is_deduped()); + } + + #[test] + fn extend_dirties_dedup_flag() { + let mut m = VecMap::new(); + m.insert("a", 1); + m.dedup(); + assert!(m.is_deduped()); + + m.extend(vec![("b", 2)]); + assert!(!m.is_deduped()); + } + + #[test] + fn iter_mut_dirties_dedup_flag() { + let mut m = VecMap::new(); + m.insert("a", 1); + m.dedup(); + assert!(m.is_deduped()); + + for (_, v) in m.iter_mut() { + *v += 1; + } + + assert!(!m.is_deduped()); + } + #[test] fn serialize_deduplicates_keeping_last() { let mut m = VecMap::new(); From f502f564bf09b1a688df62be152548b5791f2d8d Mon Sep 17 00:00:00 2001 From: Yann Hamdaoui Date: Wed, 27 May 2026 16:43:26 +0200 Subject: [PATCH 9/9] fix: add missing dirty() + comment improvements --- libdd-trace-utils/src/span/vec_map.rs | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/libdd-trace-utils/src/span/vec_map.rs b/libdd-trace-utils/src/span/vec_map.rs index 4e53996ebe..9d2b5391ae 100644 --- a/libdd-trace-utils/src/span/vec_map.rs +++ b/libdd-trace-utils/src/span/vec_map.rs @@ -34,13 +34,19 @@ use std::hash::Hash; /// /// In the future, we could trigger deduplication on other events, for example at insertion if the /// size is bigger than a threshold (and we haven't deduped for `x` operations). +/// +/// # Ordering +/// +/// As this is a map, iteration order is not defined nor guaranteed. In practice, iteration follows +/// insertion order, but [Self::dedup] will reverse the underlying vector. #[derive(Clone, Debug, PartialEq)] pub struct VecMap { data: Vec<(K, V)>, /// Deduped is a flag that is set after entry deduplication. It is dirtied (set to `false`) - /// when any modification is performed (`deduped == false` doesn't imply there are actual - /// duplicates, just than there might be). This is useful to avoid performing deduplication - /// several times in the export pipeline. + /// when any modification that could create duplicates is performed (`deduped == false` + /// doesn't imply there are actual duplicates, just than there might be). This is useful to + /// avoid performing deduplication several times in a row, for example in the export + /// pipeline. deduped: bool, } @@ -228,6 +234,8 @@ impl<'a, K, V> IntoIterator for &'a mut VecMap { type IntoIter = std::slice::IterMut<'a, (K, V)>; fn into_iter(self) -> Self::IntoIter { + // Since we iterate on keys as well, they can modified, and introduce duplicates. + self.dirty(); self.data.iter_mut() } }