Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions datasketches/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ frequencies = []
hll = []
tdigest = []
theta = []
tuple = ["theta"]

[dev-dependencies]
googletest = { workspace = true }
Expand Down
10 changes: 10 additions & 0 deletions datasketches/src/codec/decode.rs
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,16 @@ impl SketchSlice<'_> {
self.slice.set_position(pos + n);
}

/// Returns the not-yet-read portion of the underlying slice.
///
/// Useful for handing the remaining bytes to a variable-length decoder that reports how many
/// bytes it consumed; pair it with [`advance`](Self::advance).
pub fn remaining(&self) -> &[u8] {
let buf = self.slice.get_ref();
let pos = (self.slice.position() as usize).min(buf.len());
&buf[pos..]
}

/// Reads exactly `buf.len()` bytes from the slice into `buf`.
pub fn read_exact(&mut self, buf: &mut [u8]) -> io::Result<()> {
self.slice.read_exact(buf)
Expand Down
9 changes: 9 additions & 0 deletions datasketches/src/codec/family.rs
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,15 @@ impl Family {
max_pre_longs: 1,
};

/// Tuple Sketch for cardinality estimation with per-key summaries.
#[cfg(feature = "tuple")]
pub const TUPLE: Family = Family {
id: 9,
name: "TUPLE",
min_pre_longs: 1,
max_pre_longs: 3,
};

/// The Frequency family of sketches.
#[cfg(feature = "frequencies")]
pub const FREQUENCY: Family = Family {
Expand Down
2 changes: 2 additions & 0 deletions datasketches/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@ pub mod hll;
pub mod tdigest;
#[cfg(feature = "theta")]
pub mod theta;
#[cfg(feature = "tuple")]
pub mod tuple;

// common modules
pub mod codec;
Expand Down
11 changes: 3 additions & 8 deletions datasketches/src/theta/hash_table.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,7 @@ use crate::theta::HASH_TABLE_REBUILD_THRESHOLD;
use crate::theta::HASH_TABLE_RESIZE_THRESHOLD;
use crate::theta::MAX_THETA;
use crate::theta::MIN_LG_K;

/// Stride hash bits (7 bits for stride calculation)
const STRIDE_HASH_BITS: u8 = 7;

/// Stride mask
const STRIDE_MASK: u64 = (1 << STRIDE_HASH_BITS) - 1;
use crate::theta::STRIDE_MASK;

/// Specific hash table for theta sketch
///
Expand Down Expand Up @@ -391,7 +386,7 @@ impl ThetaHashTable {
/// Compute initial lg_size for hash table based on target lg_size, minimum lg_size, and resize
/// factor. Make sure `lg_target = lg_init + n * lg_resize_factor`, where `n` is an integer and
/// `lg_init >= lg_min`
fn starting_sub_multiple(lg_target: u8, lg_min: u8, lg_resize_factor: u8) -> u8 {
pub(crate) fn starting_sub_multiple(lg_target: u8, lg_min: u8, lg_resize_factor: u8) -> u8 {
if lg_target <= lg_min {
lg_min
} else if lg_resize_factor == 0 {
Expand All @@ -402,7 +397,7 @@ fn starting_sub_multiple(lg_target: u8, lg_min: u8, lg_resize_factor: u8) -> u8
}

/// Compute initial theta for hash table based on sampling probability.
fn starting_theta_from_sampling_probability(sampling_probability: f32) -> u64 {
pub(crate) fn starting_theta_from_sampling_probability(sampling_probability: f32) -> u64 {
if sampling_probability < 1.0 {
(MAX_THETA as f64 * sampling_probability as f64) as u64
} else {
Expand Down
22 changes: 16 additions & 6 deletions datasketches/src/theta/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -45,21 +45,31 @@ mod intersection;
mod serialization;
mod sketch;

// These helpers are re-exported only for the Tuple sketch, which reuses the Theta hash-table
// sizing.
#[cfg(feature = "tuple")]
pub(crate) use self::hash_table::starting_sub_multiple;
#[cfg(feature = "tuple")]
pub(crate) use self::hash_table::starting_theta_from_sampling_probability;
pub use self::intersection::ThetaIntersection;
pub use self::sketch::CompactThetaSketch;
pub use self::sketch::ThetaSketch;
pub use self::sketch::ThetaSketchBuilder;
pub use self::sketch::ThetaSketchView;

/// Maximum theta value (signed max for compatibility with Java)
const MAX_THETA: u64 = i64::MAX as u64;
pub(crate) const MAX_THETA: u64 = i64::MAX as u64;
/// Minimum log2 of K
const MIN_LG_K: u8 = 5;
pub(crate) const MIN_LG_K: u8 = 5;
/// Maximum log2 of K
const MAX_LG_K: u8 = 26;
pub(crate) const MAX_LG_K: u8 = 26;
/// Default log2 of K
const DEFAULT_LG_K: u8 = 12;
pub(crate) const DEFAULT_LG_K: u8 = 12;
/// Resize threshold (0.5 = 50% load factor)
const HASH_TABLE_RESIZE_THRESHOLD: f64 = 0.5;
pub(crate) const HASH_TABLE_RESIZE_THRESHOLD: f64 = 0.5;
/// Rebuild threshold (15/16 = 93.75% load factor)
const HASH_TABLE_REBUILD_THRESHOLD: f64 = 15.0 / 16.0;
pub(crate) const HASH_TABLE_REBUILD_THRESHOLD: f64 = 15.0 / 16.0;
/// Stride hash bits (7 bits for stride calculation)
pub(crate) const STRIDE_HASH_BITS: u8 = 7;
/// Stride mask
pub(crate) const STRIDE_MASK: u64 = (1 << STRIDE_HASH_BITS) - 1;
Loading