Skip to content

Commit 253de6a

Browse files
authored
Merge pull request #29 from apache/expand_docs
Expand docs
2 parents 5bffb0b + 7e77363 commit 253de6a

29 files changed

Lines changed: 681 additions & 106 deletions

datasketches/KernelFunction.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,12 @@
2424
# Each implementation must extend the KernelFunction class
2525
# and define the __call__ method
2626

27-
# Implements a basic Gaussian Kernel
2827
class GaussianKernel(KernelFunction):
28+
'''Implements a basic Gaussian kernel
29+
30+
:param bandwidth: The kernel bandwidth, default 1.0
31+
:type bandwidth: float
32+
'''
2933
def __init__(self, bandwidth: float=1.0):
3034
KernelFunction.__init__(self)
3135
self._bw = bandwidth

datasketches/PySerDe.py

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -32,12 +32,12 @@
3232
# returns a tuple with the newly reconstructed object and the
3333
# total number of bytes beyond the offset read from the input data.
3434

35-
# Implements a simple string-encoding scheme where a string is
36-
# written as <num_bytes> <string_contents>, with no null termination.
37-
# This format allows pre-allocating each string, at the cost of
38-
# additional storage. Using this format, the serialized string consumes
39-
# 4 + len(item) bytes.
4035
class PyStringsSerDe(PyObjectSerDe):
36+
'''Implements a simple string-encoding scheme where a string is
37+
written as `<num_bytes> <string_contents>`, with no null termination.
38+
This format allows pre-allocating each string, at the cost of
39+
additional storage. Using this format, the serialized string consumes
40+
``4 + len(item)`` bytes.'''
4141
def get_size(self, item):
4242
return int(4 + len(item))
4343

@@ -54,9 +54,9 @@ def from_bytes(self, data: bytes, offset: int):
5454
str = data[offset+4:offset+4+num_chars].decode()
5555
return (str, 4+num_chars)
5656

57-
# Implements an integer encoding scheme where each integer is written
58-
# as a 32-bit (4 byte) little-endian value.
5957
class PyIntsSerDe(PyObjectSerDe):
58+
'''Implements an integer encoding scheme where each integer is written
59+
as a 32-bit (4 byte) little-endian value.'''
6060
def get_size(self, item):
6161
return int(4)
6262

@@ -68,9 +68,9 @@ def from_bytes(self, data: bytes, offset: int):
6868
return (val, 4)
6969

7070

71-
# Implements an integer encoding scheme where each integer is written
72-
# as a 64-bit (8 byte) little-endian value.
7371
class PyLongsSerDe(PyObjectSerDe):
72+
'''Implements an integer encoding scheme where each integer is written
73+
as a 64-bit (8 byte) little-endian value.'''
7474
def get_size(self, item):
7575
return int(8)
7676

@@ -82,9 +82,9 @@ def from_bytes(self, data: bytes, offset: int):
8282
return (val, 8)
8383

8484

85-
# Implements a floating point encoding scheme where each value is written
86-
# as a 32-bit floating point value.
8785
class PyFloatsSerDe(PyObjectSerDe):
86+
'''Implements a floating point encoding scheme where each value is written
87+
as a 32-bit floating point value.'''
8888
def get_size(self, item):
8989
return int(4)
9090

@@ -96,9 +96,9 @@ def from_bytes(self, data: bytes, offset: int):
9696
return (val, 4)
9797

9898

99-
# Implements a floating point encoding scheme where each value is written
100-
# as a 64-bit floating point value.
10199
class PyDoublesSerDe(PyObjectSerDe):
100+
'''Implements a floating point encoding scheme where each value is written
101+
as a 64-bit floating point value.'''
102102
def get_size(self, item):
103103
return int(8)
104104

datasketches/TuplePolicy.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -22,16 +22,17 @@
2222
# This file provides an example Python Tuple Policy implementation.
2323
#
2424
# Each implementation must extend the PyTuplePolicy class and define
25-
# two methods:
25+
# the following methods:
2626
# * create_summary() returns a new Summary object
2727
# * update_summary(summary, update) applies the relevant policy to update the
2828
# provided summary with the data in update.
2929
# * __call__ may be similar to update_summary but allows a different
3030
# implementation for set operations (union and intersection)
3131

32-
# Implements an accumulator summary policy, where new values are
33-
# added to the existing value.
3432
class AccumulatorPolicy(TuplePolicy):
33+
'''Implements an accumulatory summary policy, where new values
34+
are added to the existing value.'''
35+
3536
def __init__(self):
3637
TuplePolicy.__init__(self)
3738

@@ -47,8 +48,9 @@ def __call__(self, summary: int, update: int) -> int:
4748
return summary
4849

4950

50-
# Implements a MAX rule, where the largest integer value is always kept
5151
class MaxIntPolicy(TuplePolicy):
52+
'''Implements a MAX rule, where the largest integer value is always kept.'''
53+
5254
def __init__(self):
5355
TuplePolicy.__init__(self)
5456

@@ -62,8 +64,9 @@ def __call__(self, summary: int, update: int) -> int:
6264
return max(summary, update)
6365

6466

65-
# Implements a MIN rule, where the smallest integer value is always kept
6667
class MinIntPolicy(TuplePolicy):
68+
'''Implements a MIN rule, where the smallest integer value is always kept.'''
69+
6770
def __init__(self):
6871
TuplePolicy.__init__(self)
6972

docs/source/count_min_sketch.rst

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@ heavy hitters.
1717
:members:
1818
:undoc-members:
1919
:exclude-members: deserialize, suggest_num_buckets, suggest_num_hashes
20-
:member-order: groupwise
2120

2221
.. rubric:: Static Methods:
2322

docs/source/cpc.rst

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,12 +13,20 @@ For additional security this sketch can be configured with a user-specified hash
1313
.. autoclass:: _datasketches.cpc_sketch
1414
:members:
1515
:undoc-members:
16-
:exclude-members: deserialize,
17-
:member-order: groupwise
16+
:exclude-members: deserialize
1817

1918
.. rubric:: Static Methods:
2019

2120
.. automethod:: deserialize
2221

2322
.. rubric:: Non-static Methods:
2423

24+
.. automethod:: __init__
25+
26+
27+
.. autoclass:: _datasketches.cpc_union
28+
:members:
29+
:undoc-members:
30+
:exclude-members: deserialize
31+
32+
.. automethod:: __init__

docs/source/density_sketch.rst

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
11
Density Sketch
22
--------------
3+
4+
.. currentmodule:: datasketches
5+
36
Builds a coreset from the given set of input points.
47
Provides density estimate at a given point.
58

@@ -9,9 +12,17 @@ https://proceedings.mlr.press/v99/karnin19a/karnin19a.pdf
912

1013
Inspired by the following implementation: https://github.com/edoliberty/streaming-quantiles/blob/f688c8161a25582457b0a09deb4630a81406293b/gde.py
1114

12-
.. autoclass:: datasketches.density_sketch
15+
Requires the use of a :class:`KernelFunction` to compute the distance between two vectors.
16+
17+
.. autoclass:: density_sketch
1318
:members:
1419
:undoc-members:
15-
16-
.. autoclass:: datasketches.GaussianKernel
17-
:members:
20+
:exclude-members: deserialize
21+
22+
.. rubric:: Static Methods:
23+
24+
.. automethod:: deserialize
25+
26+
.. rubric:: Non-static Methods:
27+
28+
.. automethod:: __init__

docs/source/ebpps.rst

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
Exact and Bounded, Probabilitiy Proportional to Size (EBPPS) Sampling
2+
---------------------------------------------------------------------
3+
4+
.. currentmodule:: datasketches
5+
6+
An EBPPS sketch produces a randome sample of data from a stream of items, ensuring that the probability
7+
of including an item is always exactly equal to the item's size. The size of an item is defined as its
8+
weight relative to the total weight of all items seen so far by the sketch. In contrast to VarOpt sampling,
9+
this sketch may return fewer than `k` items in order to keep the probability of including an item strictly
10+
proportional to its size.
11+
12+
This sketch is based on: B. Hentschel, P. J. Haas, Y. Tian
13+
"Exact PPS Sampling with Bounded Sample Size",
14+
Information Processing Letters, 2023.
15+
16+
EBPPS sampling is related to reservoir sampling, but handles unequal item weights.
17+
Feeding the sketch items with a uniform weight value will produce a sample equivalent to reservoir sampling.
18+
19+
.. note::
20+
Serializing and deserializing this sketch requires the use of a :class:`PyObjectSerDe`.
21+
22+
.. autoclass:: ebpps_sketch
23+
:members:
24+
:undoc-members:
25+
:exclude-members: deserialize
26+
27+
.. rubric:: Static Methods:
28+
29+
.. automethod:: deserialize
30+
31+
.. rubric:: Non-static Methods:
32+
33+
.. automethod:: __init__

docs/source/frequent_items.rst

Lines changed: 53 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
11
Frequent Items
22
--------------
33

4-
This sketch is useful for tracking approximate frequencies of items of type `<T>` with optional associated counts `(<T> item, int count)`
5-
that are members of a multiset of such items.
4+
.. currentmodule:: datasketches
5+
6+
This sketch is useful for tracking approximate frequencies of items (``object`` or ``string``) with optional associated
7+
integer counts that are members of a multiset of such items.
68
The true frequency of an item is defined to be the sum of associated counts.
79

810
This implementation provides the following capabilities:
@@ -16,38 +18,38 @@ This implementation provides the following capabilities:
1618

1719
**Space Usage**
1820

19-
The sketch is initialized with a maximum map size, `maxMapSize`, that specifies the maximum physical length of the internal hash map of the form `(<T> item, int count)`.
20-
The maximum map size is always a power of 2, defined through the variables `lg_max_map_size`.
21+
The sketch is initialized with a maximum map size, ``maxMapSize``, that specifies the maximum physical length of the internal hash map of the form ``(object item, int count)``.
22+
The maximum map size is always a power of 2, defined through the variables ``lg_max_map_size``.
2123

2224
The hash map starts at a very small size (8 entries) and grows as needed up to the specified maximum map size.
2325

24-
Excluding external space required for the item objects, the internal memory space usage of this sketch is `18 * mapSize bytes` (assuming 8 bytes for each reference),
26+
Excluding external space required for the item objects, the internal memory space usage of this sketch is ``18 * mapSize`` bytes (assuming 8 bytes for each reference),
2527
plus a small constant number of additional bytes.
26-
The internal memory space usage of this sketch will never exceed `18 * maxMapSize` bytes, plus a small constant number of additional bytes.
28+
The internal memory space usage of this sketch will never exceed ``18 * maxMapSize`` bytes, plus a small constant number of additional bytes.
2729

2830
**Maximum Capacity of the Sketch**
2931

30-
The `LOAD_FACTOR` for the hash map is internally set at :math:`75\%`, which means at any time the map capacity of `(item, count)` pairs is `mapCap = 0.75 * mapSize`.
31-
The maximum capacity of `(item, count)`` pairs of the sketch is `maxMapCap = 0.75 * maxMapSize`.
32+
The ``LOAD_FACTOR`` for the hash map is internally set at :math:`75\%`, which means at any time the map capacity of ``(item, count)`` pairs is ``mapCap = 0.75 * mapSize``.
33+
The maximum capacity of ``(item, count)`` pairs of the sketch is ``maxMapCap = 0.75 * maxMapSize``.
3234

33-
**Updating the sketch with `(item, count)` pairs**
35+
**Updating the sketch with ``(item, count)`` pairs**
3436

35-
If the item is found in the hash map, the mapped count field (the "counter") is incremented by the incoming count; otherwise, a new counter `"(item, count) pair"` is created.
37+
If the item is found in the hash map, the mapped count field (the "counter") is incremented by the incoming count; otherwise, a new counter ``(item, count)`` pair is created.
3638
If the number of tracked counters reaches the maximum capacity of the hash map, the sketch decrements all of the counters (by an approximately computed median)
3739
and removes any non-positive counters.
3840

3941
**Accuracy**
4042

41-
If fewer than `0.75 * maxMapSize` different items are inserted into the sketch, the estimated frequencies returned by the sketch will be exact.
43+
If fewer than ``0.75 * maxMapSize`` different items are inserted into the sketch, the estimated frequencies returned by the sketch will be exact.
4244

4345
The logic of the frequent items sketch is such that the stored counts and true counts are never too different.
4446
More specifically, for any item, the sketch can return an estimate of the true frequency of item, along with upper and lower bounds on the frequency (that hold deterministically).
4547

4648
For this implementation and for a specific active item, it is guaranteed that the true frequency will be between the Upper Bound (UB) and the Lower Bound (LB) computed for that item.
47-
Specifically, `(UB- LB) ≤ W * epsilon`, where :math:`W` denotes the sum of all item counts, and :math:`epsilon = 3.5/M`, where :math:`epsilon = M` is the maxMapSize.
49+
Specifically, ``(UB- LB) ≤ W * epsilon``, where :math:`W` denotes the sum of all item counts, and :math:`epsilon = 3.5/M`, where :math:`epsilon = M` is the maxMapSize.
4850

4951
This is a worst-case guarantee that applies to arbitrary inputs.
50-
For inputs typically seen in practice, `(UB-LB)` is usually much smaller.
52+
For inputs typically seen in practice, ``(UB-LB)`` is usually much smaller.
5153

5254
**Background**
5355

@@ -63,12 +65,45 @@ Variants of it were discovered and rediscovered and redesigned several times ove
6365
For speed, we do employ some randomization that introduces a small probability that our proof of the worst-case bound might not apply to a given run.
6466
However, we have ensured that this probability is extremely small.
6567
For example, if the stream causes one table purge (rebuild), our proof of the worst-case bound applies with a probability of at least `1 - 1E-14`.
66-
If the stream causes `1E9` purges, our proof applies with a probability of at least `1 - 1E-5`.
68+
If the stream causes ``1E9`` purges, our proof applies with a probability of at least ``1 - 1E-5``.
69+
70+
There are two flavors of Frequent Items Sketches, one with generic items (objects) and another specific to strings.
71+
The string version is a legacy name from before the library supported generic objects and is retained
72+
only for backwards compatibility.
73+
74+
.. note::
75+
The :class:`frequent_items_sketch` uses an input object's ``__hash__`` and ``__eq__`` methods.
76+
77+
.. note::
78+
Serializing and deserializing the :class:`frequent_items_sketch` requires the use of a :class:`PyObjectSerDe`.
79+
80+
.. autoclass:: frequent_items_error_type
81+
82+
.. autoattribute:: NO_FALSE_POSITIVES
83+
:annotation: : Returns only true positives but may miss some heavy hitters.
84+
85+
.. autoattribute:: NO_FALSE_NEGATIVES
86+
:annotation: : Does not miss any heavy hitters but may return false positives.
87+
88+
89+
.. autoclass:: frequent_items_sketch
90+
:members:
91+
:undoc-members:
92+
:exclude-members: deserialize, get_epsilon_for_lg_size, get_apriori_error
93+
:member-order: groupwise
6794

68-
Parameter: <T> The type of item to be tracked by this sketch
95+
.. rubric:: Static Methods:
6996

97+
.. automethod:: deserialize
98+
.. automethod:: get_epsilon_for_lg_size
99+
.. automethod:: get_apriori_error
70100

71-
.. autoclass:: _datasketches.frequent_items_sketch
101+
.. rubric:: Non-static Methods:
102+
103+
.. automethod:: __init__
104+
105+
106+
.. autoclass:: frequent_strings_sketch
72107
:members:
73108
:undoc-members:
74109
:exclude-members: deserialize, get_epsilon_for_lg_size, get_apriori_error
@@ -81,3 +116,5 @@ Parameter: <T> The type of item to be tracked by this sketch
81116
.. automethod:: get_apriori_error
82117

83118
.. rubric:: Non-static Methods:
119+
120+
.. automethod:: __init__

docs/source/hyper_log_log.rst

Lines changed: 31 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -7,26 +7,49 @@ If the ONLY use case for sketching is counting uniques and merging, the HLL sket
77
This implementation offers three different types of HLL sketch, each with different trade-offs with accuracy, space and performance.
88
These types are specified with the target_hll_type parameter.
99

10-
In terms of accuracy, all three types, for the same lg_config_k, have the same error distribution as a function of n, the number of unique values fed to the sketch.
11-
The configuration parameter `lg_config_k` is the log-base-2 of `K`, where `K` is the number of buckets or slots for the sketch.
10+
In terms of accuracy, all three types, for the same lg_config_k, have the same error distribution as a function of ``n``, the number of unique values fed to the sketch.
11+
The configuration parameter ``lg_config_k`` is the log-base-2 of ``k``, where ``k`` is the number of buckets or slots for the sketch.
1212

13-
During warmup, when the sketch has only received a small number of unique items (up to about 10% of `K`), this implementation leverages a new class of estimator algorithms with significantly better accuracy.
13+
During warmup, when the sketch has only received a small number of unique items (up to about 10% of ``k``), this implementation leverages a new class of estimator algorithms with significantly better accuracy.
14+
15+
16+
.. autoclass:: _datasketches.tgt_hll_type
17+
18+
.. autoattribute:: HLL_4
19+
:annotation: : 4 bits per entry
20+
21+
.. autoattribute:: HLL_6
22+
:annotation: : 6 bits per entry
23+
24+
.. autoattribute:: HLL_8
25+
:annotation: : 8 bits per entry
1426

15-
This sketch also offers the capability of operating off-heap.
16-
Given a WritableMemory object created by the user, the sketch will perform all of its updates and internal phase transitions in that object, which can actually reside either on-heap or off-heap based on how it is configured.
17-
In large systems that must update and merge many millions of sketches, having the sketch operate off-heap avoids the serialization and deserialization costs of moving sketches to and from off-heap memory-mapped files, for example, and eliminates big garbage collection delays.
1827

1928
.. autoclass:: _datasketches.hll_sketch
2029
:members:
2130
:undoc-members:
2231
:exclude-members: deserialize, get_max_updatable_serialization_bytes, get_rel_err
2332

24-
:member-order: groupwise
25-
2633
.. rubric:: Static Methods:
2734

2835
.. automethod:: deserialize
2936
.. automethod:: get_max_updatable_serialization_bytes
3037
.. automethod:: get_rel_err
3138

3239
.. rubric:: Non-static Methods:
40+
41+
.. automethod:: __init__
42+
43+
.. autoclass:: _datasketches.hll_union
44+
:members:
45+
:undoc-members:
46+
:exclude-members: get_rel_err
47+
48+
.. rubric:: Static Methods:
49+
50+
.. automethod:: get_rel_err
51+
52+
.. rubric:: Non-static Methods:
53+
54+
.. automethod:: __init__
55+

0 commit comments

Comments
 (0)