Skip to content

Commit f812afa

Browse files
authored
Merge pull request #48 from apache/cpp_version_bump
Cpp version bump
2 parents 834f0c4 + 5830303 commit f812afa

8 files changed

Lines changed: 231 additions & 4 deletions

File tree

CMakeLists.txt

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,7 @@ target_sources(python
108108
src/density_wrapper.cpp
109109
src/ks_wrapper.cpp
110110
src/count_wrapper.cpp
111+
src/tdigest_wrapper.cpp
111112
src/vector_of_kll.cpp
112113
src/py_serde.cpp
113114
)
@@ -116,7 +117,7 @@ cmake_policy(SET CMP0097 NEW)
116117
include(ExternalProject)
117118
ExternalProject_Add(datasketches
118119
GIT_REPOSITORY https://github.com/apache/datasketches-cpp.git
119-
GIT_TAG 5.0.2
120+
GIT_TAG 5.1.0
120121
GIT_SHALLOW true
121122
GIT_SUBMODULES ""
122123
INSTALL_DIR /tmp/datasketches
@@ -126,7 +127,7 @@ ExternalProject_Get_property(datasketches INSTALL_DIR)
126127
set(datasketches_INSTALL_DIR ${INSTALL_DIR})
127128
message("Source dir of datasketches = ${datasketches_INSTALL_DIR}")
128129
message("Numpy include dir(s): ${Python_NumPy_INCLUDE_DIRS}")
129-
target_include_directories(python
130+
target_include_directories(python
130131
PRIVATE
131132
${datasketches_INSTALL_DIR}/include/DataSketches
132133
${Python_NumPy_INCLUDE_DIRS}

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,6 @@ def build_extension(self, ext):
104104
# may need to add all source paths for sdist packages w/o MANIFEST.in
105105
ext_modules=[CMakeExtension('datasketches','.')],
106106
cmdclass={'build_ext': CMakeBuild},
107-
install_requires=['numpy'],
107+
install_requires=['numpy < 2.0'],
108108
zip_safe=False
109109
)

src/datasketches.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ void init_req(nb::module_& m);
3939
void init_quantiles(nb::module_& m);
4040
void init_count_min(nb::module_& m);
4141
void init_density(nb::module_& m);
42+
void init_tdigest(nb::module_& m);
4243
void init_vector_of_kll(nb::module_& m);
4344

4445
// supporting objects
@@ -70,6 +71,7 @@ NB_MODULE(_datasketches, m) {
7071
init_quantiles(m);
7172
init_count_min(m);
7273
init_density(m);
74+
init_tdigest(m);
7375
init_vector_of_kll(m);
7476

7577
init_kolmogorov_smirnov(m);

src/tdigest_wrapper.cpp

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
#include <vector>
21+
#include <stdexcept>
22+
#include <algorithm> // should ultimately be in tdigest.hpp
23+
24+
#include <nanobind/nanobind.h>
25+
#include <nanobind/make_iterator.h>
26+
#include <nanobind/stl/string.h>
27+
#include <nanobind/ndarray.h>
28+
29+
#include "tdigest.hpp"
30+
#include "quantile_conditional.hpp"
31+
32+
namespace nb = nanobind;
33+
34+
template<typename T>
35+
void bind_tdigest(nb::module_ &m, const char* name) {
36+
using namespace datasketches;
37+
38+
auto tdigest_class = nb::class_<tdigest<T>>(m, name)
39+
.def(nb::init<uint16_t>(), nb::arg("k")=tdigest<T>::DEFAULT_K,
40+
"Creates a tdigest instance with the given value of k.\n\n"
41+
":param k: Controls the size/accuracy trade-off of the sketch. Default is 200.\n"
42+
":type k: int, optional"
43+
)
44+
.def("__copy__", [](const tdigest<T>& sk) { return tdigest<T>(sk); })
45+
.def("update", (void(tdigest<T>::*)(T)) &tdigest<T>::update, nb::arg("item"),
46+
"Updates the sketch with the given value")
47+
.def("merge", (void(tdigest<T>::*)(tdigest<T>&)) &tdigest<T>::merge, nb::arg("sketch"),
48+
"Merges the provided sketch into this one")
49+
.def("__str__", [](const tdigest<T>& sk) { return sk.to_string(); },
50+
"Produces a string summary of the sketch")
51+
.def("to_string", &tdigest<T>::to_string, nb::arg("print_centroids")=false,
52+
"Produces a string summary of the sketch")
53+
.def("is_empty", &tdigest<T>::is_empty,
54+
"Returns True if the sketch is empty, otherwise False")
55+
.def_prop_ro("k", &tdigest<T>::get_k,
56+
"The configured parameter k")
57+
.def("get_total_weight", &tdigest<T>::get_total_weight,
58+
"The total weight processed by the sketch")
59+
.def("compress", &tdigest<T>::compress,
60+
"Process buffered values and merge centroids, if necesssary")
61+
.def("get_min_value", &tdigest<T>::get_min_value,
62+
"Returns the minimum value from the stream. If empty, throws a RuntimeError")
63+
.def("get_max_value", &tdigest<T>::get_max_value,
64+
"Returns the maximum value from the stream. If empty, throws a RuntimeError")
65+
.def("get_rank", &tdigest<T>::get_rank, nb::arg("value"),
66+
"Computes the approximate normalized rank of the given value")
67+
.def("get_quantile", &tdigest<T>::get_quantile, nb::arg("rank"),
68+
"Returns an approximation to the data value "
69+
"associated with the given rank in a hypothetical sorted "
70+
"version of the input stream so far.\n")
71+
.def("get_serialized_size_bytes", &tdigest<T>::get_serialized_size_bytes,
72+
nb::arg("with_buffer")=false,
73+
"Returns the size of the serialized sketch, in bytes")
74+
;
75+
76+
add_serialization<T>(tdigest_class);
77+
add_vector_update<T>(tdigest_class);
78+
}
79+
80+
void init_tdigest(nb::module_ &m) {
81+
bind_tdigest<float>(m, "tdigest_float");
82+
bind_tdigest<double>(m, "tdigest_double");
83+
}

src/tuple_wrapper.cpp

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
#include <nanobind/make_iterator.h>
2424
#include <nanobind/intrusive/counter.h>
2525
#include <nanobind/stl/array.h>
26+
#include <nanobind/stl/function.h>
2627
#include <nanobind/stl/string.h>
2728

2829
#include "py_serde.hpp"
@@ -133,6 +134,14 @@ void init_tuple(nb::module_ &m) {
133134
}, nb::arg("serde"),
134135
"Serializes the sketch into a bytes object"
135136
)
137+
.def("filter",
138+
[](const py_compact_tuple& sk, const std::function<bool(const nb::object&)> func) {
139+
return sk.filter(func);
140+
}, nb::arg("predicate"),
141+
"Produces a compact_tuple_sketch from the given sketch by applying a predicate to "
142+
"the summary in each entry.\n\n"
143+
":param predicate: A function returning true or value evaluated on each tuple summary\n"
144+
":return: A compact_tuple_sketch with the selected entries\n:rtype: :class:`compact_tuple_sketch`")
136145
.def_static(
137146
"deserialize",
138147
[](const nb::bytes& bytes, py_object_serde& serde, uint64_t seed) {
@@ -169,6 +178,14 @@ void init_tuple(nb::module_ &m) {
169178
"Returns a compacted form of the sketch, optionally sorting it")
170179
.def("trim", &py_update_tuple::trim, "Removes retained entries in excess of the nominal size k (if any)")
171180
.def("reset", &py_update_tuple::reset, "Resets the sketch to the initial empty state")
181+
.def("filter",
182+
[](const py_update_tuple& sk, const std::function<bool(const nb::object&)> func) {
183+
return sk.filter(func);
184+
}, nb::arg("predicate"),
185+
"Produces a compact_tuple_sketch from the given sketch by applying a predicate to "
186+
"the summary in each entry.\n\n"
187+
":param predicate: A function returning true or value evaluated on each tuple summary\n"
188+
":return: A compact_tuple_sketch with the selected entries\n:rtype: :class:`compact_tuple_sketch`")
172189
;
173190

174191
nb::class_<py_tuple_union>(m, "tuple_union")

tests/tdigest_test.py

Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
# Licensed to the Apache Software Foundation (ASF) under one
2+
# or more contributor license agreements. See the NOTICE file
3+
# distributed with this work for additional information
4+
# regarding copyright ownership. The ASF licenses this file
5+
# to you under the Apache License, Version 2.0 (the
6+
# "License"); you may not use this file except in compliance
7+
# with the License. You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing,
12+
# software distributed under the License is distributed on an
13+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
# KIND, either express or implied. See the License for the
15+
# specific language governing permissions and limitations
16+
# under the License.
17+
18+
import unittest
19+
from datasketches import tdigest_float, tdigest_double
20+
import numpy as np
21+
22+
class TdigestTest(unittest.TestCase):
23+
def test_tdigest_double_example(self):
24+
n = 2 ** 20
25+
26+
# create a tdigest and inject ~1 million N(0,1) points, both using a vector
27+
# update as well as a single value
28+
td = tdigest_double()
29+
td.update(np.random.normal(size=n-1))
30+
td.update(0.0)
31+
32+
# 0 should be near the median
33+
self.assertAlmostEqual(0.5, td.get_rank(0.0), delta=0.1)
34+
35+
# the median should be near 0
36+
self.assertAlmostEqual(0.0, td.get_quantile(0.5), delta=0.1)
37+
38+
# note that with t-digest, while it typically performs quite well in practice,
39+
# we do not have any sort of theoretical guarantees on the error bounds
40+
# or even an estimate of what bounds we may expect.
41+
42+
# we also track the min/max independently from the rest of the data
43+
# which lets us know the full observed data range
44+
self.assertLessEqual(td.get_min_value(), td.get_quantile(0.01))
45+
self.assertLessEqual(0.0, td.get_rank(td.get_min_value()))
46+
self.assertGreaterEqual(td.get_max_value(), td.get_quantile(0.99))
47+
self.assertGreaterEqual(1.0, td.get_rank(td.get_max_value()))
48+
49+
# and a few basic queries about the sketch
50+
self.assertFalse(td.is_empty())
51+
self.assertEqual(td.get_total_weight(), n)
52+
53+
# we can define a new tdiget with a different distribution, then merge them
54+
td2 = tdigest_double()
55+
td2.update(np.random.normal(loc=2.0, size=n))
56+
td.merge(td2)
57+
58+
# the new median should be near 1.0, and 1.0 should be near the median although
59+
# the error distribution is not well-characterized so we allow generous margins
60+
self.assertAlmostEqual(0.5, td.get_rank(1.0), delta=0.2)
61+
self.assertAlmostEqual(1.0, td.get_quantile(0.5), delta=0.2)
62+
self.assertEqual(td.get_total_weight(), 2 * n)
63+
64+
# finally, can serialize and deserialize the sketch
65+
td_bytes = td.serialize()
66+
new_td = tdigest_double.deserialize(td_bytes)
67+
self.assertEqual(td.get_total_weight(), new_td.get_total_weight())
68+
self.assertEqual(td.get_min_value(), new_td.get_min_value())
69+
self.assertEqual(td.get_max_value(), new_td.get_max_value())
70+
self.assertEqual(td.get_quantile(0.7), new_td.get_quantile(0.7))
71+
self.assertEqual(td.get_rank(0.0), new_td.get_rank(0.0))
72+
73+
74+
# the same tests as above, but with tdigest_float
75+
def test_tdigest_float_example(self):
76+
n = 2 ** 20
77+
td = tdigest_float()
78+
td.update(np.random.normal(size=n-1))
79+
td.update(0.0)
80+
81+
self.assertAlmostEqual(0.5, td.get_rank(0.0), delta=0.1)
82+
self.assertAlmostEqual(0.0, td.get_quantile(0.5), delta=0.1)
83+
84+
self.assertLessEqual(td.get_min_value(), td.get_quantile(0.01))
85+
self.assertLessEqual(0.0, td.get_rank(td.get_min_value()))
86+
self.assertGreaterEqual(td.get_max_value(), td.get_quantile(0.99))
87+
self.assertGreaterEqual(1.0, td.get_rank(td.get_max_value()))
88+
89+
self.assertFalse(td.is_empty())
90+
self.assertEqual(td.get_total_weight(), n)
91+
92+
td2 = tdigest_float()
93+
td2.update(np.random.normal(loc=2.0, size=n))
94+
td.merge(td2)
95+
96+
self.assertAlmostEqual(0.5, td.get_rank(1.0), delta=0.2)
97+
self.assertAlmostEqual(1.0, td.get_quantile(0.5), delta=0.2)
98+
self.assertEqual(td.get_total_weight(), 2 * n)
99+
100+
td_bytes = td.serialize()
101+
new_td = tdigest_float.deserialize(td_bytes)
102+
self.assertEqual(td.get_total_weight(), new_td.get_total_weight())
103+
self.assertEqual(td.get_min_value(), new_td.get_min_value())
104+
self.assertEqual(td.get_max_value(), new_td.get_max_value())
105+
self.assertEqual(td.get_quantile(0.7), new_td.get_quantile(0.7))
106+
self.assertEqual(td.get_rank(0.0), new_td.get_rank(0.0))

tests/tuple_test.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,24 @@ def test_tuple_basic_example(self):
8787
self.assertTrue(sk.is_empty())
8888
self.assertEqual(sk.num_retained, 0)
8989

90+
def test_tuple_filter(self):
91+
lgk = 12 # 2^k = 4096 rows in the table
92+
n = 1 << 18 # ~256k unique values
93+
94+
# filtering lets us apply a predicate to the sketch, producing a new
95+
# compact sketch using the entries matching the predicate.
96+
sk = update_tuple_sketch(AccumulatorPolicy(), lgk)
97+
for ii in range(0, n):
98+
sk.update(ii, ii)
99+
100+
# we can filter by a predicate, whether a lambda or a defined function
101+
# for instance, using 0.5*n will return a compact_tuple_sketch with
102+
# approximately half the entries.
103+
result = sk.filter(lambda x: x < (0.5 * n))
104+
self.assertAlmostEqual(result.get_estimate(), 0.5 * n, delta=0.01 * n)
105+
self.assertLess(result.get_lower_bound(1), 0.5 * n)
106+
self.assertGreater(result.get_upper_bound(1), 0.5 * n)
107+
90108
def test_tuple_set_operations(self):
91109
lgk = 12 # 2^k = 4096 rows in the table
92110
n = 1 << 18 # ~256k unique values

tox.ini

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,6 @@ isolated_build = true
2121

2222
[testenv]
2323
deps = pytest
24-
numpy
24+
numpy < 2.0
2525
changedir = tests
2626
commands = pytest

0 commit comments

Comments
 (0)