Merge pull request #48 from apache/cpp_version_bump

jmalkin · web-flow · commit f812afadc292 · 2024-08-21T09:36:00.000-07:00
Cpp version bump
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -108,6 +108,7 @@ target_sources(python
     src/density_wrapper.cpp
     src/ks_wrapper.cpp
     src/count_wrapper.cpp
+    src/tdigest_wrapper.cpp
     src/vector_of_kll.cpp
     src/py_serde.cpp
 )
@@ -116,7 +117,7 @@ cmake_policy(SET CMP0097 NEW)
 include(ExternalProject)
 ExternalProject_Add(datasketches
   GIT_REPOSITORY https://github.com/apache/datasketches-cpp.git
-  GIT_TAG 5.0.2
+  GIT_TAG 5.1.0
   GIT_SHALLOW true
   GIT_SUBMODULES ""
   INSTALL_DIR /tmp/datasketches
@@ -126,7 +127,7 @@ ExternalProject_Get_property(datasketches INSTALL_DIR)
 set(datasketches_INSTALL_DIR ${INSTALL_DIR})
 message("Source dir of datasketches = ${datasketches_INSTALL_DIR}")
 message("Numpy include dir(s): ${Python_NumPy_INCLUDE_DIRS}")
-target_include_directories(python 
+target_include_directories(python
   PRIVATE
     ${datasketches_INSTALL_DIR}/include/DataSketches
     ${Python_NumPy_INCLUDE_DIRS}
diff --git a/setup.py b/setup.py
@@ -104,6 +104,6 @@ def build_extension(self, ext):
     # may need to add all source paths for sdist packages w/o MANIFEST.in
     ext_modules=[CMakeExtension('datasketches','.')],
     cmdclass={'build_ext': CMakeBuild},
-    install_requires=['numpy'],
+    install_requires=['numpy < 2.0'],
     zip_safe=False
 )
diff --git a/src/datasketches.cpp b/src/datasketches.cpp
@@ -39,6 +39,7 @@ void init_req(nb::module_& m);
 void init_quantiles(nb::module_& m);
 void init_count_min(nb::module_& m);
 void init_density(nb::module_& m);
+void init_tdigest(nb::module_& m);
 void init_vector_of_kll(nb::module_& m);
 
 // supporting objects
@@ -70,6 +71,7 @@ NB_MODULE(_datasketches, m) {
   init_quantiles(m);
   init_count_min(m);
   init_density(m);
+  init_tdigest(m);
   init_vector_of_kll(m);
 
   init_kolmogorov_smirnov(m);
diff --git a/src/tdigest_wrapper.cpp b/src/tdigest_wrapper.cpp
@@ -0,0 +1,83 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <vector>
+#include <stdexcept>
+#include <algorithm> // should ultimately be in tdigest.hpp
+
+#include <nanobind/nanobind.h>
+#include <nanobind/make_iterator.h>
+#include <nanobind/stl/string.h>
+#include <nanobind/ndarray.h>
+
+#include "tdigest.hpp"
+#include "quantile_conditional.hpp"
+
+namespace nb = nanobind;
+
+template<typename T>
+void bind_tdigest(nb::module_ &m, const char* name) {
+  using namespace datasketches;
+
+  auto tdigest_class = nb::class_<tdigest<T>>(m, name)
+    .def(nb::init<uint16_t>(), nb::arg("k")=tdigest<T>::DEFAULT_K,
+         "Creates a tdigest instance with the given value of k.\n\n"
+         ":param k: Controls the size/accuracy trade-off of the sketch. Default is 200.\n"
+         ":type k: int, optional"
+    )
+    .def("__copy__", [](const tdigest<T>& sk) { return tdigest<T>(sk); })
+    .def("update", (void(tdigest<T>::*)(T)) &tdigest<T>::update, nb::arg("item"),
+        "Updates the sketch with the given value")
+    .def("merge", (void(tdigest<T>::*)(tdigest<T>&)) &tdigest<T>::merge, nb::arg("sketch"),
+         "Merges the provided sketch into this one")
+    .def("__str__", [](const tdigest<T>& sk) { return sk.to_string(); },
+         "Produces a string summary of the sketch")
+    .def("to_string", &tdigest<T>::to_string, nb::arg("print_centroids")=false,
+         "Produces a string summary of the sketch")
+    .def("is_empty", &tdigest<T>::is_empty,
+         "Returns True if the sketch is empty, otherwise False")
+    .def_prop_ro("k", &tdigest<T>::get_k,
+         "The configured parameter k")
+    .def("get_total_weight", &tdigest<T>::get_total_weight,
+         "The total weight processed by the sketch")
+    .def("compress", &tdigest<T>::compress,
+         "Process buffered values and merge centroids, if necesssary")
+    .def("get_min_value", &tdigest<T>::get_min_value,
+         "Returns the minimum value from the stream. If empty, throws a RuntimeError")
+    .def("get_max_value", &tdigest<T>::get_max_value,
+         "Returns the maximum value from the stream. If empty, throws a RuntimeError")
+    .def("get_rank", &tdigest<T>::get_rank, nb::arg("value"),
+         "Computes the approximate normalized rank of the given value")
+    .def("get_quantile", &tdigest<T>::get_quantile, nb::arg("rank"),
+         "Returns an approximation to the data value "
+         "associated with the given rank in a hypothetical sorted "
+         "version of the input stream so far.\n")
+    .def("get_serialized_size_bytes", &tdigest<T>::get_serialized_size_bytes,
+         nb::arg("with_buffer")=false,
+         "Returns the size of the serialized sketch, in bytes")
+    ;
+
+    add_serialization<T>(tdigest_class);
+    add_vector_update<T>(tdigest_class);
+}
+
+void init_tdigest(nb::module_ &m) {
+  bind_tdigest<float>(m, "tdigest_float");
+  bind_tdigest<double>(m, "tdigest_double");
+}
diff --git a/src/tuple_wrapper.cpp b/src/tuple_wrapper.cpp
@@ -23,6 +23,7 @@
 #include <nanobind/make_iterator.h>
 #include <nanobind/intrusive/counter.h>
 #include <nanobind/stl/array.h>
+#include <nanobind/stl/function.h>
 #include <nanobind/stl/string.h>
 
 #include "py_serde.hpp"
@@ -133,6 +134,14 @@ void init_tuple(nb::module_ &m) {
         }, nb::arg("serde"),
         "Serializes the sketch into a bytes object"
     )
+    .def("filter",
+         [](const py_compact_tuple& sk, const std::function<bool(const nb::object&)> func) {
+           return sk.filter(func);
+         }, nb::arg("predicate"),
+         "Produces a compact_tuple_sketch from the given sketch by applying a predicate to "
+         "the summary in each entry.\n\n"
+         ":param predicate: A function returning true or value evaluated on each tuple summary\n"
+         ":return: A compact_tuple_sketch with the selected entries\n:rtype: :class:`compact_tuple_sketch`")
     .def_static(
         "deserialize",
         [](const nb::bytes& bytes, py_object_serde& serde, uint64_t seed) {
@@ -169,6 +178,14 @@ void init_tuple(nb::module_ &m) {
          "Returns a compacted form of the sketch, optionally sorting it")
     .def("trim", &py_update_tuple::trim, "Removes retained entries in excess of the nominal size k (if any)")
     .def("reset", &py_update_tuple::reset, "Resets the sketch to the initial empty state")
+    .def("filter",
+         [](const py_update_tuple& sk, const std::function<bool(const nb::object&)> func) {
+           return sk.filter(func);
+         }, nb::arg("predicate"),
+         "Produces a compact_tuple_sketch from the given sketch by applying a predicate to "
+         "the summary in each entry.\n\n"
+         ":param predicate: A function returning true or value evaluated on each tuple summary\n"
+         ":return: A compact_tuple_sketch with the selected entries\n:rtype: :class:`compact_tuple_sketch`")
   ;
 
   nb::class_<py_tuple_union>(m, "tuple_union")
diff --git a/tests/tdigest_test.py b/tests/tdigest_test.py
@@ -0,0 +1,106 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import unittest
+from datasketches import tdigest_float, tdigest_double
+import numpy as np
+
+class TdigestTest(unittest.TestCase):
+    def test_tdigest_double_example(self):
+      n = 2 ** 20
+
+      # create a tdigest and inject ~1 million N(0,1) points, both using a vector
+      # update as well as a single value
+      td = tdigest_double()
+      td.update(np.random.normal(size=n-1))
+      td.update(0.0)
+
+      # 0 should be near the median
+      self.assertAlmostEqual(0.5, td.get_rank(0.0), delta=0.1)
+
+      # the median should be near 0
+      self.assertAlmostEqual(0.0, td.get_quantile(0.5), delta=0.1)
+
+      # note that with t-digest, while it typically performs quite well in practice,
+      # we do not have any sort of theoretical guarantees on the error bounds
+      # or even an estimate of what bounds we may expect.
+
+      # we also track the min/max independently from the rest of the data
+      # which lets us know the full observed data range
+      self.assertLessEqual(td.get_min_value(), td.get_quantile(0.01))
+      self.assertLessEqual(0.0, td.get_rank(td.get_min_value()))
+      self.assertGreaterEqual(td.get_max_value(), td.get_quantile(0.99))
+      self.assertGreaterEqual(1.0, td.get_rank(td.get_max_value()))
+
+      # and a few basic queries about the sketch
+      self.assertFalse(td.is_empty())
+      self.assertEqual(td.get_total_weight(), n)
+
+      # we can define a new tdiget with a different distribution, then merge them
+      td2 = tdigest_double()
+      td2.update(np.random.normal(loc=2.0, size=n))
+      td.merge(td2)
+
+      # the new median should be near 1.0, and 1.0 should be near the median although
+      # the error distribution is not well-characterized so we allow generous margins
+      self.assertAlmostEqual(0.5, td.get_rank(1.0), delta=0.2)
+      self.assertAlmostEqual(1.0, td.get_quantile(0.5), delta=0.2)
+      self.assertEqual(td.get_total_weight(), 2 * n)
+
+      # finally, can serialize and deserialize the sketch
+      td_bytes = td.serialize()
+      new_td = tdigest_double.deserialize(td_bytes)
+      self.assertEqual(td.get_total_weight(), new_td.get_total_weight())
+      self.assertEqual(td.get_min_value(), new_td.get_min_value())
+      self.assertEqual(td.get_max_value(), new_td.get_max_value())
+      self.assertEqual(td.get_quantile(0.7), new_td.get_quantile(0.7))
+      self.assertEqual(td.get_rank(0.0), new_td.get_rank(0.0))
+
+
+    # the same tests as above, but with tdigest_float
+    def test_tdigest_float_example(self):
+      n = 2 ** 20
+      td = tdigest_float()
+      td.update(np.random.normal(size=n-1))
+      td.update(0.0)
+
+      self.assertAlmostEqual(0.5, td.get_rank(0.0), delta=0.1)
+      self.assertAlmostEqual(0.0, td.get_quantile(0.5), delta=0.1)
+
+      self.assertLessEqual(td.get_min_value(), td.get_quantile(0.01))
+      self.assertLessEqual(0.0, td.get_rank(td.get_min_value()))
+      self.assertGreaterEqual(td.get_max_value(), td.get_quantile(0.99))
+      self.assertGreaterEqual(1.0, td.get_rank(td.get_max_value()))
+
+      self.assertFalse(td.is_empty())
+      self.assertEqual(td.get_total_weight(), n)
+
+      td2 = tdigest_float()
+      td2.update(np.random.normal(loc=2.0, size=n))
+      td.merge(td2)
+
+      self.assertAlmostEqual(0.5, td.get_rank(1.0), delta=0.2)
+      self.assertAlmostEqual(1.0, td.get_quantile(0.5), delta=0.2)
+      self.assertEqual(td.get_total_weight(), 2 * n)
+
+      td_bytes = td.serialize()
+      new_td = tdigest_float.deserialize(td_bytes)
+      self.assertEqual(td.get_total_weight(), new_td.get_total_weight())
+      self.assertEqual(td.get_min_value(), new_td.get_min_value())
+      self.assertEqual(td.get_max_value(), new_td.get_max_value())
+      self.assertEqual(td.get_quantile(0.7), new_td.get_quantile(0.7))
+      self.assertEqual(td.get_rank(0.0), new_td.get_rank(0.0))
diff --git a/tests/tuple_test.py b/tests/tuple_test.py
@@ -87,6 +87,24 @@ def test_tuple_basic_example(self):
         self.assertTrue(sk.is_empty())
         self.assertEqual(sk.num_retained, 0)
 
+    def test_tuple_filter(self):
+        lgk = 12    # 2^k = 4096 rows in the table
+        n = 1 << 18 # ~256k unique values
+
+        # filtering lets us apply a predicate to the sketch, producing a new
+        # compact sketch using the entries matching the predicate.
+        sk = update_tuple_sketch(AccumulatorPolicy(), lgk)
+        for ii in range(0, n):
+          sk.update(ii, ii)
+
+        # we can filter by a predicate, whether a lambda or a defined function
+        # for instance, using 0.5*n will return a compact_tuple_sketch with
+        # approximately half the entries.
+        result = sk.filter(lambda x: x < (0.5 * n))
+        self.assertAlmostEqual(result.get_estimate(), 0.5 * n, delta=0.01 * n)
+        self.assertLess(result.get_lower_bound(1), 0.5 * n)
+        self.assertGreater(result.get_upper_bound(1), 0.5 * n)
+
     def test_tuple_set_operations(self):
         lgk = 12    # 2^k = 4096 rows in the table
         n = 1 << 18 # ~256k unique values
diff --git a/tox.ini b/tox.ini
@@ -21,6 +21,6 @@ isolated_build = true
 
 [testenv]
 deps = pytest
-       numpy
+       numpy < 2.0
 changedir = tests
 commands = pytest

Original file line number	Diff line number	Diff line change
`@@ -104,6 +104,6 @@ def build_extension(self, ext):`
`104`	`104`	`# may need to add all source paths for sdist packages w/o MANIFEST.in`
`105`	`105`	`ext_modules=[CMakeExtension('datasketches','.')],`
`106`	`106`	`cmdclass={'build_ext': CMakeBuild},`
`107`		`- install_requires=['numpy'],`
	`107`	`+ install_requires=['numpy < 2.0'],`
`108`	`108`	`zip_safe=False`
`109`	`109`	`)`