Skip to content

Commit 1a23698

Browse files
authored
Merge pull request #476 from proost/feat-aos-tuple-sketch
feat: AoS tuple sketch
2 parents f546262 + 7617df4 commit 1a23698

8 files changed

+1177
-4
lines changed

tuple/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,4 +54,6 @@ install(FILES
5454
include/array_tuple_intersection_impl.hpp
5555
include/array_tuple_a_not_b.hpp
5656
include/array_tuple_a_not_b_impl.hpp
57+
include/array_of_strings_sketch.hpp
58+
include/array_of_strings_sketch_impl.hpp
5759
DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/DataSketches")
Lines changed: 164 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,164 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
#ifndef ARRAY_OF_STRINGS_SKETCH_HPP_
21+
#define ARRAY_OF_STRINGS_SKETCH_HPP_
22+
23+
#include <memory>
24+
#include <string>
25+
26+
#include "array_tuple_sketch.hpp"
27+
#include "xxhash64.h"
28+
29+
namespace datasketches {
30+
31+
using array_of_strings = array<std::string>;
32+
33+
// default update policy for an array of strings
34+
class default_array_of_strings_update_policy {
35+
public:
36+
default_array_of_strings_update_policy() = default;
37+
38+
array_of_strings create() const;
39+
40+
void update(array_of_strings& array, const array_of_strings& input) const;
41+
42+
void update(array_of_strings& array, const array_of_strings* input) const;
43+
};
44+
45+
/**
46+
* Serializer/deserializer for an array of strings.
47+
*
48+
* Requirements:
49+
* - Array size must be <= 127.
50+
*
51+
* This serde does not perform UTF-8 validation. Callers must ensure strings
52+
* are valid UTF-8 before serialization to guarantee interoperability with
53+
* Java, Go, and Rust implementations.
54+
*/
55+
template<typename Allocator = std::allocator<array_of_strings>>
56+
struct default_array_of_strings_serde {
57+
using summary_allocator = typename std::allocator_traits<Allocator>::template rebind_alloc<array_of_strings>;
58+
59+
explicit default_array_of_strings_serde(const Allocator& allocator = Allocator());
60+
61+
void serialize(std::ostream& os, const array_of_strings* items, unsigned num) const;
62+
void deserialize(std::istream& is, array_of_strings* items, unsigned num) const;
63+
size_t serialize(void* ptr, size_t capacity, const array_of_strings* items, unsigned num) const;
64+
size_t deserialize(const void* ptr, size_t capacity, array_of_strings* items, unsigned num) const;
65+
size_t size_of_item(const array_of_strings& item) const;
66+
67+
private:
68+
summary_allocator summary_allocator_;
69+
static void check_num_nodes(uint8_t num_nodes);
70+
static uint32_t compute_total_bytes(const array_of_strings& item);
71+
};
72+
73+
/**
74+
* Hashes an array of strings using ArrayOfStrings-compatible hashing.
75+
*/
76+
uint64_t hash_array_of_strings_key(const array_of_strings& key);
77+
78+
/**
79+
* Extended class of compact_tuple_sketch for array of strings.
80+
*
81+
* Requirements:
82+
* - Array size must be <= 127.
83+
*
84+
* UTF-8 compatibility:
85+
* Serialized sketches are intended to be language and platform independent.
86+
* Other implementations (Java, Go, Rust) enforce UTF-8 encoding for strings.
87+
* This C++ implementation does not validate UTF-8; it is the caller's
88+
* responsibility to ensure all strings are valid UTF-8 before calling update().
89+
* Non-UTF-8 strings may serialize successfully but will fail to deserialize
90+
* in other language implementations.
91+
*/
92+
template<typename Allocator = std::allocator<array_of_strings>>
93+
class compact_array_of_strings_tuple_sketch:
94+
public compact_tuple_sketch<array_of_strings, Allocator> {
95+
public:
96+
using Base = compact_tuple_sketch<array_of_strings, Allocator>;
97+
using vector_bytes = typename Base::vector_bytes;
98+
using Base::serialize;
99+
100+
/**
101+
* Copy constructor.
102+
* Constructs a compact sketch from another sketch (update or compact)
103+
* @param other sketch to be constructed from
104+
* @param ordered if true make the resulting sketch ordered
105+
*/
106+
template<typename Sketch>
107+
compact_array_of_strings_tuple_sketch(const Sketch& sketch, bool ordered = true);
108+
109+
/**
110+
* This method deserializes a sketch from a given stream.
111+
* @param is input stream
112+
* @param seed the seed for the hash function that was used to create the sketch
113+
* @param sd instance of a SerDe
114+
* @param allocator instance of an Allocator
115+
* @return an instance of the sketch
116+
*/
117+
template<typename SerDe = default_array_of_strings_serde<Allocator>>
118+
static compact_array_of_strings_tuple_sketch deserialize(std::istream& is, uint64_t seed = DEFAULT_SEED,
119+
const SerDe& sd = SerDe(), const Allocator& allocator = Allocator());
120+
121+
/**
122+
* This method deserializes a sketch from a given array of bytes.
123+
* @param bytes pointer to the array of bytes
124+
* @param size the size of the array
125+
* @param seed the seed for the hash function that was used to create the sketch
126+
* @param sd instance of a SerDe
127+
* @param allocator instance of an Allocator
128+
* @return an instance of the sketch
129+
*/
130+
template<typename SerDe = default_array_of_strings_serde<Allocator>>
131+
static compact_array_of_strings_tuple_sketch deserialize(const void* bytes, size_t size, uint64_t seed = DEFAULT_SEED,
132+
const SerDe& sd = SerDe(), const Allocator& allocator = Allocator());
133+
134+
private:
135+
explicit compact_array_of_strings_tuple_sketch(Base&& base);
136+
};
137+
138+
/**
139+
* Convenience alias for update_tuple_sketch for array of strings
140+
*/
141+
template<typename Allocator = std::allocator<array_of_strings>,
142+
typename Policy = default_array_of_strings_update_policy>
143+
using update_array_of_strings_tuple_sketch = update_tuple_sketch<
144+
array_of_strings,
145+
array_of_strings,
146+
Policy,
147+
Allocator
148+
>;
149+
150+
/**
151+
* Converts an array of strings tuple sketch to a compact sketch (ordered or unordered).
152+
* @param sketch input sketch
153+
* @param ordered optional flag to specify if an ordered sketch should be produced
154+
* @return compact array of strings sketch
155+
*/
156+
template<typename Allocator = std::allocator<array_of_strings>, typename Policy = default_array_of_strings_update_policy>
157+
compact_array_of_strings_tuple_sketch<Allocator> compact_array_of_strings_sketch(
158+
const update_array_of_strings_tuple_sketch<Allocator, Policy>& sketch, bool ordered = true);
159+
160+
} /* namespace datasketches */
161+
162+
#include "array_of_strings_sketch_impl.hpp"
163+
164+
#endif

0 commit comments

Comments
 (0)