|
| 1 | +/* |
| 2 | + * Licensed to the Apache Software Foundation (ASF) under one |
| 3 | + * or more contributor license agreements. See the NOTICE file |
| 4 | + * distributed with this work for additional information |
| 5 | + * regarding copyright ownership. The ASF licenses this file |
| 6 | + * to you under the Apache License, Version 2.0 (the |
| 7 | + * "License"); you may not use this file except in compliance |
| 8 | + * with the License. You may obtain a copy of the License at |
| 9 | + * |
| 10 | + * http://www.apache.org/licenses/LICENSE-2.0 |
| 11 | + * |
| 12 | + * Unless required by applicable law or agreed to in writing, |
| 13 | + * software distributed under the License is distributed on an |
| 14 | + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| 15 | + * KIND, either express or implied. See the License for the |
| 16 | + * specific language governing permissions and limitations |
| 17 | + * under the License. |
| 18 | + */ |
| 19 | + |
| 20 | +#ifndef ARRAY_OF_STRINGS_SKETCH_HPP_ |
| 21 | +#define ARRAY_OF_STRINGS_SKETCH_HPP_ |
| 22 | + |
| 23 | +#include <memory> |
| 24 | +#include <string> |
| 25 | + |
| 26 | +#include "array_tuple_sketch.hpp" |
| 27 | +#include "xxhash64.h" |
| 28 | + |
| 29 | +namespace datasketches { |
| 30 | + |
| 31 | +using array_of_strings = array<std::string>; |
| 32 | + |
| 33 | +// default update policy for an array of strings |
| 34 | +class default_array_of_strings_update_policy { |
| 35 | +public: |
| 36 | + default_array_of_strings_update_policy() = default; |
| 37 | + |
| 38 | + array_of_strings create() const; |
| 39 | + |
| 40 | + void update(array_of_strings& array, const array_of_strings& input) const; |
| 41 | + |
| 42 | + void update(array_of_strings& array, const array_of_strings* input) const; |
| 43 | +}; |
| 44 | + |
| 45 | +/** |
| 46 | + * Serializer/deserializer for an array of strings. |
| 47 | + * |
| 48 | + * Requirements: |
| 49 | + * - Array size must be <= 127. |
| 50 | + * |
| 51 | + * This serde does not perform UTF-8 validation. Callers must ensure strings |
| 52 | + * are valid UTF-8 before serialization to guarantee interoperability with |
| 53 | + * Java, Go, and Rust implementations. |
| 54 | + */ |
| 55 | +template<typename Allocator = std::allocator<array_of_strings>> |
| 56 | +struct default_array_of_strings_serde { |
| 57 | + using summary_allocator = typename std::allocator_traits<Allocator>::template rebind_alloc<array_of_strings>; |
| 58 | + |
| 59 | + explicit default_array_of_strings_serde(const Allocator& allocator = Allocator()); |
| 60 | + |
| 61 | + void serialize(std::ostream& os, const array_of_strings* items, unsigned num) const; |
| 62 | + void deserialize(std::istream& is, array_of_strings* items, unsigned num) const; |
| 63 | + size_t serialize(void* ptr, size_t capacity, const array_of_strings* items, unsigned num) const; |
| 64 | + size_t deserialize(const void* ptr, size_t capacity, array_of_strings* items, unsigned num) const; |
| 65 | + size_t size_of_item(const array_of_strings& item) const; |
| 66 | + |
| 67 | +private: |
| 68 | + summary_allocator summary_allocator_; |
| 69 | + static void check_num_nodes(uint8_t num_nodes); |
| 70 | + static uint32_t compute_total_bytes(const array_of_strings& item); |
| 71 | +}; |
| 72 | + |
| 73 | +/** |
| 74 | + * Hashes an array of strings using ArrayOfStrings-compatible hashing. |
| 75 | + */ |
| 76 | +uint64_t hash_array_of_strings_key(const array_of_strings& key); |
| 77 | + |
| 78 | +/** |
| 79 | + * Extended class of compact_tuple_sketch for array of strings. |
| 80 | + * |
| 81 | + * Requirements: |
| 82 | + * - Array size must be <= 127. |
| 83 | + * |
| 84 | + * UTF-8 compatibility: |
| 85 | + * Serialized sketches are intended to be language and platform independent. |
| 86 | + * Other implementations (Java, Go, Rust) enforce UTF-8 encoding for strings. |
| 87 | + * This C++ implementation does not validate UTF-8; it is the caller's |
| 88 | + * responsibility to ensure all strings are valid UTF-8 before calling update(). |
| 89 | + * Non-UTF-8 strings may serialize successfully but will fail to deserialize |
| 90 | + * in other language implementations. |
| 91 | + */ |
| 92 | +template<typename Allocator = std::allocator<array_of_strings>> |
| 93 | +class compact_array_of_strings_tuple_sketch: |
| 94 | + public compact_tuple_sketch<array_of_strings, Allocator> { |
| 95 | +public: |
| 96 | + using Base = compact_tuple_sketch<array_of_strings, Allocator>; |
| 97 | + using vector_bytes = typename Base::vector_bytes; |
| 98 | + using Base::serialize; |
| 99 | + |
| 100 | + /** |
| 101 | + * Copy constructor. |
| 102 | + * Constructs a compact sketch from another sketch (update or compact) |
| 103 | + * @param other sketch to be constructed from |
| 104 | + * @param ordered if true make the resulting sketch ordered |
| 105 | + */ |
| 106 | + template<typename Sketch> |
| 107 | + compact_array_of_strings_tuple_sketch(const Sketch& sketch, bool ordered = true); |
| 108 | + |
| 109 | + /** |
| 110 | + * This method deserializes a sketch from a given stream. |
| 111 | + * @param is input stream |
| 112 | + * @param seed the seed for the hash function that was used to create the sketch |
| 113 | + * @param sd instance of a SerDe |
| 114 | + * @param allocator instance of an Allocator |
| 115 | + * @return an instance of the sketch |
| 116 | + */ |
| 117 | + template<typename SerDe = default_array_of_strings_serde<Allocator>> |
| 118 | + static compact_array_of_strings_tuple_sketch deserialize(std::istream& is, uint64_t seed = DEFAULT_SEED, |
| 119 | + const SerDe& sd = SerDe(), const Allocator& allocator = Allocator()); |
| 120 | + |
| 121 | + /** |
| 122 | + * This method deserializes a sketch from a given array of bytes. |
| 123 | + * @param bytes pointer to the array of bytes |
| 124 | + * @param size the size of the array |
| 125 | + * @param seed the seed for the hash function that was used to create the sketch |
| 126 | + * @param sd instance of a SerDe |
| 127 | + * @param allocator instance of an Allocator |
| 128 | + * @return an instance of the sketch |
| 129 | + */ |
| 130 | + template<typename SerDe = default_array_of_strings_serde<Allocator>> |
| 131 | + static compact_array_of_strings_tuple_sketch deserialize(const void* bytes, size_t size, uint64_t seed = DEFAULT_SEED, |
| 132 | + const SerDe& sd = SerDe(), const Allocator& allocator = Allocator()); |
| 133 | + |
| 134 | +private: |
| 135 | + explicit compact_array_of_strings_tuple_sketch(Base&& base); |
| 136 | +}; |
| 137 | + |
| 138 | +/** |
| 139 | + * Convenience alias for update_tuple_sketch for array of strings |
| 140 | + */ |
| 141 | +template<typename Allocator = std::allocator<array_of_strings>, |
| 142 | + typename Policy = default_array_of_strings_update_policy> |
| 143 | +using update_array_of_strings_tuple_sketch = update_tuple_sketch< |
| 144 | + array_of_strings, |
| 145 | + array_of_strings, |
| 146 | + Policy, |
| 147 | + Allocator |
| 148 | +>; |
| 149 | + |
| 150 | +/** |
| 151 | + * Converts an array of strings tuple sketch to a compact sketch (ordered or unordered). |
| 152 | + * @param sketch input sketch |
| 153 | + * @param ordered optional flag to specify if an ordered sketch should be produced |
| 154 | + * @return compact array of strings sketch |
| 155 | + */ |
| 156 | +template<typename Allocator = std::allocator<array_of_strings>, typename Policy = default_array_of_strings_update_policy> |
| 157 | +compact_array_of_strings_tuple_sketch<Allocator> compact_array_of_strings_sketch( |
| 158 | + const update_array_of_strings_tuple_sketch<Allocator, Policy>& sketch, bool ordered = true); |
| 159 | + |
| 160 | +} /* namespace datasketches */ |
| 161 | + |
| 162 | +#include "array_of_strings_sketch_impl.hpp" |
| 163 | + |
| 164 | +#endif |
0 commit comments