|
24 | 24 | #include <memory> |
25 | 25 | #include "ByteRLE.hh" |
26 | 26 | #include "ColumnWriter.hh" |
| 27 | +#include "Dictionary.hh" |
27 | 28 | #include "RLE.hh" |
28 | 29 | #include "Statistics.hh" |
29 | 30 | #include "Timezone.hh" |
30 | 31 | #include "Utils.hh" |
31 | 32 |
|
32 | | -#include <sparsehash/dense_hash_map> |
33 | | - |
34 | 33 | namespace orc { |
35 | 34 | StreamsFactory::~StreamsFactory() { |
36 | 35 | // PASS |
@@ -927,104 +926,6 @@ namespace orc { |
927 | 926 | ColumnWriter::finishStreams(); |
928 | 927 | dataStream_->finishStream(); |
929 | 928 | } |
930 | | - |
931 | | - /** |
932 | | - * Implementation of increasing sorted string dictionary |
933 | | - */ |
934 | | - class SortedStringDictionary { |
935 | | - public: |
936 | | - struct DictEntry { |
937 | | - DictEntry(const char* str, size_t len) : data(std::make_unique<std::string>(str, len)) {} |
938 | | - |
939 | | - std::unique_ptr<std::string> data; |
940 | | - }; |
941 | | - |
942 | | - SortedStringDictionary() : totalLength_(0) { |
943 | | - /// Need to set empty key otherwise dense_hash_map will not work correctly |
944 | | - keyToIndex_.set_empty_key(std::string_view{}); |
945 | | - } |
946 | | - |
947 | | - // insert a new string into dictionary, return its insertion order |
948 | | - size_t insert(const char* str, size_t len); |
949 | | - |
950 | | - // write dictionary data & length to output buffer |
951 | | - void flush(AppendOnlyBufferedStream* dataStream, RleEncoder* lengthEncoder) const; |
952 | | - |
953 | | - // get dict entries in insertion order |
954 | | - const std::vector<DictEntry>& getEntriesInInsertionOrder() const; |
955 | | - |
956 | | - // return count of entries |
957 | | - size_t size() const; |
958 | | - |
959 | | - // return total length of strings in the dictioanry |
960 | | - uint64_t length() const; |
961 | | - |
962 | | - void clear(); |
963 | | - |
964 | | - private: |
965 | | - // store dictionary entries in insertion order |
966 | | - mutable std::vector<DictEntry> flatDict_; |
967 | | - |
968 | | - // map from string to its insertion order index |
969 | | - google::dense_hash_map<std::string_view, size_t> keyToIndex_; |
970 | | - uint64_t totalLength_; |
971 | | - |
972 | | - // use friend class here to avoid being bothered by const function calls |
973 | | - friend class StringColumnWriter; |
974 | | - friend class CharColumnWriter; |
975 | | - friend class VarCharColumnWriter; |
976 | | - // store indexes of insertion order in the dictionary for not-null rows |
977 | | - std::vector<int64_t> idxInDictBuffer_; |
978 | | - }; |
979 | | - |
980 | | - // insert a new string into dictionary, return its insertion order |
981 | | - size_t SortedStringDictionary::insert(const char* str, size_t len) { |
982 | | - size_t index = flatDict_.size(); |
983 | | - |
984 | | - auto it = keyToIndex_.find(std::string_view{str, len}); |
985 | | - if (it != keyToIndex_.end()) { |
986 | | - return it->second; |
987 | | - } else { |
988 | | - flatDict_.emplace_back(str, len); |
989 | | - totalLength_ += len; |
990 | | - |
991 | | - const auto& lastEntry = flatDict_.back(); |
992 | | - keyToIndex_.emplace(std::string_view{lastEntry.data->data(), lastEntry.data->size()}, index); |
993 | | - return index; |
994 | | - } |
995 | | - } |
996 | | - |
997 | | - // write dictionary data & length to output buffer |
998 | | - void SortedStringDictionary::flush(AppendOnlyBufferedStream* dataStream, |
999 | | - RleEncoder* lengthEncoder) const { |
1000 | | - for (const auto& entry : flatDict_) { |
1001 | | - dataStream->write(entry.data->data(), entry.data->size()); |
1002 | | - lengthEncoder->write(static_cast<int64_t>(entry.data->size())); |
1003 | | - } |
1004 | | - } |
1005 | | - |
1006 | | - // get dict entries in insertion order |
1007 | | - const std::vector<SortedStringDictionary::DictEntry>& |
1008 | | - SortedStringDictionary::getEntriesInInsertionOrder() const { |
1009 | | - return flatDict_; |
1010 | | - } |
1011 | | - |
1012 | | - // return count of entries |
1013 | | - size_t SortedStringDictionary::size() const { |
1014 | | - return flatDict_.size(); |
1015 | | - } |
1016 | | - |
1017 | | - // return total length of strings in the dictioanry |
1018 | | - uint64_t SortedStringDictionary::length() const { |
1019 | | - return totalLength_; |
1020 | | - } |
1021 | | - |
1022 | | - void SortedStringDictionary::clear() { |
1023 | | - totalLength_ = 0; |
1024 | | - keyToIndex_.clear(); |
1025 | | - flatDict_.clear(); |
1026 | | - } |
1027 | | - |
1028 | 929 | class StringColumnWriter : public ColumnWriter { |
1029 | 930 | public: |
1030 | 931 | StringColumnWriter(const Type& type, const StreamsFactory& factory, |
@@ -1324,6 +1225,9 @@ namespace orc { |
1324 | 1225 | // flush dictionary data & length streams |
1325 | 1226 | dictionary.flush(dictStream.get(), dictLengthEncoder.get()); |
1326 | 1227 |
|
| 1228 | + // convert index from insertion order to dictionary order |
| 1229 | + dictionary.reorder(dictionary.idxInDictBuffer_); |
| 1230 | + |
1327 | 1231 | // write data sequences |
1328 | 1232 | int64_t* data = dictionary.idxInDictBuffer_.data(); |
1329 | 1233 | if (enableIndex) { |
@@ -1367,14 +1271,15 @@ namespace orc { |
1367 | 1271 | } |
1368 | 1272 |
|
1369 | 1273 | // get dictionary entries in insertion order |
1370 | | - const auto& entries = dictionary.getEntriesInInsertionOrder(); |
| 1274 | + std::vector<const SortedStringDictionary::DictEntry*> entries; |
| 1275 | + dictionary.getEntriesInInsertionOrder(entries); |
1371 | 1276 |
|
1372 | 1277 | // store each length of the data into a vector |
1373 | 1278 | for (uint64_t i = 0; i != dictionary.idxInDictBuffer_.size(); ++i) { |
1374 | 1279 | // write one row data in direct encoding |
1375 | 1280 | const auto& dictEntry = entries[static_cast<size_t>(dictionary.idxInDictBuffer_[i])]; |
1376 | | - directDataStream->write(dictEntry.data->data(), dictEntry.data->size()); |
1377 | | - directLengthEncoder->write(static_cast<int64_t>(dictEntry.data->size())); |
| 1281 | + directDataStream->write(dictEntry->data->data(), dictEntry->data->size()); |
| 1282 | + directLengthEncoder->write(static_cast<int64_t>(dictEntry->data->size())); |
1378 | 1283 | } |
1379 | 1284 |
|
1380 | 1285 | deleteDictStreams(); |
|
0 commit comments