Skip to content

Commit e2953d8

Browse files
support config store decimal as integer for write parquet format
1 parent a801dc4 commit e2953d8

3 files changed

Lines changed: 87 additions & 1 deletion

File tree

velox/dwio/parquet/tests/writer/ParquetWriterTest.cpp

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -682,6 +682,57 @@ TEST_F(ParquetWriterTest, updateWriterOptionsFromHiveConfig) {
682682
TimestampPrecision::kMilliseconds);
683683
}
684684

685+
TEST_F(ParquetWriterTest, storeDecimalAsIntegerFromConfig) {
686+
const config::ConfigBase emptySession({});
687+
688+
{
689+
std::unordered_map<std::string, std::string> connectorMap = {
690+
{parquet::WriterOptions::kParquetHiveConnectorStoreDecimalAsInteger,
691+
"false"}};
692+
const config::ConfigBase connectorConfig(std::move(connectorMap));
693+
parquet::WriterOptions options;
694+
options.processConfigs(connectorConfig, emptySession);
695+
ASSERT_TRUE(options.storeDecimalAsInteger.has_value());
696+
ASSERT_FALSE(options.storeDecimalAsInteger.value());
697+
}
698+
699+
{
700+
std::unordered_map<std::string, std::string> connectorMap = {
701+
{parquet::WriterOptions::kParquetHiveConnectorStoreDecimalAsInteger,
702+
"true"}};
703+
const config::ConfigBase connectorConfig(std::move(connectorMap));
704+
parquet::WriterOptions options;
705+
options.processConfigs(connectorConfig, emptySession);
706+
ASSERT_TRUE(options.storeDecimalAsInteger.has_value());
707+
ASSERT_TRUE(options.storeDecimalAsInteger.value());
708+
}
709+
710+
{
711+
std::unordered_map<std::string, std::string> connectorMap = {
712+
{parquet::WriterOptions::kParquetHiveConnectorStoreDecimalAsInteger,
713+
"true"}};
714+
const config::ConfigBase connectorConfig(std::move(connectorMap));
715+
std::unordered_map<std::string, std::string> sessionMap = {
716+
{parquet::WriterOptions::kParquetSessionStoreDecimalAsInteger, "false"}};
717+
const config::ConfigBase session(std::move(sessionMap));
718+
parquet::WriterOptions options;
719+
options.processConfigs(connectorConfig, session);
720+
ASSERT_TRUE(options.storeDecimalAsInteger.has_value());
721+
ASSERT_FALSE(options.storeDecimalAsInteger.value());
722+
}
723+
724+
{
725+
parquet::WriterOptions options;
726+
options.storeDecimalAsInteger = true;
727+
std::unordered_map<std::string, std::string> connectorMap = {
728+
{parquet::WriterOptions::kParquetHiveConnectorStoreDecimalAsInteger,
729+
"false"}};
730+
const config::ConfigBase connectorConfig(std::move(connectorMap));
731+
options.processConfigs(connectorConfig, emptySession);
732+
ASSERT_TRUE(options.storeDecimalAsInteger.value());
733+
}
734+
}
735+
685736
#ifdef VELOX_ENABLE_PARQUET
686737
DEBUG_ONLY_TEST_F(ParquetWriterTest, timestampUnitAndTimeZone) {
687738
SCOPED_TESTVALUE_SET(

velox/dwio/parquet/writer/Writer.cpp

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -161,7 +161,11 @@ std::shared_ptr<WriterProperties> getArrowParquetWriterOptions(
161161
properties = properties->maxRowGroupLength(
162162
static_cast<int64_t>(flushPolicy->rowsInRowGroup()));
163163
properties = properties->codecOptions(options.codecOptions);
164-
properties = properties->enableStoreDecimalAsInteger();
164+
if (options.storeDecimalAsInteger.value_or(true)) {
165+
properties = properties->enableStoreDecimalAsInteger();
166+
} else {
167+
properties = properties->disableStoreDecimalAsInteger();
168+
}
165169
if (options.useParquetDataPageV2.value_or(false)) {
166170
properties = properties->dataPageVersion(arrow::ParquetDataPageVersion::V2);
167171
} else {
@@ -349,6 +353,20 @@ std::optional<int64_t> toParquetBatchSize(
349353
}
350354
}
351355

356+
std::optional<bool> isParquetStoreDecimalAsInteger(
357+
const config::ConfigBase& config,
358+
const char* configKey) {
359+
try {
360+
if (const auto storeAsInt = config.get<bool>(configKey)) {
361+
return storeAsInt.value();
362+
}
363+
} catch (const folly::ConversionError& e) {
364+
VELOX_USER_FAIL(
365+
"Invalid parquet writer store decimal as integer option: {}", e.what());
366+
}
367+
return std::nullopt;
368+
}
369+
352370
} // namespace
353371

354372
Writer::Writer(
@@ -639,6 +657,16 @@ void WriterOptions::processConfigs(
639657
kParquetEnableDictionary, connectorConfig));
640658
}
641659

660+
if (!storeDecimalAsInteger) {
661+
storeDecimalAsInteger = isParquetStoreDecimalAsInteger(
662+
session, kParquetSessionStoreDecimalAsInteger)
663+
.has_value()
664+
? isParquetStoreDecimalAsInteger(
665+
session, kParquetSessionStoreDecimalAsInteger)
666+
: isParquetStoreDecimalAsInteger(
667+
connectorConfig, kParquetHiveConnectorStoreDecimalAsInteger);
668+
}
669+
642670
if (!dictionaryPageSizeLimit) {
643671
dictionaryPageSizeLimit =
644672
toParquetPageSize(session.getWithFallback<std::string>(

velox/dwio/parquet/writer/Writer.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,9 @@ struct WriterOptions : public dwio::common::WriterOptions, public WriterConfig {
134134
std::optional<int64_t> dataPageSize;
135135
std::optional<int64_t> dictionaryPageSizeLimit;
136136
std::optional<bool> enableDictionary;
137+
/// If unset, Writer uses true (INT32/INT64 for short DECIMAL); false forces
138+
/// FIXED_LEN_BYTE_ARRAY for those precisions.
139+
std::optional<bool> storeDecimalAsInteger;
137140
std::optional<bool> useParquetDataPageV2;
138141
std::optional<std::string> createdBy;
139142

@@ -163,6 +166,10 @@ struct WriterOptions : public dwio::common::WriterOptions, public WriterConfig {
163166
"hive.parquet.writer.batch_size";
164167
static constexpr const char* kParquetCreatedBy =
165168
"hive.parquet.writer.created_by";
169+
static constexpr const char* kParquetSessionStoreDecimalAsInteger =
170+
"hive.parquet.writer.store_decimal_as_integer";
171+
static constexpr const char* kParquetHiveConnectorStoreDecimalAsInteger =
172+
"hive.parquet.writer.store-decimal-as-integer";
166173
static constexpr const char* kParquetMaxTargetFileSize =
167174
"max_target_file_size";
168175
// Serde parameter keys for timestamp settings. These can be set via

0 commit comments

Comments
 (0)