diff --git a/.gitignore b/.gitignore index edb37019d8fd2d..a93b4957c14bf7 100644 --- a/.gitignore +++ b/.gitignore @@ -151,3 +151,4 @@ compile_commands.json .github .worktrees/ +.worktree_initialized diff --git a/be/src/core/data_type_serde/data_type_datetimev2_serde.cpp b/be/src/core/data_type_serde/data_type_datetimev2_serde.cpp index 92a5106b4815a8..ce0599080c6b2a 100644 --- a/be/src/core/data_type_serde/data_type_datetimev2_serde.cpp +++ b/be/src/core/data_type_serde/data_type_datetimev2_serde.cpp @@ -28,6 +28,7 @@ #include "core/data_type/data_type_decimal.h" #include "core/data_type/data_type_number.h" #include "core/data_type/primitive_type.h" +#include "core/data_type_serde/decoded_column_view.h" #include "core/types.h" #include "core/value/vdatetime_value.h" #include "exprs/function/cast/cast_to_datetimev2_impl.hpp" @@ -451,6 +452,34 @@ Status DataTypeDateTimeV2SerDe::read_column_from_arrow(IColumn& column, return Status::OK(); } +Status DataTypeDateTimeV2SerDe::read_column_from_decoded_values( + IColumn& column, const DecodedColumnView& view) const { + if (view.value_kind != DecodedValueKind::INT64) { + return Status::NotSupported("DATETIMEV2 decoded reader expects INT64 source"); + } + if (view.values == nullptr && view.row_count > 0) { + return Status::Corruption("Decoded value buffer is null for {}", column.get_name()); + } + auto& data = assert_cast(column).get_data(); + const auto* values = reinterpret_cast(view.values); + static const cctz::time_zone utc_time_zone = cctz::utc_time_zone(); + const int64_t second_mask = view.time_unit == DecodedTimeUnit::MILLIS ? 1000 : 1000000; + for (int64_t row = 0; row < view.row_count; ++row) { + int64_t epoch_seconds = values[row] / second_mask; + int64_t sub_second = values[row] % second_mask; + if (sub_second < 0) { + sub_second += second_mask; + --epoch_seconds; + } + const int32_t microsecond = static_cast(sub_second * (1000000 / second_mask)); + DateV2Value datetime_value; + datetime_value.from_unixtime(epoch_seconds, utc_time_zone); + datetime_value.set_microsecond(static_cast(microsecond)); + data.push_back(datetime_value); + } + return Status::OK(); +} + Status DataTypeDateTimeV2SerDe::write_column_to_mysql_binary(const IColumn& column, MysqlRowBinaryBuffer& result, int64_t row_idx, bool col_const, diff --git a/be/src/core/data_type_serde/data_type_datetimev2_serde.h b/be/src/core/data_type_serde/data_type_datetimev2_serde.h index 0389432a621730..34d0373eba1c34 100644 --- a/be/src/core/data_type_serde/data_type_datetimev2_serde.h +++ b/be/src/core/data_type_serde/data_type_datetimev2_serde.h @@ -88,6 +88,8 @@ class DataTypeDateTimeV2SerDe : public DataTypeNumberSerDe 0) { + return Status::Corruption("Decoded value buffer is null for {}", column.get_name()); + } + auto& data = assert_cast(column).get_data(); + const auto* values = reinterpret_cast(view.values); + for (int64_t row = 0; row < view.row_count; ++row) { + DateV2Value date_v2; + date_v2.get_date_from_daynr(values[row] + date_threshold); + data.push_back(date_v2); + } + return Status::OK(); +} + Status DataTypeDateV2SerDe::write_column_to_mysql_binary(const IColumn& column, MysqlRowBinaryBuffer& result, int64_t row_idx, bool col_const, diff --git a/be/src/core/data_type_serde/data_type_datev2_serde.h b/be/src/core/data_type_serde/data_type_datev2_serde.h index 0375f9be4b4b23..ff985d61345d5a 100644 --- a/be/src/core/data_type_serde/data_type_datev2_serde.h +++ b/be/src/core/data_type_serde/data_type_datev2_serde.h @@ -86,6 +86,8 @@ class DataTypeDateV2SerDe : public DataTypeNumberSerDe +NativeType decode_big_endian_signed_integer(const uint8_t* data, int length) { + using UnsignedNativeType = + std::conditional_t, unsigned __int128, + std::make_unsigned_t>; + UnsignedNativeType value = data != nullptr && length > 0 && (data[0] & 0x80) != 0 + ? static_cast(-1) + : 0; + for (int i = 0; i < length; ++i) { + value = static_cast((value << 8) | data[i]); + } + return static_cast(value); +} + +template +typename PrimitiveTypeTraits::CppType read_decimal_decoded_value(const DecodedColumnView& view, + int64_t row) { + using FieldType = typename PrimitiveTypeTraits::CppType; + if (view.value_kind == DecodedValueKind::INT32) { + const auto* values = reinterpret_cast(view.values); + return FieldType {static_cast(values[row])}; + } + if (view.value_kind == DecodedValueKind::INT64) { + const auto* values = reinterpret_cast(view.values); + return FieldType {static_cast(values[row])}; + } + const auto& value = (*view.binary_values)[row]; + const auto length = view.value_kind == DecodedValueKind::FIXED_BINARY + ? view.fixed_length + : cast_set(value.size); + return FieldType { + static_cast(decode_big_endian_signed_integer( + reinterpret_cast(value.data), length))}; +} + +template +Status read_decimal_decoded_values(IColumn& column, const DecodedColumnView& view) { + auto& data = assert_cast&>(column).get_data(); + for (int64_t row = 0; row < view.row_count; ++row) { + data.push_back(read_decimal_decoded_value(view, row)); + } + return Status::OK(); +} + +} // namespace template Status DataTypeDecimalSerDe::from_string_batch(const ColumnString& str, ColumnNullable& column, @@ -381,6 +429,22 @@ Status DataTypeDecimalSerDe::read_column_from_arrow(IColumn& column, return Status::OK(); } +template +Status DataTypeDecimalSerDe::read_column_from_decoded_values( + IColumn& column, const DecodedColumnView& view) const { + if constexpr (T == TYPE_DECIMAL32 || T == TYPE_DECIMAL64 || T == TYPE_DECIMAL128I || + T == TYPE_DECIMAL256) { + if (view.value_kind == DecodedValueKind::INT32 || + view.value_kind == DecodedValueKind::INT64 || + view.value_kind == DecodedValueKind::BINARY || + view.value_kind == DecodedValueKind::FIXED_BINARY) { + return read_decimal_decoded_values(column, view); + } + } + return Status::NotSupported("Unsupported decoded values for {} from source kind {}", get_name(), + static_cast(view.value_kind)); +} + template Status DataTypeDecimalSerDe::write_column_to_mysql_binary(const IColumn& column, MysqlRowBinaryBuffer& result, diff --git a/be/src/core/data_type_serde/data_type_decimal_serde.h b/be/src/core/data_type_serde/data_type_decimal_serde.h index 0185672e024718..089835a21be955 100644 --- a/be/src/core/data_type_serde/data_type_decimal_serde.h +++ b/be/src/core/data_type_serde/data_type_decimal_serde.h @@ -107,6 +107,8 @@ class DataTypeDecimalSerDe : public DataTypeSerDe { const cctz::time_zone& ctz) const override; Status read_column_from_arrow(IColumn& column, const arrow::Array* arrow_array, int64_t start, int64_t end, const cctz::time_zone& ctz) const override; + Status read_column_from_decoded_values(IColumn& column, + const DecodedColumnView& view) const override; Status write_column_to_mysql_binary(const IColumn& column, MysqlRowBinaryBuffer& row_buffer, int64_t row_idx, bool col_const, const FormatOptions& options) const override; diff --git a/be/src/core/data_type_serde/data_type_nullable_serde.cpp b/be/src/core/data_type_serde/data_type_nullable_serde.cpp index a93f8d6126c7d5..b02c8606332b92 100644 --- a/be/src/core/data_type_serde/data_type_nullable_serde.cpp +++ b/be/src/core/data_type_serde/data_type_nullable_serde.cpp @@ -22,7 +22,7 @@ #include #include -#include +#include #include "core/assert_cast.h" #include "core/column/column.h" @@ -31,6 +31,7 @@ #include "core/column/column_vector.h" #include "core/data_type_serde/data_type_serde.h" #include "core/data_type_serde/data_type_string_serde.h" +#include "core/data_type_serde/decoded_column_view.h" #include "exprs/function/cast/cast_base.h" #include "format/transformer/vcsv_transformer.h" #include "util/jsonb_document.h" @@ -350,6 +351,23 @@ Status DataTypeNullableSerDe::read_column_from_arrow(IColumn& column, ctz); } +Status DataTypeNullableSerDe::read_column_from_decoded_values(IColumn& column, + const DecodedColumnView& view) const { + auto& nullable_column = assert_cast(column); + auto& null_map = nullable_column.get_null_map_data(); + const auto old_size = null_map.size(); + null_map.resize(null_map.size() + view.row_count); + if (view.null_map != nullptr) { + // TODO: skip if no null in map + auto* dst = null_map.data() + old_size; + memcpy(dst, view.null_map, view.row_count); + } + DecodedColumnView nested_view = view; + nested_view.null_map = nullptr; + return nested_serde->read_column_from_decoded_values(nullable_column.get_nested_column(), + nested_view); +} + bool DataTypeNullableSerDe::write_column_to_mysql_text(const IColumn& column, BufferWritable& bw, int64_t row_idx, const FormatOptions& options) const { diff --git a/be/src/core/data_type_serde/data_type_nullable_serde.h b/be/src/core/data_type_serde/data_type_nullable_serde.h index cfb4e1e3bca198..376f3692dc1814 100644 --- a/be/src/core/data_type_serde/data_type_nullable_serde.h +++ b/be/src/core/data_type_serde/data_type_nullable_serde.h @@ -86,6 +86,8 @@ class DataTypeNullableSerDe : public DataTypeSerDe { const cctz::time_zone& ctz) const override; Status read_column_from_arrow(IColumn& column, const arrow::Array* arrow_array, int64_t start, int64_t end, const cctz::time_zone& ctz) const override; + Status read_column_from_decoded_values(IColumn& column, + const DecodedColumnView& view) const override; Status write_column_to_mysql_binary(const IColumn& column, MysqlRowBinaryBuffer& row_buffer, int64_t row_idx, bool col_const, const FormatOptions& options) const override; diff --git a/be/src/core/data_type_serde/data_type_number_serde.cpp b/be/src/core/data_type_serde/data_type_number_serde.cpp index 39e9c0726c498a..6cd30449083f23 100644 --- a/be/src/core/data_type_serde/data_type_number_serde.cpp +++ b/be/src/core/data_type_serde/data_type_number_serde.cpp @@ -27,6 +27,7 @@ #include "core/data_type/define_primitive_type.h" #include "core/data_type/primitive_type.h" #include "core/data_type_serde/data_type_serde.h" +#include "core/data_type_serde/decoded_column_view.h" #include "core/packed_int128.h" #include "core/types.h" #include "core/value/timestamptz_value.h" @@ -42,6 +43,29 @@ #include "util/to_string.h" namespace doris { +namespace { + +template +const NativeType* decoded_values_as(const DecodedColumnView& view) { + return reinterpret_cast(view.values); +} + +template +Status read_number_decoded_values(IColumn& column, const DecodedColumnView& view) { + if (view.values == nullptr && view.row_count > 0) { + return Status::Corruption("Decoded value buffer is null for {}", column.get_name()); + } + auto& data = + assert_cast::ColumnType&>(column).get_data(); + const auto* values = decoded_values_as(view); + for (int64_t row = 0; row < view.row_count; ++row) { + using DorisCppType = typename PrimitiveTypeTraits::CppType; + data.push_back(static_cast(values[row])); + } + return Status::OK(); +} + +} // namespace // Type map的基本结构 template struct TypeMap { @@ -156,6 +180,34 @@ Status DataTypeNumberSerDe::write_column_to_arrow(const IColumn& column, cons return Status::OK(); } +template +Status DataTypeNumberSerDe::read_column_from_decoded_values( + IColumn& column, const DecodedColumnView& view) const { + if constexpr (T == TYPE_BOOLEAN) { + if (view.value_kind == DecodedValueKind::BOOL) { + return read_number_decoded_values(column, view); + } + } else if constexpr (T == TYPE_INT) { + if (view.value_kind == DecodedValueKind::INT32) { + return read_number_decoded_values(column, view); + } + } else if constexpr (T == TYPE_BIGINT) { + if (view.value_kind == DecodedValueKind::INT64) { + return read_number_decoded_values(column, view); + } + } else if constexpr (T == TYPE_FLOAT) { + if (view.value_kind == DecodedValueKind::FLOAT) { + return read_number_decoded_values(column, view); + } + } else if constexpr (T == TYPE_DOUBLE) { + if (view.value_kind == DecodedValueKind::DOUBLE) { + return read_number_decoded_values(column, view); + } + } + return Status::NotSupported("Unsupported decoded values for {} from source kind {}", get_name(), + static_cast(view.value_kind)); +} + template Status DataTypeNumberSerDe::deserialize_one_cell_from_json(IColumn& column, Slice& slice, const FormatOptions& options) const { diff --git a/be/src/core/data_type_serde/data_type_number_serde.h b/be/src/core/data_type_serde/data_type_number_serde.h index b57f9f9d21298d..0e0a3acfc1aed7 100644 --- a/be/src/core/data_type_serde/data_type_number_serde.h +++ b/be/src/core/data_type_serde/data_type_number_serde.h @@ -117,6 +117,9 @@ class DataTypeNumberSerDe : public DataTypeSerDe { Status read_column_from_arrow(IColumn& column, const arrow::Array* arrow_array, int64_t start, int64_t end, const cctz::time_zone& ctz) const override; + Status read_column_from_decoded_values(IColumn& column, + const DecodedColumnView& view) const override; + Status write_column_to_mysql_binary(const IColumn& column, MysqlRowBinaryBuffer& row_buffer, int64_t row_idx, bool col_const, const FormatOptions& options) const override; diff --git a/be/src/core/data_type_serde/data_type_serde.cpp b/be/src/core/data_type_serde/data_type_serde.cpp index ac688ae6c307a3..b6a49524887087 100644 --- a/be/src/core/data_type_serde/data_type_serde.cpp +++ b/be/src/core/data_type_serde/data_type_serde.cpp @@ -34,6 +34,12 @@ namespace doris { DataTypeSerDe::~DataTypeSerDe() = default; +Status DataTypeSerDe::read_column_from_decoded_values(IColumn& column, + const DecodedColumnView& view) const { + return Status::NotSupported("read_column_from_decoded_values is not supported for {}", + get_name()); +} + DataTypeSerDeSPtrs create_data_type_serdes(const DataTypes& types) { DataTypeSerDeSPtrs serdes; serdes.reserve(types.size()); diff --git a/be/src/core/data_type_serde/data_type_serde.h b/be/src/core/data_type_serde/data_type_serde.h index 7c007c6558ddf3..07ad5d5d1b02d6 100644 --- a/be/src/core/data_type_serde/data_type_serde.h +++ b/be/src/core/data_type_serde/data_type_serde.h @@ -27,6 +27,7 @@ #include "common/cast_set.h" #include "common/status.h" #include "core/column/column_nullable.h" +#include "core/data_type_serde/decoded_column_view.h" #include "core/field.h" #include "core/string_buffer.hpp" #include "core/types.h" @@ -485,6 +486,12 @@ class DataTypeSerDe { int64_t start, int64_t end, const cctz::time_zone& ctz) const = 0; + // Read already decoded column values into a Doris column. The input view is format-neutral: + // file readers translate their decoder output into DecodedColumnView, while SerDe owns + // the Doris-type-specific materialization into IColumn. + virtual Status read_column_from_decoded_values(IColumn& column, + const DecodedColumnView& view) const; + // ORC serializer virtual Status write_column_to_orc(const std::string& timezone, const IColumn& column, const NullMap* null_map, diff --git a/be/src/core/data_type_serde/data_type_string_serde.cpp b/be/src/core/data_type_serde/data_type_string_serde.cpp index b7a59b3c07e42a..0a9a5cd7dabc04 100644 --- a/be/src/core/data_type_serde/data_type_string_serde.cpp +++ b/be/src/core/data_type_serde/data_type_string_serde.cpp @@ -19,11 +19,28 @@ #include "core/column/column_string.h" #include "core/data_type/define_primitive_type.h" +#include "core/data_type_serde/decoded_column_view.h" #include "util/jsonb_document_cast.h" #include "util/jsonb_utils.h" #include "util/jsonb_writer.h" namespace doris { +namespace { + +template +Status read_string_decoded_values(IColumn& column, const DecodedColumnView& view) { + if (view.binary_values == nullptr && view.row_count > 0) { + return Status::Corruption("Decoded binary values are null for {}", column.get_name()); + } + auto& string_column = assert_cast(column); + for (int64_t row = 0; row < view.row_count; ++row) { + const auto& value = (*view.binary_values)[row]; + string_column.insert_data(value.data, value.size); + } + return Status::OK(); +} + +} // namespace template Status DataTypeStringSerDeBase::serialize_column_to_json(const IColumn& column, @@ -313,6 +330,17 @@ Status DataTypeStringSerDeBase::read_column_from_arrow( return Status::OK(); } +template +Status DataTypeStringSerDeBase::read_column_from_decoded_values( + IColumn& column, const DecodedColumnView& view) const { + if (view.value_kind != DecodedValueKind::BINARY && + view.value_kind != DecodedValueKind::FIXED_BINARY) { + return Status::NotSupported("Unsupported decoded values for {} from source kind {}", + get_name(), static_cast(view.value_kind)); + } + return read_string_decoded_values(column, view); +} + template Status DataTypeStringSerDeBase::write_column_to_orc( const std::string& timezone, const IColumn& column, const NullMap* null_map, diff --git a/be/src/core/data_type_serde/data_type_string_serde.h b/be/src/core/data_type_serde/data_type_string_serde.h index 79c8450835d39c..81b80eab4a5cbf 100644 --- a/be/src/core/data_type_serde/data_type_string_serde.h +++ b/be/src/core/data_type_serde/data_type_string_serde.h @@ -203,6 +203,9 @@ class DataTypeStringSerDeBase : public DataTypeSerDe { Status read_column_from_arrow(IColumn& column, const arrow::Array* arrow_array, int64_t start, int64_t end, const cctz::time_zone& ctz) const override; + Status read_column_from_decoded_values(IColumn& column, + const DecodedColumnView& view) const override; + Status write_column_to_mysql_binary(const IColumn& column, MysqlRowBinaryBuffer& result, int64_t row_idx, bool col_const, const FormatOptions& options) const override { diff --git a/be/src/core/data_type_serde/data_type_time_serde.cpp b/be/src/core/data_type_serde/data_type_time_serde.cpp index e57fd08a271339..a40a8d217c9bd4 100644 --- a/be/src/core/data_type_serde/data_type_time_serde.cpp +++ b/be/src/core/data_type_serde/data_type_time_serde.cpp @@ -20,11 +20,38 @@ #include "core/data_type/data_type_decimal.h" #include "core/data_type/data_type_number.h" #include "core/data_type/primitive_type.h" +#include "core/data_type_serde/decoded_column_view.h" #include "core/value/time_value.h" #include "exprs/function/cast/cast_base.h" #include "exprs/function/cast/cast_to_time_impl.hpp" namespace doris { +namespace { + +TimeValue::TimeType read_time_decoded_value(const DecodedColumnView& view, int64_t row) { + int64_t micros = 0; + if (view.value_kind == DecodedValueKind::INT32) { + const auto* values = reinterpret_cast(view.values); + micros = static_cast(values[row]) * 1000; + } else { + const auto* values = reinterpret_cast(view.values); + micros = values[row]; + if (view.time_unit == DecodedTimeUnit::MILLIS) { + micros *= 1000; + } else if (view.time_unit == DecodedTimeUnit::NANOS) { + micros /= 1000; + } + } + const bool negative = micros < 0; + const int64_t abs_micros = std::abs(micros); + return TimeValue::make_time( + abs_micros / TimeValue::ONE_HOUR_MICROSECONDS, + (abs_micros % TimeValue::ONE_HOUR_MICROSECONDS) / TimeValue::ONE_MINUTE_MICROSECONDS, + (abs_micros % TimeValue::ONE_MINUTE_MICROSECONDS) / TimeValue::ONE_SECOND_MICROSECONDS, + abs_micros % TimeValue::ONE_SECOND_MICROSECONDS, negative); +} + +} // namespace Status DataTypeTimeV2SerDe::write_column_to_mysql_binary(const IColumn& column, MysqlRowBinaryBuffer& result, @@ -145,6 +172,21 @@ Status DataTypeTimeV2SerDe::from_string_strict_mode(StringRef& str, IColumn& col return Status::OK(); } +Status DataTypeTimeV2SerDe::read_column_from_decoded_values(IColumn& column, + const DecodedColumnView& view) const { + if (view.value_kind != DecodedValueKind::INT32 && view.value_kind != DecodedValueKind::INT64) { + return Status::NotSupported("TIMEV2 decoded reader expects INT32 or INT64 source"); + } + if (view.values == nullptr && view.row_count > 0) { + return Status::Corruption("Decoded value buffer is null for {}", column.get_name()); + } + auto& data = assert_cast(column).get_data(); + for (int64_t row = 0; row < view.row_count; ++row) { + data.push_back(read_time_decoded_value(view, row)); + } + return Status::OK(); +} + template Status DataTypeTimeV2SerDe::from_int_batch(const typename IntDataType::ColumnType& int_col, ColumnNullable& target_col) const { diff --git a/be/src/core/data_type_serde/data_type_time_serde.h b/be/src/core/data_type_serde/data_type_time_serde.h index db703616b497cf..e3fccf379c913a 100644 --- a/be/src/core/data_type_serde/data_type_time_serde.h +++ b/be/src/core/data_type_serde/data_type_time_serde.h @@ -67,6 +67,8 @@ class DataTypeTimeV2SerDe : public DataTypeNumberSerDe Status from_decimal_strict_mode_batch(const typename DecimalDataType::ColumnType& decimal_col, IColumn& target_col) const; + Status read_column_from_decoded_values(IColumn& column, + const DecodedColumnView& view) const override; int get_scale() const override { return _scale; } protected: diff --git a/be/src/core/data_type_serde/decoded_column_view.h b/be/src/core/data_type_serde/decoded_column_view.h new file mode 100644 index 00000000000000..9b0b14b17c777d --- /dev/null +++ b/be/src/core/data_type_serde/decoded_column_view.h @@ -0,0 +1,62 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include + +#include "common/status.h" +#include "core/string_ref.h" + +namespace doris { + +class IColumn; + +// 已解码 column batch 的物理值来源类型。 +// 该枚举只描述通用内存布局,不包含 Parquet/ORC/Arrow 等格式专有类型。 +enum class DecodedValueKind { + BOOL, + INT32, + INT64, + FLOAT, + DOUBLE, + BINARY, + FIXED_BINARY, +}; + +enum class DecodedTimeUnit { + UNKNOWN, + MILLIS, + MICROS, + NANOS, +}; + +struct DecodedColumnView { + DecodedValueKind value_kind = DecodedValueKind::INT32; + DecodedTimeUnit time_unit = DecodedTimeUnit::UNKNOWN; + int64_t row_count = 0; + int decimal_precision = -1; + int decimal_scale = -1; + int fixed_length = -1; + const uint8_t* values = nullptr; + const uint8_t* null_map = nullptr; + const std::vector* binary_values = nullptr; +}; + +} // namespace doris diff --git a/be/src/exec/scan/file_scanner.cpp b/be/src/exec/scan/file_scanner.cpp index 5f1d248c1e1f4d..0ba7266456e427 100644 --- a/be/src/exec/scan/file_scanner.cpp +++ b/be/src/exec/scan/file_scanner.cpp @@ -1791,7 +1791,6 @@ Status FileScanner::_init_expr_ctxes() { if (is_file_slot) { _is_file_slot.emplace(slot_id); _file_slot_descs.emplace_back(it->second); - _file_col_names.push_back(it->second->col_name()); } _column_descs.push_back(col_desc); diff --git a/be/src/exec/scan/file_scanner.h b/be/src/exec/scan/file_scanner.h index cd4066ec987ad8..34f59cdee320a7 100644 --- a/be/src/exec/scan/file_scanner.h +++ b/be/src/exec/scan/file_scanner.h @@ -133,8 +133,6 @@ class FileScanner : public Scanner { bool _cur_reader_eof = false; // File source slot descriptors std::vector _file_slot_descs; - // col names from _file_slot_descs - std::vector _file_col_names; // Unified column descriptors for init_reader (includes file, partition, missing, synthesized cols) std::vector _column_descs; @@ -147,6 +145,7 @@ class FileScanner : public Scanner { // dest slot name to index in _dest_vexpr_ctx; std::unordered_map _dest_slot_name_to_idx; // col name to default value expr + // TODO: only used by json reader. Could we delete this? std::unordered_map _col_default_value_ctx; // the map values of dest slot id to src slot desc // if there is not key of dest slot id in dest_sid_to_src_sid_without_trans, it will be set to nullptr @@ -190,10 +189,9 @@ class FileScanner : public Scanner { std::unique_ptr _file_cache_statistics; std::unique_ptr _file_reader_stats; - std::unique_ptr _io_ctx; + std::shared_ptr _io_ctx; // Whether to fill partition columns from path, default is true. - bool _fill_partition_from_path = true; std::unordered_map> _partition_col_descs; std::unordered_map _partition_value_is_null; diff --git a/be/src/exprs/vliteral.cpp b/be/src/exprs/vliteral.cpp index 551839f699e2e6..9b93d7097274ee 100644 --- a/be/src/exprs/vliteral.cpp +++ b/be/src/exprs/vliteral.cpp @@ -37,12 +37,6 @@ namespace doris { class VExprContext; -void VLiteral::init(const TExprNode& node) { - Field field; - field = _data_type->get_field(node); - _column_ptr = _data_type->create_column_const(1, field); -} - Status VLiteral::prepare(RuntimeState* state, const RowDescriptor& desc, VExprContext* context) { RETURN_IF_ERROR_OR_PREPARED(VExpr::prepare(state, desc, context)); return Status::OK(); diff --git a/be/src/exprs/vliteral.h b/be/src/exprs/vliteral.h index b1b8e89157d420..e5a4c7a5f3dbc4 100644 --- a/be/src/exprs/vliteral.h +++ b/be/src/exprs/vliteral.h @@ -39,7 +39,9 @@ class VLiteral : public VExpr { VLiteral(const TExprNode& node, bool should_init = true) : VExpr(node), _expr_name(_data_type->get_name()) { if (should_init) { - init(node); + Field field; + field = _data_type->get_field(node); + _column_ptr = _data_type->create_column_const(1, field); } } @@ -69,11 +71,9 @@ class VLiteral : public VExpr { uint64_t get_digest(uint64_t seed) const override; protected: + VLiteral(const DataTypePtr& type) : VExpr(type, false) {} ColumnPtr _column_ptr; std::string _expr_name; - -private: - void init(const TExprNode& node); }; } // namespace doris diff --git a/be/src/exprs/vslot_ref.h b/be/src/exprs/vslot_ref.h index 21b5735753b83d..a3b849a87138bd 100644 --- a/be/src/exprs/vslot_ref.h +++ b/be/src/exprs/vslot_ref.h @@ -31,7 +31,7 @@ class TExprNode; class Block; class VExprContext; -class VSlotRef MOCK_REMOVE(final) : public VExpr { +class VSlotRef : public VExpr { ENABLE_FACTORY_CREATOR(VSlotRef); public: @@ -67,12 +67,18 @@ class VSlotRef MOCK_REMOVE(final) : public VExpr { column_ids.insert(_column_id); } - MOCK_FUNCTION const std::string& column_name() const { return *_column_name; } + virtual const std::string& column_name() const { return *_column_name; } uint64_t get_digest(uint64_t seed) const override; double execute_cost() const override { return 0.0; } +protected: + VSlotRef(int slot_id, int column_id, int column_uniq_id) + : _slot_id(slot_id), _column_id(column_id), _column_uniq_id(column_uniq_id) { + _node_type = TExprNodeType::SLOT_REF; + } + private: int _slot_id; int _column_id; diff --git a/be/src/format/csv/csv_reader.cpp b/be/src/format/csv/csv_reader.cpp index 539132c7c9f003..4231b8eb20c8e5 100644 --- a/be/src/format/csv/csv_reader.cpp +++ b/be/src/format/csv/csv_reader.cpp @@ -638,7 +638,7 @@ Status CsvReader::_create_file_reader(bool need_schema) { } else { _file_description.mtime = _range.__isset.modification_time ? _range.modification_time : 0; io::FileReaderOptions reader_options = - FileFactory::get_reader_options(_state, _file_description); + FileFactory::get_reader_options(_state->query_options(), _file_description); io::FileReaderSPtr file_reader; if (_io_ctx_holder) { file_reader = DORIS_TRY(io::DelegateReader::create_file_reader( diff --git a/be/src/format/json/new_json_reader.cpp b/be/src/format/json/new_json_reader.cpp index da141437fcf200..89992105cb87fd 100644 --- a/be/src/format/json/new_json_reader.cpp +++ b/be/src/format/json/new_json_reader.cpp @@ -478,7 +478,7 @@ Status NewJsonReader::_open_file_reader(bool need_schema) { } else { _file_description.mtime = _range.__isset.modification_time ? _range.modification_time : 0; io::FileReaderOptions reader_options = - FileFactory::get_reader_options(_state, _file_description); + FileFactory::get_reader_options(_state->query_options(), _file_description); io::FileReaderSPtr file_reader; if (_io_ctx_holder) { file_reader = DORIS_TRY(io::DelegateReader::create_file_reader( diff --git a/be/src/format/native/native_reader.cpp b/be/src/format/native/native_reader.cpp index 565bab20231125..32fb7d660ad97b 100644 --- a/be/src/format/native/native_reader.cpp +++ b/be/src/format/native/native_reader.cpp @@ -125,7 +125,7 @@ Status NativeReader::init_reader() { } io::FileReaderOptions reader_options = - FileFactory::get_reader_options(_state, file_description); + FileFactory::get_reader_options(_state->query_options(), file_description); auto reader_res = io::DelegateReader::create_file_reader( _profile, system_properties, file_description, reader_options, io::DelegateReader::AccessMode::RANDOM, _io_ctx); diff --git a/be/src/format/new_parquet/column_reader.cpp b/be/src/format/new_parquet/column_reader.cpp new file mode 100644 index 00000000000000..143cddd831aec9 --- /dev/null +++ b/be/src/format/new_parquet/column_reader.cpp @@ -0,0 +1,1548 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "format/new_parquet/column_reader.h" + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "core/column/column.h" +#include "core/column/column_array.h" +#include "core/column/column_map.h" +#include "core/column/column_nullable.h" +#include "core/column/column_struct.h" +#include "core/column/column_vector.h" +#include "core/data_type/data_type_array.h" +#include "core/data_type/data_type_nullable.h" +#include "core/data_type/data_type_number.h" +#include "core/data_type/data_type_struct.h" +#include "core/data_type_serde/decoded_column_view.h" +#include "format/new_parquet/parquet_column_schema.h" +#include "format/reader/file_reader.h" + +namespace doris::parquet { +namespace { + +constexpr int64_t NESTED_READ_BATCH_ROWS = 4096; + +struct NestedScalarBatch { + int64_t records_read = 0; + int64_t levels_written = 0; + int64_t values_written = 0; + std::vector def_levels; + std::vector rep_levels; + std::vector value_indices; + MutableColumnPtr values_column; + + bool empty() const { return levels_written == 0; } +}; + +struct NestedScalarOverflow { + NestedScalarBatch batch; + + bool empty() const { return batch.empty(); } + void clear() { batch = NestedScalarBatch(); } +}; + +class ScalarColumnReader final : public ParquetColumnReader { +public: + ScalarColumnReader(int parquet_leaf_column_id, const ::parquet::ColumnDescriptor* descriptor, + ParquetTypeDescriptor type_descriptor, DataTypePtr type, std::string name, + std::shared_ptr<::parquet::internal::RecordReader> record_reader) + : _file_column_id(parquet_leaf_column_id), + _parquet_leaf_column_id(parquet_leaf_column_id), + _descriptor(descriptor), + _type_descriptor(std::move(type_descriptor)), + _type(std::move(type)), + _name(std::move(name)), + _record_reader(std::move(record_reader)) {} + + int file_column_id() const override { return _file_column_id; } + int parquet_leaf_column_id() const override { return _parquet_leaf_column_id; } + const DataTypePtr& type() const override { return _type; } + const std::string& name() const override { return _name; } + + Status read(int64_t rows, MutableColumnPtr& column, int64_t* rows_read) override; + Status skip(int64_t rows) override; + + const ::parquet::ColumnDescriptor* descriptor() const { return _descriptor; } + const std::shared_ptr<::parquet::internal::RecordReader>& record_reader() const { + return _record_reader; + } + const ParquetTypeDescriptor& type_descriptor() const { return _type_descriptor; } + +private: + int _file_column_id = -1; + int _parquet_leaf_column_id = -1; + const ::parquet::ColumnDescriptor* _descriptor = nullptr; + ParquetTypeDescriptor _type_descriptor; + DataTypePtr _type; + std::string _name; + std::shared_ptr<::parquet::internal::RecordReader> _record_reader; +}; + +class StructColumnReader final : public ParquetColumnReader { +public: + StructColumnReader(const ParquetColumnSchema& schema, DataTypePtr type, + std::vector> children) + : _field_id(schema.top_level_field_id), + _nullable_definition_level(schema.nullable_definition_level), + _type(std::move(type)), + _name(schema.name), + _children(std::move(children)) {} + + int file_column_id() const override { return _field_id; } + int parquet_leaf_column_id() const override { return -1; } + const DataTypePtr& type() const override { return _type; } + const std::string& name() const override { return _name; } + + Status read(int64_t rows, MutableColumnPtr& column, int64_t* rows_read) override; + Status skip(int64_t rows) override; + +private: + int _field_id = -1; + int16_t _nullable_definition_level = 0; + DataTypePtr _type; + std::string _name; + std::vector> _children; +}; + +class ListColumnReader final : public ParquetColumnReader { +public: + ListColumnReader(const ParquetColumnSchema& schema, DataTypePtr type, + std::unique_ptr element_reader) + : _field_id(schema.top_level_field_id), + _nullable_definition_level(schema.nullable_definition_level), + _repeated_repetition_level(schema.repeated_repetition_level), + _type(std::move(type)), + _name(schema.name), + _element_reader(std::move(element_reader)) {} + + int file_column_id() const override { return _field_id; } + int parquet_leaf_column_id() const override { return -1; } + const DataTypePtr& type() const override { return _type; } + const std::string& name() const override { return _name; } + + Status read(int64_t rows, MutableColumnPtr& column, int64_t* rows_read) override; + Status skip(int64_t rows) override; + +private: + int _field_id = -1; + int16_t _nullable_definition_level = 0; + int16_t _repeated_repetition_level = 0; + DataTypePtr _type; + std::string _name; + std::unique_ptr _element_reader; + NestedScalarOverflow _element_overflow; +}; + +class RowPositionColumnReader final : public ParquetColumnReader { +public: + explicit RowPositionColumnReader(int64_t row_group_first_row) + : _row_group_first_row(row_group_first_row) {} + + int file_column_id() const override { + return ParquetColumnReaderFactory::ROW_POSITION_COLUMN_ID; + } + int parquet_leaf_column_id() const override { return -1; } + const DataTypePtr& type() const override { return _type; } + const std::string& name() const override { return _name; } + + Status read(int64_t rows, MutableColumnPtr& column, int64_t* rows_read) override { + if (column.get() == nullptr || rows_read == nullptr) { + return Status::InvalidArgument("Invalid parquet row position read result pointer"); + } + if (rows < 0) { + return Status::InvalidArgument("Invalid parquet row position read rows {}", rows); + } + auto* vector_column = assert_cast(column.get()); + auto& data = vector_column->get_data(); + const auto old_size = data.size(); + data.resize(old_size + rows); + for (int64_t row = 0; row < rows; ++row) { + data[old_size + row] = _row_group_first_row + _next_row_position + row; + } + _next_row_position += rows; + *rows_read = rows; + return Status::OK(); + } + + Status skip(int64_t rows) override { + if (rows <= 0) { + return Status::OK(); + } + _next_row_position += rows; + return Status::OK(); + } + +private: + int64_t _row_group_first_row = 0; + int64_t _next_row_position = 0; + DataTypePtr _type = std::make_shared(); + std::string _name = ParquetColumnReaderFactory::ROW_POSITION_COLUMN_NAME; +}; + +class MapColumnReader final : public ParquetColumnReader { +public: + MapColumnReader(const ParquetColumnSchema& schema, DataTypePtr type, + std::unique_ptr key_reader, + std::unique_ptr value_reader) + : _field_id(schema.top_level_field_id), + _nullable_definition_level(schema.nullable_definition_level), + _repeated_repetition_level(schema.repeated_repetition_level), + _type(std::move(type)), + _name(schema.name), + _key_reader(std::move(key_reader)), + _value_reader(std::move(value_reader)) {} + + int file_column_id() const override { return _field_id; } + int parquet_leaf_column_id() const override { return -1; } + const DataTypePtr& type() const override { return _type; } + const std::string& name() const override { return _name; } + + Status read(int64_t rows, MutableColumnPtr& column, int64_t* rows_read) override; + Status skip(int64_t rows) override; + +private: + int _field_id = -1; + int16_t _nullable_definition_level = 0; + int16_t _repeated_repetition_level = 0; + DataTypePtr _type; + std::string _name; + std::unique_ptr _key_reader; + std::unique_ptr _value_reader; + NestedScalarOverflow _key_overflow; + NestedScalarOverflow _value_overflow; +}; + +Status read_records(ScalarColumnReader& column_reader, int64_t batch_rows, + ::parquet::internal::RecordReader** record_reader, int64_t* rows_read) { + auto reader = column_reader.record_reader(); + if (reader == nullptr) { + return Status::InternalError("Parquet record reader is not initialized for column {}", + column_reader.name()); + } + + int64_t records_read = 0; + try { + reader->Reset(); + reader->Reserve(batch_rows); + records_read = reader->ReadRecords(batch_rows); + } catch (const ::parquet::ParquetException& e) { + return Status::Corruption("Failed to read parquet records for column {}: {}", + column_reader.name(), e.what()); + } catch (const std::exception& e) { + return Status::InternalError("Failed to read parquet records for column {}: {}", + column_reader.name(), e.what()); + } + if (records_read < 0 || records_read > batch_rows) { + return Status::Corruption("Invalid parquet record read result for column {}: {}", + column_reader.name(), records_read); + } + *record_reader = reader.get(); + *rows_read = records_read; + return Status::OK(); +} + +struct RowRange { + int64_t start = 0; + int64_t length = 0; +}; + +std::vector selection_to_ranges(const SelectionVector& selection, + uint16_t selected_rows) { + std::vector ranges; + if (selected_rows == 0) { + return ranges; + } + + int64_t range_start = selection.get_index(0); + int64_t previous = selection.get_index(0); + for (uint16_t selection_idx = 1; selection_idx < selected_rows; ++selection_idx) { + const int64_t current = selection.get_index(selection_idx); + DCHECK_GT(current, previous); + if (current == previous + 1) { + previous = current; + continue; + } + ranges.push_back(RowRange {range_start, previous - range_start + 1}); + range_start = current; + previous = current; + } + ranges.push_back(RowRange {range_start, previous - range_start + 1}); + return ranges; +} + +DecodedTimeUnit decoded_time_unit(ParquetTimeUnit time_unit) { + switch (time_unit) { + case ParquetTimeUnit::MILLIS: + return DecodedTimeUnit::MILLIS; + case ParquetTimeUnit::MICROS: + return DecodedTimeUnit::MICROS; + case ParquetTimeUnit::NANOS: + return DecodedTimeUnit::NANOS; + case ParquetTimeUnit::UNKNOWN: + default: + return DecodedTimeUnit::UNKNOWN; + } +} + +DecodedValueKind decoded_value_kind(const ParquetTypeDescriptor& type_descriptor) { + switch (type_descriptor.physical_type) { + case ::parquet::Type::BOOLEAN: + return DecodedValueKind::BOOL; + case ::parquet::Type::INT32: + return DecodedValueKind::INT32; + case ::parquet::Type::INT64: + return DecodedValueKind::INT64; + case ::parquet::Type::FLOAT: + return DecodedValueKind::FLOAT; + case ::parquet::Type::DOUBLE: + return DecodedValueKind::DOUBLE; + case ::parquet::Type::FIXED_LEN_BYTE_ARRAY: + return DecodedValueKind::FIXED_BINARY; + case ::parquet::Type::BYTE_ARRAY: + default: + return DecodedValueKind::BINARY; + } +} + +Status build_null_map(const ScalarColumnReader& column_reader, + ::parquet::internal::RecordReader& record_reader, int64_t records_read, + NullMap* null_map) { + if (column_reader.descriptor()->max_definition_level() == 0) { + return Status::OK(); + } + if (record_reader.read_dense_for_nullable()) { + return Status::NotSupported( + "Dense nullable parquet record reader is not supported for column {}", + column_reader.name()); + } + auto* def_levels = record_reader.def_levels(); + if (def_levels == nullptr && records_read > 0) { + return Status::Corruption( + "Parquet record reader returned null definition levels for nullable column {}", + column_reader.name()); + } + const int16_t max_definition_level = column_reader.descriptor()->max_definition_level(); + null_map->resize(records_read); + auto* __restrict dst = null_map->data(); + const auto* __restrict src = def_levels; + for (int64_t record_idx = 0; record_idx < records_read; ++record_idx) { + dst[record_idx] = src[record_idx] != max_definition_level; + } + return Status::OK(); +} + +Status get_binary_chunks(const ScalarColumnReader& column_reader, + ::parquet::internal::RecordReader& record_reader, + std::vector>* chunks) { + auto* binary_reader = dynamic_cast<::parquet::internal::BinaryRecordReader*>(&record_reader); + if (binary_reader == nullptr) { + return Status::InternalError("Parquet binary record reader is not available for column {}", + column_reader.name()); + } + *chunks = binary_reader->GetBuilderChunks(); + return Status::OK(); +} + +Status build_binary_values(const ScalarColumnReader& column_reader, + const std::vector>& chunks, + int64_t records_read, std::vector* binary_values) { + binary_values->reserve(records_read); + for (const auto& chunk : chunks) { + if (chunk == nullptr) { + return Status::Corruption( + "Parquet binary record reader returned null chunk for column {}", + column_reader.name()); + } + if (auto* binary_array = dynamic_cast<::arrow::BinaryArray*>(chunk.get())) { + for (int64_t row_idx = 0; row_idx < binary_array->length(); ++row_idx) { + if (binary_array->IsNull(row_idx)) { + binary_values->emplace_back(static_cast(nullptr), 0); + continue; + } + int32_t length = 0; + const uint8_t* value = binary_array->GetValue(row_idx, &length); + binary_values->emplace_back(reinterpret_cast(value), length); + } + } else if (auto* fixed_array = dynamic_cast<::arrow::FixedSizeBinaryArray*>(chunk.get())) { + for (int64_t row_idx = 0; row_idx < fixed_array->length(); ++row_idx) { + if (fixed_array->IsNull(row_idx)) { + binary_values->emplace_back(static_cast(nullptr), 0); + continue; + } + binary_values->emplace_back( + reinterpret_cast(fixed_array->GetValue(row_idx)), + fixed_array->byte_width()); + } + } else { + return Status::InternalError("Unexpected Arrow binary array type for column {}", + column_reader.name()); + } + } + if (binary_values->size() != static_cast(records_read)) { + return Status::Corruption( + "Invalid parquet binary record read result for column {}: rows={}, records={}", + column_reader.name(), binary_values->size(), records_read); + } + return Status::OK(); +} + +Status append_scalar_values(const ScalarColumnReader& column_reader, + ::parquet::internal::RecordReader& record_reader, int64_t row_count, + const NullMap* null_map, MutableColumnPtr& column) { + std::vector binary_values; + std::vector> binary_chunks; + DecodedColumnView view; + view.value_kind = decoded_value_kind(column_reader.type_descriptor()); + view.time_unit = decoded_time_unit(column_reader.type_descriptor().time_unit); + view.row_count = row_count; + view.decimal_precision = column_reader.type_descriptor().decimal_precision; + view.decimal_scale = column_reader.type_descriptor().decimal_scale; + view.fixed_length = column_reader.type_descriptor().fixed_length; + view.null_map = null_map == nullptr || null_map->empty() ? nullptr : null_map->data(); + if (view.value_kind == DecodedValueKind::BINARY || + view.value_kind == DecodedValueKind::FIXED_BINARY) { + RETURN_IF_ERROR(get_binary_chunks(column_reader, record_reader, &binary_chunks)); + RETURN_IF_ERROR( + build_binary_values(column_reader, binary_chunks, row_count, &binary_values)); + view.binary_values = &binary_values; + } else { + view.values = record_reader.values(); + } + + RETURN_IF_ERROR( + column_reader.type()->get_serde()->read_column_from_decoded_values(*column, view)); + return Status::OK(); +} + +Status read_nested_scalar_batch(ScalarColumnReader& column_reader, int64_t batch_rows, + int16_t value_slot_definition_level, NestedScalarBatch* batch) { + if (batch == nullptr) { + return Status::InvalidArgument("Nested scalar batch is null for column {}", + column_reader.name()); + } + *batch = NestedScalarBatch(); + + ::parquet::internal::RecordReader* record_reader = nullptr; + RETURN_IF_ERROR(read_records(column_reader, batch_rows, &record_reader, &batch->records_read)); + if (column_reader.type()->is_nullable() && record_reader->read_dense_for_nullable()) { + return Status::NotSupported( + "Dense nullable parquet nested reader is not supported for column {}", + column_reader.name()); + } + batch->levels_written = record_reader->levels_written(); + batch->values_written = record_reader->values_written(); + if (batch->levels_written == 0 && batch->records_read > 0 && + batch->values_written == batch->records_read && + column_reader.descriptor()->max_definition_level() == 0 && + column_reader.descriptor()->max_repetition_level() == 0) { + batch->levels_written = batch->records_read; + } + if (batch->levels_written < batch->records_read || batch->values_written < 0 || + batch->values_written > batch->levels_written) { + return Status::Corruption( + "Invalid nested parquet read result for column {}: rows={}, levels={}, values={}", + column_reader.name(), batch->records_read, batch->levels_written, + batch->values_written); + } + if (batch->levels_written == 0) { + return Status::OK(); + } + + auto* def_levels = record_reader->def_levels(); + if (def_levels == nullptr && column_reader.descriptor()->max_definition_level() > 0) { + return Status::Corruption( + "Nested parquet reader returned null definition levels for column {}", + column_reader.name()); + } + batch->def_levels.resize(static_cast(batch->levels_written)); + if (column_reader.descriptor()->max_definition_level() == 0 || def_levels == nullptr) { + std::fill(batch->def_levels.begin(), batch->def_levels.end(), + column_reader.descriptor()->max_definition_level()); + } else { + std::copy(def_levels, def_levels + batch->levels_written, batch->def_levels.begin()); + } + + auto* rep_levels = record_reader->rep_levels(); + if (rep_levels == nullptr && column_reader.descriptor()->max_repetition_level() > 0) { + return Status::Corruption( + "Nested parquet reader returned null repetition levels for column {}", + column_reader.name()); + } + batch->rep_levels.resize(static_cast(batch->levels_written)); + if (column_reader.descriptor()->max_repetition_level() == 0 || rep_levels == nullptr) { + std::fill(batch->rep_levels.begin(), batch->rep_levels.end(), 0); + } else { + std::copy(rep_levels, rep_levels + batch->levels_written, batch->rep_levels.begin()); + } + + batch->value_indices.resize(static_cast(batch->levels_written), -1); + int64_t value_idx = 0; + const int16_t max_definition_level = column_reader.descriptor()->max_definition_level(); + NullMap value_null_map; + for (int64_t level_idx = 0; level_idx < batch->levels_written; ++level_idx) { + if (batch->def_levels[level_idx] >= value_slot_definition_level) { + if (value_idx >= batch->values_written) { + return Status::Corruption( + "Nested parquet reader returned fewer values than definition levels for " + "column {}", + column_reader.name()); + } + batch->value_indices[level_idx] = value_idx++; + if (column_reader.type()->is_nullable()) { + value_null_map.push_back(batch->def_levels[level_idx] != max_definition_level); + } + } + } + if (value_idx != batch->values_written) { + return Status::Corruption( + "Nested parquet reader returned extra values for column {}: consumed={}, values={}", + column_reader.name(), value_idx, batch->values_written); + } + if (column_reader.type()->is_nullable() && + value_null_map.size() != static_cast(batch->values_written)) { + return Status::Corruption("Invalid nested parquet null map for column {}", + column_reader.name()); + } + + batch->values_column = column_reader.type()->create_column(); + if (batch->values_written > 0) { + const NullMap* null_map = value_null_map.empty() ? nullptr : &value_null_map; + RETURN_IF_ERROR(append_scalar_values(column_reader, *record_reader, batch->values_written, + null_map, batch->values_column)); + } + return Status::OK(); +} + +void move_nested_scalar_tail(const NestedScalarBatch& src, int64_t start_level, + NestedScalarOverflow* overflow) { + DORIS_CHECK(overflow != nullptr); + if (start_level >= src.levels_written) { + overflow->clear(); + return; + } + + NestedScalarBatch dst; + dst.records_read = 0; + dst.levels_written = src.levels_written - start_level; + dst.def_levels.assign(src.def_levels.begin() + start_level, src.def_levels.end()); + dst.rep_levels.assign(src.rep_levels.begin() + start_level, src.rep_levels.end()); + dst.value_indices.resize(static_cast(dst.levels_written), -1); + dst.values_column = src.values_column->clone_empty(); + + for (int64_t level_idx = start_level; level_idx < src.levels_written; ++level_idx) { + const int64_t value_idx = src.value_indices[level_idx]; + if (value_idx < 0) { + continue; + } + dst.value_indices[static_cast(level_idx - start_level)] = dst.values_written; + dst.values_column->insert_from(*src.values_column, static_cast(value_idx)); + dst.values_written++; + } + overflow->batch = std::move(dst); +} + +Status append_scalar_batch_value(const ScalarColumnReader& column_reader, + const NestedScalarBatch& batch, int64_t level_idx, + MutableColumnPtr& column) { + const int64_t value_idx = batch.value_indices[level_idx]; + if (value_idx < 0) { + return Status::Corruption("Nested parquet value is absent for column {}", + column_reader.name()); + } + column->insert_from(*batch.values_column, static_cast(value_idx)); + return Status::OK(); +} + +bool supports_nested_scalar_record_reader(const ParquetColumnSchema& column_schema) { + if (supports_record_reader(column_schema.type_descriptor)) { + return true; + } + const auto& type_descriptor = column_schema.type_descriptor; + if (type_descriptor.extra_type_info != ParquetExtraTypeInfo::NONE || + type_descriptor.is_decimal || type_descriptor.is_timestamp || + type_descriptor.is_string_like) { + return false; + } + if (type_descriptor.converted_type != ::parquet::ConvertedType::NONE && + type_descriptor.converted_type != ::parquet::ConvertedType::UNDEFINED) { + return false; + } + switch (type_descriptor.physical_type) { + case ::parquet::Type::BOOLEAN: + case ::parquet::Type::INT32: + case ::parquet::Type::INT64: + case ::parquet::Type::FLOAT: + case ::parquet::Type::DOUBLE: + return true; + default: + return false; + } +} + +ColumnArray* array_column_from_output(MutableColumnPtr& column) { + if (auto* nullable_column = check_and_get_column(*column)) { + return assert_cast(&nullable_column->get_nested_column()); + } + return assert_cast(column.get()); +} + +ColumnMap* map_column_from_output(MutableColumnPtr& column) { + if (auto* nullable_column = check_and_get_column(*column)) { + return assert_cast(&nullable_column->get_nested_column()); + } + return assert_cast(column.get()); +} + +ColumnStruct* struct_column_from_output(MutableColumnPtr& column) { + if (auto* nullable_column = check_and_get_column(*column)) { + return assert_cast(&nullable_column->get_nested_column()); + } + return assert_cast(column.get()); +} + +NullMap* null_map_from_nullable_output(MutableColumnPtr& column) { + if (auto* nullable_column = check_and_get_column(*column)) { + return &nullable_column->get_null_map_data(); + } + return nullptr; +} + +void append_offsets(ColumnArray::Offsets64& offsets, const std::vector& entry_counts) { + offsets.reserve(offsets.size() + entry_counts.size()); + uint64_t current_offset = offsets.empty() ? 0 : offsets.back(); + for (const auto entry_count : entry_counts) { + current_offset += entry_count; + offsets.push_back(current_offset); + } +} + +void append_parent_nulls(NullMap* dst, const NullMap& src) { + if (dst == nullptr) { + return; + } + dst->insert(src.begin(), src.end()); +} + +template +Status assemble_repeated_levels(ScalarColumnReader& driver_reader, int16_t repeated_level, + int16_t value_slot_definition_level, int64_t rows, + NestedScalarOverflow* overflow, Sink& sink, int64_t* rows_read) { + if (overflow == nullptr || rows_read == nullptr) { + return Status::InvalidArgument("Invalid repeated level assembler arguments for column {}", + driver_reader.name()); + } + *rows_read = 0; + while (*rows_read < rows) { + NestedScalarBatch read_batch; + NestedScalarBatch* batch = nullptr; + bool from_overflow = false; + if (!overflow->empty()) { + batch = &overflow->batch; + from_overflow = true; + } else { + const int64_t batch_rows = std::max(rows - *rows_read, NESTED_READ_BATCH_ROWS); + RETURN_IF_ERROR(read_nested_scalar_batch(driver_reader, batch_rows, + value_slot_definition_level, &read_batch)); + if (read_batch.empty()) { + break; + } + batch = &read_batch; + } + RETURN_IF_ERROR(sink.start_batch(*batch)); + + int64_t level_idx = 0; + while (level_idx < batch->levels_written) { + const bool starts_parent = batch->rep_levels[level_idx] < repeated_level; + if (starts_parent && *rows_read >= rows) { + move_nested_scalar_tail(*batch, level_idx, overflow); + return Status::OK(); + } + if (starts_parent) { + RETURN_IF_ERROR(sink.start_parent(*batch, level_idx)); + ++*rows_read; + } else { + if (*rows_read == 0) { + return Status::Corruption( + "Repeated parquet stream starts with repeated level for column {}", + driver_reader.name()); + } + RETURN_IF_ERROR(sink.append_repeated(*batch, level_idx)); + } + ++level_idx; + } + + if (from_overflow) { + overflow->clear(); + } + } + return Status::OK(); +} + +} // namespace + +Status ScalarColumnReader::read(int64_t rows, MutableColumnPtr& column, int64_t* rows_read) { + if (column.get() == nullptr || rows_read == nullptr) { + return Status::InvalidArgument("Invalid parquet column read result pointer for column {}", + _name); + } + if (_record_reader == nullptr) { + return Status::InternalError("Parquet record reader is not initialized for column {}", + _name); + } + ::parquet::internal::RecordReader* record_reader = nullptr; + RETURN_IF_ERROR(read_records(*this, rows, &record_reader, rows_read)); + if (record_reader->values_written() != *rows_read) { + return Status::Corruption( + "Invalid parquet record read result for column {}: values={}, records={}", _name, + record_reader->values_written(), *rows_read); + } + + NullMap null_map; + RETURN_IF_ERROR(build_null_map(*this, *record_reader, *rows_read, &null_map)); + + RETURN_IF_ERROR(append_scalar_values(*this, *record_reader, *rows_read, &null_map, column)); + return Status::OK(); +} + +Status ScalarColumnReader::skip(int64_t rows) { + if (rows <= 0) { + return Status::OK(); + } + + if (_record_reader == nullptr) { + return Status::InternalError("Parquet record reader is not initialized for column {}", + _name); + } + int64_t skipped_rows = 0; + try { + _record_reader->Reset(); + while (skipped_rows < rows) { + const int64_t skipped = _record_reader->SkipRecords(rows - skipped_rows); + if (skipped <= 0) { + return Status::Corruption( + "Failed to skip parquet records for column {}: skipped {} of {} rows", + _name, skipped_rows, rows); + } + skipped_rows += skipped; + } + } catch (const ::parquet::ParquetException& e) { + return Status::Corruption("Failed to skip parquet records for column {}: {}", _name, + e.what()); + } catch (const std::exception& e) { + return Status::InternalError("Failed to skip parquet records for column {}: {}", _name, + e.what()); + } + return Status::OK(); +} + +Status StructColumnReader::read(int64_t rows, MutableColumnPtr& column, int64_t* rows_read) { + if (column.get() == nullptr || rows_read == nullptr) { + return Status::InvalidArgument("Invalid parquet struct read result pointer for column {}", + _name); + } + if (_children.empty()) { + column->resize(static_cast(rows)); + *rows_read = rows; + return Status::OK(); + } + + auto* struct_column = struct_column_from_output(column); + DORIS_CHECK(struct_column != nullptr); + auto* parent_null_map = null_map_from_nullable_output(column); + DCHECK_EQ(struct_column->get_columns().size(), _children.size()); + + std::vector scalar_children; + scalar_children.reserve(_children.size()); + bool all_scalar_children = true; + for (const auto& child_reader : _children) { + DORIS_CHECK(child_reader != nullptr); + auto* scalar_child = dynamic_cast(child_reader.get()); + if (scalar_child == nullptr) { + all_scalar_children = false; + break; + } + scalar_children.push_back(scalar_child); + } + if (all_scalar_children) { + std::vector child_batches(scalar_children.size()); + int64_t expected_rows = -1; + for (size_t child_idx = 0; child_idx < scalar_children.size(); ++child_idx) { + RETURN_IF_ERROR(read_nested_scalar_batch(*scalar_children[child_idx], rows, 0, + &child_batches[child_idx])); + if (expected_rows < 0) { + expected_rows = child_batches[child_idx].records_read; + } else if (child_batches[child_idx].records_read != expected_rows) { + return Status::Corruption( + "Parquet struct children returned different row counts in column {}: {} " + "vs {}", + _name, expected_rows, child_batches[child_idx].records_read); + } + if (child_batches[child_idx].levels_written != child_batches[child_idx].records_read) { + return Status::Corruption( + "Parquet struct child {} returned repeated levels in column {}", + scalar_children[child_idx]->name(), _name); + } + } + + if (expected_rows <= 0) { + *rows_read = 0; + return Status::OK(); + } + + std::vector child_columns; + child_columns.reserve(scalar_children.size()); + for (size_t child_idx = 0; child_idx < scalar_children.size(); ++child_idx) { + child_columns.push_back(struct_column->get_column_ptr(child_idx)->assume_mutable()); + } + + NullMap parent_nulls; + parent_nulls.reserve(static_cast(expected_rows)); + for (int64_t row_idx = 0; row_idx < expected_rows; ++row_idx) { + const bool parent_is_null = + child_batches[0].def_levels[row_idx] < _nullable_definition_level; + parent_nulls.push_back(parent_is_null); + for (size_t child_idx = 1; child_idx < child_batches.size(); ++child_idx) { + const bool child_parent_is_null = + child_batches[child_idx].def_levels[row_idx] < _nullable_definition_level; + if (child_parent_is_null != parent_is_null) { + return Status::Corruption( + "Parquet struct children returned different null parent shape in " + "column {}", + _name); + } + } + for (size_t child_idx = 0; child_idx < scalar_children.size(); ++child_idx) { + if (parent_is_null) { + child_columns[child_idx]->insert_default(); + } else { + if (!scalar_children[child_idx]->type()->is_nullable() && + child_batches[child_idx].def_levels[row_idx] != + scalar_children[child_idx]->descriptor()->max_definition_level()) { + return Status::Corruption( + "Parquet STRUCT column {} contains null for non-nullable child {}", + _name, scalar_children[child_idx]->name()); + } + RETURN_IF_ERROR(append_scalar_batch_value(*scalar_children[child_idx], + child_batches[child_idx], row_idx, + child_columns[child_idx])); + } + } + } + for (size_t child_idx = 0; child_idx < child_columns.size(); ++child_idx) { + struct_column->get_column_ptr(child_idx) = std::move(child_columns[child_idx]); + } + if (parent_null_map == nullptr) { + for (const auto parent_is_null : parent_nulls) { + if (parent_is_null) { + return Status::Corruption( + "Parquet STRUCT column {} contains null for non-nullable struct", + _name); + } + } + } else { + append_parent_nulls(parent_null_map, parent_nulls); + } + *rows_read = expected_rows; + return Status::OK(); + } + + if (parent_null_map != nullptr) { + return Status::NotSupported( + "Current parquet nullable STRUCT reader only supports scalar children for column " + "{}", + _name); + } + + int64_t expected_rows = -1; + size_t child_idx = 0; + for (auto& child_reader : _children) { + DORIS_CHECK(child_reader != nullptr); + int64_t child_rows = 0; + auto child_column = struct_column->get_column_ptr(child_idx)->assume_mutable(); + RETURN_IF_ERROR(child_reader->read(rows, child_column, &child_rows)); + if (expected_rows < 0) { + expected_rows = child_rows; + } else if (child_rows != expected_rows) { + return Status::Corruption( + "Parquet struct children returned different row counts in column {}: {} vs {}", + _name, expected_rows, child_rows); + } + struct_column->get_column_ptr(child_idx) = std::move(child_column); + child_idx++; + } + + *rows_read = std::max(expected_rows, 0); + return Status::OK(); +} + +Status StructColumnReader::skip(int64_t rows) { + if (rows <= 0) { + return Status::OK(); + } + for (auto& child_reader : _children) { + RETURN_IF_ERROR(child_reader->skip(rows)); + } + return Status::OK(); +} + +Status ListColumnReader::read(int64_t rows, MutableColumnPtr& column, int64_t* rows_read) { + if (column.get() == nullptr || rows_read == nullptr) { + return Status::InvalidArgument("Invalid parquet list read result pointer for column {}", + _name); + } + if (_element_reader == nullptr) { + return Status::InternalError("Parquet list element reader is not initialized for column {}", + _name); + } + auto* element_reader = dynamic_cast(_element_reader.get()); + if (element_reader == nullptr) { + return Status::NotSupported( + "Current parquet LIST reader only supports scalar elements for column {}", _name); + } + auto* array_column = array_column_from_output(column); + DORIS_CHECK(array_column != nullptr); + auto* parent_null_map = null_map_from_nullable_output(column); + auto nested_column = array_column->get_data_ptr()->assume_mutable(); + std::vector entry_counts; + NullMap parent_nulls; + const int16_t element_slot_definition_level = _nullable_definition_level + 1; + const int16_t element_max_definition_level = + element_reader->descriptor()->max_definition_level(); + + struct ListSink { + ListColumnReader* self = nullptr; + ScalarColumnReader* element_reader = nullptr; + MutableColumnPtr* nested_column = nullptr; + std::vector* entry_counts = nullptr; + NullMap* parent_nulls = nullptr; + int16_t element_max_definition_level = 0; + + Status start_batch(const NestedScalarBatch&) { return Status::OK(); } + + Status start_parent(const NestedScalarBatch& batch, int64_t level_idx) { + const int16_t def_level = batch.def_levels[level_idx]; + if (def_level < self->_nullable_definition_level) { + if (!self->_type->is_nullable()) { + return Status::Corruption( + "Parquet LIST column {} contains null for non-nullable list", + self->_name); + } + entry_counts->push_back(0); + parent_nulls->push_back(1); + return Status::OK(); + } + entry_counts->push_back(0); + parent_nulls->push_back(0); + if (def_level == self->_nullable_definition_level) { + return Status::OK(); + } + return append_element(batch, level_idx); + } + + Status append_repeated(const NestedScalarBatch& batch, int64_t level_idx) { + if (entry_counts->empty()) { + return Status::Corruption("Invalid repeated LIST level for column {}", self->_name); + } + return append_element(batch, level_idx); + } + + Status append_element(const NestedScalarBatch& batch, int64_t level_idx) { + const int16_t def_level = batch.def_levels[level_idx]; + if (def_level == element_max_definition_level) { + RETURN_IF_ERROR(append_scalar_batch_value(*element_reader, batch, level_idx, + *nested_column)); + } else { + if (!element_reader->type()->is_nullable()) { + return Status::Corruption( + "Parquet LIST column {} contains null for non-nullable element", + self->_name); + } + (*nested_column)->insert_default(); + } + ++entry_counts->back(); + return Status::OK(); + } + }; + + ListSink sink {this, element_reader, &nested_column, + &entry_counts, &parent_nulls, element_max_definition_level}; + RETURN_IF_ERROR(assemble_repeated_levels(*element_reader, _repeated_repetition_level, + element_slot_definition_level, rows, + &_element_overflow, sink, rows_read)); + + array_column->get_data_ptr() = std::move(nested_column); + append_offsets(array_column->get_offsets(), entry_counts); + append_parent_nulls(parent_null_map, parent_nulls); + return Status::OK(); +} + +Status ListColumnReader::skip(int64_t rows) { + if (rows <= 0) { + return Status::OK(); + } + auto* element_reader = dynamic_cast(_element_reader.get()); + if (element_reader == nullptr) { + return Status::NotSupported( + "Current parquet LIST reader only supports scalar elements for column {}", _name); + } + struct SkipSink { + Status start_batch(const NestedScalarBatch&) { return Status::OK(); } + Status start_parent(const NestedScalarBatch&, int64_t) { return Status::OK(); } + Status append_repeated(const NestedScalarBatch&, int64_t) { return Status::OK(); } + }; + SkipSink sink; + int64_t rows_read = 0; + RETURN_IF_ERROR(assemble_repeated_levels(*element_reader, _repeated_repetition_level, + _nullable_definition_level + 1, rows, + &_element_overflow, sink, &rows_read)); + if (rows_read != rows) { + return Status::Corruption("Failed to skip parquet LIST column {}: skipped {} of {} rows", + _name, rows_read, rows); + } + return Status::OK(); +} + +Status MapColumnReader::read(int64_t rows, MutableColumnPtr& column, int64_t* rows_read) { + if (column.get() == nullptr || rows_read == nullptr) { + return Status::InvalidArgument("Invalid parquet map read result pointer for column {}", + _name); + } + if (_key_reader == nullptr || _value_reader == nullptr) { + return Status::InternalError("Parquet map child reader is not initialized for column {}", + _name); + } + auto* key_reader = dynamic_cast(_key_reader.get()); + auto* value_reader = dynamic_cast(_value_reader.get()); + if (key_reader == nullptr || value_reader == nullptr) { + return Status::NotSupported( + "Current parquet MAP reader only supports scalar key/value for column {}", _name); + } + + auto* map_column = map_column_from_output(column); + DORIS_CHECK(map_column != nullptr); + auto* parent_null_map = null_map_from_nullable_output(column); + auto key_column = map_column->get_keys_ptr()->assume_mutable(); + auto value_column = map_column->get_values_ptr()->assume_mutable(); + std::vector entry_counts; + NullMap parent_nulls; + const int16_t entry_definition_level = _nullable_definition_level + 1; + const int16_t key_max_definition_level = key_reader->descriptor()->max_definition_level(); + const int16_t value_max_definition_level = value_reader->descriptor()->max_definition_level(); + + struct MapSink { + MapColumnReader* self = nullptr; + ScalarColumnReader* key_reader = nullptr; + ScalarColumnReader* value_reader = nullptr; + MutableColumnPtr* key_column = nullptr; + MutableColumnPtr* value_column = nullptr; + std::vector* entry_counts = nullptr; + NullMap* parent_nulls = nullptr; + int16_t key_max_definition_level = 0; + int16_t value_max_definition_level = 0; + + Status read_value_batch(int64_t batch_rows, NestedScalarBatch* out_value_batch) { + if (!self->_value_overflow.empty()) { + *out_value_batch = std::move(self->_value_overflow.batch); + self->_value_overflow.clear(); + return Status::OK(); + } + return read_nested_scalar_batch(*value_reader, batch_rows, + self->_nullable_definition_level + 1, out_value_batch); + } + + Status validate_value_alignment(const NestedScalarBatch& key_batch, + const NestedScalarBatch& candidate_value_batch) { + if (candidate_value_batch.records_read != key_batch.records_read || + candidate_value_batch.levels_written != key_batch.levels_written) { + return Status::Corruption( + "Parquet MAP key/value levels are not aligned for column {}: key rows={}, " + "key levels={}, value rows={}, value levels={}", + self->_name, key_batch.records_read, key_batch.levels_written, + candidate_value_batch.records_read, candidate_value_batch.levels_written); + } + for (int64_t level_idx = 0; level_idx < key_batch.levels_written; ++level_idx) { + if (candidate_value_batch.rep_levels[level_idx] != + key_batch.rep_levels[level_idx]) { + return Status::Corruption( + "Parquet MAP key/value repetition levels are not aligned for column {}", + self->_name); + } + } + return Status::OK(); + } + + Status start_batch(const NestedScalarBatch& key_batch) { + RETURN_IF_ERROR(read_value_batch(key_batch.records_read, &value_batch)); + RETURN_IF_ERROR(validate_value_alignment(key_batch, value_batch)); + return Status::OK(); + } + + Status start_parent(const NestedScalarBatch& key_batch, int64_t level_idx) { + const int16_t def_level = key_batch.def_levels[level_idx]; + if (def_level < self->_nullable_definition_level) { + if (!self->_type->is_nullable()) { + return Status::Corruption( + "Parquet MAP column {} contains null for non-nullable map", + self->_name); + } + entry_counts->push_back(0); + parent_nulls->push_back(1); + return Status::OK(); + } + entry_counts->push_back(0); + parent_nulls->push_back(0); + if (def_level == self->_nullable_definition_level) { + return Status::OK(); + } + return append_entry(key_batch, level_idx); + } + + Status append_repeated(const NestedScalarBatch& key_batch, int64_t level_idx) { + if (entry_counts->empty()) { + return Status::Corruption("Invalid repeated MAP level for column {}", self->_name); + } + return append_entry(key_batch, level_idx); + } + + Status append_entry(const NestedScalarBatch& key_batch, int64_t level_idx) { + if (key_batch.def_levels[level_idx] != key_max_definition_level) { + return Status::Corruption("Parquet MAP column {} contains null map key", + self->_name); + } + RETURN_IF_ERROR( + append_scalar_batch_value(*key_reader, key_batch, level_idx, *key_column)); + if (value_batch.def_levels[level_idx] == value_max_definition_level) { + RETURN_IF_ERROR(append_scalar_batch_value(*value_reader, value_batch, level_idx, + *value_column)); + } else { + if (!value_reader->type()->is_nullable()) { + return Status::Corruption( + "Parquet MAP column {} contains null for non-nullable value", + self->_name); + } + (*value_column)->insert_default(); + } + ++entry_counts->back(); + return Status::OK(); + } + + NestedScalarBatch value_batch; + }; + + MapSink sink; + sink.self = this; + sink.key_reader = key_reader; + sink.value_reader = value_reader; + sink.key_column = &key_column; + sink.value_column = &value_column; + sink.entry_counts = &entry_counts; + sink.parent_nulls = &parent_nulls; + sink.key_max_definition_level = key_max_definition_level; + sink.value_max_definition_level = value_max_definition_level; + RETURN_IF_ERROR(assemble_repeated_levels(*key_reader, _repeated_repetition_level, + entry_definition_level, rows, &_key_overflow, sink, + rows_read)); + if (!_key_overflow.empty()) { + move_nested_scalar_tail( + sink.value_batch, + sink.value_batch.levels_written - _key_overflow.batch.levels_written, + &_value_overflow); + } + + map_column->get_keys_ptr() = std::move(key_column); + map_column->get_values_ptr() = std::move(value_column); + append_offsets(map_column->get_offsets(), entry_counts); + append_parent_nulls(parent_null_map, parent_nulls); + return Status::OK(); +} + +Status MapColumnReader::skip(int64_t rows) { + if (rows <= 0) { + return Status::OK(); + } + DORIS_CHECK(_key_reader != nullptr); + DORIS_CHECK(_value_reader != nullptr); + auto* key_reader = dynamic_cast(_key_reader.get()); + auto* value_reader = dynamic_cast(_value_reader.get()); + if (key_reader == nullptr || value_reader == nullptr) { + return Status::NotSupported( + "Current parquet MAP reader only supports scalar key/value for column {}", _name); + } + struct SkipSink { + MapColumnReader* self = nullptr; + ScalarColumnReader* value_reader = nullptr; + + Status read_value_batch(int64_t batch_rows, NestedScalarBatch* out_value_batch) { + if (!self->_value_overflow.empty()) { + *out_value_batch = std::move(self->_value_overflow.batch); + self->_value_overflow.clear(); + return Status::OK(); + } + return read_nested_scalar_batch(*value_reader, batch_rows, + self->_nullable_definition_level + 1, out_value_batch); + } + + Status validate_value_alignment(const NestedScalarBatch& key_batch, + const NestedScalarBatch& candidate_value_batch) { + if (candidate_value_batch.records_read != key_batch.records_read || + candidate_value_batch.levels_written != key_batch.levels_written) { + return Status::Corruption( + "Parquet MAP key/value levels are not aligned for column {} while " + "skipping", + self->_name); + } + for (int64_t level_idx = 0; level_idx < key_batch.levels_written; ++level_idx) { + if (candidate_value_batch.rep_levels[level_idx] != + key_batch.rep_levels[level_idx]) { + return Status::Corruption( + "Parquet MAP key/value repetition levels are not aligned for column {}", + self->_name); + } + } + return Status::OK(); + } + + Status start_batch(const NestedScalarBatch& key_batch) { + RETURN_IF_ERROR(read_value_batch(key_batch.records_read, &value_batch)); + RETURN_IF_ERROR(validate_value_alignment(key_batch, value_batch)); + return Status::OK(); + } + + Status start_parent(const NestedScalarBatch&, int64_t) { return Status::OK(); } + + Status append_repeated(const NestedScalarBatch&, int64_t) { return Status::OK(); } + + NestedScalarBatch value_batch; + }; + SkipSink sink; + sink.self = this; + sink.value_reader = value_reader; + int64_t rows_read = 0; + RETURN_IF_ERROR(assemble_repeated_levels(*key_reader, _repeated_repetition_level, + _nullable_definition_level + 1, rows, &_key_overflow, + sink, &rows_read)); + if (!_key_overflow.empty()) { + move_nested_scalar_tail( + sink.value_batch, + sink.value_batch.levels_written - _key_overflow.batch.levels_written, + &_value_overflow); + } + if (rows_read != rows) { + return Status::Corruption("Failed to skip parquet MAP column {}: skipped {} of {} rows", + _name, rows_read, rows); + } + return Status::OK(); +} + +Status ParquetColumnReader::skip(int64_t rows) { + return Status::NotSupported("Parquet column skip is not implemented, rows={}", rows); +} + +Status ParquetColumnReader::select(const SelectionVector& sel, uint16_t selected_rows, + int64_t batch_rows, MutableColumnPtr& column) { + if (column.get() == nullptr) { + return Status::InvalidArgument("Parquet selected read result is null for column {}", + name()); + } + RETURN_IF_ERROR(sel.verify(selected_rows, batch_rows)); + + const auto ranges = selection_to_ranges(sel, selected_rows); + int64_t cursor = 0; + for (const auto& range : ranges) { + if (range.start < cursor || range.start + range.length > batch_rows) { + return Status::InvalidArgument("Invalid parquet selection range [{}, {}) for column {}", + range.start, range.start + range.length, name()); + } + RETURN_IF_ERROR(skip(range.start - cursor)); + + int64_t range_rows_read = 0; + RETURN_IF_ERROR(read(range.length, column, &range_rows_read)); + if (range_rows_read != range.length) { + return Status::Corruption( + "Parquet selected read returned {} rows, expected {} rows for column {}", + range_rows_read, range.length, name()); + } + cursor = range.start + range.length; + } + RETURN_IF_ERROR(skip(batch_rows - cursor)); + return Status::OK(); +} + +ParquetColumnReaderFactory::ParquetColumnReaderFactory( + std::shared_ptr<::parquet::RowGroupReader> row_group, int num_leaf_columns) + : _row_group(std::move(row_group)), + _record_readers(static_cast(num_leaf_columns)) {} + +reader::SchemaField ParquetColumnReaderFactory::row_position_schema_field() { + reader::SchemaField field; + field.id = ROW_POSITION_COLUMN_ID; + field.name = ROW_POSITION_COLUMN_NAME; + field.type = std::make_shared(); + field.column_type = reader::ColumnType::ROW_NUMBER; + return field; +} + +std::unique_ptr ParquetColumnReaderFactory::create_row_position_column_reader( + int64_t row_group_first_row) const { + return std::make_unique(row_group_first_row); +} + +Status ParquetColumnReaderFactory::create_scalar_reader( + int parquet_leaf_column_id, const ParquetTypeDescriptor& type_descriptor, + const ::parquet::ColumnDescriptor* descriptor, DataTypePtr type, std::string name, + std::shared_ptr<::parquet::internal::RecordReader> record_reader, + std::unique_ptr* reader) const { + if (reader == nullptr) { + return Status::InvalidArgument("reader is null"); + } + if (descriptor == nullptr || type == nullptr || record_reader == nullptr) { + return Status::InvalidArgument("Invalid parquet column reader arguments for column {}", + name); + } + *reader = std::make_unique(parquet_leaf_column_id, descriptor, + type_descriptor, std::move(type), + std::move(name), std::move(record_reader)); + return Status::OK(); +} + +Status ParquetColumnReaderFactory::create_scalar_column_reader( + const ParquetColumnSchema& column_schema, + std::unique_ptr* reader) const { + if (reader == nullptr) { + return Status::InvalidArgument("reader is null"); + } + if (column_schema.leaf_column_id < 0 || + column_schema.leaf_column_id >= static_cast(_record_readers.size())) { + return Status::InvalidArgument("Invalid parquet leaf column id {} for column {}", + column_schema.leaf_column_id, column_schema.name); + } + if (!supports_record_reader(column_schema.type_descriptor)) { + return Status::NotSupported( + "Current parquet reader only supports primitive columns without repetition; " + "column {} is not supported", + column_schema.name); + } + if (column_schema.descriptor == nullptr || + column_schema.descriptor->max_repetition_level() != 0 || + column_schema.descriptor->max_definition_level() > 1) { + return Status::NotSupported( + "Current parquet scalar reader only supports flat primitive columns; column {} is " + "not supported", + column_schema.name); + } + std::shared_ptr<::parquet::internal::RecordReader> record_reader; + RETURN_IF_ERROR(get_record_reader(column_schema.leaf_column_id, column_schema.descriptor, + column_schema.name, &record_reader)); + return create_scalar_reader(column_schema.leaf_column_id, column_schema.type_descriptor, + column_schema.descriptor, column_schema.type, column_schema.name, + std::move(record_reader), reader); +} + +Status ParquetColumnReaderFactory::create_nested_scalar_column_reader( + const ParquetColumnSchema& column_schema, + std::unique_ptr* reader) const { + if (reader == nullptr) { + return Status::InvalidArgument("reader is null"); + } + if (column_schema.kind != ParquetColumnSchemaKind::PRIMITIVE) { + return Status::InvalidArgument("Parquet nested scalar reader requires primitive column {}", + column_schema.name); + } + if (column_schema.leaf_column_id < 0 || + column_schema.leaf_column_id >= static_cast(_record_readers.size())) { + return Status::InvalidArgument("Invalid parquet leaf column id {} for column {}", + column_schema.leaf_column_id, column_schema.name); + } + if (!supports_nested_scalar_record_reader(column_schema)) { + return Status::NotSupported( + "Current parquet nested scalar reader does not support column {}", + column_schema.name); + } + std::shared_ptr<::parquet::internal::RecordReader> record_reader; + RETURN_IF_ERROR(get_record_reader(column_schema.leaf_column_id, column_schema.descriptor, + column_schema.name, &record_reader)); + return create_scalar_reader(column_schema.leaf_column_id, column_schema.type_descriptor, + column_schema.descriptor, column_schema.type, column_schema.name, + std::move(record_reader), reader); +} + +Status ParquetColumnReaderFactory::get_record_reader( + int leaf_column_id, const ::parquet::ColumnDescriptor* descriptor, const std::string& name, + std::shared_ptr<::parquet::internal::RecordReader>* reader) const { + if (reader == nullptr) { + return Status::InvalidArgument("reader is null"); + } + if (_row_group == nullptr) { + return Status::InternalError("Parquet row group reader is not initialized for column {}", + name); + } + if (leaf_column_id < 0 || leaf_column_id >= static_cast(_record_readers.size())) { + return Status::InvalidArgument("Invalid parquet leaf column id {} for column {}", + leaf_column_id, name); + } + if (descriptor == nullptr) { + return Status::InvalidArgument("Parquet column descriptor is null for column {}", name); + } + if (_record_readers[leaf_column_id] == nullptr) { + try { + _record_readers[leaf_column_id] = + _row_group->RecordReader(leaf_column_id, /*read_dictionary=*/false); + } catch (const ::parquet::ParquetException& e) { + return Status::Corruption("Failed to create parquet record reader for column {}: {}", + name, e.what()); + } catch (const std::exception& e) { + return Status::InternalError("Failed to create parquet record reader for column {}: {}", + name, e.what()); + } + } + if (_record_readers[leaf_column_id] == nullptr) { + return Status::Corruption("Failed to create parquet record reader for column {}", name); + } + *reader = _record_readers[leaf_column_id]; + return Status::OK(); +} + +Status ParquetColumnReaderFactory::create_struct_column_reader( + const ParquetColumnSchema& column_schema, const reader::FieldProjection* projection, + std::unique_ptr* reader) const { + if (reader == nullptr) { + return Status::InvalidArgument("reader is null"); + } + std::vector> child_readers; + child_readers.reserve(column_schema.children.size()); + DataTypes projected_child_types; + Strings projected_child_names; + for (size_t child_idx = 0; child_idx < column_schema.children.size(); ++child_idx) { + const auto& child_schema = column_schema.children[child_idx]; + const reader::FieldProjection* child_projection = nullptr; + if (projection != nullptr && !projection->project_all_children) { + auto it = std::find_if(projection->children.begin(), projection->children.end(), + [&](const reader::FieldProjection& child) { + return child.file_path == child_schema->file_path; + }); + if (it == projection->children.end()) { + continue; + } + child_projection = &*it; + } + std::unique_ptr child_reader; + if (child_schema->kind == ParquetColumnSchemaKind::PRIMITIVE) { + RETURN_IF_ERROR(create_nested_scalar_column_reader(*child_schema, &child_reader)); + } else { + RETURN_IF_ERROR(create(*child_schema, child_projection, &child_reader)); + } + projected_child_types.push_back(child_reader->type()); + projected_child_names.push_back(child_reader->name()); + child_readers.push_back(std::move(child_reader)); + } + if (child_readers.empty() && !column_schema.children.empty()) { + return Status::NotSupported("Parquet STRUCT projection for column {} contains no children", + column_schema.name); + } + DataTypePtr type = column_schema.type; + if (projection != nullptr && !projection->project_all_children) { + type = std::make_shared(projected_child_types, projected_child_names); + if (column_schema.type != nullptr && column_schema.type->is_nullable()) { + type = make_nullable(type); + } + } + *reader = std::make_unique(column_schema, std::move(type), + std::move(child_readers)); + return Status::OK(); +} + +Status ParquetColumnReaderFactory::create_list_column_reader( + const ParquetColumnSchema& column_schema, const reader::FieldProjection* projection, + std::unique_ptr* reader) const { + if (reader == nullptr) { + return Status::InvalidArgument("reader is null"); + } + if (projection != nullptr && !projection->project_all_children) { + return Status::NotSupported("Parquet LIST projection is not implemented for column {}", + column_schema.name); + } + if (column_schema.children.size() != 1) { + return Status::NotSupported("Unsupported parquet LIST layout for column {}", + column_schema.name); + } + std::unique_ptr element_reader; + RETURN_IF_ERROR( + create_nested_scalar_column_reader(*column_schema.children[0], &element_reader)); + *reader = std::make_unique(column_schema, column_schema.type, + std::move(element_reader)); + return Status::OK(); +} + +Status ParquetColumnReaderFactory::create_map_column_reader( + const ParquetColumnSchema& column_schema, const reader::FieldProjection* projection, + std::unique_ptr* reader) const { + if (reader == nullptr) { + return Status::InvalidArgument("reader is null"); + } + if (projection != nullptr && !projection->project_all_children) { + return Status::NotSupported("Parquet MAP projection is not implemented for column {}", + column_schema.name); + } + if (column_schema.children.size() != 1 || column_schema.children[0]->children.size() != 2) { + return Status::NotSupported("Unsupported parquet MAP layout for column {}", + column_schema.name); + } + const auto& key_value_schema = *column_schema.children[0]; + std::unique_ptr key_reader; + RETURN_IF_ERROR(create_nested_scalar_column_reader(*key_value_schema.children[0], &key_reader)); + std::unique_ptr value_reader; + RETURN_IF_ERROR( + create_nested_scalar_column_reader(*key_value_schema.children[1], &value_reader)); + *reader = std::make_unique(column_schema, column_schema.type, + std::move(key_reader), std::move(value_reader)); + return Status::OK(); +} + +Status ParquetColumnReaderFactory::create(const ParquetColumnSchema& column_schema, + const reader::FieldProjection* projection, + std::unique_ptr* reader) const { + if (reader == nullptr) { + return Status::InvalidArgument("reader is null"); + } + switch (column_schema.kind) { + case ParquetColumnSchemaKind::PRIMITIVE: + return create_scalar_column_reader(column_schema, reader); + case ParquetColumnSchemaKind::STRUCT: + return create_struct_column_reader(column_schema, projection, reader); + case ParquetColumnSchemaKind::LIST: + return create_list_column_reader(column_schema, projection, reader); + case ParquetColumnSchemaKind::MAP: + return create_map_column_reader(column_schema, projection, reader); + } + return Status::NotSupported("Unsupported parquet column schema kind for column {}", + column_schema.name); +} + +} // namespace doris::parquet diff --git a/be/src/format/new_parquet/column_reader.h b/be/src/format/new_parquet/column_reader.h new file mode 100644 index 00000000000000..62400d739cad8b --- /dev/null +++ b/be/src/format/new_parquet/column_reader.h @@ -0,0 +1,147 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include + +#include "common/status.h" +#include "core/data_type/data_type.h" +#include "format/new_parquet/parquet_type.h" +#include "format/new_parquet/selection_vector.h" + +namespace parquet { +class ColumnDescriptor; +class RowGroupReader; + +namespace internal { +class RecordReader; +} // namespace internal +} // namespace parquet + +namespace doris { +class IColumn; + +namespace reader { +struct FieldProjection; +struct SchemaField; +} // namespace reader + +namespace parquet { +struct ParquetColumnSchema; + +// Doris 的 Parquet column reader 抽象。 +// 该类包装 Arrow Parquet RecordReader,负责将 file-local Parquet leaf column 读取成 +// Doris-owned column。它不理解 Iceberg/global schema,也不处理 table-level +// cast/default/generated/partition 语义。 +class ParquetColumnReader { +public: + virtual ~ParquetColumnReader() = default; + + // FileReader 暴露给上层 scan request 的 file-local column id。 + // 对 top-level primitive 列,它通常等于 Parquet leaf column id;对 struct/list/map + // 这类复杂列,它表示 file schema tree 中的逻辑字段 id。 + virtual int file_column_id() const = 0; + + // Parquet 文件内部的 leaf column id,用于访问 RowGroupReader::RecordReader、 + // ColumnChunk metadata、statistics/page index 等 Parquet 物理列结构。 + // 只有 primitive leaf reader 有有效值;复杂列 reader 没有单一 leaf column,返回 -1。 + virtual int parquet_leaf_column_id() const = 0; + + virtual const DataTypePtr& type() const = 0; + virtual const std::string& name() const = 0; + + // 读取一个 file-local column batch。 + virtual Status read(int64_t rows, MutableColumnPtr& column, int64_t* rows_read) = 0; + + // 跳过指定行数。这里必须使用 row-level skip,不能退回到 value-level Skip。 + virtual Status skip(int64_t rows); + + // 按 selection 读取当前 batch 中需要输出的行,并在末尾跳过 batch 内剩余行。 + // 该方法只允许通过 skip + read 推进 reader 游标,不允许退化为整批 read + filter。 + virtual Status select(const SelectionVector& sel, uint16_t selected_rows, int64_t batch_rows, + MutableColumnPtr& column); +}; + +// Parquet column reader 工厂。 +// 工厂绑定当前 row group,并根据 file-local schema tree 创建 Doris 自己的 column +// reader。Arrow internal RecordReader 的创建和缓存必须封装在这里,避免泄露到 +// ParquetReader 主流程。后续 reader options、Dremel assembler、延时物化 cache/skip +// 策略都应挂在该工厂上下文里,而不是继续扩展自由函数参数。 +class ParquetColumnReaderFactory { +public: + ParquetColumnReaderFactory(std::shared_ptr<::parquet::RowGroupReader> row_group, + int num_leaf_columns); + + static constexpr int ROW_POSITION_COLUMN_ID = -10001; + static constexpr const char* ROW_POSITION_COLUMN_NAME = "__parquet_row_position"; + + static reader::SchemaField row_position_schema_field(); + + // 根据 file-local schema tree 创建 column reader。复杂类型会在这里递归创建 + // children。该入口只理解 Parquet file schema,不处理 table/global schema。 + Status create(const ParquetColumnSchema& column_schema, + const reader::FieldProjection* projection, + std::unique_ptr* reader) const; + + Status create(const ParquetColumnSchema& column_schema, + std::unique_ptr* reader) const { + return create(column_schema, nullptr, reader); + } + + std::unique_ptr create_row_position_column_reader( + int64_t row_group_first_row) const; + +private: + Status create_scalar_column_reader(const ParquetColumnSchema& column_schema, + std::unique_ptr* reader) const; + + Status create_nested_scalar_column_reader(const ParquetColumnSchema& column_schema, + std::unique_ptr* reader) const; + + Status create_struct_column_reader(const ParquetColumnSchema& column_schema, + const reader::FieldProjection* projection, + std::unique_ptr* reader) const; + + Status create_list_column_reader(const ParquetColumnSchema& column_schema, + const reader::FieldProjection* projection, + std::unique_ptr* reader) const; + + Status create_map_column_reader(const ParquetColumnSchema& column_schema, + const reader::FieldProjection* projection, + std::unique_ptr* reader) const; + + Status get_record_reader(int leaf_column_id, const ::parquet::ColumnDescriptor* descriptor, + const std::string& name, + std::shared_ptr<::parquet::internal::RecordReader>* reader) const; + + Status create_scalar_reader(int parquet_leaf_column_id, + const ParquetTypeDescriptor& type_descriptor, + const ::parquet::ColumnDescriptor* descriptor, DataTypePtr type, + std::string name, + std::shared_ptr<::parquet::internal::RecordReader> record_reader, + std::unique_ptr* reader) const; + + std::shared_ptr<::parquet::RowGroupReader> _row_group; + mutable std::vector> _record_readers; +}; + +} // namespace parquet +} // namespace doris diff --git a/be/src/format/new_parquet/parquet_column_schema.cpp b/be/src/format/new_parquet/parquet_column_schema.cpp new file mode 100644 index 00000000000000..cbca53c7f72fda --- /dev/null +++ b/be/src/format/new_parquet/parquet_column_schema.cpp @@ -0,0 +1,296 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "format/new_parquet/parquet_column_schema.h" + +#include + +#include +#include +#include +#include + +#include "core/data_type/data_type_array.h" +#include "core/data_type/data_type_map.h" +#include "core/data_type/data_type_nullable.h" +#include "core/data_type/data_type_struct.h" +#include "format/new_parquet/parquet_type.h" + +namespace doris::parquet { +namespace { + +struct SchemaBuildContext { + int32_t top_level_field_id = -1; + int32_t parent_schema_node_id = -1; + int16_t definition_level = 0; + int16_t repetition_level = 0; + int16_t nullable_definition_level = 0; + int16_t repeated_repetition_level = 0; + std::vector file_path; + std::vector field_id_path; + std::vector name_path; + int* next_schema_node_id = nullptr; +}; + +bool is_list_node(const ::parquet::schema::Node& node) { + const auto& logical_type = node.logical_type(); + return node.converted_type() == ::parquet::ConvertedType::LIST || + (logical_type != nullptr && logical_type->is_valid() && logical_type->is_list()); +} + +bool is_map_node(const ::parquet::schema::Node& node) { + const auto& logical_type = node.logical_type(); + return node.converted_type() == ::parquet::ConvertedType::MAP || + node.converted_type() == ::parquet::ConvertedType::MAP_KEY_VALUE || + (logical_type != nullptr && logical_type->is_valid() && logical_type->is_map()); +} + +DataTypePtr nullable_if_needed(DataTypePtr type, const ::parquet::schema::Node& node) { + return node.is_optional() ? make_nullable(type) : type; +} + +void inherit_common_schema_state(const ::parquet::schema::Node& node, + const SchemaBuildContext& context, + ParquetColumnSchema* column_schema) { + DORIS_CHECK(column_schema != nullptr); + DORIS_CHECK(context.next_schema_node_id != nullptr); + column_schema->field_id = node.field_id(); + column_schema->top_level_field_id = context.top_level_field_id; + column_schema->schema_node_id = (*context.next_schema_node_id)++; + column_schema->parent_schema_node_id = context.parent_schema_node_id; + column_schema->file_path = context.file_path; + column_schema->field_id_path = context.field_id_path; + column_schema->name_path = context.name_path; + column_schema->name = node.name(); + column_schema->node = &node; + column_schema->max_definition_level = context.definition_level; + column_schema->max_repetition_level = context.repetition_level; + column_schema->nullable_definition_level = context.nullable_definition_level; + column_schema->repeated_repetition_level = context.repeated_repetition_level; +} + +SchemaBuildContext child_context(const SchemaBuildContext& parent, + const ::parquet::schema::Node& child_node, int32_t child_idx, + int32_t parent_schema_node_id) { + SchemaBuildContext result = parent; + result.parent_schema_node_id = parent_schema_node_id; + result.file_path.push_back(child_idx); + result.field_id_path.push_back(child_node.field_id()); + result.name_path.push_back(child_node.name()); + if (child_node.repetition() != ::parquet::Repetition::REQUIRED) { + result.definition_level++; + result.nullable_definition_level = result.definition_level; + } + if (child_node.is_repeated()) { + result.repetition_level++; + result.repeated_repetition_level = result.repetition_level; + } + return result; +} + +void propagate_child_levels(ParquetColumnSchema* column_schema) { + DORIS_CHECK(column_schema != nullptr); + for (const auto& child : column_schema->children) { + column_schema->max_definition_level = + std::max(column_schema->max_definition_level, child->max_definition_level); + column_schema->max_repetition_level = + std::max(column_schema->max_repetition_level, child->max_repetition_level); + } +} + +Status build_node_schema(const ::parquet::SchemaDescriptor& schema, + const ::parquet::schema::Node& node, const SchemaBuildContext& context, + std::unique_ptr* result) { + if (result == nullptr) { + return Status::InvalidArgument("result is null"); + } + auto column_schema = std::make_unique(); + inherit_common_schema_state(node, context, column_schema.get()); + + if (node.is_primitive()) { + const int leaf_column_id = schema.ColumnIndex(node); + if (leaf_column_id < 0) { + return Status::InvalidArgument("Cannot find leaf column id for parquet column {}", + node.name()); + } + column_schema->kind = ParquetColumnSchemaKind::PRIMITIVE; + column_schema->leaf_column_id = leaf_column_id; + column_schema->descriptor = schema.Column(leaf_column_id); + if (column_schema->descriptor != nullptr) { + column_schema->max_definition_level = column_schema->descriptor->max_definition_level(); + column_schema->max_repetition_level = column_schema->descriptor->max_repetition_level(); + } + column_schema->type_descriptor = resolve_parquet_type(column_schema->descriptor); + column_schema->type = column_schema->type_descriptor.doris_type; + if (column_schema->type == nullptr) { + return Status::NotSupported("Unsupported parquet column type for column {}", + node.name()); + } + column_schema->type = node.is_optional() + ? make_nullable(remove_nullable(column_schema->type)) + : remove_nullable(column_schema->type); + *result = std::move(column_schema); + return Status::OK(); + } + + const auto& group = static_cast(node); + if (is_list_node(node)) { + column_schema->kind = ParquetColumnSchemaKind::LIST; + if (group.field_count() != 1) { + return Status::NotSupported("Unsupported parquet LIST encoding for column {}", + node.name()); + } + const auto& repeated_node = *group.field(0); + if (!repeated_node.is_repeated() || repeated_node.is_primitive()) { + return Status::NotSupported("Unsupported parquet LIST encoding for column {}", + node.name()); + } + const auto& repeated_group = + static_cast(repeated_node); + if (repeated_group.field_count() != 1) { + return Status::NotSupported("Unsupported parquet LIST element layout for column {}", + node.name()); + } + auto repeated_context = + child_context(context, repeated_node, 0, column_schema->schema_node_id); + column_schema->repeated_repetition_level = repeated_context.repeated_repetition_level; + std::unique_ptr child; + RETURN_IF_ERROR(build_node_schema(schema, *repeated_group.field(0), + child_context(repeated_context, *repeated_group.field(0), + 0, column_schema->schema_node_id), + &child)); + column_schema->type = + nullable_if_needed(std::make_shared(child->type), node); + column_schema->children.push_back(std::move(child)); + propagate_child_levels(column_schema.get()); + *result = std::move(column_schema); + return Status::OK(); + } + + if (is_map_node(node)) { + column_schema->kind = ParquetColumnSchemaKind::MAP; + if (group.field_count() != 1) { + return Status::NotSupported("Unsupported parquet MAP encoding for column {}", + node.name()); + } + const auto& key_value_node = *group.field(0); + if (!key_value_node.is_repeated()) { + return Status::NotSupported("Unsupported parquet MAP encoding for column {}", + node.name()); + } + auto key_value_context = + child_context(context, key_value_node, 0, column_schema->schema_node_id); + column_schema->repeated_repetition_level = key_value_context.repeated_repetition_level; + if (key_value_node.is_primitive()) { + return Status::NotSupported("Unsupported parquet MAP key_value layout for column {}", + node.name()); + } + const auto& key_value_group = + static_cast(key_value_node); + if (key_value_group.field_count() != 2) { + return Status::NotSupported("Unsupported parquet MAP key_value layout for column {}", + node.name()); + } + auto key_value = std::make_unique(); + inherit_common_schema_state(key_value_node, key_value_context, key_value.get()); + key_value->kind = ParquetColumnSchemaKind::STRUCT; + DataTypes child_types; + Strings child_names; + child_types.reserve(key_value_group.field_count()); + child_names.reserve(key_value_group.field_count()); + for (int child_idx = 0; child_idx < key_value_group.field_count(); ++child_idx) { + std::unique_ptr child; + RETURN_IF_ERROR(build_node_schema( + schema, *key_value_group.field(child_idx), + child_context(key_value_context, *key_value_group.field(child_idx), child_idx, + key_value->schema_node_id), + &child)); + child_types.push_back(child->type); + child_names.push_back(child->name); + key_value->children.push_back(std::move(child)); + } + key_value->type = std::make_shared(child_types, child_names); + propagate_child_levels(key_value.get()); + if (key_value->children.size() != 2) { + return Status::NotSupported("Unsupported parquet MAP key_value layout for column {}", + node.name()); + } + if (key_value->children[0]->node == nullptr || + key_value->children[0]->node->repetition() != ::parquet::Repetition::REQUIRED) { + return Status::NotSupported("Unsupported nullable parquet MAP key for column {}", + node.name()); + } + auto key_type = key_value->children[0]->type; + auto value_type = key_value->children[1]->type; + column_schema->type = + nullable_if_needed(std::make_shared(key_type, value_type), node); + column_schema->children.push_back(std::move(key_value)); + propagate_child_levels(column_schema.get()); + *result = std::move(column_schema); + return Status::OK(); + } + + column_schema->kind = ParquetColumnSchemaKind::STRUCT; + DataTypes child_types; + Strings child_names; + child_types.reserve(group.field_count()); + child_names.reserve(group.field_count()); + for (int child_idx = 0; child_idx < group.field_count(); ++child_idx) { + std::unique_ptr child; + RETURN_IF_ERROR(build_node_schema(schema, *group.field(child_idx), + child_context(context, *group.field(child_idx), child_idx, + column_schema->schema_node_id), + &child)); + child_types.push_back(child->type); + child_names.push_back(child->name); + column_schema->children.push_back(std::move(child)); + } + column_schema->type = + nullable_if_needed(std::make_shared(child_types, child_names), node); + propagate_child_levels(column_schema.get()); + *result = std::move(column_schema); + return Status::OK(); +} + +} // namespace + +Status build_parquet_column_schema(const ::parquet::SchemaDescriptor& schema, + std::vector>* fields) { + if (fields == nullptr) { + return Status::InvalidArgument("fields is null"); + } + fields->clear(); + const auto* root = schema.group_node(); + if (root == nullptr) { + return Status::InvalidArgument("Parquet schema root is null"); + } + int next_schema_node_id = 0; + fields->reserve(root->field_count()); + for (int field_idx = 0; field_idx < root->field_count(); ++field_idx) { + std::unique_ptr field; + SchemaBuildContext context; + context.top_level_field_id = field_idx; + context.next_schema_node_id = &next_schema_node_id; + RETURN_IF_ERROR(build_node_schema( + schema, *root->field(field_idx), + child_context(context, *root->field(field_idx), field_idx, -1), &field)); + fields->push_back(std::move(field)); + } + return Status::OK(); +} + +} // namespace doris::parquet diff --git a/be/src/format/new_parquet/parquet_column_schema.h b/be/src/format/new_parquet/parquet_column_schema.h new file mode 100644 index 00000000000000..81f9536243e8ee --- /dev/null +++ b/be/src/format/new_parquet/parquet_column_schema.h @@ -0,0 +1,77 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include + +#include "common/status.h" +#include "core/data_type/data_type.h" +#include "format/new_parquet/parquet_type.h" + +namespace parquet { +class ColumnDescriptor; +class SchemaDescriptor; + +namespace schema { +class Node; +} // namespace schema +} // namespace parquet + +namespace doris::parquet { + +enum class ParquetColumnSchemaKind { + PRIMITIVE, + STRUCT, + LIST, + MAP, +}; + +// 新 Parquet reader 的 file-local schema tree。 +// 它描述 Parquet 逻辑字段到 leaf column ordinal 的关系,不包含 table/global schema 语义。 +struct ParquetColumnSchema { + int field_id = -1; + int top_level_field_id = -1; + // Parquet schema 中的 primitive leaf column ordinal。 + // 该 id 用于访问 ColumnDescriptor、RowGroupReader::RecordReader、ColumnChunk + // metadata 和 statistics。复杂类型节点本身没有单一 leaf column,因此为 -1。 + int leaf_column_id = -1; + int schema_node_id = -1; + int parent_schema_node_id = -1; + std::vector file_path; + std::vector field_id_path; + std::vector name_path; + std::string name; + DataTypePtr type; + ParquetTypeDescriptor type_descriptor; + ParquetColumnSchemaKind kind = ParquetColumnSchemaKind::PRIMITIVE; + const ::parquet::schema::Node* node = nullptr; + const ::parquet::ColumnDescriptor* descriptor = nullptr; + int16_t max_definition_level = 0; + int16_t max_repetition_level = 0; + int16_t nullable_definition_level = 0; + int16_t repeated_repetition_level = 0; + std::vector> children; +}; + +// 从 Arrow Parquet core schema 构造 file-local schema tree。 +Status build_parquet_column_schema(const ::parquet::SchemaDescriptor& schema, + std::vector>* fields); + +} // namespace doris::parquet diff --git a/be/src/format/new_parquet/parquet_reader.cpp b/be/src/format/new_parquet/parquet_reader.cpp new file mode 100644 index 00000000000000..c38d8b810a9b2d --- /dev/null +++ b/be/src/format/new_parquet/parquet_reader.cpp @@ -0,0 +1,924 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "format/new_parquet/parquet_reader.h" + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "common/exception.h" +#include "core/assert_cast.h" +#include "core/block/block.h" +#include "core/data_type/data_type_array.h" +#include "core/data_type/data_type_map.h" +#include "core/data_type/data_type_nullable.h" +#include "core/data_type/data_type_struct.h" +#include "exprs/vexpr_context.h" +#include "format/new_parquet/column_reader.h" +#include "format/new_parquet/parquet_column_schema.h" +#include "format/new_parquet/parquet_statistics.h" +#include "format/new_parquet/selection_vector.h" +#include "io/fs/file_reader.h" +#include "storage/predicate/column_predicate.h" +#include "util/slice.h" + +namespace doris::parquet { + +constexpr int64_t DEFAULT_PARQUET_READ_BATCH_SIZE = 4096; + +Status arrow_status_to_doris_status(const arrow::Status& status) { + if (status.ok()) { + return Status::OK(); + } + if (status.IsIOError()) { + return Status::IOError(status.ToString()); + } + if (status.IsInvalid()) { + return Status::InvalidArgument(status.ToString()); + } + return Status::InternalError(status.ToString()); +} + +class DorisRandomAccessFile final : public arrow::io::RandomAccessFile { +public: + DorisRandomAccessFile(io::FileReaderSPtr file_reader, io::IOContext* io_ctx) + : _file_reader(std::move(file_reader)), _io_ctx(io_ctx) { + set_mode(arrow::io::FileMode::READ); + } + + arrow::Status Close() override { + _closed = true; + return arrow::Status::OK(); + } + + bool closed() const override { return _closed; } + + arrow::Result Tell() const override { return _pos; } + + arrow::Status Seek(int64_t position) override { + if (position < 0) { + return arrow::Status::Invalid("negative seek position"); + } + _pos = position; + return arrow::Status::OK(); + } + + arrow::Result GetSize() override { + if (!_file_reader) { + return arrow::Status::IOError("Doris file reader is not open"); + } + return static_cast(_file_reader->size()); + } + + arrow::Result Read(int64_t nbytes, void* out) override { + ARROW_ASSIGN_OR_RAISE(auto bytes_read, ReadAt(_pos, nbytes, out)); + _pos += bytes_read; + return bytes_read; + } + + arrow::Result> Read(int64_t nbytes) override { + ARROW_ASSIGN_OR_RAISE(auto buffer, arrow::AllocateResizableBuffer(nbytes)); + ARROW_ASSIGN_OR_RAISE(auto bytes_read, Read(nbytes, buffer->mutable_data())); + ARROW_RETURN_NOT_OK(buffer->Resize(bytes_read, false)); + buffer->ZeroPadding(); + return buffer; + } + + arrow::Result ReadAt(int64_t position, int64_t nbytes, void* out) override { + if (!_file_reader) { + return arrow::Status::IOError("Doris file reader is not open"); + } + if (position < 0 || nbytes < 0) { + return arrow::Status::Invalid("negative read position or length"); + } + size_t bytes_read = 0; + Status st = _file_reader->read_at( + static_cast(position), + Slice(static_cast(out), static_cast(nbytes)), &bytes_read, + _io_ctx); + if (!st.ok()) { + return arrow::Status::IOError(st.to_string_no_stack()); + } + return static_cast(bytes_read); + } + + arrow::Result> ReadAt(int64_t position, + int64_t nbytes) override { + ARROW_ASSIGN_OR_RAISE(auto buffer, arrow::AllocateResizableBuffer(nbytes)); + ARROW_ASSIGN_OR_RAISE(auto bytes_read, ReadAt(position, nbytes, buffer->mutable_data())); + ARROW_RETURN_NOT_OK(buffer->Resize(bytes_read, false)); + buffer->ZeroPadding(); + return buffer; + } + +private: + io::FileReaderSPtr _file_reader; + io::IOContext* _io_ctx = nullptr; + int64_t _pos = 0; + bool _closed = false; +}; + +struct ParquetReaderScanState { + // Doris 文件句柄适配成 Arrow RandomAccessFile。该对象只处理随机读,不携带 + // table/global schema 语义。 + std::shared_ptr arrow_file; + + // Arrow Parquet core reader 和 footer metadata。ParquetReader 只依赖 core API, + // 不使用 parquet::arrow reader,也不输出 Arrow Array/RecordBatch。 + std::unique_ptr<::parquet::ParquetFileReader> file_reader; + std::shared_ptr<::parquet::FileMetaData> metadata; + const ::parquet::SchemaDescriptor* schema = nullptr; + std::vector> file_schema; + + // 当前 scan 的 top-level file-local projection 和 row group 列表。projected_fields + // 决定输出 block;具体 leaf column reader 由 ParquetColumnReaderFactory 按需创建。 + std::vector predicate_fields; + std::vector non_predicate_fields; + std::vector selected_row_groups; + // We need this to quickly determine the first row of each row group, which is needed for position delete and page index. + // TODO: this may be parsed by multiple ParquetReader with the same file but different scan ranges, so we should cache it + std::vector row_group_first_rows; + size_t next_row_group_idx = 0; + std::shared_ptr<::parquet::RowGroupReader> current_row_group; + std::vector> current_predicate_columns; + std::vector> current_non_predicate_columns; + int64_t current_row_group_rows = 0; + int64_t current_row_group_rows_read = 0; + int64_t current_row_group_first_row = 0; +}; + +Status ParquetReader::_reset_reader_position() { + _state->next_row_group_idx = 0; + _state->current_row_group.reset(); + _state->current_predicate_columns.clear(); + _state->current_non_predicate_columns.clear(); + _state->current_row_group_rows = 0; + _state->current_row_group_rows_read = 0; + _state->current_row_group_first_row = 0; + return Status::OK(); +} + +void ParquetReader::_reset_current_row_group() { + _state->current_row_group.reset(); + _state->current_predicate_columns.clear(); + _state->current_non_predicate_columns.clear(); + _state->current_row_group_rows = 0; + _state->current_row_group_rows_read = 0; + _state->current_row_group_first_row = 0; +} + +void ParquetReader::_fill_schema_field(const ParquetColumnSchema& column_schema, + reader::SchemaField* field) const { + field->id = column_schema.top_level_field_id; + field->name = column_schema.name; + field->type = column_schema.type; + field->file_path = column_schema.file_path; + field->field_id_path = column_schema.field_id_path; + field->name_path = column_schema.name_path; + field->children.clear(); + field->children.reserve(column_schema.children.size()); + for (const auto& child : column_schema.children) { + reader::SchemaField child_field; + _fill_schema_field(*child, &child_field); + field->children.push_back(std::move(child_field)); + } +} + +Status ParquetReader::_fill_projected_schema_field(const ParquetColumnSchema& column_schema, + const reader::FieldProjection* projection, + reader::SchemaField* field) const { + if (field == nullptr) { + return Status::InvalidArgument("projected schema field is null"); + } + _fill_schema_field(column_schema, field); + if (projection == nullptr || projection->project_all_children || + column_schema.children.empty()) { + return Status::OK(); + } + + field->children.clear(); + std::map child_projection_by_idx; + for (const auto& child_projection : projection->children) { + if (child_projection.file_path.empty()) { + return Status::InvalidArgument("Empty parquet projection path for column {}", + column_schema.name); + } + child_projection_by_idx.emplace(child_projection.file_path.back(), &child_projection); + } + + DataTypes child_types; + Strings child_names; + for (size_t child_idx = 0; child_idx < column_schema.children.size(); ++child_idx) { + auto it = child_projection_by_idx.find(static_cast(child_idx)); + if (it == child_projection_by_idx.end()) { + continue; + } + if (it->second->file_path != column_schema.children[child_idx]->file_path) { + return Status::InvalidArgument("Invalid parquet projection path for column {}", + column_schema.children[child_idx]->name); + } + reader::SchemaField child_field; + RETURN_IF_ERROR(_fill_projected_schema_field(*column_schema.children[child_idx], it->second, + &child_field)); + child_types.push_back(child_field.type); + child_names.push_back(child_field.name); + field->children.push_back(std::move(child_field)); + } + + if (field->children.empty()) { + return Status::NotSupported("Parquet projection for column {} contains no children", + column_schema.name); + } + + const auto primitive_type = remove_nullable(column_schema.type)->get_primitive_type(); + DataTypePtr projected_type; + switch (primitive_type) { + case TYPE_STRUCT: + projected_type = std::make_shared(child_types, child_names); + break; + case TYPE_ARRAY: + DORIS_CHECK(child_types.size() == 1); + projected_type = std::make_shared(child_types[0]); + break; + case TYPE_MAP: + DORIS_CHECK(child_types.size() == 1); + DORIS_CHECK(remove_nullable(child_types[0])->get_primitive_type() == TYPE_STRUCT); + { + const auto* entry_type = + assert_cast(remove_nullable(child_types[0]).get()); + DORIS_CHECK(entry_type->get_elements().size() == 2); + projected_type = std::make_shared(entry_type->get_element(0), + entry_type->get_element(1)); + } + break; + default: + return Status::InvalidArgument("Cannot project children from non-complex parquet column {}", + column_schema.name); + } + field->type = + column_schema.type->is_nullable() ? make_nullable(projected_type) : projected_type; + return Status::OK(); +} + +Status ParquetReader::_get_projected_schema_field(reader::ColumnId file_column_id, + const reader::FieldProjection* projection, + reader::SchemaField* field) const { + if (file_column_id < 0 || + file_column_id >= static_cast(_state->file_schema.size())) { + return Status::InvalidArgument("Invalid parquet field id {}", file_column_id); + } + RETURN_IF_ERROR( + _fill_projected_schema_field(*_state->file_schema[file_column_id], projection, field)); + field->id = file_column_id; + return Status::OK(); +} + +Status ParquetReader::_read_filter_columns(int64_t batch_rows, Block* file_block, + SelectionVector* selection, uint16_t* selected_rows) { + selection->resize(static_cast(batch_rows)); + for (size_t filter_idx = 0; filter_idx < _request->predicate_columns.size(); ++filter_idx) { + const int file_field_id = _request->predicate_columns[filter_idx]; + auto& column_reader = _state->current_predicate_columns[filter_idx]; + auto position_it = _request->column_positions.find(file_field_id); + DORIS_CHECK(position_it != _request->column_positions.end()); + const auto block_position = position_it->second; + auto column = file_block->get_by_position(block_position).column->assume_mutable(); + DCHECK_EQ(file_block->get_by_position(block_position).type->get_primitive_type(), + column_reader->type()->get_primitive_type()); + int64_t column_rows = 0; + RETURN_IF_ERROR(column_reader->read(batch_rows, column, &column_rows)); + if (column_rows != batch_rows) { + return Status::Corruption("Parquet filter column {} returned {} rows, expected {} rows", + column_reader->name(), column_rows, batch_rows); + } + file_block->replace_by_position(block_position, std::move(column)); + } + return _execute_filter_conjuncts(batch_rows, file_block, selection, selected_rows); +} + +Status ParquetReader::_execute_filter_conjuncts(int64_t batch_rows, Block* file_block, + SelectionVector* selection, + uint16_t* selected_rows) { + // Conjuncts may reference several predicate columns. Execute them only after all referenced + // predicate columns in the file-local block have been materialized. + for (const auto& conjunct : _request->conjuncts) { + if (*selected_rows == 0) { + break; + } + IColumn::Filter filter(static_cast(batch_rows), 1); + bool can_filter_all = false; + RETURN_IF_ERROR(conjunct->execute_filter(file_block, filter.data(), + static_cast(batch_rows), false, + &can_filter_all)); + *selected_rows = + can_filter_all ? 0 : _apply_filter_to_selection(filter, selection, *selected_rows); + } + for (const auto& delete_conjunct : _request->delete_conjuncts) { + if (*selected_rows == 0) { + break; + } + const size_t original_columns = file_block->columns(); + int result_column_id = -1; + RETURN_IF_ERROR(delete_conjunct->root()->execute(delete_conjunct.get(), file_block, + &result_column_id)); + DORIS_CHECK(result_column_id >= 0 && + result_column_id < static_cast(file_block->columns())); + const auto& delete_filter = assert_cast( + *file_block->get_by_position(result_column_id).column) + .get_data(); + DORIS_CHECK(delete_filter.size() == static_cast(batch_rows)); + IColumn::Filter keep_filter(static_cast(batch_rows), 1); + bool has_kept_row = false; + for (size_t row = 0; row < static_cast(batch_rows); ++row) { + keep_filter[row] = !delete_filter[row]; + has_kept_row |= keep_filter[row] != 0; + } + file_block->erase_tail(original_columns); + *selected_rows = + !has_kept_row ? 0 + : _apply_filter_to_selection(keep_filter, selection, *selected_rows); + } + return Status::OK(); +} + +IColumn::Filter ParquetReader::_selection_to_filter(const SelectionVector& selection, + uint16_t selected_rows, int64_t batch_rows) { + IColumn::Filter filter(static_cast(batch_rows), 0); + for (uint16_t selection_idx = 0; selection_idx < selected_rows; ++selection_idx) { + filter[selection.get_index(selection_idx)] = 1; + } + return filter; +} + +uint16_t ParquetReader::_apply_filter_to_selection(const IColumn::Filter& filter, + SelectionVector* selection, + uint16_t selected_rows) { + uint16_t new_selected_rows = 0; + for (uint16_t selection_idx = 0; selection_idx < selected_rows; ++selection_idx) { + const auto row_idx = selection->get_index(selection_idx); + if (filter[row_idx] != 0) { + selection->set_index(new_selected_rows++, static_cast(row_idx)); + } + } + return new_selected_rows; +} + +Status ParquetReader::_open_next_row_group(bool* has_row_group) { + *has_row_group = false; + while (_state->next_row_group_idx < _state->selected_row_groups.size()) { + const int row_group_idx = _state->selected_row_groups[_state->next_row_group_idx++]; + try { + _state->current_row_group = _state->file_reader->RowGroup(row_group_idx); + } catch (const ::parquet::ParquetException& e) { + return Status::Corruption("Failed to open parquet row group {}: {}", row_group_idx, + e.what()); + } catch (const std::exception& e) { + return Status::InternalError("Failed to open parquet row group {}: {}", row_group_idx, + e.what()); + } + + auto row_group_metadata = _state->metadata->RowGroup(row_group_idx); + _state->current_row_group_rows = + row_group_metadata == nullptr ? 0 : row_group_metadata->num_rows(); + if (_state->current_row_group_rows < 0) { + return Status::Corruption("Invalid negative row count in parquet row group {}", + row_group_idx); + } else if (_state->current_row_group_rows == 0) { + _reset_current_row_group(); + continue; + } + DORIS_CHECK(row_group_idx >= 0 && + row_group_idx < static_cast(_state->row_group_first_rows.size())); + _state->current_row_group_first_row = _state->row_group_first_rows[row_group_idx]; + _state->current_row_group_rows_read = 0; + _state->current_predicate_columns.clear(); + _state->current_non_predicate_columns.clear(); + + ParquetColumnReaderFactory column_reader_factory(_state->current_row_group, + _state->schema->num_columns()); + for (const auto file_column_id : _request->predicate_columns) { + if (file_column_id == ParquetColumnReaderFactory::ROW_POSITION_COLUMN_ID) { + _state->current_predicate_columns.push_back( + column_reader_factory.create_row_position_column_reader( + _state->current_row_group_first_row)); + continue; + } + const auto& column_schema = _state->file_schema[file_column_id]; + const auto projection_it = _request->complex_projections.find(file_column_id); + const auto* projection = projection_it == _request->complex_projections.end() + ? nullptr + : &projection_it->second; + std::unique_ptr column_reader; + RETURN_IF_ERROR( + column_reader_factory.create(*column_schema, projection, &column_reader)); + _state->current_predicate_columns.push_back(std::move(column_reader)); + } + for (const auto file_column_id : _request->non_predicate_columns) { + if (file_column_id == ParquetColumnReaderFactory::ROW_POSITION_COLUMN_ID) { + _state->current_non_predicate_columns.push_back( + column_reader_factory.create_row_position_column_reader( + _state->current_row_group_first_row)); + continue; + } + const auto& column_schema = _state->file_schema[file_column_id]; + const auto projection_it = _request->complex_projections.find(file_column_id); + const auto* projection = projection_it == _request->complex_projections.end() + ? nullptr + : &projection_it->second; + std::unique_ptr column_reader; + RETURN_IF_ERROR( + column_reader_factory.create(*column_schema, projection, &column_reader)); + _state->current_non_predicate_columns.push_back(std::move(column_reader)); + } + *has_row_group = true; + break; + } + return Status::OK(); +} + +// `file_block` has the same layout as FileScanRequest::column_positions. +Status ParquetReader::_read_current_row_group_batch(int64_t batch_rows, Block* file_block, + size_t* rows) { + if (_state->current_predicate_columns.empty() && + _state->current_non_predicate_columns.empty()) { + *rows = static_cast(batch_rows); + return Status::OK(); + } + SelectionVector selection; + DORIS_CHECK(batch_rows <= std::numeric_limits::max()); + uint16_t selected_rows = static_cast(batch_rows); + // 1. Read all predicate columns and evaluate selection vector. + RETURN_IF_ERROR(_read_filter_columns(batch_rows, file_block, &selection, &selected_rows)); + + // 2. Materialize all predicate columns after filtering. + const bool need_filter_output = selected_rows != batch_rows; + if (need_filter_output) { + IColumn::Filter output_filter = _selection_to_filter(selection, selected_rows, batch_rows); + for (const auto file_field_id : _request->predicate_columns) { + auto position_it = _request->column_positions.find(file_field_id); + DORIS_CHECK(position_it != _request->column_positions.end()); + const auto block_position = position_it->second; + RETURN_IF_CATCH_EXCEPTION(file_block->replace_by_position( + block_position, file_block->get_by_position(block_position) + .column->filter(output_filter, selected_rows))); + } + } + + // 3. Materialize all non-predicate columns with selection. + for (size_t output_idx = 0; output_idx < _state->current_non_predicate_columns.size(); + ++output_idx) { + auto& column_reader = _state->current_non_predicate_columns[output_idx]; + auto position_it = + _request->column_positions.find(_request->non_predicate_columns[output_idx]); + DORIS_CHECK(position_it != _request->column_positions.end()); + const auto block_position = position_it->second; + auto col = file_block->get_columns()[block_position]->assume_mutable(); + DCHECK_EQ(file_block->get_by_position(block_position).type->get_primitive_type(), + column_reader->type()->get_primitive_type()); + if (need_filter_output) { + [[maybe_unused]] auto old_size = col->size(); + RETURN_IF_ERROR(column_reader->select(selection, selected_rows, batch_rows, col)); + if (col->size() != old_size + selected_rows) { + return Status::Corruption( + "Parquet selected output column {} returned {} rows, expected {} rows", + column_reader->name(), col->size(), old_size + selected_rows); + } + } else { + int64_t column_rows = 0; + RETURN_IF_ERROR(column_reader->read(batch_rows, col, &column_rows)); + if (column_rows != batch_rows) { + return Status::Corruption( + "Parquet output column {} returned {} rows, expected {} rows", + column_reader->name(), column_rows, batch_rows); + } + } + } + + *rows = static_cast(selected_rows); + return Status::OK(); +} + +int64_t ParquetReader::_column_start_offset( + const ::parquet::ColumnChunkMetaData& column_metadata) const { + return column_metadata.has_dictionary_page() + ? cast_set(column_metadata.dictionary_page_offset()) + : cast_set(column_metadata.data_page_offset()); +} + +bool ParquetReader::_is_row_group_outside_range(int row_group_idx) const { + DORIS_CHECK(_file_description != nullptr); + // This parquet file is not split + if (_file_description->range_size < 0) { + return false; + } + const int64_t range_start_offset = _file_description->range_start_offset; + const int64_t range_end_offset = range_start_offset + _file_description->range_size; + DORIS_CHECK(range_start_offset >= 0); + DORIS_CHECK(range_end_offset >= range_start_offset); + // read whole parquet file if the range covers the whole file, which is a common case when parquet files are not splittable. + if (range_start_offset == 0 && + (_file_description->file_size < 0 || range_end_offset >= _file_description->file_size)) { + return false; + } + + auto row_group_metadata = _state->metadata->RowGroup(row_group_idx); + DORIS_CHECK(row_group_metadata != nullptr); + DORIS_CHECK(row_group_metadata->num_columns() > 0); + const auto first_column = row_group_metadata->ColumnChunk(0); + const auto last_column = row_group_metadata->ColumnChunk(row_group_metadata->num_columns() - 1); + DORIS_CHECK(first_column != nullptr); + DORIS_CHECK(last_column != nullptr); + const int64_t row_group_start_offset = _column_start_offset(*first_column); + const int64_t row_group_end_offset = + _column_start_offset(*last_column) + last_column->total_compressed_size(); + // A scan range is a byte split, while Parquet is read by row group. If a row group crosses + // split boundaries, using overlap would let adjacent ranges read the same row group. Keep the + // same ownership rule as the legacy vparquet reader: the range containing the row group's + // midpoint owns the whole row group. + const int64_t row_group_mid_offset = + row_group_start_offset + (row_group_end_offset - row_group_start_offset) / 2; + return row_group_mid_offset < range_start_offset || row_group_mid_offset >= range_end_offset; +} + +ParquetReader::ParquetReader(std::shared_ptr& system_properties, + std::unique_ptr& file_description, + std::shared_ptr io_ctx, RuntimeProfile* profile) + : FileReader(system_properties, file_description, io_ctx, profile) {} + +ParquetReader::~ParquetReader() = default; + +Status ParquetReader::init(RuntimeState* state) { + RETURN_IF_ERROR(reader::FileReader::init(state)); + _state = std::make_unique(); + _state->arrow_file = + std::make_shared(_tracing_file_reader, _io_ctx.get()); + + try { + _state->file_reader = ::parquet::ParquetFileReader::Open( + _state->arrow_file, ::parquet::default_reader_properties()); + _state->metadata = _state->file_reader->metadata(); + _state->schema = _state->metadata != nullptr ? _state->metadata->schema() : nullptr; + } catch (const ::parquet::ParquetException& e) { + return Status::Corruption("Failed to open parquet file: {}", e.what()); + } catch (const std::exception& e) { + return Status::InternalError("Failed to open parquet file: {}", e.what()); + } + + if (_state->metadata == nullptr || _state->schema == nullptr) { + return Status::Corruption("Failed to read parquet metadata"); + } + RETURN_IF_ERROR(build_parquet_column_schema(*_state->schema, &_state->file_schema)); + return Status::OK(); +} + +Status ParquetReader::get_schema(std::vector* file_schema) const { + if (file_schema == nullptr) { + return Status::InvalidArgument("file_schema is null"); + } + file_schema->clear(); + if (_state == nullptr || _state->schema == nullptr) { + return Status::Uninitialized("ParquetReader is not open"); + } + + file_schema->reserve(_state->file_schema.size()); + for (size_t column_idx = 0; column_idx < _state->file_schema.size(); ++column_idx) { + reader::SchemaField field; + _fill_schema_field(*_state->file_schema[column_idx], &field); + field.id = static_cast(column_idx); + file_schema->push_back(std::move(field)); + } + return Status::OK(); +} + +Status ParquetReader::open(std::unique_ptr& request) { + if (_state == nullptr || _state->metadata == nullptr || _state->schema == nullptr) { + return Status::Uninitialized("ParquetReader is not open"); + } + RETURN_IF_ERROR(reader::FileReader::open(request)); + + // `_request->column_positions.empty()` means all columns are needed by table reader + if (_request->column_positions.empty()) { + for (const auto file_column_id : _request->predicate_columns) { + _request->column_positions.emplace(file_column_id, file_column_id); + } + for (const auto file_column_id : _request->non_predicate_columns) { + _request->column_positions.emplace(file_column_id, file_column_id); + } + } + + const int num_fields = static_cast(_state->file_schema.size()); + for (const auto file_column_id : _request->predicate_columns) { + DORIS_CHECK(_request->column_positions.count(file_column_id) > 0); + if (file_column_id == ParquetColumnReaderFactory::ROW_POSITION_COLUMN_ID) { + continue; + } + DORIS_CHECK(file_column_id >= 0 && file_column_id < num_fields); + } + for (const auto file_column_id : _request->non_predicate_columns) { + DORIS_CHECK(_request->column_positions.count(file_column_id) > 0); + if (file_column_id == ParquetColumnReaderFactory::ROW_POSITION_COLUMN_ID) { + continue; + } + DORIS_CHECK(file_column_id >= 0 && file_column_id < num_fields); + } + for (const auto& column_filter : _request->column_predicate_filters) { + if (column_filter.file_column_id < 0 || column_filter.file_column_id >= num_fields) { + return Status::InvalidArgument("Invalid parquet filter top-level field id {}", + column_filter.file_column_id); + } + } + for (const auto& [file_column_id, projection] : _request->complex_projections) { + if (file_column_id < 0 || file_column_id >= num_fields) { + return Status::InvalidArgument("Invalid parquet projection top-level field id {}", + file_column_id); + } + if (projection.file_column_id != file_column_id) { + return Status::InvalidArgument( + "Parquet projection column id mismatch: key={}, value={}", file_column_id, + projection.file_column_id); + } + if (!projection.file_path.empty() && projection.file_path.front() != file_column_id) { + return Status::InvalidArgument("Invalid parquet projection root path for column {}", + file_column_id); + } + reader::SchemaField projected_field; + RETURN_IF_ERROR(_get_projected_schema_field(file_column_id, &projection, &projected_field)); + } + RETURN_IF_ERROR(select_row_groups_by_statistics(*_state->metadata, _state->file_reader.get(), + _state->file_schema, *_request, + &_state->selected_row_groups)); + std::vector range_selected_row_groups; + range_selected_row_groups.reserve(_state->selected_row_groups.size()); + for (const auto row_group_idx : _state->selected_row_groups) { + if (!_is_row_group_outside_range(row_group_idx)) { + range_selected_row_groups.push_back(row_group_idx); + } + } + _state->selected_row_groups = std::move(range_selected_row_groups); + _state->row_group_first_rows.resize(_state->metadata->num_row_groups()); + int64_t next_row_group_first_row = 0; + for (int row_group_idx = 0; row_group_idx < _state->metadata->num_row_groups(); + ++row_group_idx) { + _state->row_group_first_rows[row_group_idx] = next_row_group_first_row; + auto row_group_metadata = _state->metadata->RowGroup(row_group_idx); + DORIS_CHECK(row_group_metadata != nullptr); + next_row_group_first_row += row_group_metadata->num_rows(); + } + RETURN_IF_ERROR(_reset_reader_position()); + _eof = _state->selected_row_groups.empty(); + return Status::OK(); +} + +Status ParquetReader::get_block(Block* file_block, size_t* rows, bool* eof) { + if (_state == nullptr || _state->file_reader == nullptr || _state->schema == nullptr) { + return Status::Uninitialized("ParquetReader is not open"); + } + *rows = 0; + if (_eof) { + *eof = true; + return Status::OK(); + } + + while (true) { + if (_state->current_row_group == nullptr) { + bool has_row_group = false; + RETURN_IF_ERROR(_open_next_row_group(&has_row_group)); + if (!has_row_group) { + _eof = true; + *eof = true; + return Status::OK(); + } + } + + const int64_t remaining_rows = + _state->current_row_group_rows - _state->current_row_group_rows_read; + if (remaining_rows <= 0) { + _reset_current_row_group(); + continue; + } + + const int64_t batch_rows = + std::min(DEFAULT_PARQUET_READ_BATCH_SIZE, remaining_rows); + const int64_t physical_rows_read = batch_rows; + RETURN_IF_ERROR(_read_current_row_group_batch(batch_rows, file_block, rows)); + _state->current_row_group_rows_read += physical_rows_read; + if (_state->current_row_group_rows_read >= _state->current_row_group_rows) { + _reset_current_row_group(); + } + if (*rows == 0) { + continue; + } + *eof = false; + // TODO: Compute _request->reader_expression_map to filter file_block + return Status::OK(); + } +} + +Status ParquetReader::get_aggregate_result(const reader::FileAggregateRequest& request, + reader::FileAggregateResult* result) { + DORIS_CHECK(result != nullptr); + if (_state == nullptr || _state->metadata == nullptr || _state->schema == nullptr) { + return Status::Uninitialized("ParquetReader is not open"); + } + result->count = 0; + result->columns.clear(); + if (request.agg_type != TPushAggOp::type::COUNT && + request.agg_type != TPushAggOp::type::MINMAX) { + return Status::NotSupported("Unsupported parquet aggregate pushdown type {}", + request.agg_type); + } + + // Aggregate row count in all selected row groups. For MIN/MAX aggregate, this is used to determine whether there is no row group selected. + for (const auto row_group_idx : _state->selected_row_groups) { + auto row_group_metadata = _state->metadata->RowGroup(row_group_idx); + DORIS_CHECK(row_group_metadata != nullptr); + result->count += row_group_metadata->num_rows(); + } + if (request.agg_type == TPushAggOp::type::COUNT) { + return Status::OK(); + } + + result->columns.resize(request.columns.size()); + for (size_t request_column_idx = 0; request_column_idx < request.columns.size(); + ++request_column_idx) { + const auto file_column_id = request.columns[request_column_idx].file_column_id; + if (file_column_id < 0 || + file_column_id >= static_cast(_state->file_schema.size())) { + return Status::InvalidArgument("Invalid parquet aggregate column id {}", + file_column_id); + } + const auto& column_schema = _state->file_schema[file_column_id]; + DORIS_CHECK(column_schema != nullptr); + // TODO: Support min/max pushdown for complex column by traversing down to the leaf column readers. This requires supporting complex column statistics in parquet file reader, which is currently not implemented in parquet-cpp. + if (column_schema->leaf_column_id < 0) { + return Status::NotSupported( + "Parquet aggregate pushdown only supports primitive column {}", + column_schema->name); + } + + auto& aggregate_column = result->columns[request_column_idx]; + for (const auto row_group_idx : _state->selected_row_groups) { + auto row_group_metadata = _state->metadata->RowGroup(row_group_idx); + DORIS_CHECK(row_group_metadata != nullptr); + auto column_chunk = row_group_metadata->ColumnChunk(column_schema->leaf_column_id); + DORIS_CHECK(column_chunk != nullptr); + const auto statistics = ParquetStatisticsUtils::TransformColumnStatistics( + *column_schema, column_chunk->statistics()); + if (!statistics.has_min_max) { + return Status::NotSupported("Missing parquet min/max statistics for column {}", + column_schema->name); + } + if (!aggregate_column.has_min || statistics.min_value < aggregate_column.min_value) { + aggregate_column.min_value = statistics.min_value; + aggregate_column.has_min = true; + } + if (!aggregate_column.has_max || aggregate_column.max_value < statistics.max_value) { + aggregate_column.max_value = statistics.max_value; + aggregate_column.has_max = true; + } + } + if (!aggregate_column.has_min || !aggregate_column.has_max) { + return Status::NotSupported("No parquet row group selected for min/max pushdown"); + } + } + return Status::OK(); +} + +Status ParquetReader::close() { + if (_state != nullptr) { + if (_state->file_reader != nullptr) { + try { + _state->file_reader->Close(); + } catch (const std::exception&) { + // close 需要保持幂等;这里不覆盖此前 scan 路径上的真实错误。 + } + } + if (_state->arrow_file != nullptr) { + static_cast(arrow_status_to_doris_status(_state->arrow_file->Close())); + } + _state = std::make_unique(); + } + return FileReader::close(); +} + +void ParquetReader::_init_profile() { + if (_profile != nullptr) { + static const char* parquet_profile = "ParquetReader"; + ADD_TIMER_WITH_LEVEL(_profile, parquet_profile, 1); + + _parquet_profile.filtered_row_groups = ADD_CHILD_COUNTER_WITH_LEVEL( + _profile, "RowGroupsFiltered", TUnit::UNIT, parquet_profile, 1); + _parquet_profile.filtered_row_groups_by_min_max = ADD_CHILD_COUNTER_WITH_LEVEL( + _profile, "RowGroupsFilteredByMinMax", TUnit::UNIT, parquet_profile, 1); + _parquet_profile.filtered_row_groups_by_bloom_filter = ADD_CHILD_COUNTER_WITH_LEVEL( + _profile, "RowGroupsFilteredByBloomFilter", TUnit::UNIT, parquet_profile, 1); + _parquet_profile.to_read_row_groups = ADD_CHILD_COUNTER_WITH_LEVEL( + _profile, "RowGroupsReadNum", TUnit::UNIT, parquet_profile, 1); + _parquet_profile.total_row_groups = ADD_CHILD_COUNTER_WITH_LEVEL( + _profile, "RowGroupsTotalNum", TUnit::UNIT, parquet_profile, 1); + _parquet_profile.filtered_group_rows = ADD_CHILD_COUNTER_WITH_LEVEL( + _profile, "FilteredRowsByGroup", TUnit::UNIT, parquet_profile, 1); + _parquet_profile.filtered_page_rows = ADD_CHILD_COUNTER_WITH_LEVEL( + _profile, "FilteredRowsByPage", TUnit::UNIT, parquet_profile, 1); + _parquet_profile.lazy_read_filtered_rows = ADD_CHILD_COUNTER_WITH_LEVEL( + _profile, "FilteredRowsByLazyRead", TUnit::UNIT, parquet_profile, 1); + _parquet_profile.filtered_bytes = ADD_CHILD_COUNTER_WITH_LEVEL( + _profile, "FilteredBytes", TUnit::BYTES, parquet_profile, 1); + _parquet_profile.raw_rows_read = ADD_CHILD_COUNTER_WITH_LEVEL( + _profile, "RawRowsRead", TUnit::UNIT, parquet_profile, 1); + _parquet_profile.column_read_time = + ADD_CHILD_TIMER_WITH_LEVEL(_profile, "ColumnReadTime", parquet_profile, 1); + _parquet_profile.parse_meta_time = + ADD_CHILD_TIMER_WITH_LEVEL(_profile, "ParseMetaTime", parquet_profile, 1); + _parquet_profile.parse_footer_time = + ADD_CHILD_TIMER_WITH_LEVEL(_profile, "ParseFooterTime", parquet_profile, 1); + _parquet_profile.file_reader_create_time = + ADD_CHILD_TIMER_WITH_LEVEL(_profile, "FileReaderCreateTime", parquet_profile, 1); + _parquet_profile.open_file_num = + ADD_CHILD_COUNTER_WITH_LEVEL(_profile, "FileNum", TUnit::UNIT, parquet_profile, 1); + _parquet_profile.page_index_read_calls = + ADD_COUNTER_WITH_LEVEL(_profile, "PageIndexReadCalls", TUnit::UNIT, 1); + _parquet_profile.page_index_filter_time = + ADD_CHILD_TIMER_WITH_LEVEL(_profile, "PageIndexFilterTime", parquet_profile, 1); + _parquet_profile.read_page_index_time = + ADD_CHILD_TIMER_WITH_LEVEL(_profile, "PageIndexReadTime", parquet_profile, 1); + _parquet_profile.parse_page_index_time = + ADD_CHILD_TIMER_WITH_LEVEL(_profile, "PageIndexParseTime", parquet_profile, 1); + _parquet_profile.row_group_filter_time = + ADD_CHILD_TIMER_WITH_LEVEL(_profile, "RowGroupFilterTime", parquet_profile, 1); + _parquet_profile.file_footer_read_calls = + ADD_COUNTER_WITH_LEVEL(_profile, "FileFooterReadCalls", TUnit::UNIT, 1); + _parquet_profile.file_footer_hit_cache = + ADD_COUNTER_WITH_LEVEL(_profile, "FileFooterHitCache", TUnit::UNIT, 1); + _parquet_profile.decompress_time = + ADD_CHILD_TIMER_WITH_LEVEL(_profile, "DecompressTime", parquet_profile, 1); + _parquet_profile.decompress_cnt = ADD_CHILD_COUNTER_WITH_LEVEL( + _profile, "DecompressCount", TUnit::UNIT, parquet_profile, 1); + _parquet_profile.page_read_counter = ADD_CHILD_COUNTER_WITH_LEVEL( + _profile, "PageReadCount", TUnit::UNIT, parquet_profile, 1); + _parquet_profile.page_cache_write_counter = ADD_CHILD_COUNTER_WITH_LEVEL( + _profile, "PageCacheWriteCount", TUnit::UNIT, parquet_profile, 1); + _parquet_profile.page_cache_compressed_write_counter = ADD_CHILD_COUNTER_WITH_LEVEL( + _profile, "PageCacheCompressedWriteCount", TUnit::UNIT, parquet_profile, 1); + _parquet_profile.page_cache_decompressed_write_counter = ADD_CHILD_COUNTER_WITH_LEVEL( + _profile, "PageCacheDecompressedWriteCount", TUnit::UNIT, parquet_profile, 1); + _parquet_profile.page_cache_hit_counter = ADD_CHILD_COUNTER_WITH_LEVEL( + _profile, "PageCacheHitCount", TUnit::UNIT, parquet_profile, 1); + _parquet_profile.page_cache_missing_counter = ADD_CHILD_COUNTER_WITH_LEVEL( + _profile, "PageCacheMissingCount", TUnit::UNIT, parquet_profile, 1); + _parquet_profile.page_cache_compressed_hit_counter = ADD_CHILD_COUNTER_WITH_LEVEL( + _profile, "PageCacheCompressedHitCount", TUnit::UNIT, parquet_profile, 1); + _parquet_profile.page_cache_decompressed_hit_counter = ADD_CHILD_COUNTER_WITH_LEVEL( + _profile, "PageCacheDecompressedHitCount", TUnit::UNIT, parquet_profile, 1); + _parquet_profile.decode_header_time = + ADD_CHILD_TIMER_WITH_LEVEL(_profile, "PageHeaderDecodeTime", parquet_profile, 1); + _parquet_profile.read_page_header_time = + ADD_CHILD_TIMER_WITH_LEVEL(_profile, "PageHeaderReadTime", parquet_profile, 1); + _parquet_profile.decode_value_time = + ADD_CHILD_TIMER_WITH_LEVEL(_profile, "DecodeValueTime", parquet_profile, 1); + _parquet_profile.decode_dict_time = + ADD_CHILD_TIMER_WITH_LEVEL(_profile, "DecodeDictTime", parquet_profile, 1); + _parquet_profile.decode_level_time = + ADD_CHILD_TIMER_WITH_LEVEL(_profile, "DecodeLevelTime", parquet_profile, 1); + _parquet_profile.decode_null_map_time = + ADD_CHILD_TIMER_WITH_LEVEL(_profile, "DecodeNullMapTime", parquet_profile, 1); + _parquet_profile.skip_page_header_num = ADD_CHILD_COUNTER_WITH_LEVEL( + _profile, "SkipPageHeaderNum", TUnit::UNIT, parquet_profile, 1); + _parquet_profile.parse_page_header_num = ADD_CHILD_COUNTER_WITH_LEVEL( + _profile, "ParsePageHeaderNum", TUnit::UNIT, parquet_profile, 1); + _parquet_profile.predicate_filter_time = + ADD_CHILD_TIMER_WITH_LEVEL(_profile, "PredicateFilterTime", parquet_profile, 1); + _parquet_profile.dict_filter_rewrite_time = + ADD_CHILD_TIMER_WITH_LEVEL(_profile, "DictFilterRewriteTime", parquet_profile, 1); + _parquet_profile.convert_time = + ADD_CHILD_TIMER_WITH_LEVEL(_profile, "ConvertTime", parquet_profile, 1); + _parquet_profile.bloom_filter_read_time = + ADD_CHILD_TIMER_WITH_LEVEL(_profile, "BloomFilterReadTime", parquet_profile, 1); + } +} + +} // namespace doris::parquet diff --git a/be/src/format/new_parquet/parquet_reader.h b/be/src/format/new_parquet/parquet_reader.h new file mode 100644 index 00000000000000..85d766f88820ce --- /dev/null +++ b/be/src/format/new_parquet/parquet_reader.h @@ -0,0 +1,154 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +#include "common/status.h" +#include "format/reader/file_reader.h" +#include "parquet_column_schema.h" +#include "selection_vector.h" + +namespace doris { +namespace io { +struct IOContext; +} // namespace io +} // namespace doris + +namespace parquet { +class ColumnChunkMetaData; +} // namespace parquet + +namespace doris::parquet { + +struct ParquetReaderScanState; + +// ParquetReader 的 file-local scan 请求。 +// 当前没有新增 Parquet-only 字段,但保留独立类型,便于后续加入 row group/page index +// 等 Parquet 专属选项。 +struct ParquetScanRequest : public reader::FileScanRequest {}; + +// Parquet 文件物理读取层。 +// 该类只理解 Parquet file-local schema 和 ParquetScanRequest,不理解 Iceberg/global +// schema,不处理 table-level cast/default/generated/partition 语义。 +class ParquetReader : public reader::FileReader { +public: + ParquetReader(std::shared_ptr& system_properties, + std::unique_ptr& file_description, + std::shared_ptr io_ctx, RuntimeProfile* profile); + ~ParquetReader() override; + + // 打开 Parquet 文件并解析 footer metadata。 + // init 成功后可以调用 get_schema() 获取 Parquet file-local schema。 + Status init(RuntimeState* state) override; + + // 返回 init() 阶段解析出的 Parquet 文件自身 schema。 + // 该方法只能在 init() 成功后调用,不要求 open() 已经执行。 + // 这里不做 Iceberg schema evolution,也不把字段转换成 table/global schema。 + Status get_schema(std::vector* file_schema) const override; + + Status open(std::unique_ptr& request) override; + // 读取下一批 Parquet file-local block。 + // 该方法只能在 init() 成功后调用。 + // 返回列必须保持 file-local 语义,不能在这里补 default/generated/partition 列。 + Status get_block(Block* file_block, size_t* rows, bool* eof) override; + + Status get_aggregate_result(const reader::FileAggregateRequest& request, + reader::FileAggregateResult* result) override; + + Status close() override; + +protected: + void _init_profile() override; + +private: + struct ParquetProfile { + RuntimeProfile::Counter* filtered_row_groups = nullptr; + RuntimeProfile::Counter* filtered_row_groups_by_min_max = nullptr; + RuntimeProfile::Counter* filtered_row_groups_by_bloom_filter = nullptr; + RuntimeProfile::Counter* to_read_row_groups = nullptr; + RuntimeProfile::Counter* total_row_groups = nullptr; + RuntimeProfile::Counter* filtered_group_rows = nullptr; + RuntimeProfile::Counter* filtered_page_rows = nullptr; + RuntimeProfile::Counter* lazy_read_filtered_rows = nullptr; + RuntimeProfile::Counter* filtered_bytes = nullptr; + RuntimeProfile::Counter* raw_rows_read = nullptr; + RuntimeProfile::Counter* column_read_time = nullptr; + RuntimeProfile::Counter* parse_meta_time = nullptr; + RuntimeProfile::Counter* parse_footer_time = nullptr; + RuntimeProfile::Counter* file_reader_create_time = nullptr; + RuntimeProfile::Counter* open_file_num = nullptr; + RuntimeProfile::Counter* row_group_filter_time = nullptr; + RuntimeProfile::Counter* page_index_read_calls = nullptr; + RuntimeProfile::Counter* page_index_filter_time = nullptr; + RuntimeProfile::Counter* read_page_index_time = nullptr; + RuntimeProfile::Counter* parse_page_index_time = nullptr; + RuntimeProfile::Counter* file_footer_read_calls = nullptr; + RuntimeProfile::Counter* file_footer_hit_cache = nullptr; + RuntimeProfile::Counter* decompress_time = nullptr; + RuntimeProfile::Counter* decompress_cnt = nullptr; + RuntimeProfile::Counter* page_read_counter = nullptr; + RuntimeProfile::Counter* page_cache_write_counter = nullptr; + RuntimeProfile::Counter* page_cache_compressed_write_counter = nullptr; + RuntimeProfile::Counter* page_cache_decompressed_write_counter = nullptr; + RuntimeProfile::Counter* page_cache_hit_counter = nullptr; + RuntimeProfile::Counter* page_cache_missing_counter = nullptr; + RuntimeProfile::Counter* page_cache_compressed_hit_counter = nullptr; + RuntimeProfile::Counter* page_cache_decompressed_hit_counter = nullptr; + RuntimeProfile::Counter* decode_header_time = nullptr; + RuntimeProfile::Counter* read_page_header_time = nullptr; + RuntimeProfile::Counter* decode_value_time = nullptr; + RuntimeProfile::Counter* decode_dict_time = nullptr; + RuntimeProfile::Counter* decode_level_time = nullptr; + RuntimeProfile::Counter* decode_null_map_time = nullptr; + RuntimeProfile::Counter* skip_page_header_num = nullptr; + RuntimeProfile::Counter* parse_page_header_num = nullptr; + RuntimeProfile::Counter* predicate_filter_time = nullptr; + RuntimeProfile::Counter* dict_filter_rewrite_time = nullptr; + RuntimeProfile::Counter* convert_time = nullptr; + RuntimeProfile::Counter* bloom_filter_read_time = nullptr; + }; + Status _reset_reader_position(); + void _reset_current_row_group(); + void _fill_schema_field(const ParquetColumnSchema& column_schema, + reader::SchemaField* field) const; + Status _fill_projected_schema_field(const ParquetColumnSchema& column_schema, + const reader::FieldProjection* projection, + reader::SchemaField* field) const; + Status _get_projected_schema_field(reader::ColumnId file_column_id, + const reader::FieldProjection* projection, + reader::SchemaField* field) const; + Status _read_filter_columns(int64_t batch_rows, Block* file_block, SelectionVector* selection, + uint16_t* selected_rows); + Status _execute_filter_conjuncts(int64_t batch_rows, Block* file_block, + SelectionVector* selection, uint16_t* selected_rows); + IColumn::Filter _selection_to_filter(const SelectionVector& selection, uint16_t selected_rows, + int64_t batch_rows); + uint16_t _apply_filter_to_selection(const IColumn::Filter& filter, SelectionVector* selection, + uint16_t selected_rows); + Status _open_next_row_group(bool* has_row_group); + Status _read_current_row_group_batch(int64_t batch_rows, Block* file_block, size_t* rows); + bool _is_row_group_outside_range(int row_group_idx) const; + int64_t _column_start_offset(const ::parquet::ColumnChunkMetaData& column_metadata) const; + + std::unique_ptr _state; + ParquetProfile _parquet_profile; +}; + +} // namespace doris::parquet diff --git a/be/src/format/new_parquet/parquet_statistics.cpp b/be/src/format/new_parquet/parquet_statistics.cpp new file mode 100644 index 00000000000000..17c4e4911f5ddf --- /dev/null +++ b/be/src/format/new_parquet/parquet_statistics.cpp @@ -0,0 +1,458 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "format/new_parquet/parquet_statistics.h" + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "core/data_type/data_type.h" +#include "core/data_type/data_type_nullable.h" +#include "core/data_type/primitive_type.h" +#include "core/field.h" +#include "format/new_parquet/parquet_column_schema.h" +#include "storage/index/zone_map/zone_map_index.h" +#include "storage/predicate/column_predicate.h" + +namespace doris::parquet { +namespace { + +PrimitiveType physical_filter_type(const ParquetColumnSchema& column_schema) { + if (column_schema.type == nullptr) { + return INVALID_TYPE; + } + switch (remove_nullable(column_schema.type)->get_primitive_type()) { + case TYPE_BOOLEAN: + case TYPE_INT: + case TYPE_BIGINT: + case TYPE_FLOAT: + case TYPE_DOUBLE: + case TYPE_STRING: + return remove_nullable(column_schema.type)->get_primitive_type(); + default: + return INVALID_TYPE; + } +} + +template +bool set_typed_min_max(const std::shared_ptr<::parquet::Statistics>& statistics, ConvertFn convert, + ParquetColumnStatistics* column_statistics) { + auto typed_statistics = + std::static_pointer_cast<::parquet::TypedStatistics>(statistics); + column_statistics->min_value = Field::create_field(convert(typed_statistics->min())); + column_statistics->max_value = Field::create_field(convert(typed_statistics->max())); + return true; +} + +bool set_string_min_max(const std::shared_ptr<::parquet::Statistics>& statistics, + const ::parquet::ColumnDescriptor* descriptor, + ParquetColumnStatistics* column_statistics) { + switch (statistics->physical_type()) { + case ::parquet::Type::BYTE_ARRAY: { + auto typed_statistics = + std::static_pointer_cast<::parquet::TypedStatistics<::parquet::ByteArrayType>>( + statistics); + column_statistics->min_value = Field::create_field( + ::parquet::ByteArrayToString(typed_statistics->min())); + column_statistics->max_value = Field::create_field( + ::parquet::ByteArrayToString(typed_statistics->max())); + return true; + } + case ::parquet::Type::FIXED_LEN_BYTE_ARRAY: { + if (descriptor == nullptr || descriptor->type_length() <= 0) { + return false; + } + auto typed_statistics = + std::static_pointer_cast<::parquet::TypedStatistics<::parquet::FLBAType>>( + statistics); + const int type_length = descriptor->type_length(); + column_statistics->min_value = Field::create_field(std::string( + reinterpret_cast(typed_statistics->min().ptr), type_length)); + column_statistics->max_value = Field::create_field(std::string( + reinterpret_cast(typed_statistics->max().ptr), type_length)); + return true; + } + default: + return false; + } +} + +bool is_null_only_predicate(const ColumnPredicate& predicate) { + return predicate.type() == PredicateType::IS_NULL || + predicate.type() == PredicateType::IS_NOT_NULL; +} + +bool is_supported_dictionary_predicate(const ColumnPredicate& predicate) { + switch (predicate.type()) { + case PredicateType::EQ: + case PredicateType::IN_LIST: + return true; + default: + return false; + } +} + +bool is_dictionary_data_encoding(::parquet::Encoding::type encoding) { + return encoding == ::parquet::Encoding::PLAIN_DICTIONARY || + encoding == ::parquet::Encoding::RLE_DICTIONARY; +} + +bool is_level_encoding(::parquet::Encoding::type encoding) { + return encoding == ::parquet::Encoding::RLE || encoding == ::parquet::Encoding::BIT_PACKED; +} + +bool is_data_page_type(::parquet::PageType::type page_type) { + return page_type == ::parquet::PageType::DATA_PAGE || + page_type == ::parquet::PageType::DATA_PAGE_V2; +} + +bool is_dictionary_encoded_chunk(const ::parquet::ColumnChunkMetaData& column_metadata) { + if (!column_metadata.has_dictionary_page()) { + return false; + } + + const auto& encoding_stats = column_metadata.encoding_stats(); + if (!encoding_stats.empty()) { + bool has_dictionary_data_page = false; + for (const auto& encoding_stat : encoding_stats) { + if (!is_data_page_type(encoding_stat.page_type) || encoding_stat.count <= 0) { + continue; + } + if (!is_dictionary_data_encoding(encoding_stat.encoding)) { + return false; + } + has_dictionary_data_page = true; + } + return has_dictionary_data_page; + } + + bool has_dictionary_encoding = false; + for (const auto encoding : column_metadata.encodings()) { + if (is_dictionary_data_encoding(encoding)) { + has_dictionary_encoding = true; + continue; + } + if (!is_level_encoding(encoding)) { + return false; + } + } + return has_dictionary_encoding; +} + +bool supports_dictionary_pruning(const ParquetColumnSchema& column_schema, + const ::parquet::ColumnChunkMetaData& column_metadata, + const reader::FileColumnPredicateFilter& column_filter) { + if (column_schema.kind != ParquetColumnSchemaKind::PRIMITIVE || + column_schema.descriptor == nullptr || column_schema.type == nullptr) { + return false; + } + if (!column_schema.type_descriptor.is_string_like) { + return false; + } + if (column_metadata.type() != ::parquet::Type::BYTE_ARRAY && + column_metadata.type() != ::parquet::Type::FIXED_LEN_BYTE_ARRAY) { + return false; + } + for (const auto& column_predicate : column_filter.predicates) { + if (column_predicate == nullptr || !is_supported_dictionary_predicate(*column_predicate)) { + return false; + } + } + return true; +} + +struct OwnedDictionaryWords { + std::vector values; + std::vector refs; + + void clear() { + values.clear(); + refs.clear(); + } + + void build_refs() { + refs.reserve(values.size()); + for (const auto& value : values) { + refs.emplace_back(value.data(), value.size()); + } + } +}; + +bool read_dictionary_words(::parquet::ParquetFileReader* file_reader, int row_group_idx, + int leaf_column_id, const ParquetColumnSchema& column_schema, + OwnedDictionaryWords* dict_words) { + DORIS_CHECK(dict_words != nullptr); + dict_words->clear(); + if (file_reader == nullptr || leaf_column_id < 0) { + return false; + } + + auto row_group_reader = file_reader->RowGroup(row_group_idx); + if (row_group_reader == nullptr) { + return false; + } + auto page_reader = row_group_reader->GetColumnPageReader(leaf_column_id); + if (page_reader == nullptr) { + return false; + } + + std::shared_ptr<::parquet::Page> page; + try { + page = page_reader->NextPage(); + } catch (const ::parquet::ParquetException&) { + return false; + } catch (const std::exception&) { + return false; + } + if (page == nullptr || page->type() != ::parquet::PageType::DICTIONARY_PAGE) { + return false; + } + const auto* dictionary_page = static_cast(page.get()); + if (dictionary_page->encoding() != ::parquet::Encoding::PLAIN && + dictionary_page->encoding() != ::parquet::Encoding::PLAIN_DICTIONARY) { + return false; + } + const int32_t dictionary_length = dictionary_page->num_values(); + if (dictionary_length <= 0) { + return false; + } + const auto* dictionary_data = dictionary_page->data(); + const int dictionary_size = dictionary_page->size(); + + dict_words->values.reserve(static_cast(dictionary_length)); + if (column_schema.descriptor->physical_type() == ::parquet::Type::BYTE_ARRAY) { + auto decoder = ::parquet::MakeTypedDecoder<::parquet::ByteArrayType>( + ::parquet::Encoding::PLAIN, column_schema.descriptor); + decoder->SetData(dictionary_length, dictionary_data, dictionary_size); + std::vector<::parquet::ByteArray> byte_array_values(static_cast(dictionary_length)); + if (decoder->Decode(byte_array_values.data(), dictionary_length) != dictionary_length) { + return false; + } + for (int32_t dict_idx = 0; dict_idx < dictionary_length; ++dict_idx) { + dict_words->values.emplace_back( + reinterpret_cast(byte_array_values[dict_idx].ptr), + byte_array_values[dict_idx].len); + } + dict_words->build_refs(); + return true; + } + if (column_schema.descriptor->physical_type() == ::parquet::Type::FIXED_LEN_BYTE_ARRAY) { + const int type_length = column_schema.descriptor->type_length(); + if (type_length <= 0) { + return false; + } + auto decoder = ::parquet::MakeTypedDecoder<::parquet::FLBAType>(::parquet::Encoding::PLAIN, + column_schema.descriptor); + decoder->SetData(dictionary_length, dictionary_data, dictionary_size); + std::vector<::parquet::FixedLenByteArray> flba_values( + static_cast(dictionary_length)); + if (decoder->Decode(flba_values.data(), dictionary_length) != dictionary_length) { + return false; + } + for (int32_t dict_idx = 0; dict_idx < dictionary_length; ++dict_idx) { + dict_words->values.emplace_back( + reinterpret_cast(flba_values[dict_idx].ptr), type_length); + } + dict_words->build_refs(); + return true; + } + return false; +} + +segment_v2::ZoneMap to_column_predicate_statistics(const ParquetColumnStatistics& statistics) { + segment_v2::ZoneMap predicate_statistics; + predicate_statistics.min_value = statistics.min_value; + predicate_statistics.max_value = statistics.max_value; + predicate_statistics.has_null = statistics.has_null; + predicate_statistics.has_not_null = statistics.has_not_null; + return predicate_statistics; +} + +} // namespace + +ParquetColumnStatistics ParquetStatisticsUtils::TransformColumnStatistics( + const ParquetColumnSchema& column_schema, + const std::shared_ptr<::parquet::Statistics>& statistics) { + ParquetColumnStatistics result; + if (statistics == nullptr) { + return result; + } + + result.has_null = statistics->HasNullCount() && statistics->null_count() > 0; + result.has_not_null = statistics->num_values() > 0 || statistics->HasMinMax(); + result.has_null_count = statistics->HasNullCount(); + if (!result.has_not_null || !statistics->HasMinMax()) { + return result; + } + + switch (statistics->physical_type()) { + case ::parquet::Type::BOOLEAN: + result.has_min_max = set_typed_min_max<::parquet::BooleanType, TYPE_BOOLEAN>( + statistics, [](bool value) { return static_cast(value); }, &result); + return result; + case ::parquet::Type::INT32: + result.has_min_max = set_typed_min_max<::parquet::Int32Type, TYPE_INT>( + statistics, [](int32_t value) { return value; }, &result); + return result; + case ::parquet::Type::INT64: + result.has_min_max = set_typed_min_max<::parquet::Int64Type, TYPE_BIGINT>( + statistics, [](int64_t value) { return value; }, &result); + return result; + case ::parquet::Type::FLOAT: + result.has_min_max = set_typed_min_max<::parquet::FloatType, TYPE_FLOAT>( + statistics, [](float value) { return value; }, &result); + return result; + case ::parquet::Type::DOUBLE: + result.has_min_max = set_typed_min_max<::parquet::DoubleType, TYPE_DOUBLE>( + statistics, [](double value) { return value; }, &result); + return result; + case ::parquet::Type::BYTE_ARRAY: + case ::parquet::Type::FIXED_LEN_BYTE_ARRAY: + result.has_min_max = set_string_min_max(statistics, column_schema.descriptor, &result); + return result; + default: + return result; + } +} + +bool ParquetStatisticsUtils::CheckStatistics(const reader::FileColumnPredicateFilter& column_filter, + const ParquetColumnStatistics& statistics) { + if (!statistics.has_any_statistics()) { + return false; + } + + for (const auto& column_predicate : column_filter.predicates) { + if (is_null_only_predicate(*column_predicate)) { + if (!statistics.has_null_count) { + continue; + } + } else if (!statistics.has_any_statistics()) { + continue; + } + if (!column_predicate->evaluate_and(to_column_predicate_statistics(statistics))) { + return true; + } + } + return false; +} + +bool ParquetStatisticsUtils::RowGroupExcludes( + const ::parquet::RowGroupMetaData& row_group, ::parquet::ParquetFileReader* file_reader, + int row_group_idx, const std::vector>& schema, + const reader::FileColumnPredicateFilter& column_filter) { + if (column_filter.predicates.empty()) { + return false; + } + DCHECK_LT(column_filter.file_column_id, schema.size()); + const auto& column_schema = *schema[column_filter.file_column_id]; + if (column_schema.kind != ParquetColumnSchemaKind::PRIMITIVE || + column_schema.leaf_column_id < 0) { + return false; + } + DCHECK_LT(column_schema.leaf_column_id, row_group.num_columns()); + auto column_chunk = row_group.ColumnChunk(column_schema.leaf_column_id); + if (column_chunk == nullptr) { + return false; + } + if (CheckStatistics(column_filter, + TransformColumnStatistics(column_schema, column_chunk->statistics()))) { + return true; + } + if (!supports_dictionary_pruning(column_schema, *column_chunk, column_filter) || + !is_dictionary_encoded_chunk(*column_chunk)) { + return false; + } + OwnedDictionaryWords dict_words; + if (!read_dictionary_words(file_reader, row_group_idx, column_schema.leaf_column_id, + column_schema, &dict_words)) { + return false; + } + for (const auto& column_predicate : column_filter.predicates) { + if (!column_predicate->evaluate_and(dict_words.refs.data(), dict_words.refs.size())) { + return true; + } + } + return false; +} + +Status ParquetStatisticsUtils::SelectRowGroups( + const ::parquet::FileMetaData& metadata, ::parquet::ParquetFileReader* file_reader, + const std::vector>& file_schema, + const reader::FileScanRequest& request, std::vector* selected_row_groups) { + if (selected_row_groups == nullptr) { + return Status::InvalidArgument("selected_row_groups is null"); + } + selected_row_groups->clear(); + + const int num_row_groups = metadata.num_row_groups(); + selected_row_groups->reserve(num_row_groups); + for (int row_group_idx = 0; row_group_idx < num_row_groups; ++row_group_idx) { + auto row_group = metadata.RowGroup(row_group_idx); + if (row_group == nullptr) { + selected_row_groups->push_back(row_group_idx); + continue; + } + bool drop = false; + for (const auto& column_filter : request.column_predicate_filters) { + if (RowGroupExcludes(*row_group, file_reader, row_group_idx, file_schema, + column_filter)) { + drop = true; + break; + } + } + if (drop) { + continue; + } + selected_row_groups->push_back(row_group_idx); + } + return Status::OK(); +} + +bool ParquetStatisticsUtils::BloomFilterSupported(const ParquetColumnSchema& column_schema) { + switch (physical_filter_type(column_schema)) { + case TYPE_BOOLEAN: + case TYPE_INT: + case TYPE_BIGINT: + case TYPE_FLOAT: + case TYPE_DOUBLE: + case TYPE_STRING: + return true; + default: + return false; + } +} + +Status select_row_groups_by_statistics( + const ::parquet::FileMetaData& metadata, ::parquet::ParquetFileReader* file_reader, + const std::vector>& file_schema, + const reader::FileScanRequest& request, std::vector* selected_row_groups) { + return ParquetStatisticsUtils::SelectRowGroups(metadata, file_reader, file_schema, request, + selected_row_groups); +} + +} // namespace doris::parquet diff --git a/be/src/format/new_parquet/parquet_statistics.h b/be/src/format/new_parquet/parquet_statistics.h new file mode 100644 index 00000000000000..ff1c300e84ca6f --- /dev/null +++ b/be/src/format/new_parquet/parquet_statistics.h @@ -0,0 +1,91 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +#include "common/status.h" +#include "core/field.h" +#include "format/reader/file_reader.h" + +namespace parquet { +class FileMetaData; +class ParquetFileReader; +class RowGroupMetaData; +class Statistics; +} // namespace parquet + +namespace doris { +class ColumnPredicate; +} // namespace doris + +namespace doris::parquet { + +struct ParquetColumnSchema; + +// Parquet row group column statistics 转换后的 Doris 统计视图。 +// DuckDB 会把 Parquet stats 转换成 BaseStatistics,然后让 TableFilter 自己判断; +// Doris 新 reader 先保存 file-local min/max/null 信息,再交给 ColumnPredicate 判断。 +struct ParquetColumnStatistics { + Field min_value; + Field max_value; + bool has_null = false; + bool has_not_null = false; + bool has_null_count = false; + bool has_min_max = false; + + bool has_any_statistics() const { return has_null_count || has_min_max; } +}; + +// Parquet file-local statistics/page index/bloom filter 工具类。 +// 结构参考 DuckDB ParquetStatisticsUtils:先把 Parquet metadata 转成统一统计对象, +// 再由 filter/predicate 判断是否可以裁剪。这里不理解 table/global schema。 +struct ParquetStatisticsUtils { + static ParquetColumnStatistics TransformColumnStatistics( + const ParquetColumnSchema& column_schema, + const std::shared_ptr<::parquet::Statistics>& statistics); + + // Return true if the statistics indicate that the row group can be safely skipped according to + // the local single-column predicate filter. + static bool CheckStatistics(const reader::FileColumnPredicateFilter& column_filter, + const ParquetColumnStatistics& statistics); + + static bool RowGroupExcludes(const ::parquet::RowGroupMetaData& row_group, + ::parquet::ParquetFileReader* file_reader, int row_group_idx, + const std::vector>& schema, + const reader::FileColumnPredicateFilter& column_filter); + + static Status SelectRowGroups( + const ::parquet::FileMetaData& metadata, ::parquet::ParquetFileReader* file_reader, + const std::vector>& file_schema, + const reader::FileScanRequest& request, std::vector* selected_row_groups); + + static bool BloomFilterSupported(const ParquetColumnSchema& column_schema); +}; + +// Parquet file-local statistics/page index/bloom filter 裁剪入口。 +// 这里只消费已经 localize 到 file schema 的 FileScanRequest,不理解 table/global schema。 +// 后续 page index、dictionary、bloom filter 等文件格式优化也应继续收敛在这一层,避免污染 +// ParquetReader 的 scan 调度代码。 +Status select_row_groups_by_statistics( + const ::parquet::FileMetaData& metadata, ::parquet::ParquetFileReader* file_reader, + const std::vector>& file_schema, + const reader::FileScanRequest& request, std::vector* selected_row_groups); + +} // namespace doris::parquet diff --git a/be/src/format/new_parquet/parquet_type.cpp b/be/src/format/new_parquet/parquet_type.cpp new file mode 100644 index 00000000000000..4079c989f7d232 --- /dev/null +++ b/be/src/format/new_parquet/parquet_type.cpp @@ -0,0 +1,345 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "format/new_parquet/parquet_type.h" + +#include + +#include +#include + +#include "core/data_type/data_type_factory.hpp" +#include "core/data_type/data_type_nullable.h" +#include "core/data_type/data_type_number.h" +#include "core/data_type/data_type_string.h" +#include "core/data_type/primitive_type.h" + +namespace doris::parquet { +namespace { + +DataTypePtr create_type(PrimitiveType type, bool nullable, int precision = 0, int scale = 0) { + return DataTypeFactory::instance().create_data_type(type, nullable, precision, scale); +} + +PrimitiveType decimal_primitive_type(int precision) { + return precision > 38 ? TYPE_DECIMAL256 : TYPE_DECIMAL128I; +} + +bool has_non_physical_annotation(const ::parquet::ColumnDescriptor* column) { + if (column == nullptr) { + return false; + } + const auto& logical_type = column->logical_type(); + return column->converted_type() != ::parquet::ConvertedType::NONE || + (logical_type != nullptr && logical_type->is_valid() && !logical_type->is_none()); +} + +void mark_decimal(const ::parquet::ColumnDescriptor* column, int precision, int scale, + ParquetTypeDescriptor* result) { + result->is_decimal = true; + result->decimal_precision = precision; + result->decimal_scale = scale; + switch (column->physical_type()) { + case ::parquet::Type::INT32: + result->extra_type_info = ParquetExtraTypeInfo::DECIMAL_INT32; + break; + case ::parquet::Type::INT64: + result->extra_type_info = ParquetExtraTypeInfo::DECIMAL_INT64; + break; + case ::parquet::Type::BYTE_ARRAY: + case ::parquet::Type::FIXED_LEN_BYTE_ARRAY: + result->extra_type_info = ParquetExtraTypeInfo::DECIMAL_BYTE_ARRAY; + break; + default: + result->extra_type_info = ParquetExtraTypeInfo::NONE; + break; + } +} + +DataTypePtr converted_type_to_doris_type(const ::parquet::ColumnDescriptor* column, + ParquetTypeDescriptor* result) { + const bool nullable = column->max_definition_level() > 0; + switch (column->converted_type()) { + case ::parquet::ConvertedType::UTF8: + case ::parquet::ConvertedType::ENUM: + case ::parquet::ConvertedType::JSON: + case ::parquet::ConvertedType::BSON: + return create_type(TYPE_STRING, nullable); + case ::parquet::ConvertedType::DECIMAL: + mark_decimal(column, column->type_precision(), column->type_scale(), result); + return create_type(decimal_primitive_type(column->type_precision()), nullable, + column->type_precision(), column->type_scale()); + case ::parquet::ConvertedType::DATE: + return create_type(TYPE_DATEV2, nullable); + case ::parquet::ConvertedType::TIME_MILLIS: + result->time_unit = ParquetTimeUnit::MILLIS; + result->extra_type_info = ParquetExtraTypeInfo::UNIT_MS; + return create_type(TYPE_TIMEV2, nullable, 0, 3); + case ::parquet::ConvertedType::TIME_MICROS: + result->time_unit = ParquetTimeUnit::MICROS; + result->extra_type_info = ParquetExtraTypeInfo::UNIT_MICROS; + return create_type(TYPE_TIMEV2, nullable, 0, 6); + case ::parquet::ConvertedType::TIMESTAMP_MILLIS: + result->is_timestamp = true; + result->time_unit = ParquetTimeUnit::MILLIS; + result->extra_type_info = ParquetExtraTypeInfo::UNIT_MS; + return create_type(TYPE_DATETIMEV2, nullable, 0, 3); + case ::parquet::ConvertedType::TIMESTAMP_MICROS: + result->is_timestamp = true; + result->time_unit = ParquetTimeUnit::MICROS; + result->extra_type_info = ParquetExtraTypeInfo::UNIT_MICROS; + return create_type(TYPE_DATETIMEV2, nullable, 0, 6); + case ::parquet::ConvertedType::INT_8: + return create_type(TYPE_TINYINT, nullable); + case ::parquet::ConvertedType::UINT_8: + case ::parquet::ConvertedType::INT_16: + return create_type(TYPE_SMALLINT, nullable); + case ::parquet::ConvertedType::UINT_16: + case ::parquet::ConvertedType::INT_32: + return create_type(TYPE_INT, nullable); + case ::parquet::ConvertedType::UINT_32: + case ::parquet::ConvertedType::INT_64: + return create_type(TYPE_BIGINT, nullable); + case ::parquet::ConvertedType::UINT_64: + return create_type(TYPE_LARGEINT, nullable); + case ::parquet::ConvertedType::NONE: + default: + return nullptr; + } +} + +DataTypePtr logical_type_to_doris_type(const ::parquet::ColumnDescriptor* column, + ParquetTypeDescriptor* result) { + const auto& logical_type = column->logical_type(); + if (logical_type == nullptr || !logical_type->is_valid() || logical_type->is_none()) { + return nullptr; + } + const bool nullable = column->max_definition_level() > 0; + if (logical_type->is_string() || logical_type->is_enum() || logical_type->is_JSON() || + logical_type->is_BSON() || logical_type->is_UUID()) { + return create_type(TYPE_STRING, nullable); + } + if (logical_type->is_decimal()) { + const auto& decimal_type = static_cast(*logical_type); + mark_decimal(column, decimal_type.precision(), decimal_type.scale(), result); + return create_type(decimal_primitive_type(decimal_type.precision()), nullable, + decimal_type.precision(), decimal_type.scale()); + } + if (logical_type->is_date()) { + return create_type(TYPE_DATEV2, nullable); + } + if (logical_type->is_time()) { + const auto& time_type = static_cast(*logical_type); + int scale = 0; + if (time_type.time_unit() == ::parquet::LogicalType::TimeUnit::MILLIS) { + scale = 3; + result->time_unit = ParquetTimeUnit::MILLIS; + result->extra_type_info = ParquetExtraTypeInfo::UNIT_MS; + } else if (time_type.time_unit() == ::parquet::LogicalType::TimeUnit::MICROS) { + scale = 6; + result->time_unit = ParquetTimeUnit::MICROS; + result->extra_type_info = ParquetExtraTypeInfo::UNIT_MICROS; + } else { + return nullptr; + } + return create_type(TYPE_TIMEV2, nullable, 0, scale); + } + if (logical_type->is_timestamp()) { + const auto& timestamp_type = + static_cast(*logical_type); + int scale = 0; + if (timestamp_type.time_unit() == ::parquet::LogicalType::TimeUnit::MILLIS) { + scale = 3; + result->time_unit = ParquetTimeUnit::MILLIS; + result->extra_type_info = ParquetExtraTypeInfo::UNIT_MS; + } else if (timestamp_type.time_unit() == ::parquet::LogicalType::TimeUnit::MICROS) { + scale = 6; + result->time_unit = ParquetTimeUnit::MICROS; + result->extra_type_info = ParquetExtraTypeInfo::UNIT_MICROS; + } else { + return nullptr; + } + result->is_timestamp = true; + return create_type(TYPE_DATETIMEV2, nullable, 0, scale); + } + if (logical_type->is_int()) { + const auto& int_type = static_cast(*logical_type); + switch (int_type.bit_width()) { + case 8: + return create_type(int_type.is_signed() ? TYPE_TINYINT : TYPE_SMALLINT, nullable); + case 16: + return create_type(int_type.is_signed() ? TYPE_SMALLINT : TYPE_INT, nullable); + case 32: + return create_type(int_type.is_signed() ? TYPE_INT : TYPE_BIGINT, nullable); + case 64: + return create_type(int_type.is_signed() ? TYPE_BIGINT : TYPE_LARGEINT, nullable); + default: + return nullptr; + } + } + return nullptr; +} + +DataTypePtr physical_type_to_doris_type(const ::parquet::ColumnDescriptor* column) { + const bool nullable = column->max_definition_level() > 0; + DataTypePtr type; + switch (column->physical_type()) { + case ::parquet::Type::BOOLEAN: + type = std::make_shared(); + break; + case ::parquet::Type::INT32: + type = std::make_shared(); + break; + case ::parquet::Type::INT64: + type = std::make_shared(); + break; + case ::parquet::Type::FLOAT: + type = std::make_shared(); + break; + case ::parquet::Type::DOUBLE: + type = std::make_shared(); + break; + case ::parquet::Type::BYTE_ARRAY: + case ::parquet::Type::FIXED_LEN_BYTE_ARRAY: + type = std::make_shared(); + break; + case ::parquet::Type::INT96: + type = std::make_shared(); + break; + default: + return nullptr; + } + return nullable ? make_nullable(type) : type; +} + +DataTypePtr direct_flat_primitive_doris_type(const ::parquet::ColumnDescriptor* column) { + if (column == nullptr || column->max_repetition_level() != 0 || + column->max_definition_level() > 1 || has_non_physical_annotation(column)) { + return nullptr; + } + + const bool nullable = column->max_definition_level() > 0; + switch (column->physical_type()) { + case ::parquet::Type::BOOLEAN: + return create_type(TYPE_BOOLEAN, nullable); + case ::parquet::Type::INT32: + return create_type(TYPE_INT, nullable); + case ::parquet::Type::INT64: + return create_type(TYPE_BIGINT, nullable); + case ::parquet::Type::FLOAT: + return create_type(TYPE_FLOAT, nullable); + case ::parquet::Type::DOUBLE: + return create_type(TYPE_DOUBLE, nullable); + default: + return nullptr; + } +} + +bool record_reader_physical_type_supported(::parquet::Type::type physical_type) { + switch (physical_type) { + case ::parquet::Type::BOOLEAN: + case ::parquet::Type::INT32: + case ::parquet::Type::INT64: + case ::parquet::Type::FLOAT: + case ::parquet::Type::DOUBLE: + case ::parquet::Type::BYTE_ARRAY: + case ::parquet::Type::FIXED_LEN_BYTE_ARRAY: + return true; + default: + return false; + } +} + +bool record_reader_integer_annotation_supported(const ::parquet::ColumnDescriptor* column, + const DataTypePtr& doris_type) { + const auto& logical_type = column->logical_type(); + const bool has_int_logical_type = + logical_type != nullptr && logical_type->is_valid() && logical_type->is_int(); + const bool has_int_converted_type = + column->converted_type() == ::parquet::ConvertedType::INT_8 || + column->converted_type() == ::parquet::ConvertedType::UINT_8 || + column->converted_type() == ::parquet::ConvertedType::INT_16 || + column->converted_type() == ::parquet::ConvertedType::UINT_16 || + column->converted_type() == ::parquet::ConvertedType::INT_32 || + column->converted_type() == ::parquet::ConvertedType::UINT_32 || + column->converted_type() == ::parquet::ConvertedType::INT_64 || + column->converted_type() == ::parquet::ConvertedType::UINT_64; + auto primitive_type = remove_nullable(doris_type)->get_primitive_type(); + return (has_int_logical_type || has_int_converted_type) && + (primitive_type == TYPE_TINYINT || primitive_type == TYPE_SMALLINT || + primitive_type == TYPE_INT || primitive_type == TYPE_BIGINT); +} + +} // namespace + +std::string parquet_column_name(const ::parquet::ColumnDescriptor* column) { + if (column == nullptr) { + return {}; + } + auto path = column->path(); + if (path) { + return path->ToDotString(); + } + return column->name(); +} + +ParquetTypeDescriptor resolve_parquet_type(const ::parquet::ColumnDescriptor* column) { + ParquetTypeDescriptor result; + if (column == nullptr) { + return result; + } + + result.physical_type = column->physical_type(); + result.converted_type = column->converted_type(); + result.fixed_length = column->type_length(); + + if (auto logical_type = logical_type_to_doris_type(column, &result); logical_type != nullptr) { + result.doris_type = logical_type; + } else if (auto converted_type = converted_type_to_doris_type(column, &result); + converted_type != nullptr) { + result.doris_type = converted_type; + } else { + result.doris_type = physical_type_to_doris_type(column); + if (result.physical_type == ::parquet::Type::INT96) { + result.extra_type_info = ParquetExtraTypeInfo::IMPALA_TIMESTAMP; + } + } + + result.is_string_like = + !result.is_decimal && (result.physical_type == ::parquet::Type::BYTE_ARRAY || + result.physical_type == ::parquet::Type::FIXED_LEN_BYTE_ARRAY); + + if (!record_reader_physical_type_supported(result.physical_type)) { + result.supports_record_reader = false; + return result; + } + if (direct_flat_primitive_doris_type(column) != nullptr || result.is_string_like || + (result.is_decimal && result.decimal_precision <= 38) || + (result.is_timestamp && result.physical_type == ::parquet::Type::INT64) || + record_reader_integer_annotation_supported(column, result.doris_type) || + remove_nullable(result.doris_type)->get_primitive_type() == TYPE_DATEV2 || + remove_nullable(result.doris_type)->get_primitive_type() == TYPE_TIMEV2) { + result.supports_record_reader = true; + } + return result; +} + +bool supports_record_reader(const ParquetTypeDescriptor& type_descriptor) { + return type_descriptor.supports_record_reader; +} + +} // namespace doris::parquet diff --git a/be/src/format/new_parquet/parquet_type.h b/be/src/format/new_parquet/parquet_type.h new file mode 100644 index 00000000000000..1404f84bc362d6 --- /dev/null +++ b/be/src/format/new_parquet/parquet_type.h @@ -0,0 +1,84 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include + +#include "core/data_type/data_type.h" + +namespace parquet { +class ColumnDescriptor; +} // namespace parquet + +namespace doris::parquet { + +// Parquet logical/converted annotation 解析后留下的额外编码信息。 +// 这对应 DuckDB ParquetColumnSchema::type_info:Doris type 只能表达最终展示类型, +// 读值时还需要知道 decimal/timestamp/time 在 Parquet 中的物理编码方式。 +enum class ParquetExtraTypeInfo { + NONE, + DECIMAL_INT32, + DECIMAL_INT64, + DECIMAL_BYTE_ARRAY, + UNIT_MS, + UNIT_MICROS, + UNIT_NS, + IMPALA_TIMESTAMP, +}; + +enum class ParquetTimeUnit { + UNKNOWN, + MILLIS, + MICROS, + NANOS, +}; + +// Parquet file-local column descriptor 的类型解析结果。 +// 该结构只解释 Parquet physical/logical/converted type,不包含 table/global schema +// evolution,也不依赖 Arrow internal RecordReader API。 +struct ParquetTypeDescriptor { + DataTypePtr doris_type; + ParquetExtraTypeInfo extra_type_info = ParquetExtraTypeInfo::NONE; + ParquetTimeUnit time_unit = ParquetTimeUnit::UNKNOWN; + ::parquet::Type::type physical_type = ::parquet::Type::UNDEFINED; + ::parquet::ConvertedType::type converted_type = ::parquet::ConvertedType::UNDEFINED; + int decimal_precision = -1; + int decimal_scale = -1; + int fixed_length = -1; + bool is_decimal = false; + bool is_timestamp = false; + bool is_string_like = false; + bool supports_record_reader = false; +}; + +// 返回 Parquet leaf column 的 file-local 展示名。 +std::string parquet_column_name(const ::parquet::ColumnDescriptor* column); + +// 将 Parquet file-local column descriptor 解析成 Doris file-local 类型和读值所需的 +// 编码信息。这里不做 table schema evolution;类型提升和 default/generated/partition +// 列由 table 层处理。 +ParquetTypeDescriptor resolve_parquet_type(const ::parquet::ColumnDescriptor* column); + +// 判断当前阶段是否可以通过 Arrow Parquet RecordReader 读取该列。 +// 当前支持 flat primitive/string/decimal/timestamp。复杂 nested column 仍通过 children +// 递归组合,list/map assembler 后续补齐。 +bool supports_record_reader(const ParquetTypeDescriptor& type_descriptor); + +} // namespace doris::parquet diff --git a/be/src/format/new_parquet/selection_vector.h b/be/src/format/new_parquet/selection_vector.h new file mode 100644 index 00000000000000..22a9d3507e27dd --- /dev/null +++ b/be/src/format/new_parquet/selection_vector.h @@ -0,0 +1,116 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include + +#include "common/status.h" + +namespace doris::parquet { + +// 类似 DuckDB SelectionVector 的轻量行号视图。 +// 它只表达一个 batch 内被选中的 row offset,不持有 table/global schema 语义。 +// 未绑定 data 时表示 identity selection:get_index(i) == i。 +class SelectionVector { +public: + using Index = uint16_t; + + SelectionVector() = default; + + explicit SelectionVector(size_t count) { resize(count); } + + SelectionVector(Index* data, size_t count) { initialize(data, count); } + + void initialize(Index* data, size_t count) { + _owned.clear(); + _data = data; + _size = count; + } + + void resize(size_t count) { + _owned.resize(count); + _data = _owned.data(); + _size = count; + for (size_t idx = 0; idx < count; ++idx) { + _data[idx] = static_cast(idx); + } + } + + void clear() { + _owned.clear(); + _data = nullptr; + _size = 0; + } + + size_t size() const { return _size; } + + bool is_set() const { return _data != nullptr; } + + Index* data() { return _data; } + + const Index* data() const { return _data; } + + size_t get_index(size_t idx) const { + if (_data == nullptr) { + return idx; + } + return _data[idx]; + } + + void set_index(size_t idx, Index value) { _data[idx] = value; } + + Status verify(size_t count, int64_t batch_rows) const { + if (batch_rows < 0) { + return Status::InvalidArgument("Negative parquet selection batch rows {}", batch_rows); + } + if (count > static_cast(batch_rows)) { + return Status::InvalidArgument("Parquet selection count {} exceeds batch rows {}", + count, batch_rows); + } + if (_data != nullptr && count > _size) { + return Status::InvalidArgument("Parquet selection count {} exceeds vector size {}", + count, _size); + } + size_t previous = 0; + for (size_t idx = 0; idx < count; ++idx) { + const size_t current = get_index(idx); + if (current >= static_cast(batch_rows)) { + return Status::InvalidArgument( + "Parquet selection index {} out of range [0, {}) at position {}", current, + batch_rows, idx); + } + if (idx > 0 && current <= previous) { + return Status::InvalidArgument( + "Parquet selection index {} is not strictly greater than previous {} at " + "position {}", + current, previous, idx); + } + previous = current; + } + return Status::OK(); + } + +private: + std::vector _owned; + Index* _data = nullptr; + size_t _size = 0; +}; + +} // namespace doris::parquet diff --git a/be/src/format/orc/vorc_reader.cpp b/be/src/format/orc/vorc_reader.cpp index bcb1a8d70f4b3f..25db29c49625af 100644 --- a/be/src/format/orc/vorc_reader.cpp +++ b/be/src/format/orc/vorc_reader.cpp @@ -348,7 +348,7 @@ Status OrcReader::_create_file_reader() { _file_description.mtime = _scan_range.__isset.modification_time ? _scan_range.modification_time : 0; io::FileReaderOptions reader_options = - FileFactory::get_reader_options(_state, _file_description); + FileFactory::get_reader_options(_state->query_options(), _file_description); io::FileReaderSPtr inner_reader; if (_io_ctx_holder != nullptr) { inner_reader = DORIS_TRY(io::DelegateReader::create_file_reader( diff --git a/be/src/format/parquet/vparquet_reader.cpp b/be/src/format/parquet/vparquet_reader.cpp index a2f2356085b171..35cb3b1944a22b 100644 --- a/be/src/format/parquet/vparquet_reader.cpp +++ b/be/src/format/parquet/vparquet_reader.cpp @@ -311,7 +311,7 @@ Status ParquetReader::_open_file() { _file_description.mtime = _scan_range.__isset.modification_time ? _scan_range.modification_time : 0; io::FileReaderOptions reader_options = - FileFactory::get_reader_options(_state, _file_description); + FileFactory::get_reader_options(_state->query_options(), _file_description); _file_reader = DORIS_TRY(io::DelegateReader::create_file_reader( _profile, _system_properties, _file_description, reader_options, io::DelegateReader::AccessMode::RANDOM, _io_ctx)); diff --git a/be/src/format/reader/column_mapper.cpp b/be/src/format/reader/column_mapper.cpp new file mode 100644 index 00000000000000..c6114b20df31cb --- /dev/null +++ b/be/src/format/reader/column_mapper.cpp @@ -0,0 +1,520 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "format/reader/column_mapper.h" + +#include +#include +#include +#include +#include + +#include "common/status.h" +#include "core/assert_cast.h" +#include "core/data_type/data_type_array.h" +#include "core/data_type/data_type_map.h" +#include "core/data_type/data_type_nullable.h" +#include "core/data_type/data_type_struct.h" +#include "format/reader/expr/cast.h" +#include "format/reader/expr/slot_ref.h" +#include "format/reader/file_reader.h" +#include "format/reader/table_reader.h" + +namespace doris::reader { + +struct FileSlotRewriteInfo { + size_t block_position = 0; + DataTypePtr file_type; + DataTypePtr table_type; + std::string file_column_name; +}; + +static VExprSPtr create_file_slot_ref(const VSlotRef& slot_ref, + const FileSlotRewriteInfo& rewrite_info) { + return TableSlotRef::create_shared(slot_ref.slot_id(), + cast_set(rewrite_info.block_position), -1, + rewrite_info.file_type, rewrite_info.file_column_name); +} + +static VExprSPtr rewrite_table_expr_to_file_expr( + const VExprSPtr& expr, + const std::map& table_column_to_file_slot) { + if (expr == nullptr) { + return nullptr; + } + if (expr->is_slot_ref()) { + const auto* slot_ref = assert_cast(expr.get()); + const auto rewrite_it = table_column_to_file_slot.find(slot_ref->slot_id()); + if (rewrite_it != table_column_to_file_slot.end()) { + const auto& rewrite_info = rewrite_it->second; + auto file_slot = create_file_slot_ref(*slot_ref, rewrite_info); + if (rewrite_info.file_type->equals(*rewrite_info.table_type)) { + return file_slot; + } + auto cast_expr = Cast::create_shared(rewrite_info.table_type); + cast_expr->add_child(std::move(file_slot)); + return cast_expr; + } + return expr; + } + // rewrite_table_expr_to_file_expr localizes the expression tree in-place because VExpr does + // not provide a generic deep-clone API. A previous split may already have inserted Cast(slot) + // for the same table-level conjunct. Keep that rewrite idempotent: rewrite the cast child + // from table slot to the current split's file slot, and drop the cast when the current split + // no longer needs it. + if (dynamic_cast(expr.get()) != nullptr && expr->get_num_children() == 1) { + const auto& child = expr->children()[0]; + if (child->is_slot_ref()) { + const auto* slot_ref = assert_cast(child.get()); + const auto rewrite_it = table_column_to_file_slot.find(slot_ref->slot_id()); + if (rewrite_it != table_column_to_file_slot.end() && + expr->data_type()->equals(*rewrite_it->second.table_type)) { + auto rewritten_child = create_file_slot_ref(*slot_ref, rewrite_it->second); + if (rewrite_it->second.file_type->equals(*rewrite_it->second.table_type)) { + return rewritten_child; + } + expr->set_children({std::move(rewritten_child)}); + return expr; + } + } + } + + // VExpr currently does not provide a generic deep-clone API for arbitrary expression types. + // Keep all slot-localization mutation inside ColumnMapper and rebuild it for every split + // before the localized expression is prepared/opened by TableReader. + VExprSPtrs rewritten_children; + rewritten_children.reserve(expr->children().size()); + for (const auto& child : expr->children()) { + rewritten_children.push_back( + rewrite_table_expr_to_file_expr(child, table_column_to_file_slot)); + } + expr->set_children(std::move(rewritten_children)); + return expr; +} + +static constexpr const char* ROW_LINEAGE_ROW_ID = "_row_id"; +static constexpr const char* ROW_LINEAGE_LAST_UPDATED_SEQ_NUMBER = "_last_updated_sequence_number"; + +static void add_scan_column(FileScanRequest* file_request, ColumnId file_column_id, + std::vector* scan_columns) { + if (scan_columns == &file_request->non_predicate_columns && + std::find(file_request->predicate_columns.begin(), file_request->predicate_columns.end(), + file_column_id) != file_request->predicate_columns.end()) { + return; + } + // column_positions is the global read-column index for this scan request, so it also + // deduplicates predicate_columns and non_predicate_columns across all filter/projection paths. + const bool newly_added = file_request->column_positions.count(file_column_id) == 0; + if (newly_added) { + file_request->column_positions.emplace(file_column_id, + file_request->column_positions.size()); + } + if (std::find(scan_columns->begin(), scan_columns->end(), file_column_id) == + scan_columns->end()) { + scan_columns->push_back(file_column_id); + } + if (scan_columns == &file_request->predicate_columns) { + file_request->non_predicate_columns.erase( + std::remove(file_request->non_predicate_columns.begin(), + file_request->non_predicate_columns.end(), file_column_id), + file_request->non_predicate_columns.end()); + } +} + +static void rebuild_projection(ColumnMapping* mapping, size_t block_position) { + DORIS_CHECK(mapping->file_column_id.has_value()); + if (mapping->is_trivial) { + mapping->projection = VExprContext::create_shared(TableSlotRef::create_shared( + cast_set(block_position), cast_set(block_position), -1, + mapping->file_type, mapping->file_column_name)); + return; + } + + auto expr = Cast::create_shared(mapping->table_type); + expr->add_child(TableSlotRef::create_shared(cast_set(block_position), + cast_set(block_position), -1, + mapping->file_type, mapping->file_column_name)); + mapping->projection = VExprContext::create_shared(expr); +} + +// Build a map from table column id to file slot rewrite info for all columns in the given mappings that have a file column id and are present in the file request. +static std::map build_file_slot_rewrite_map( + const std::vector& mappings, const FileScanRequest& file_request) { + std::map table_column_to_file_slot; + for (const auto& mapping : mappings) { + if (!mapping.file_column_id.has_value()) { + continue; + } + const auto position_it = file_request.column_positions.find(*mapping.file_column_id); + if (position_it != file_request.column_positions.end()) { + table_column_to_file_slot.emplace( + mapping.table_column_id, + FileSlotRewriteInfo {.block_position = position_it->second, + .file_type = mapping.file_type, + .table_type = mapping.table_type, + .file_column_name = mapping.file_column_name}); + } + } + return table_column_to_file_slot; +} + +static bool is_complex_type(const DataTypePtr& type) { + DORIS_CHECK(type != nullptr); + const auto primitive_type = remove_nullable(type)->get_primitive_type(); + return primitive_type == TYPE_STRUCT || primitive_type == TYPE_ARRAY || + primitive_type == TYPE_MAP; +} + +static const SchemaField* find_file_child_by_table_column( + const TableColumn& table_column, const std::vector& file_children, + TableColumnMappingMode mode) { + for (const auto& field : file_children) { + if (mode == TableColumnMappingMode::BY_FIELD_ID && !field.field_id_path.empty() && + field.field_id_path.back() != -1 && field.field_id_path.back() == table_column.id) { + return &field; + } + if (field.name == table_column.name) { + return &field; + } + } + return nullptr; +} + +static bool complex_projection_has_pruned_children(const ColumnMapping& mapping) { + if (!is_complex_type(mapping.file_type)) { + return false; + } + if (mapping.child_mappings.empty()) { + return false; + } + DORIS_CHECK(mapping.file_type != nullptr); + DORIS_CHECK(mapping.table_type != nullptr); + if (remove_nullable(mapping.file_type)->get_primitive_type() != + remove_nullable(mapping.table_type)->get_primitive_type()) { + return true; + } + if (!mapping.table_type->equals(*mapping.file_type)) { + return true; + } + for (const auto& child_mapping : mapping.child_mappings) { + if (!child_mapping.file_column_id.has_value() || + complex_projection_has_pruned_children(child_mapping)) { + return true; + } + } + return false; +} + +static Status rebuild_projected_file_type(ColumnMapping* mapping) { + if (mapping == nullptr) { + return Status::InvalidArgument("mapping is null"); + } + DORIS_CHECK(is_complex_type(mapping->file_type)); + DataTypes child_types; + Strings child_names; + child_types.reserve(mapping->child_mappings.size()); + child_names.reserve(mapping->child_mappings.size()); + for (auto& child_mapping : mapping->child_mappings) { + if (!child_mapping.file_column_id.has_value()) { + continue; + } + if (complex_projection_has_pruned_children(child_mapping)) { + RETURN_IF_ERROR(rebuild_projected_file_type(&child_mapping)); + } + child_types.push_back(child_mapping.file_type); + child_names.push_back(child_mapping.file_column_name); + } + if (child_types.empty()) { + return Status::NotSupported("Projection for complex column {} contains no file children", + mapping->file_column_name); + } + DataTypePtr projected_type; + const auto primitive_type = remove_nullable(mapping->file_type)->get_primitive_type(); + switch (primitive_type) { + case TYPE_STRUCT: + projected_type = std::make_shared(child_types, child_names); + break; + case TYPE_ARRAY: + DORIS_CHECK(child_types.size() == 1); + projected_type = std::make_shared(child_types[0]); + break; + case TYPE_MAP: + DORIS_CHECK(child_types.size() == 1); + DORIS_CHECK(remove_nullable(child_types[0])->get_primitive_type() == TYPE_STRUCT); + { + const auto* entry_type = + assert_cast(remove_nullable(child_types[0]).get()); + DORIS_CHECK(entry_type->get_elements().size() == 2); + projected_type = std::make_shared(entry_type->get_element(0), + entry_type->get_element(1)); + } + break; + default: + return Status::InvalidArgument("Cannot project children from non-complex column {}", + mapping->file_column_name); + } + mapping->file_type = + mapping->file_type->is_nullable() ? make_nullable(projected_type) : projected_type; + mapping->is_trivial = + mapping->table_type != nullptr && mapping->table_type->equals(*mapping->file_type); + mapping->has_complex_projection = true; + return Status::OK(); +} + +static std::vector filter_slot_ids(const TableFilter& table_filter) { + if (!table_filter.slot_ids.empty()) { + return table_filter.slot_ids; + } + return {}; +} + +Status TableColumnMapper::create_mapping(const std::vector& projected_columns, + const std::map& partition_values, + const std::vector& file_schema) { + _mappings.clear(); + for (const auto& table_column : projected_columns) { + ColumnMapping mapping; + mapping.table_column_id = table_column.id; + mapping.table_type = table_column.type; + if (table_column.is_partition_key && partition_values.count(table_column.name) > 0) { + // 1. Partition column, use partition value as a constant mapping. Note that partition column may also have default expression, but partition value should take precedence if it exists. + mapping.is_constant = true; + mapping.default_expr = VExprContext::create_shared(TableLiteral::create_shared( + mapping.table_type, partition_values.at(table_column.name))); + } else if (const auto* file_field = _find_file_field(table_column, file_schema)) { + // 2. Table column has a matching file column, use it as a direct mapping. + RETURN_IF_ERROR(_create_direct_mapping(table_column, *file_field, &mapping)); + } else if (table_column.default_expr != nullptr) { + // 3. Table column does not exist in file (column adding by schema evolution), which has a default expression, use it as a constant mapping. + mapping.is_constant = true; + mapping.default_expr = table_column.default_expr; + } else if (table_column.name == ROW_LINEAGE_ROW_ID) { + // 4. Virtual column, use special mapping to indicate it should be materialized by table reader instead of read from file or evaluated from expression. + mapping.virtual_column_type = TableVirtualColumnType::ROW_ID; + } else if (table_column.name == ROW_LINEAGE_LAST_UPDATED_SEQ_NUMBER) { + mapping.virtual_column_type = TableVirtualColumnType::LAST_UPDATED_SEQUENCE_NUMBER; + } else { + if (table_column.is_partition_key) { + return Status::InvalidArgument( + "Table column '{}' (id={}) does not have a matching partition value", + table_column.name, table_column.id); + } + if (!_options.allow_missing_columns) { + return Status::InvalidArgument( + "Table column '{}' (id={}) does not have a matching file column", + table_column.name, table_column.id); + } + } + _mappings.push_back(std::move(mapping)); + } + return Status::OK(); +} + +Status TableColumnMapper::create_scan_request(const std::vector& table_filters, + const TableColumnPredicates& table_column_predicates, + const std::vector& projected_columns, + FileScanRequest* file_request) { + // FileReader evaluates expressions against a file-local block. This mapper owns the + // table-column to file-column conversion, so it also owns the file-local block positions. + file_request->predicate_columns.clear(); + file_request->non_predicate_columns.clear(); + file_request->column_positions.clear(); + file_request->complex_projections.clear(); + file_request->conjuncts.clear(); + file_request->delete_conjuncts.clear(); + file_request->column_predicate_filters.clear(); + file_request->reader_expression_map.clear(); + // 1. Build referenced non-predicate columns + for (const auto& table_column : projected_columns) { + auto* mapping = _find_mapping(table_column.id); + if (mapping != nullptr && mapping->file_column_id.has_value()) { + // A file column can be read lazily as a non-predicate column only when it is not used + // by either expression filters or single-column predicate pruning. + bool used_by_filter = table_column_predicates.count(table_column.id) > 0; + if (!used_by_filter) { + for (const auto& table_filter : table_filters) { + const auto slot_ids = filter_slot_ids(table_filter); + if (std::find(slot_ids.begin(), slot_ids.end(), table_column.id) != + slot_ids.end()) { + used_by_filter = true; + break; + } + } + } + if (!used_by_filter) { + add_scan_column(file_request, *mapping->file_column_id, + &file_request->non_predicate_columns); + } + if (mapping->has_complex_projection || + complex_projection_has_pruned_children(*mapping)) { + if (!mapping->has_complex_projection) { + RETURN_IF_ERROR(rebuild_projected_file_type(mapping)); + } + FieldProjection projection; + RETURN_IF_ERROR(_build_complex_projection(*mapping, &projection)); + file_request->complex_projections.emplace(*mapping->file_column_id, + std::move(projection)); + } + } + } + // 2. Build referenced predicate columns + RETURN_IF_ERROR(localize_filters(table_filters, table_column_predicates, file_request)); + // 3. Re-build projections for all referenced file columns to point to the correct file-local block positions. + for (auto& mapping : _mappings) { + if (!mapping.file_column_id.has_value()) { + continue; + } + auto position_it = file_request->column_positions.find(*mapping.file_column_id); + DORIS_CHECK(position_it != file_request->column_positions.end()); + rebuild_projection(&mapping, position_it->second); + } + return Status::OK(); +} + +Status TableColumnMapper::localize_filters(const std::vector& table_filters, + const TableColumnPredicates& table_column_predicates, + FileScanRequest* file_request) const { + // 真实实现会处理 trivial mapping、safe cast、reader expression fallback 和 + // finalize-only filter。stub 只复制能够直接定位到 file column 的谓词。 + for (const auto& table_filter : table_filters) { + if (!table_filter.can_be_localized()) { + // TODO: Rewrite table filter to reader_expression_map + // file_request->reader_expression_map.emplace_back(..., table_filter.conjunct); + continue; + } + for (const auto table_column_id : filter_slot_ids(table_filter)) { + const auto* mapping = _find_mapping(table_column_id); + if (mapping == nullptr || !mapping->file_column_id.has_value()) { + continue; + } + add_scan_column(file_request, *mapping->file_column_id, + &file_request->predicate_columns); + } + } + for (const auto& [table_column_id, _] : table_column_predicates) { + const auto* mapping = _find_mapping(table_column_id); + if (mapping == nullptr || !mapping->file_column_id.has_value()) { + continue; + } + add_scan_column(file_request, *mapping->file_column_id, &file_request->predicate_columns); + } + + // Build the complete table-slot rewrite map after all predicate columns have been assigned. + // This keeps expression localization independent from filter iteration order. + const auto table_column_to_file_slot = build_file_slot_rewrite_map(_mappings, *file_request); + for (const auto& table_filter : table_filters) { + if (!table_filter.can_be_localized()) { + continue; + } + if (table_filter.conjunct != nullptr) { + file_request->conjuncts.push_back( + VExprContext::create_shared(rewrite_table_expr_to_file_expr( + table_filter.conjunct->root(), table_column_to_file_slot))); + } + } + for (const auto& [table_column_id, predicates] : table_column_predicates) { + const auto* mapping = _find_mapping(table_column_id); + if (mapping == nullptr || !mapping->file_column_id.has_value() || predicates.empty()) { + continue; + } + FileColumnPredicateFilter column_predicate_filter; + column_predicate_filter.file_column_id = *mapping->file_column_id; + column_predicate_filter.predicates = predicates; + file_request->column_predicate_filters.push_back(std::move(column_predicate_filter)); + } + return Status::OK(); +} + +const SchemaField* TableColumnMapper::_find_file_field( + const TableColumn& table_column, const std::vector& file_schema) const { + for (const auto& field : file_schema) { + if (_options.mode == TableColumnMappingMode::BY_FIELD_ID) { + if (!field.field_id_path.empty() && field.field_id_path.back() != -1 && + field.field_id_path.back() == table_column.id) { + return &field; + } + if ((field.field_id_path.empty() || field.field_id_path.back() == -1) && + field.id == table_column.id) { + return &field; + } + } + if (field.name == table_column.name) { + return &field; + } + } + return nullptr; +} + +Status TableColumnMapper::_create_direct_mapping(const TableColumn& table_column, + const SchemaField& file_field, + ColumnMapping* mapping) const { + if (mapping == nullptr) { + return Status::InvalidArgument("mapping is null"); + } + mapping->file_column_id = file_field.id; + mapping->file_column_name = file_field.name; + mapping->file_path = file_field.file_path; + mapping->file_type = file_field.type; + mapping->is_trivial = _is_same_type(mapping->table_type, mapping->file_type); + mapping->child_mappings.clear(); + + if (!table_column.children.empty() && is_complex_type(file_field.type)) { + for (const auto& table_child : table_column.children) { + const auto* file_child = find_file_child_by_table_column( + table_child, file_field.children, _options.mode); + if (file_child == nullptr) { + return Status::NotSupported( + "Complex schema change is not implemented: table child column '{}' " + "(id={}) does not have a matching file child under column '{}'", + table_child.name, table_child.id, table_column.name); + } + ColumnMapping child_mapping; + child_mapping.table_column_id = table_child.id; + child_mapping.table_type = table_child.type; + RETURN_IF_ERROR(_create_direct_mapping(table_child, *file_child, &child_mapping)); + mapping->child_mappings.push_back(std::move(child_mapping)); + } + } + return Status::OK(); +} + +Status TableColumnMapper::_build_complex_projection(const ColumnMapping& mapping, + FieldProjection* projection) const { + if (projection == nullptr) { + return Status::InvalidArgument("projection is null"); + } + DORIS_CHECK(mapping.file_column_id.has_value()); + projection->file_column_id = *mapping.file_column_id; + projection->file_path = mapping.file_path; + projection->project_all_children = mapping.child_mappings.empty(); + projection->children.clear(); + for (const auto& child_mapping : mapping.child_mappings) { + if (!child_mapping.file_column_id.has_value()) { + continue; + } + FieldProjection child_projection; + RETURN_IF_ERROR(_build_complex_projection(child_mapping, &child_projection)); + projection->children.push_back(std::move(child_projection)); + } + if (!projection->project_all_children && projection->children.empty()) { + return Status::NotSupported("Projection for complex column {} contains no file children", + mapping.file_column_name); + } + return Status::OK(); +} + +} // namespace doris::reader diff --git a/be/src/format/reader/column_mapper.h b/be/src/format/reader/column_mapper.h new file mode 100644 index 00000000000000..e1839652a4799a --- /dev/null +++ b/be/src/format/reader/column_mapper.h @@ -0,0 +1,161 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "common/status.h" +#include "core/data_type/data_type.h" +#include "exprs/vexpr_fwd.h" +#include "format/reader/expr/literal.h" + +namespace doris { +class ColumnPredicate; +} // namespace doris + +namespace doris::reader { + +struct TableColumn; +struct TableFilter; +struct SchemaField; +struct FileScanRequest; +struct FieldProjection; + +using TableColumnPredicates = std::map>>; + +enum class TableColumnMappingMode { + BY_FIELD_ID, + BY_NAME, +}; + +enum TableVirtualColumnType { + INVALID = 0, // not a virtual column + ROW_ID = 1, + LAST_UPDATED_SEQUENCE_NUMBER = 2, +}; + +// 单个 table column 到 file column 的映射结果。 +// 这是 table 层和 file 层的核心边界对象。 +struct ColumnMapping { + int32_t table_column_id = -1; + std::optional file_column_id; + std::string file_column_name; + std::vector file_path; + DataTypePtr file_type; + DataTypePtr table_type; + + // 最终输出表达式。用于把 file-local value 转成 table/global value,例如 cast、 + // default、partition、generated column 或复杂列 remap。 + VExprContextSPtr projection; + + // 读时过滤 fallback 表达式。只在 table filter 不能安全转换成 file-local predicate + // 时使用,服务 reader_expression_map,不等价于 finalize_expr。 + VExprContextSPtr reader_filter_expr; + + std::vector child_mappings; + bool is_trivial = false; + bool is_constant = false; + bool has_complex_projection = false; + TableVirtualColumnType virtual_column_type = TableVirtualColumnType::INVALID; + VExprContextSPtr default_expr; +}; + +struct TableColumnMapperOptions { + TableColumnMappingMode mode = TableColumnMappingMode::BY_FIELD_ID; + bool allow_missing_columns = true; + bool enable_reader_expression_fallback = true; +}; + +// 通用 table schema 到 file schema 映射层。 +// Iceberg 会使用 BY_FIELD_ID;普通 by-name 场景可以复用该组件,但不应把它命名成 +// Iceberg-only 组件。 +class TableColumnMapper { +public: + explicit TableColumnMapper(TableColumnMapperOptions options = {}) + : _options(std::move(options)) {} + virtual ~TableColumnMapper() = default; + + // 建立 table schema 到 file schema 的列映射。 + // 输出的 ColumnMapping 描述 table column 如何从 file column、常量列或表达式得到; + // 后续 projection、filter localization 和 table block finalize 都应复用这份映射。 + virtual Status create_mapping(const std::vector& projected_columns, + const std::map& partition_values, + const std::vector& file_schema); + + // 把 table-level scan 请求转换成 file-local scan 请求。 + // table_request 使用 table/global schema;file_request 只包含 FileReader 能理解的 + // projected_file_columns、conjuncts、delete_conjuncts、column_predicate_filters 和 + // reader_expression_map。 + virtual Status create_scan_request(const std::vector& table_filters, + const TableColumnPredicates& table_column_predicates, + const std::vector& projected_columns, + FileScanRequest* file_request); + + // 将 table-level filter 定位到文件 schema。 + // trivial mapping 可以直接复制结构化谓词;类型变化时可以尝试安全 cast;无法安全 + // 下推的表达式应通过 reader_expression_map 或 table-level finalize/filter fallback 处理。 + virtual Status localize_filters(const std::vector& table_filters, + const TableColumnPredicates& table_column_predicates, + FileScanRequest* file_request) const; + void clear() { _mappings.clear(); } + const std::vector& mappings() const { return _mappings; } + +private: + const SchemaField* _find_file_field(const TableColumn& table_column, + const std::vector& file_schema) const; + Status _create_direct_mapping(const TableColumn& table_column, const SchemaField& file_field, + ColumnMapping* mapping) const; + Status _build_complex_projection(const ColumnMapping& mapping, + FieldProjection* projection) const; + + ColumnMapping* _find_mapping(int32_t table_column_id) { + for (auto& mapping : _mappings) { + if (mapping.table_column_id == table_column_id) { + return &mapping; + } + } + return nullptr; + } + + const ColumnMapping* _find_mapping(int32_t table_column_id) const { + for (const auto& mapping : _mappings) { + if (mapping.table_column_id == table_column_id) { + return &mapping; + } + } + return nullptr; + } + + bool _is_same_type(const DataTypePtr& table_type, const DataTypePtr& file_type) const { + DORIS_CHECK(table_type != nullptr); + DORIS_CHECK(file_type != nullptr); + return table_type->equals(*file_type); + } + + TableColumnMapperOptions _options; + std::vector _mappings; +}; + +} // namespace doris::reader diff --git a/be/src/format/reader/expr/cast.cpp b/be/src/format/reader/expr/cast.cpp new file mode 100644 index 00000000000000..69af83c9e77ffe --- /dev/null +++ b/be/src/format/reader/expr/cast.cpp @@ -0,0 +1,131 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "format/reader/expr/cast.h" + +#include +#include +#include + +#include + +#include "common/status.h" +#include "core/block/block.h" +#include "core/block/column_with_type_and_name.h" +#include "core/block/columns_with_type_and_name.h" +#include "exprs/function/simple_function_factory.h" +#include "exprs/vexpr_context.h" +#include "exprs/vliteral.h" + +namespace doris { + +Status Cast::prepare(RuntimeState* state, const RowDescriptor& desc, VExprContext* context) { + RETURN_IF_ERROR_OR_PREPARED(VExpr::prepare(state, desc, context)); + if (_children.size() != 1) { + return Status::InternalError( + fmt::format("Cast should have exactly 1 child expr, but got {}", _children.size())); + } + ColumnsWithTypeAndName argument_template; + argument_template.reserve(_children.size()); + if (_children[0]->is_literal()) { + // For some functions, he needs some literal columns to derive the return type. + auto literal_node = std::dynamic_pointer_cast(_children[0]); + argument_template.emplace_back(literal_node->get_column_ptr(), _children[0]->data_type(), + _children[0]->expr_name()); + } else { + argument_template.emplace_back(nullptr, _children[0]->data_type(), + _children[0]->expr_name()); + } + + _expr_name = fmt::format("CAST(arguments={},return={})", _children[0]->data_type()->get_name(), + _data_type->get_name()); + // get the function. won't prepare function. + _function = SimpleFunctionFactory::instance().get_function( + "CAST", argument_template, _data_type, + {.new_version_unix_timestamp = state->query_options().new_version_unix_timestamp}, + state->be_exec_version()); + if (_function == nullptr) { + return Status::InternalError("Could not find function {} ", _expr_name); + } + VExpr::register_function_context(state, context); + _prepare_finished = true; + return Status::OK(); +} + +Status Cast::open(RuntimeState* state, VExprContext* context, + FunctionContext::FunctionStateScope scope) { + DCHECK(_prepare_finished); + for (auto& i : _children) { + RETURN_IF_ERROR(i->open(state, context, scope)); + } + RETURN_IF_ERROR(VExpr::init_function_context(state, context, scope, _function)); + if (scope == FunctionContext::FRAGMENT_LOCAL) { + RETURN_IF_ERROR(VExpr::get_const_col(context, nullptr)); + } + _open_finished = true; + return Status::OK(); +} + +void Cast::close(VExprContext* context, FunctionContext::FunctionStateScope scope) { + VExpr::close_function_context(context, scope, _function); + VExpr::close(context, scope); +} + +Status Cast::execute_column_impl(VExprContext* context, const Block* block, + const Selector* selector, size_t count, + ColumnPtr& result_column) const { + return _do_execute(context, block, selector, count, result_column); +} + +std::string Cast::debug_string() const { + return _expr_name; +} + +Status Cast::_do_execute(VExprContext* context, const Block* block, const Selector* selector, + size_t count, ColumnPtr& result_column) const { + DCHECK(_open_finished || block == nullptr) << debug_string(); + if (_children.size() != 1) { + return Status::InternalError( + fmt::format("Cast should have exactly 1 child expr, but got {}", _children.size())); + } + if (is_const_and_have_executed()) { // const have executed in open function + result_column = get_result_from_const(count); + return Status::OK(); + } + + Block temp_block; + ColumnNumbers args(1); + + ColumnPtr tmp_arg_column; + RETURN_IF_ERROR(_children[0]->execute_column(context, block, selector, count, tmp_arg_column)); + auto arg_type = _children[0]->execute_type(block); + temp_block.insert({tmp_arg_column, arg_type, _children[0]->expr_name()}); + args[0] = 0; + + uint32_t num_columns_without_result = temp_block.columns(); + // prepare a column to save result + temp_block.insert({nullptr, _data_type, _expr_name}); + + RETURN_IF_ERROR(_function->execute(context->fn_context(_fn_context_index), temp_block, args, + num_columns_without_result, count)); + result_column = temp_block.get_by_position(num_columns_without_result).column; + DCHECK_EQ(result_column->size(), count); + RETURN_IF_ERROR(result_column->column_self_check()); + return Status::OK(); +} + +} // namespace doris diff --git a/be/src/format/reader/expr/cast.h b/be/src/format/reader/expr/cast.h new file mode 100644 index 00000000000000..7d8ca437ba3fb0 --- /dev/null +++ b/be/src/format/reader/expr/cast.h @@ -0,0 +1,59 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "common/object_pool.h" +#include "common/status.h" +#include "exprs/function_context.h" +#include "exprs/vexpr.h" + +namespace doris { +class RowDescriptor; +class RuntimeState; +class TExprNode; +class Block; +class VExprContext; +} // namespace doris + +namespace doris { + +class Cast final : public VExpr { + ENABLE_FACTORY_CREATOR(Cast); + +public: + Cast(const DataTypePtr& type) { _data_type = type; } + ~Cast() override = default; + Status prepare(RuntimeState* state, const RowDescriptor& desc, VExprContext* context) override; + Status open(RuntimeState* state, VExprContext* context, + FunctionContext::FunctionStateScope scope) override; + void close(VExprContext* context, FunctionContext::FunctionStateScope scope) override; + Status execute_column_impl(VExprContext* context, const Block* block, const Selector* selector, + size_t count, ColumnPtr& result_column) const override; + std::string debug_string() const override; + uint64_t get_digest(uint64_t seed) const override { return 0; } + const std::string& expr_name() const override { return _expr_name; } + +private: + Status _do_execute(VExprContext* context, const Block* block, const Selector* selector, + size_t count, ColumnPtr& result_column) const; + std::string _expr_name; + FunctionBasePtr _function; +}; +} // namespace doris diff --git a/be/src/format/reader/expr/delete_predicate.cpp b/be/src/format/reader/expr/delete_predicate.cpp new file mode 100644 index 00000000000000..31c6a057afd213 --- /dev/null +++ b/be/src/format/reader/expr/delete_predicate.cpp @@ -0,0 +1,117 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "format/reader/expr/delete_predicate.h" + +#include +#include +#include + +#include +#include +#include + +#include "common/status.h" +#include "core/block/block.h" +#include "core/block/column_numbers.h" +#include "core/block/column_with_type_and_name.h" +#include "core/block/columns_with_type_and_name.h" + +namespace doris { + +DeletePredicate::DeletePredicate(const std::vector& deleted_rows) + : VExpr(), _deleted_rows(deleted_rows) { + _node_type = TExprNodeType::PREDICATE; + _opcode = TExprOpcode::DELETE; + _data_type = std::make_shared(); +} + +Status DeletePredicate::prepare(RuntimeState* state, const RowDescriptor& desc, + VExprContext* context) { + RETURN_IF_ERROR_OR_PREPARED(VExpr::prepare(state, desc, context)); + _expr_name = "DeletePredicate"; + _prepare_finished = true; + return Status::OK(); +} + +Status DeletePredicate::open(RuntimeState* state, VExprContext* context, + FunctionContext::FunctionStateScope scope) { + DCHECK(_prepare_finished); + RETURN_IF_ERROR_OR_PREPARED(VExpr::open(state, context, scope)); + _open_finished = true; + return Status::OK(); +} + +void DeletePredicate::close(VExprContext* context, FunctionContext::FunctionStateScope scope) { + VExpr::close(context, scope); +} + +/** + * DeletePredicate is derived from 2 cases: + * 1. All row IDs indicates deleted rows. (e.g. Delete rows with row_id in (1, 2, 3)) + * 2. Bit vector indicates whether each row is deleted or not. (e.g. Bit vector[0,1,0,0,1] indicates row 1 and row 4 are deleted) + * + * So DeletePredicate should have exactly 1 child expr, which is the slot of row id. + * Row IDs should be generated by file reader as a virtual column in `block`. + **/ +Status DeletePredicate::execute(VExprContext* context, Block* block, int* result_column_id) const { + if (_children.size() != 1) { + return Status::InternalError(fmt::format( + "DeletePredicate should have exactly 1 child expr, but got {}", _children.size())); + } + int slot = -1; + RETURN_IF_ERROR(_children[0]->execute(context, block, &slot)); + const auto& row_ids = + assert_cast(*block->get_by_position(slot).column).get_data(); + const auto count = row_ids.size(); + auto res_col = ColumnBool::create(count, 0); + if (_deleted_rows.empty()) { + block->insert({std::move(res_col), std::make_shared(), expr_name()}); + *result_column_id = static_cast(block->get_columns().size() - 1); + return Status::OK(); + } + if (count == 0) { + block->insert({std::move(res_col), std::make_shared(), expr_name()}); + *result_column_id = static_cast(block->get_columns().size() - 1); + return Status::OK(); + } + const int64_t* delete_rows = _deleted_rows.data(); + const int64_t* delete_rows_end = delete_rows + _deleted_rows.size(); + const int64_t* start_pos = std::lower_bound(delete_rows, delete_rows_end, row_ids[0]); + int64_t start_index = start_pos - delete_rows; + const int64_t* end_pos = std::upper_bound(start_pos, delete_rows_end, row_ids[count - 1]); + const int64_t end_index = end_pos - delete_rows; + + while (start_index < end_index) { + int64_t delete_row = delete_rows[start_index]; + if (const auto it = std::ranges::lower_bound(row_ids, delete_row); + it != row_ids.end() && *it == delete_row) { + const size_t index = it - row_ids.begin(); + res_col->get_data()[index] = true; + } + ++start_index; + } + block->insert({std::move(res_col), std::make_shared(), expr_name()}); + *result_column_id = static_cast(block->get_columns().size() - 1); + return Status::OK(); +} + +std::string DeletePredicate::debug_string() const { + return _expr_name; +} + +} // namespace doris diff --git a/be/src/format/reader/expr/delete_predicate.h b/be/src/format/reader/expr/delete_predicate.h new file mode 100644 index 00000000000000..3a95c31d8bfe14 --- /dev/null +++ b/be/src/format/reader/expr/delete_predicate.h @@ -0,0 +1,60 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "common/object_pool.h" +#include "common/status.h" +#include "exprs/function_context.h" +#include "exprs/vexpr.h" + +namespace doris { +class RowDescriptor; +class RuntimeState; +class TExprNode; +class Block; +class VExprContext; +} // namespace doris + +namespace doris { + +class DeletePredicate final : public VExpr { + ENABLE_FACTORY_CREATOR(DeletePredicate); + +public: + DeletePredicate(const std::vector& deleted_rows); + ~DeletePredicate() override = default; + Status execute(VExprContext* context, Block* block, int* result_column_id) const override; + Status execute_column_impl(VExprContext* context, const Block* block, const Selector* selector, + size_t count, ColumnPtr& result_column) const override { + return Status::InternalError("Not implement DeletePredicate::execute_column_impl"); + } + Status prepare(RuntimeState* state, const RowDescriptor& desc, VExprContext* context) override; + Status open(RuntimeState* state, VExprContext* context, + FunctionContext::FunctionStateScope scope) override; + void close(VExprContext* context, FunctionContext::FunctionStateScope scope) override; + std::string debug_string() const override; + uint64_t get_digest(uint64_t seed) const override { return 0; } + const std::string& expr_name() const override { return _expr_name; } + +private: + std::string _expr_name; + const std::vector& _deleted_rows; +}; +} // namespace doris \ No newline at end of file diff --git a/be/src/format/reader/expr/equality_delete_predicate.cpp b/be/src/format/reader/expr/equality_delete_predicate.cpp new file mode 100644 index 00000000000000..2b714abade7cac --- /dev/null +++ b/be/src/format/reader/expr/equality_delete_predicate.cpp @@ -0,0 +1,158 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "format/reader/expr/equality_delete_predicate.h" + +#include + +#include + +#include "common/status.h" +#include "core/assert_cast.h" +#include "core/block/column_with_type_and_name.h" +#include "core/column/column_nullable.h" +#include "core/column/column_vector.h" +#include "core/data_type/data_type_number.h" + +namespace doris { +namespace { + +bool column_value_equal(const ColumnPtr& lhs, size_t lhs_row, const ColumnPtr& rhs, + size_t rhs_row) { + if (lhs->is_nullable() && rhs->is_nullable()) { + return lhs->compare_at(lhs_row, rhs_row, *rhs, -1) == 0; + } + if (lhs->is_nullable()) { + const auto& nullable_lhs = assert_cast(*lhs); + return !nullable_lhs.is_null_at(lhs_row) && + nullable_lhs.get_nested_column().compare_at(lhs_row, rhs_row, *rhs, -1) == 0; + } + if (rhs->is_nullable()) { + const auto& nullable_rhs = assert_cast(*rhs); + return !nullable_rhs.is_null_at(rhs_row) && + lhs->compare_at(lhs_row, rhs_row, nullable_rhs.get_nested_column(), -1) == 0; + } + return lhs->compare_at(lhs_row, rhs_row, *rhs, -1) == 0; +} + +} // namespace + +EqualityDeletePredicate::EqualityDeletePredicate(Block delete_block, std::vector field_ids) + : VExpr(), _delete_block(std::move(delete_block)), _field_ids(std::move(field_ids)) { + _node_type = TExprNodeType::PREDICATE; + _opcode = TExprOpcode::DELETE; + _data_type = std::make_shared(); + _expr_name = "EqualityDeletePredicate"; + DCHECK_EQ(_delete_block.columns(), _field_ids.size()); + _delete_hashes = _build_hashes(_delete_block); + for (size_t row = 0; row < _delete_hashes.size(); ++row) { + _delete_hash_map.emplace(_delete_hashes[row], row); + } +} + +Status EqualityDeletePredicate::prepare(RuntimeState* state, const RowDescriptor& desc, + VExprContext* context) { + RETURN_IF_ERROR_OR_PREPARED(VExpr::prepare(state, desc, context)); + _expr_name = "EqualityDeletePredicate"; + _prepare_finished = true; + return Status::OK(); +} + +Status EqualityDeletePredicate::open(RuntimeState* state, VExprContext* context, + FunctionContext::FunctionStateScope scope) { + DCHECK(_prepare_finished); + for (auto& child : _children) { + RETURN_IF_ERROR(child->open(state, context, scope)); + } + if (scope == FunctionContext::FRAGMENT_LOCAL) { + RETURN_IF_ERROR(VExpr::get_const_col(context, nullptr)); + } + _open_finished = true; + return Status::OK(); +} + +void EqualityDeletePredicate::close(VExprContext* context, + FunctionContext::FunctionStateScope scope) { + VExpr::close(context, scope); +} + +Status EqualityDeletePredicate::execute(VExprContext* context, Block* block, + int* result_column_id) const { + if (_children.size() != _field_ids.size()) { + return Status::InternalError( + "EqualityDeletePredicate should have {} child exprs, but got {}", _field_ids.size(), + _children.size()); + } + + Block data_key_block; + for (const auto& child : _children) { + int slot = -1; + RETURN_IF_ERROR(child->execute(context, block, &slot)); + const auto& key_column = block->get_by_position(slot); + data_key_block.insert({key_column.column, key_column.type, key_column.name}); + } + + const auto rows = data_key_block.rows(); + auto res_col = ColumnBool::create(rows, 0); + if (_delete_hash_map.empty() || rows == 0) { + block->insert({std::move(res_col), std::make_shared(), expr_name()}); + *result_column_id = static_cast(block->columns() - 1); + return Status::OK(); + } + + auto data_hashes = _build_hashes(data_key_block); + auto& result_data = res_col->get_data(); + for (size_t row = 0; row < rows; ++row) { + const auto range = _delete_hash_map.equal_range(data_hashes[row]); + for (auto it = range.first; it != range.second; ++it) { + if (_equal(data_key_block, row, it->second)) { + result_data[row] = true; + break; + } + } + } + + block->insert({std::move(res_col), std::make_shared(), expr_name()}); + *result_column_id = static_cast(block->columns() - 1); + return Status::OK(); +} + +std::vector EqualityDeletePredicate::_build_hashes(const Block& block) { + std::vector hashes(block.rows(), 0); + for (const auto& column : block.get_columns()) { + column->update_hashes_with_value(hashes.data(), nullptr); + } + return hashes; +} + +bool EqualityDeletePredicate::_equal(const Block& data_block, size_t data_row, + size_t delete_row) const { + for (size_t column_idx = 0; column_idx < _delete_block.columns(); ++column_idx) { + const auto& data_column = data_block.get_by_position(column_idx).column; + const auto& delete_column = _delete_block.get_by_position(column_idx).column; + if (!column_value_equal(data_column, data_row, delete_column, delete_row)) { + return false; + } + } + return true; +} + +std::string EqualityDeletePredicate::debug_string() const { + return _expr_name; +} + +} // namespace doris diff --git a/be/src/format/reader/expr/equality_delete_predicate.h b/be/src/format/reader/expr/equality_delete_predicate.h new file mode 100644 index 00000000000000..2e33cffb3985df --- /dev/null +++ b/be/src/format/reader/expr/equality_delete_predicate.h @@ -0,0 +1,71 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "common/status.h" +#include "core/block/block.h" +#include "exprs/function_context.h" +#include "exprs/vexpr.h" + +namespace doris { +class RowDescriptor; +class RuntimeState; +class VExprContext; +} // namespace doris + +namespace doris { + +class EqualityDeletePredicate final : public VExpr { + ENABLE_FACTORY_CREATOR(EqualityDeletePredicate); + +public: + EqualityDeletePredicate(Block delete_block, std::vector field_ids); + ~EqualityDeletePredicate() override = default; + + Status execute(VExprContext* context, Block* block, int* result_column_id) const override; + Status execute_column_impl(VExprContext* context, const Block* block, const Selector* selector, + size_t count, ColumnPtr& result_column) const override { + return Status::InternalError("Not implement EqualityDeletePredicate::execute_column_impl"); + } + Status prepare(RuntimeState* state, const RowDescriptor& desc, VExprContext* context) override; + Status open(RuntimeState* state, VExprContext* context, + FunctionContext::FunctionStateScope scope) override; + void close(VExprContext* context, FunctionContext::FunctionStateScope scope) override; + std::string debug_string() const override; + uint64_t get_digest(uint64_t seed) const override { return 0; } + const std::string& expr_name() const override { return _expr_name; } + +private: + static std::vector _build_hashes(const Block& block); + bool _equal(const Block& data_block, size_t data_row, size_t delete_row) const; + + std::string _expr_name; + Block _delete_block; + std::vector _field_ids; + std::vector _delete_hashes; + std::multimap _delete_hash_map; +}; + +} // namespace doris diff --git a/be/src/format/reader/expr/literal.h b/be/src/format/reader/expr/literal.h new file mode 100644 index 00000000000000..9c4202994ee0ab --- /dev/null +++ b/be/src/format/reader/expr/literal.h @@ -0,0 +1,35 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "core/data_type/data_type.h" +#include "exprs/vliteral.h" + +namespace doris { + +class TableLiteral : public VLiteral { + ENABLE_FACTORY_CREATOR(TableLiteral); + +public: + TableLiteral(const DataTypePtr& type, const Field& field) : VLiteral(type) { + _data_type = type; + _column_ptr = _data_type->create_column_const(1, field); + } +}; + +} // namespace doris diff --git a/be/src/format/reader/expr/slot_ref.h b/be/src/format/reader/expr/slot_ref.h new file mode 100644 index 00000000000000..fd4782a1bdde54 --- /dev/null +++ b/be/src/format/reader/expr/slot_ref.h @@ -0,0 +1,51 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "core/data_type/data_type.h" +#include "exprs/vslot_ref.h" + +namespace doris { + +class TableSlotRef : public VSlotRef { + ENABLE_FACTORY_CREATOR(TableSlotRef); + +public: + TableSlotRef(int slot_id, int column_id, int column_uniq_id, const DataTypePtr& type, + const std::string& column_name) + : VSlotRef(slot_id, column_id, column_uniq_id), _cname(column_name) { + _data_type = type; + } + + Status prepare(RuntimeState* state, const RowDescriptor& desc, VExprContext* context) override { + if (_prepared) { + return Status::OK(); + } + _prepared = true; + _prepare_finished = true; + return Status::OK(); + } + + const std::string& expr_name() const override { return _cname; } + const std::string& column_name() const override { return _cname; } + +private: + const std::string _cname; +}; + +} // namespace doris diff --git a/be/src/format/reader/file_reader.cpp b/be/src/format/reader/file_reader.cpp new file mode 100644 index 00000000000000..daf9e2cf4f82a7 --- /dev/null +++ b/be/src/format/reader/file_reader.cpp @@ -0,0 +1,42 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "format/reader/file_reader.h" + +#include "io/fs/buffered_reader.h" +#include "io/fs/tracing_file_reader.h" +#include "runtime/runtime_state.h" + +namespace doris::reader { + +Status FileReader::init(RuntimeState* state) { + _init_profile(); + SCOPED_RAW_TIMER(&_reader_statistics.file_reader_create_time); + ++_reader_statistics.open_file_num; + io::FileReaderOptions reader_options = + FileFactory::get_reader_options(state->query_options(), *_file_description); + _file_reader = DORIS_TRY(io::DelegateReader::create_file_reader( + _profile, *_system_properties, *_file_description, reader_options, + io::DelegateReader::AccessMode::RANDOM, _io_ctx)); + _tracing_file_reader = _io_ctx ? std::make_shared( + _file_reader, _io_ctx->file_reader_stats) + : _file_reader; + _eof = false; + return Status::OK(); +} + +} // namespace doris::reader diff --git a/be/src/format/reader/file_reader.h b/be/src/format/reader/file_reader.h new file mode 100644 index 00000000000000..7e6d18acedc2d4 --- /dev/null +++ b/be/src/format/reader/file_reader.h @@ -0,0 +1,242 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include "common/status.h" +#include "core/data_type/data_type.h" +#include "core/field.h" +#include "exprs/vexpr_fwd.h" +#include "gen_cpp/PlanNodes_types.h" +#include "io/file_factory.h" +#include "io/fs/file_reader_writer_fwd.h" + +namespace doris { +class Block; +class ColumnPredicate; + +namespace io { +struct IOContext; +} // namespace io +} // namespace doris + +namespace doris::reader { + +using ColumnId = int32_t; + +enum ColumnType { + DATA_COLUMN = 0, // normal data column + ROW_NUMBER = 1, // row number in a file + FILE_NAME = 2, // file name +}; + +// 文件本地 schema 字段。 +// 这是 FileReader 暴露给 table 层的 file-local schema 视图,不携带 table/global +// schema 语义。Iceberg field id、name mapping、default/generated/partition 列都不在 +// FileReader 内部解释。 +struct SchemaField { + int32_t id = -1; + std::string name; + DataTypePtr type; + std::vector children; + std::vector file_path; + std::vector field_id_path; + std::vector name_path; + ColumnType column_type = ColumnType::DATA_COLUMN; +}; + +// File-local nested projection. The top-level scan column is still represented +// by FileScanRequest::predicate_columns/non_predicate_columns; this tree only +// describes which child paths are needed inside a complex top-level field. +struct FieldProjection { + ColumnId file_column_id = -1; + std::vector file_path; + bool project_all_children = true; + std::vector children; +}; + +// File-local single-column predicates for file-layer pruning, such as min/max, page index, +// dictionary and bloom filter. Predicates must all belong to file_column_id. +struct FileColumnPredicateFilter { + ColumnId file_column_id = -1; + std::vector> predicates; +}; + +enum class FileFormat { + PARQUET, + ORC, + CSV, +}; + +// 通用文件层 scan 请求。 +// 该结构描述所有文件格式都可以共享的 file-local 读取输入。这里不出现 table/global +// schema。所有 schema change、filter localization、default/generated/partition +// 列都应在 table 层完成。 +struct FileScanRequest { + virtual ~FileScanRequest() = default; + + std::vector predicate_columns; + std::vector non_predicate_columns; + std::map column_positions; // file_column_id -> file-local block position + std::map complex_projections; + // Complex conjuncts converted to file-local predicates from table-level predicates. + VExprContextSPtrs conjuncts; + // Delete predicates converted to file-local predicates. + VExprContextSPtrs delete_conjuncts; + // Only simple predicates that can be directly evaluated on column, such as `a` > 1. Now we use it for zone-map filtering. + std::vector column_predicate_filters; + // fallback path if filters cannot be localized to file-local predicates. The expression can reference projected_file_columns and partition columns. + std::vector> reader_expression_map; +}; + +struct FileAggregateRequest { + struct Column { + ColumnId file_column_id = -1; + }; + + TPushAggOp::type agg_type = TPushAggOp::type::NONE; + std::vector columns; +}; + +struct FileAggregateResult { + struct Column { + bool has_min = false; + bool has_max = false; + Field min_value; + Field max_value; + }; + + int64_t count = 0; + std::vector columns; +}; + +// 文件物理读取层通用接口。 +// 该接口只描述 file-local schema、file-local scan request 和 file-local block。 +// TableReader/IcebergTableReader 可以通过它组合不同文件格式 reader。 +/** + * +-----> get_schema() -----------------+ + * FileReader() -----> init() ----| -----> close() + * +-----> open() -----> get_block() ----+ + */ +class FileReader { +public: + struct ReaderStatistics { + int32_t filtered_row_groups = 0; + int32_t filtered_row_groups_by_min_max = 0; + int32_t filtered_row_groups_by_bloom_filter = 0; + int32_t read_row_groups = 0; + int64_t filtered_group_rows = 0; + int64_t filtered_page_rows = 0; + int64_t lazy_read_filtered_rows = 0; + int64_t read_rows = 0; + int64_t filtered_bytes = 0; + int64_t column_read_time = 0; + int64_t parse_meta_time = 0; + int64_t parse_footer_time = 0; + int64_t file_footer_read_calls = 0; + int64_t file_footer_hit_cache = 0; + int64_t file_reader_create_time = 0; + int64_t open_file_num = 0; + int64_t row_group_filter_time = 0; + int64_t page_index_filter_time = 0; + int64_t read_page_index_time = 0; + int64_t parse_page_index_time = 0; + int64_t predicate_filter_time = 0; + int64_t dict_filter_rewrite_time = 0; + int64_t bloom_filter_read_time = 0; + }; + + FileReader(std::shared_ptr& system_properties, + std::unique_ptr& file_description, + std::shared_ptr io_ctx, RuntimeProfile* profile) + : _system_properties(system_properties), + _file_description(std::move(file_description)), + _io_ctx(io_ctx), + _profile(profile) {} + virtual ~FileReader() = default; + + // Initialize file reader and parse file metadata. + virtual Status init(RuntimeState* state); + + // Get file-local schema from file metadata. The file schema is determined by file format and file content, and does not contain table/global schema semantics. For example, Iceberg field id, name mapping, default/generated/partition columns are not interpreted in file reader. This method can only be called after init() successfully, but does not require open() to be called. + virtual Status get_schema(std::vector* file_schema) const = 0; + + // Open the file reader with file-local scan request. The file reader should initialize its internal state according to the request, but does not need to interpret table/global schema semantics. For example, all schema change, filter localization, default/generated/partition columns should be handled in table reader layer. This method can only be called after init() successfully. + virtual Status open(std::unique_ptr& request) { + _request = std::move(request); + return Status::OK(); + } + + // 读取下一批 file-local block。 + // 该方法只能在 open(FileScanRequest) 成功后调用。 + // file_block 的列顺序和类型必须遵守 FileScanRequest,而不是 table/global schema。 + // rows 返回当前批次输出行数;eof 表示当前文件 reader 是否读完;多文件切换由 + // TableReader 负责。 + virtual Status get_block(Block* file_block, size_t* rows, bool* eof) { + // stub 默认立即 EOF。 + if (rows != nullptr) { + *rows = 0; + } + if (eof != nullptr) { + *eof = true; + } + _eof = true; + return Status::OK(); + } + + virtual Status get_aggregate_result(const FileAggregateRequest& request, + FileAggregateResult* result) { + return Status::NotSupported("FileReader does not support aggregate pushdown"); + } + + // 关闭当前物理文件 reader 并释放文件层状态。 + // 该方法不处理 table-level delete/finalize 状态,后者由 TableReader 子类管理。 + virtual Status close() { + _file_reader.reset(); + _tracing_file_reader.reset(); + _io_ctx.reset(); + _request.reset(); + _eof = true; + return Status::OK(); + } + +protected: + virtual void _init_profile() {} + io::FileReaderSPtr _file_reader; + // _tracing_file_reader wraps _file_reader. + // _file_reader is original file reader. + // _tracing_file_reader is tracing file reader with io context. + // If io_ctx is null, _tracing_file_reader will be the same as file_reader. + io::FileReaderSPtr _tracing_file_reader = nullptr; + std::unique_ptr _request; + bool _eof = true; + ReaderStatistics _reader_statistics; + std::shared_ptr _system_properties; + std::unique_ptr _file_description; + std::shared_ptr _io_ctx; + RuntimeProfile* _profile = nullptr; +}; + +} // namespace doris::reader diff --git a/be/src/format/reader/table/paimon_reader.cpp b/be/src/format/reader/table/paimon_reader.cpp new file mode 100644 index 00000000000000..d5c450b2c0172b --- /dev/null +++ b/be/src/format/reader/table/paimon_reader.cpp @@ -0,0 +1,55 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "format/reader/table/paimon_reader.h" + +#include +#include + +#include "format/table/deletion_vector_reader.h" + +namespace doris::paimon { + +Status PaimonReader::_parse_deletion_vector_file(const TTableFormatFileDesc& t_desc, + DeleteFileDesc* desc, bool* has_delete_file) { + DORIS_CHECK(desc != nullptr); + DORIS_CHECK(has_delete_file != nullptr); + *has_delete_file = false; + const auto& table_desc = t_desc.paimon_params; + if (!table_desc.__isset.deletion_file) { + return Status::OK(); + } + const auto& deletion_file = table_desc.deletion_file; + + const std::string key_prefix = "paimon_dv:"; + desc->key.resize(key_prefix.size() + deletion_file.path.size() + sizeof(deletion_file.offset)); + char* key_data = desc->key.data(); + memcpy(key_data, key_prefix.data(), key_prefix.size()); + key_data += key_prefix.size(); + memcpy(key_data, deletion_file.path.data(), deletion_file.path.size()); + key_data += deletion_file.path.size(); + memcpy(key_data, &deletion_file.offset, sizeof(deletion_file.offset)); + desc->path = deletion_file.path; + desc->start_offset = deletion_file.offset; + desc->size = deletion_file.length + 4; + desc->file_size = -1; + desc->format = DeleteFileDesc::Format::PAIMON; + *has_delete_file = true; + return Status::OK(); +} + +} // namespace doris::paimon diff --git a/be/src/format/reader/table/paimon_reader.h b/be/src/format/reader/table/paimon_reader.h new file mode 100644 index 00000000000000..ce386460a6e681 --- /dev/null +++ b/be/src/format/reader/table/paimon_reader.h @@ -0,0 +1,37 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "format/reader/table_reader.h" + +namespace doris { +struct DeleteFileDesc; +} +namespace doris::paimon { + +class PaimonReader final : public reader::TableReader { +public: + ENABLE_FACTORY_CREATOR(PaimonReader); + ~PaimonReader() final = default; + +protected: + Status _parse_deletion_vector_file(const TTableFormatFileDesc& t_desc, DeleteFileDesc* desc, + bool* has_delete_file) override; +}; + +} // namespace doris::paimon diff --git a/be/src/format/reader/table_reader.cpp b/be/src/format/reader/table_reader.cpp new file mode 100644 index 00000000000000..2c92b9ca1a1d0c --- /dev/null +++ b/be/src/format/reader/table_reader.cpp @@ -0,0 +1,277 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "format/reader/table_reader.h" + +#include +#include + +#include +#include +#include +#include + +#include "common/cast_set.h" +#include "common/status.h" +#include "core/assert_cast.h" +#include "exec/common/endian.h" +#include "exprs/vslot_ref.h" +#include "format/new_parquet/parquet_reader.h" +#include "format/reader/column_mapper.h" +#include "format/table/deletion_vector_reader.h" +#include "io/io_common.h" +#include "roaring/roaring64map.hh" + +namespace doris::reader { +namespace { + +void collect_table_slot_ids(const VExprSPtr& expr, std::set* slot_ids) { + if (expr == nullptr) { + return; + } + if (expr->is_slot_ref()) { + const auto* slot_ref = assert_cast(expr.get()); + slot_ids->insert(slot_ref->slot_id()); + } + for (const auto& child : expr->children()) { + collect_table_slot_ids(child, slot_ids); + } +} + +void build_table_filters_from_conjunct(const VExprSPtr& conjunct, + std::vector* table_filters) { + if (conjunct == nullptr) { + return; + } + if (conjunct->node_type() == TExprNodeType::COMPOUND_PRED && + conjunct->op() == TExprOpcode::COMPOUND_AND) { + for (const auto& child : conjunct->children()) { + build_table_filters_from_conjunct(child, table_filters); + } + return; + } + std::set slot_ids; + collect_table_slot_ids(conjunct, &slot_ids); + if (!slot_ids.empty()) { + TableFilter table_filter; + table_filter.conjunct = VExprContext::create_shared(conjunct); + table_filter.slot_ids.assign(slot_ids.begin(), slot_ids.end()); + table_filters->push_back(std::move(table_filter)); + } +} + +Status parse_deletion_vector(const char* buf, size_t buffer_size, DeleteFileDesc::Format format, + DeleteRows* delete_rows) { + DORIS_CHECK(buf != nullptr); + DORIS_CHECK(delete_rows != nullptr); + DORIS_CHECK(format == DeleteFileDesc::Format::PAIMON || + format == DeleteFileDesc::Format::ICEBERG); + + const size_t checksum_size = format == DeleteFileDesc::Format::ICEBERG ? 4 : 0; + if (buffer_size < 8 + checksum_size) [[unlikely]] { + return Status::DataQualityError("Deletion vector file size too small: {}", buffer_size); + } + + auto total_length = BigEndian::Load32(buf); + if (total_length + 4 + checksum_size != buffer_size) [[unlikely]] { + return Status::DataQualityError("Deletion vector length mismatch, expected: {}, actual: {}", + total_length + 4 + checksum_size, buffer_size); + } + + constexpr static char MAGIC_NUMBER[] = {'\xD1', '\xD3', '\x39', '\x64'}; + if (memcmp(buf + sizeof(total_length), MAGIC_NUMBER, 4) != 0) [[unlikely]] { + return Status::DataQualityError("Deletion vector magic number mismatch"); + } + + const char* bitmap_buf = buf + 8; + const size_t bitmap_size = buffer_size - 8 - checksum_size; + if (format == DeleteFileDesc::Format::PAIMON) { + roaring::Roaring bitmap; + try { + bitmap = roaring::Roaring::readSafe(bitmap_buf, bitmap_size); + } catch (const std::runtime_error& e) { + return Status::DataQualityError("Decode roaring bitmap failed, {}", e.what()); + } + + delete_rows->reserve(bitmap.cardinality()); + for (auto it = bitmap.begin(); it != bitmap.end(); it++) { + delete_rows->push_back(*it); + } + return Status::OK(); + } + + roaring::Roaring64Map bitmap; + try { + bitmap = roaring::Roaring64Map::readSafe(bitmap_buf, bitmap_size); + } catch (const std::runtime_error& e) { + return Status::DataQualityError("Decode roaring bitmap failed, {}", e.what()); + } + + delete_rows->reserve(bitmap.cardinality()); + for (auto it = bitmap.begin(); it != bitmap.end(); it++) { + delete_rows->push_back(cast_set(*it)); + } + return Status::OK(); +} + +} // namespace + +std::shared_ptr create_system_properties( + const TFileScanRangeParams* scan_params) { + auto system_properties = std::make_shared(); + if (scan_params == nullptr || !scan_params->__isset.file_type) { + system_properties->system_type = TFileType::FILE_LOCAL; + return system_properties; + } + system_properties->system_type = scan_params->file_type; + system_properties->properties = scan_params->properties; + system_properties->hdfs_params = scan_params->hdfs_params; + if (scan_params->__isset.broker_addresses) { + system_properties->broker_addresses.assign(scan_params->broker_addresses.begin(), + scan_params->broker_addresses.end()); + } + return system_properties; +} + +Status TableReader::init(TableReadOptions options) { + _scan_params = options.scan_params; + _format = options.format; + _io_ctx = options.io_ctx; + _runtime_state = options.runtime_state; + _scanner_profile = options.scanner_profile; + _push_down_agg_type = options.push_down_agg_type; + _projected_columns = std::move(options.projected_columns); + _system_properties = create_system_properties(_scan_params); + _profile = std::move(options.profile); + TableColumnMapperOptions mapper_options; + mapper_options.mode = TableColumnMappingMode::BY_FIELD_ID; + mapper_options.allow_missing_columns = options.allow_missing_columns; + _data_reader.column_mapper = TableColumnMapper(mapper_options); + _conjuncts = std::move(options.conjuncts); + _table_column_predicates = std::move(options.column_predicates); + return Status::OK(); +} + +Status TableReader::_build_table_filters_from_conjuncts() { + _table_filters.clear(); + build_table_filters_from_conjunct(_conjuncts.root(), &_table_filters); + return Status::OK(); +} + +Status TableReader::_open_local_filter_exprs(const FileScanRequest& file_request) { + RowDescriptor row_desc; + for (const auto& conjunct : file_request.conjuncts) { + RETURN_IF_ERROR(conjunct->prepare(_runtime_state, row_desc)); + RETURN_IF_ERROR(conjunct->open(_runtime_state)); + } + for (const auto& delete_conjunct : file_request.delete_conjuncts) { + RETURN_IF_ERROR(delete_conjunct->prepare(_runtime_state, row_desc)); + RETURN_IF_ERROR(delete_conjunct->open(_runtime_state)); + } + return Status::OK(); +} + +Status TableReader::create_next_reader(bool* eos) { + DCHECK(_data_reader.reader == nullptr); + if (_current_task == nullptr) { + *eos = true; + return Status::OK(); + } + + switch (_format) { + case FileFormat::PARQUET: { + _data_reader.reader = std::make_unique( + _system_properties, _current_task->data_file, _io_ctx, _scanner_profile); + break; + } + case FileFormat::ORC: + case FileFormat::CSV: + return Status::NotSupported("TableReader does not support file format {}", + static_cast(_format)); + } + + RETURN_IF_ERROR(_data_reader.reader->init(_runtime_state)); + RETURN_IF_ERROR(open_reader()); + *eos = false; + return Status::OK(); +} + +std::unique_ptr create_file_description(const TFileRangeDesc& range) { + auto file_description = std::make_unique(); + file_description->path = range.path; + file_description->file_size = range.__isset.file_size ? range.file_size : -1; + file_description->range_start_offset = range.__isset.start_offset ? range.start_offset : 0; + file_description->range_size = range.__isset.size ? range.size : -1; + if (range.__isset.fs_name) { + file_description->fs_name = range.fs_name; + } + if (range.__isset.file_cache_admission) { + file_description->file_cache_admission = range.file_cache_admission; + } + return file_description; +} + +Status TableReader::prepare_split(const SplitReadOptions& options) { + _partition_values = std::move(options.partition_values); + _current_task = std::make_unique(); + _current_task->data_file = create_file_description(options.current_range); + _delete_rows = nullptr; + _aggregate_pushdown_tried = false; + return _parse_delete_predicates(options); +} + +Status TableReader::_parse_delete_predicates(const SplitReadOptions& options) { + DeleteFileDesc desc {.fs_name = options.current_range.fs_name}; + bool has_delete_file = false; + RETURN_IF_ERROR(_parse_deletion_vector_file(options.current_range.table_format_params, &desc, + &has_delete_file)); + if (has_delete_file) { + DORIS_CHECK(options.cache != nullptr); + Status create_status = Status::OK(); + + _delete_rows = options.cache->get(desc.key, [&]() -> DeleteRows* { + auto* delete_rows = new DeleteRows; + + DeletionVectorReader dv_reader(_runtime_state, _scanner_profile, *_scan_params, desc, + _io_ctx.get()); + create_status = dv_reader.open(); + if (!create_status.ok()) [[unlikely]] { + return nullptr; + } + + size_t bytes_read = desc.size; + std::vector buffer(bytes_read); + create_status = dv_reader.read_at(desc.start_offset, {buffer.data(), bytes_read}); + if (!create_status.ok()) [[unlikely]] { + return nullptr; + } + + const char* buf = buffer.data(); + SCOPED_TIMER(_profile->parse_delete_file_time); + create_status = parse_deletion_vector(buf, bytes_read, desc.format, delete_rows); + if (!create_status.ok()) [[unlikely]] { + return nullptr; + } + COUNTER_UPDATE(_profile->num_delete_rows, delete_rows->size()); + return delete_rows; + }); + RETURN_IF_ERROR(create_status); + } + + return Status::OK(); +} +} // namespace doris::reader diff --git a/be/src/format/reader/table_reader.h b/be/src/format/reader/table_reader.h new file mode 100644 index 00000000000000..83e0ec44fc80fe --- /dev/null +++ b/be/src/format/reader/table_reader.h @@ -0,0 +1,690 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include +#include +#include +#include +#include +#include + +#include "common/cast_set.h" +#include "common/status.h" +#include "core/assert_cast.h" +#include "core/block/block.h" +#include "core/data_type/data_type.h" +#include "core/data_type/data_type_array.h" +#include "core/data_type/data_type_map.h" +#include "core/data_type/data_type_nullable.h" +#include "core/data_type/data_type_number.h" +#include "core/data_type/data_type_struct.h" +#include "core/field.h" +#include "exprs/vexpr_context.h" +#include "exprs/vexpr_fwd.h" +#include "format/new_parquet/column_reader.h" +#include "format/reader/column_mapper.h" +#include "format/reader/expr/delete_predicate.h" +#include "format/reader/expr/slot_ref.h" +#include "format/reader/file_reader.h" +#include "gen_cpp/PlanNodes_types.h" +#include "runtime/descriptors.h" + +namespace doris { +class Block; +class ColumnPredicate; +struct DeleteFileDesc; +} // namespace doris + +namespace doris::reader { + +using DeleteRows = std::vector; + +// table/global schema 中的列视图。 +// Iceberg 场景下,id 默认对应 Iceberg field id。该结构不描述文件中的物理列。 +struct TableColumn { + ColumnId id = -1; + std::string name; + DataTypePtr type; + std::vector children; + VExprContextSPtr default_expr; + bool is_partition_key = false; +}; + +// All complex predicates on table/global schema, which cannot be directly localized to file +// schema. They will be evaluated at table level and may depend on multiple columns. +struct TableFilter { + // 表达式过滤,适合表达 cast、复杂表达式、复杂列提取等语义。 + VExprContextSPtr conjunct; + + // Table slot ids referenced by conjunct. A single expression filter may depend on multiple + // columns, while ColumnPredicate pruning still belongs to one concrete column. + std::vector slot_ids; + + bool can_be_localized() const { return true; } +}; + +enum class TableFilterConversion { + COPY_DIRECTLY, + CAST_FILTER, + EVALUATE_EXPRESSION, + FINALIZE_ONLY, +}; + +struct BaseDataFile { + virtual ~BaseDataFile() = default; + + std::string path; + std::string format; + int64_t record_count = 0; + int64_t file_size = 0; +}; + +struct ScanTask { + virtual ~ScanTask() = default; + + std::unique_ptr data_file; +}; + +struct ReadProfile { + RuntimeProfile::Counter* num_delete_files; + RuntimeProfile::Counter* num_delete_rows; + RuntimeProfile::Counter* parse_delete_file_time; +}; + +struct TableReadOptions { + // Columns need to be read from file and output by table reader. They are all in table/global + // schema semantics. + const std::vector projected_columns; + // Simple predicates for a single column, which is parsed on scan operator. + const TableColumnPredicates column_predicates; + // All complex conjuncts from scan operator + const VExprContext conjuncts; + // File format of the underlying data files, needed for reader initialization and reader-level + // filter pushdown. + const FileFormat format; + TFileScanRangeParams* scan_params; + std::shared_ptr io_ctx; + RuntimeState* runtime_state; + RuntimeProfile* scanner_profile; + const bool allow_missing_columns = true; + // Push-down aggregate type. + const TPushAggOp::type push_down_agg_type = TPushAggOp::type::NONE; + + std::unique_ptr profile; +}; + +struct SplitReadOptions { + // Split-level information for reader initialization, which may include file path, partition values, delete file info, etc. The content is table format specific and opaque to table reader base class; it's the responsibility of the concrete table reader implementation to parse necessary information for reader initialization and filter pushdown. + std::map partition_values; + ShardedKVCache* cache; + TFileRangeDesc current_range; +}; + +// table-level reader 基类。 +// 该层负责多文件编排和动态分区裁剪等通用 table-level 逻辑,对外输出 table block。 +// 子类只需要实现“如何打开下一个具体 reader”和“如何读取当前 reader”的表格式语义。 +class TableReader { +public: + virtual ~TableReader() = default; + + // 初始化 table reader 的通用运行参数。 + // 子类可以在自己的 init(options) 中调用该方法;这里不接收具体表格式 schema/task。 + virtual Status init(TableReadOptions options); + + // Prepare for reading a new split/task. + // 1. Pass a new split/task to reader, which will be used in subsequent open_reader() to initialize the underlying file reader. + // 2. Parse delete predicates from split/task information, which will be used for later dynamic filtering and delete handling. + virtual Status prepare_split(const SplitReadOptions& options); + + // table-level 动态过滤入口。 + // 该方法用于根据 split、partition value 或文件级统计判断是否可以跳过后续 reader。 + // can_filter_all=true 表示当前 table reader 范围内的数据都可以被裁剪。 + virtual Status filter(const VExprContextSPtr& expr, bool* can_filter_all) { + // 真实实现会基于 split/partition/file stats 判断动态分区裁剪结果。 + (void)expr; + if (can_filter_all != nullptr) { + *can_filter_all = false; + } + return Status::OK(); + } + + // 对外读取 table block 的统一入口。 + // 基类负责 current reader 的打开、EOF 后切换和关闭;子类只实现 protected hook。 + // table_block 的列必须已经是 table/global schema 语义。 + Status get_block(Block* block, bool* eos) { + DORIS_CHECK(block->columns() == _projected_columns.size()); + block->clear_column_data(_projected_columns.size()); + + while (true) { + if (*eos) { + return Status::OK(); + } + if (!_data_reader.reader) { + RETURN_IF_ERROR(create_next_reader(eos)); + if (!_data_reader.reader) { + DCHECK(*eos); + return Status::OK(); + } + } + + // Materialize a reduced row set for upper aggregate operators when aggregate + // pushdown can be applied. This is not the final aggregate result: COUNT emits + // `count` default rows for the upper COUNT(*), and MIN/MAX emits two rows containing + // file-level min/max values for the upper MIN/MAX. + if (!_aggregate_pushdown_tried) { + bool pushed_down = false; + RETURN_IF_ERROR(_try_materialize_aggregate_pushdown_rows(block, &pushed_down)); + if (pushed_down) { + return Status::OK(); + } + } + + bool current_eof = false; + _data_reader.block_template.clear_column_data(); + size_t current_rows = 0; + RETURN_IF_ERROR(_data_reader.reader->get_block(&_data_reader.block_template, + ¤t_rows, ¤t_eof)); + if (current_rows == 0) { + if (current_eof) { + RETURN_IF_ERROR(close_current_reader()); + } + continue; + } + DCHECK_EQ(_data_reader.block_template.columns(), _data_reader.block_schema.size()); + DORIS_CHECK(block->columns() == _data_reader.column_mapper.mappings().size()); + RETURN_IF_ERROR(finalize_chunk(block, current_rows)); + if (current_eof) { + RETURN_IF_ERROR(close_current_reader()); + } + return Status::OK(); + } + } + + // 关闭 table reader 及当前正在读取的底层 reader。 + // 子类如果持有额外表格式资源,应 override 后先调用 TableReader::close()。 + virtual Status close() { + if (_data_reader.reader) { + RETURN_IF_ERROR(close_current_reader()); + } + return Status::OK(); + } + +protected: + // Parse deletion vector information from table format specific file description. + virtual Status _parse_deletion_vector_file(const TTableFormatFileDesc& t_desc, + DeleteFileDesc* desc, bool* has_delete_file) { + *has_delete_file = false; + return Status::OK(); + } + + // 切换到下一个 reader 的通用流程。 + // 该方法先关闭当前 reader,再打开下一个具体 reader;子类不应重复实现这个循环。 + Status create_next_reader(bool* eos); + + // 打开当前具体 reader。 + // 子类在这里基于当前 split/task 初始化底层 FileReader。 + virtual Status open_reader() { + // 1. Get file schema and create column mapping. + std::vector file_schema; + RETURN_IF_ERROR(_data_reader.reader->get_schema(&file_schema)); + _data_reader.file_schema = file_schema; + RETURN_IF_ERROR(_data_reader.column_mapper.create_mapping(_projected_columns, + _partition_values, file_schema)); + DORIS_CHECK(_data_reader.column_mapper.mappings().size() == _projected_columns.size()); + + // 2. Build table filters based on conjuncts and column predicates. + RETURN_IF_ERROR(_build_table_filters_from_conjuncts()); + + // 3. Create file scan request based on column mapping and table filters, then open file reader with the request. + // file scan request is the main carrier of file-level pruning information, including column mapping, column-level filters and expression filters. The file reader will evaluate the filters and only return rows that satisfy the filters to table reader. + auto file_request = std::make_unique(); + RETURN_IF_ERROR(_data_reader.column_mapper.create_scan_request( + _table_filters, _table_column_predicates, _projected_columns, file_request.get())); + RETURN_IF_ERROR(customize_file_scan_request(file_request.get())); + RETURN_IF_ERROR(_open_local_filter_exprs(*file_request)); + _data_reader.block_schema.clear(); + _data_reader.block_template.clear(); + _data_reader.block_schema.resize(file_request->column_positions.size()); + + // 4. Build block schema based on file schema and column mapping. The scan schema describes the column layout of the block returned by file reader, which is determined by the column mapping and file schema. + for (const auto& [file_column_id, block_position] : file_request->column_positions) { + DORIS_CHECK(block_position < _data_reader.block_schema.size()); + const auto* field = _find_schema_field(_data_reader.file_schema, file_column_id); + DORIS_CHECK(field != nullptr); + auto projection_it = file_request->complex_projections.find(file_column_id); + if (projection_it == file_request->complex_projections.end()) { + _data_reader.block_schema[block_position] = *field; + } else { + RETURN_IF_ERROR(_project_schema_field(*field, projection_it->second, + &_data_reader.block_schema[block_position])); + } + } + + // 5. Prepare block template based on block schema. The block template is used to store the block returned by file reader before finalize; it has the same column layout as the file reader output block, which is determined by the column mapping and file schema. + _data_reader.block_template.reserve(_data_reader.block_schema.size()); + for (const auto& field : _data_reader.block_schema) { + _data_reader.block_template.insert( + {field.type->create_column(), field.type, field.name}); + } + RETURN_IF_ERROR(_data_reader.reader->open(file_request)); + RETURN_IF_ERROR(_open_mapping_exprs()); + return Status::OK(); + } + + Status _build_table_filters_from_conjuncts(); + Status _open_local_filter_exprs(const FileScanRequest& file_request); + + virtual Status customize_file_scan_request(FileScanRequest* file_request) { + return _append_delete_predicate(file_request); + } + + static size_t _next_block_position(const FileScanRequest& request) { + size_t next_position = 0; + for (const auto& [_, block_position] : request.column_positions) { + next_position = std::max(next_position, block_position + 1); + } + return next_position; + } + + void _append_file_scan_column(FileScanRequest* request, ColumnId column_id, + std::vector* scan_columns) { + DORIS_CHECK(request != nullptr); + DORIS_CHECK(scan_columns != nullptr); + if (scan_columns == &request->non_predicate_columns && + std::find(request->predicate_columns.begin(), request->predicate_columns.end(), + column_id) != request->predicate_columns.end()) { + return; + } + const bool newly_added = request->column_positions.count(column_id) == 0; + if (newly_added) { + request->column_positions.emplace(column_id, _next_block_position(*request)); + scan_columns->push_back(column_id); + } else if (std::find(scan_columns->begin(), scan_columns->end(), column_id) == + scan_columns->end()) { + scan_columns->push_back(column_id); + } + if (scan_columns == &request->predicate_columns) { + request->non_predicate_columns.erase( + std::remove(request->non_predicate_columns.begin(), + request->non_predicate_columns.end(), column_id), + request->non_predicate_columns.end()); + } + if (column_id == doris::parquet::ParquetColumnReaderFactory::ROW_POSITION_COLUMN_ID && + _find_schema_field(_data_reader.file_schema, column_id) == nullptr) { + _data_reader.file_schema.push_back( + doris::parquet::ParquetColumnReaderFactory::row_position_schema_field()); + } + } + + // Append DeletePredicate to file scan request if there are deletes. The predicate will be evaluated in file reader level and filter out deleted rows before returning data to table reader. + Status _append_delete_predicate(FileScanRequest* request) { + DORIS_CHECK(request != nullptr); + if (_delete_rows == nullptr || _delete_rows->empty()) { + return Status::OK(); + } + const auto row_position_column_id = + parquet::ParquetColumnReaderFactory::ROW_POSITION_COLUMN_ID; + _append_file_scan_column(request, row_position_column_id, &request->predicate_columns); + + auto delete_predicate = std::make_shared(*_delete_rows); + const auto block_position = request->column_positions.at(row_position_column_id); + delete_predicate->add_child(TableSlotRef::create_shared( + cast_set(block_position), cast_set(block_position), -1, + std::make_shared(), + parquet::ParquetColumnReaderFactory::ROW_POSITION_COLUMN_NAME)); + + request->delete_conjuncts.push_back( + VExprContext::create_shared(std::move(delete_predicate))); + return Status::OK(); + } + + // 关闭当前具体 reader。 + // 该 hook 会被 create_next_reader 和 close 调用;实现应保持幂等。 + virtual Status close_current_reader() { + RETURN_IF_ERROR(_data_reader.reader->close()); + _data_reader.reader.reset(); + _data_reader.column_mapper.clear(); + _table_filters.clear(); + _data_reader.file_schema.clear(); + _data_reader.block_schema.clear(); + _data_reader.block_template.clear(); + _current_task.reset(); + return Status::OK(); + } + + // Finalize file-local block to table/global schema block. + virtual Status finalize_chunk(Block* block, const size_t rows) { + size_t idx = 0; + for (const auto& mapping : _data_reader.column_mapper.mappings()) { + ColumnPtr column; + RETURN_IF_ERROR(_materialize_mapping_column(mapping, &_data_reader.block_template, rows, + &column)); + block->replace_by_position(idx, std::move(column)); + idx++; + } + RETURN_IF_ERROR(materialize_virtual_columns(block)); + return Status::OK(); + } + + // Materialize virtual columns in table block, such as _row_id and _last_updated_sequence_number in Iceberg. This is called after finalize_chunk, so the virtual column can be referenced in finalize_expr. + virtual Status materialize_virtual_columns(Block* table_block) { return Status::OK(); } + + Status _try_materialize_aggregate_pushdown_rows(Block* block, bool* pushed_down) { + DORIS_CHECK(block != nullptr); + DORIS_CHECK(pushed_down != nullptr); + *pushed_down = false; + block->clear_column_data(_projected_columns.size()); + _aggregate_pushdown_tried = true; + if (!_supports_aggregate_pushdown(_push_down_agg_type)) { + return Status::OK(); + } + + FileAggregateRequest file_request; + _build_file_aggregate_request(_push_down_agg_type, &file_request); + FileAggregateResult file_result; + const auto status = _data_reader.reader->get_aggregate_result(file_request, &file_result); + if (status.is()) { + return Status::OK(); + } + RETURN_IF_ERROR(status); + RETURN_IF_ERROR( + _materialize_aggregate_pushdown_rows(_push_down_agg_type, file_result, block)); + *pushed_down = true; + RETURN_IF_ERROR(close_current_reader()); + return Status::OK(); + } + + virtual bool _supports_aggregate_pushdown(TPushAggOp::type agg_type) const { + // Only COUNT and MIN/MAX can be push down. + if (agg_type != TPushAggOp::type::COUNT && agg_type != TPushAggOp::type::MINMAX) { + return false; + } + // Only support aggregate pushdown when there is no delete, filter and column predicate, so + // the reduced rows consumed by the upper aggregate remain semantically equivalent to a + // normal scan. + if (_delete_rows != nullptr && !_delete_rows->empty()) { + return false; + } + if (!_table_filters.empty() || !_table_column_predicates.empty()) { + return false; + } + if (agg_type == TPushAggOp::type::COUNT) { + return true; + } + // For MIN/MAX, only support direct file-to-table column mappings. The two emitted rows + // must be enough for the upper MIN/MAX aggregate without evaluating projections, default + // expressions or virtual columns. + for (const auto& mapping : _data_reader.column_mapper.mappings()) { + if (!mapping.file_column_id.has_value() || mapping.has_complex_projection || + mapping.virtual_column_type != TableVirtualColumnType::INVALID || + mapping.default_expr != nullptr || mapping.file_type == nullptr || + mapping.table_type == nullptr) { + return false; + } + } + return true; + } + + Status _materialize_mapping_column(const ColumnMapping& mapping, Block* current_block, + const size_t rows, ColumnPtr* column) { + if (mapping.projection != nullptr) { + int res_id; + RETURN_IF_ERROR(mapping.projection->execute(current_block, &res_id)); + *column = current_block->get_columns()[res_id]; + return Status::OK(); + } + if (mapping.default_expr != nullptr) { + if (current_block->rows() == rows) { + int res_id; + RETURN_IF_ERROR(mapping.default_expr->execute(current_block, &res_id)); + *column = current_block->get_columns()[res_id]; + } else { + DORIS_CHECK(mapping.is_constant); + Block eval_block; + eval_block.insert({mapping.table_type->create_column_const_with_default_value(rows), + mapping.table_type, "__table_reader_const_rows"}); + int res_id; + RETURN_IF_ERROR(mapping.default_expr->execute(&eval_block, &res_id)); + *column = eval_block.get_columns()[res_id]; + } + return Status::OK(); + } + *column = mapping.table_type->create_column_const_with_default_value(rows); + return Status::OK(); + } + + Status _open_mapping_exprs() { + RowDescriptor row_desc; + for (const auto& mapping : _data_reader.column_mapper.mappings()) { + if (mapping.projection != nullptr) { + RETURN_IF_ERROR(mapping.projection->prepare(_runtime_state, row_desc)); + RETURN_IF_ERROR(mapping.projection->open(_runtime_state)); + } + if (mapping.default_expr != nullptr) { + RETURN_IF_ERROR(mapping.default_expr->prepare(_runtime_state, row_desc)); + RETURN_IF_ERROR(mapping.default_expr->open(_runtime_state)); + } + } + return Status::OK(); + } + + void _build_file_aggregate_request(TPushAggOp::type agg_type, + FileAggregateRequest* request) const { + DORIS_CHECK(request != nullptr); + DORIS_CHECK(_supports_aggregate_pushdown(agg_type)); + request->agg_type = agg_type; + request->columns.clear(); + if (agg_type == TPushAggOp::type::COUNT) { + return; + } + request->columns.reserve(_data_reader.column_mapper.mappings().size()); + for (const auto& mapping : _data_reader.column_mapper.mappings()) { + DORIS_CHECK(mapping.file_column_id.has_value()); + request->columns.push_back({*mapping.file_column_id}); + } + } + + Status _materialize_aggregate_pushdown_rows(TPushAggOp::type agg_type, + const FileAggregateResult& file_result, + Block* block) { + if (agg_type == TPushAggOp::type::COUNT) { + // COUNT pushdown is not a final count value. It emits `count` default rows so the + // upper COUNT(*) aggregate can count them and produce the final result, including + // zero rows when count is 0. + for (size_t column_idx = 0; column_idx < block->columns(); ++column_idx) { + block->replace_by_position(column_idx, + block->get_by_position(column_idx) + .type->create_column_const_with_default_value( + cast_set(file_result.count))); + } + return Status::OK(); + } + // MIN/MAX pushdown emits two rows, min first and max second, for each projected column. + // The upper MIN/MAX aggregate consumes those two rows to produce the final aggregate value. + DORIS_CHECK(file_result.columns.size() == _data_reader.column_mapper.mappings().size()); + DORIS_CHECK(block->columns() == _data_reader.column_mapper.mappings().size()); + Block file_block; + file_block.reserve(_data_reader.block_schema.size()); + for (const auto& field : _data_reader.block_schema) { + file_block.insert({field.type->create_column(), field.type, field.name}); + } + for (size_t column_idx = 0; column_idx < file_result.columns.size(); ++column_idx) { + const auto& result_column = file_result.columns[column_idx]; + if (!result_column.has_min || !result_column.has_max) { + return Status::NotSupported("Missing min/max aggregate result for column {}", + _projected_columns[column_idx].name); + } + const auto& mapping = _data_reader.column_mapper.mappings()[column_idx]; + DORIS_CHECK(mapping.file_column_id.has_value()); + bool found_file_column = false; + for (size_t block_position = 0; block_position < _data_reader.block_schema.size(); + ++block_position) { + if (_data_reader.block_schema[block_position].id == *mapping.file_column_id) { + found_file_column = true; + auto column = + file_block.get_by_position(block_position).column->assume_mutable(); + if (column->empty()) { + column->insert(result_column.min_value); + column->insert(result_column.max_value); + file_block.replace_by_position(block_position, std::move(column)); + } + break; + } + } + DORIS_CHECK(found_file_column); + } + for (size_t column_idx = 0; column_idx < _data_reader.column_mapper.mappings().size(); + ++column_idx) { + ColumnPtr table_column; + RETURN_IF_ERROR( + _materialize_mapping_column(_data_reader.column_mapper.mappings()[column_idx], + &file_block, 2, &table_column)); + block->replace_by_position(column_idx, std::move(table_column)); + } + return Status::OK(); + } + + struct DataReader { + std::unique_ptr reader; + TableColumnMapper column_mapper; + std::vector + file_schema; // Schema of the data file, also including virtual column (row position). + std::vector + block_schema; // Schema of the block returned by file reader, determined by column mapping and file schema. It is used for file reader to materialize columns into correct type and position. + Block block_template; + }; + DataReader _data_reader; + std::vector _projected_columns; + std::unique_ptr _current_task; + std::shared_ptr _system_properties; + // partition key -> value + std::map _partition_values; + // Predicates built from scan conjuncts before file-level localization. + std::vector _table_filters; + TableColumnPredicates _table_column_predicates; + VExprContext _conjuncts {nullptr}; + std::unique_ptr _profile; + // Parsed from row-position based delete files, including position delete and deletion vector. + DeleteRows* _delete_rows = nullptr; + TFileScanRangeParams* _scan_params; + std::shared_ptr _io_ctx; + RuntimeState* _runtime_state; + RuntimeProfile* _scanner_profile; + FileFormat _format; + TPushAggOp::type _push_down_agg_type = TPushAggOp::type::NONE; + bool _aggregate_pushdown_tried = false; + +private: + static const SchemaField* _find_schema_field(const std::vector& schema, + ColumnId column_id) { + for (const auto& field : schema) { + if (field.id == column_id) { + return &field; + } + } + return nullptr; + } + + static Status _project_schema_field(const SchemaField& field, const FieldProjection& projection, + SchemaField* projected_field) { + if (projected_field == nullptr) { + return Status::InvalidArgument("projected_field is null"); + } + *projected_field = field; + if (projection.project_all_children || projection.children.empty()) { + return Status::OK(); + } + projected_field->children.clear(); + for (const auto& child_projection : projection.children) { + if (child_projection.file_path.empty()) { + return Status::InvalidArgument("Empty projection path for field {}", field.name); + } + const int32_t child_idx = child_projection.file_path.back(); + if (child_idx < 0 || child_idx >= static_cast(field.children.size())) { + return Status::InvalidArgument("Invalid projection child index {} for field {}", + child_idx, field.name); + } + if (child_projection.file_path != field.children[child_idx].file_path) { + return Status::InvalidArgument("Invalid projection path for field {}", + field.children[child_idx].name); + } + SchemaField projected_child; + RETURN_IF_ERROR(_project_schema_field(field.children[child_idx], child_projection, + &projected_child)); + projected_field->children.push_back(std::move(projected_child)); + } + if (projected_field->children.empty()) { + return Status::NotSupported("Projection for field {} contains no children", field.name); + } + RETURN_IF_ERROR(_rebuild_projected_type(field.type, projected_field)); + return Status::OK(); + } + + static Status _rebuild_projected_type(const DataTypePtr& original_type, + SchemaField* projected_field) { + if (original_type == nullptr) { + return Status::InvalidArgument("Cannot rebuild projected type for field {}", + projected_field->name); + } + DataTypes child_types; + Strings child_names; + child_types.reserve(projected_field->children.size()); + child_names.reserve(projected_field->children.size()); + for (const auto& child : projected_field->children) { + child_types.push_back(child.type); + child_names.push_back(child.name); + } + + const auto primitive_type = remove_nullable(original_type)->get_primitive_type(); + DataTypePtr projected_type; + switch (primitive_type) { + case TYPE_STRUCT: + projected_type = std::make_shared(child_types, child_names); + break; + case TYPE_ARRAY: + DORIS_CHECK(child_types.size() == 1); + projected_type = std::make_shared(child_types[0]); + break; + case TYPE_MAP: + DORIS_CHECK(child_types.size() == 1); + DORIS_CHECK(remove_nullable(child_types[0])->get_primitive_type() == TYPE_STRUCT); + { + const auto* entry_type = + assert_cast(remove_nullable(child_types[0]).get()); + DORIS_CHECK(entry_type->get_elements().size() == 2); + projected_type = std::make_shared(entry_type->get_element(0), + entry_type->get_element(1)); + } + break; + default: + return Status::InvalidArgument("Cannot project children from non-complex field {}", + projected_field->name); + } + projected_field->type = + original_type->is_nullable() ? make_nullable(projected_type) : projected_type; + return Status::OK(); + } + + // Parse row-position deletes from table format specific parameters, and fill in _delete_rows. + Status _parse_delete_predicates(const SplitReadOptions& options); +}; + +} // namespace doris::reader diff --git a/be/src/format/table/deletion_vector_reader.cpp b/be/src/format/table/deletion_vector_reader.cpp index bfe34a5f555f94..d7e33c923d95b7 100644 --- a/be/src/format/table/deletion_vector_reader.cpp +++ b/be/src/format/table/deletion_vector_reader.cpp @@ -54,9 +54,9 @@ Status DeletionVectorReader::_create_file_reader() { return Status::EndOfFile("stop read."); } - _file_description.mtime = _range.__isset.modification_time ? _range.modification_time : 0; + _file_description.mtime = _desc.modification_time; io::FileReaderOptions reader_options = - FileFactory::get_reader_options(_state, _file_description); + FileFactory::get_reader_options(_state->query_options(), _file_description); _file_reader = DORIS_TRY(io::DelegateReader::create_file_reader( _profile, _system_properties, _file_description, reader_options, io::DelegateReader::AccessMode::RANDOM, _io_ctx)); @@ -64,20 +64,13 @@ Status DeletionVectorReader::_create_file_reader() { } void DeletionVectorReader::_init_file_description() { - _file_description.path = _range.path; - _file_description.file_size = _range.__isset.file_size ? _range.file_size : -1; - if (_range.__isset.fs_name) { - _file_description.fs_name = _range.fs_name; - } + _file_description.path = _desc.path; + _file_description.file_size = _desc.file_size; + _file_description.fs_name = _desc.fs_name; } void DeletionVectorReader::_init_system_properties() { - if (_range.__isset.file_type) { - // for compatibility - _system_properties.system_type = _range.file_type; - } else { - _system_properties.system_type = _params.file_type; - } + _system_properties.system_type = _params.file_type; _system_properties.properties = _params.properties; _system_properties.hdfs_params = _params.hdfs_params; if (_params.__isset.broker_addresses) { diff --git a/be/src/format/table/deletion_vector_reader.h b/be/src/format/table/deletion_vector_reader.h index 0663f3b28490ef..968344a8496bc7 100644 --- a/be/src/format/table/deletion_vector_reader.h +++ b/be/src/format/table/deletion_vector_reader.h @@ -36,6 +36,22 @@ struct IOContext; } // namespace io namespace doris { +struct DeleteFileDesc { + enum class Format { + PAIMON, + ICEBERG, + }; + + std::string key = ""; + std::string path = ""; + std::string fs_name = ""; + int64_t start_offset = 0; + int64_t size = 0; + int64_t file_size = -1; + int64_t modification_time = 0; + Format format = Format::PAIMON; +}; + class DeletionVectorReader { ENABLE_FACTORY_CREATOR(DeletionVectorReader); @@ -43,7 +59,22 @@ class DeletionVectorReader { DeletionVectorReader(RuntimeState* state, RuntimeProfile* profile, const TFileScanRangeParams& params, const TFileRangeDesc& range, io::IOContext* io_ctx) - : _state(state), _profile(profile), _range(range), _params(params), _io_ctx(io_ctx) {} + : _state(state), _profile(profile), _params(params), _io_ctx(io_ctx) { + _desc = DeleteFileDesc { + .key = "", + .path = range.path, + .fs_name = range.__isset.fs_name ? range.fs_name : "", + .start_offset = range.start_offset, + .size = range.size, + .file_size = range.__isset.file_size ? range.file_size : -1, + .modification_time = range.__isset.modification_time ? range.modification_time : 0}; + } + DeletionVectorReader(RuntimeState* state, RuntimeProfile* profile, + const TFileScanRangeParams& params, const DeleteFileDesc& desc, + io::IOContext* io_ctx) + : _state(state), _profile(profile), _params(params), _io_ctx(io_ctx) { + _desc = desc; + } ~DeletionVectorReader() = default; Status open(); Status read_at(size_t offset, Slice result); @@ -56,7 +87,7 @@ class DeletionVectorReader { private: RuntimeState* _state = nullptr; RuntimeProfile* _profile = nullptr; - const TFileRangeDesc& _range; + DeleteFileDesc _desc; const TFileScanRangeParams& _params; io::IOContext* _io_ctx = nullptr; diff --git a/be/src/format/table/iceberg_reader_mixin.h b/be/src/format/table/iceberg_reader_mixin.h index 42c80c9b7d4ddc..c9c84639b8faf0 100644 --- a/be/src/format/table/iceberg_reader_mixin.h +++ b/be/src/format/table/iceberg_reader_mixin.h @@ -341,9 +341,6 @@ class IcebergReaderMixin : public BaseReader, public TableSchemaChangeHelper { // id -> block column name std::unordered_map _id_to_block_column_name; - // File column names used during init - std::vector _file_col_names; - std::function()> _create_topn_row_id_column_iterator; diff --git a/be/src/format/table/iceberg_reader_v2.cpp b/be/src/format/table/iceberg_reader_v2.cpp new file mode 100644 index 00000000000000..f9587361dcf89d --- /dev/null +++ b/be/src/format/table/iceberg_reader_v2.cpp @@ -0,0 +1,547 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "format/table/iceberg_reader_v2.h" + +#include +#include +#include +#include + +#include "common/cast_set.h" +#include "core/assert_cast.h" +#include "core/block/block.h" +#include "core/column/column_const.h" +#include "core/column/column_nullable.h" +#include "core/column/column_string.h" +#include "core/column/column_vector.h" +#include "core/data_type/data_type_number.h" +#include "core/data_type/define_primitive_type.h" +#include "core/field.h" +#include "format/new_parquet/column_reader.h" +#include "format/new_parquet/parquet_reader.h" +#include "format/reader/expr/cast.h" +#include "format/reader/expr/equality_delete_predicate.h" +#include "format/reader/expr/slot_ref.h" +#include "format/reader/table_reader.h" +#include "format/table/deletion_vector_reader.h" +#include "io/file_factory.h" + +namespace doris::iceberg { + +IcebergTableReader::PositionDeleteRowsCollector::PositionDeleteRowsCollector( + std::string data_file_path, reader::DeleteRows* rows) + : _data_file_path(std::move(data_file_path)), _rows(rows) {} + +Status IcebergTableReader::PositionDeleteRowsCollector::collect(const Block& block, + size_t read_rows) { + if (read_rows == 0) { + return Status::OK(); + } + const auto& file_path_column = assert_cast( + *block.get_by_position(ICEBERG_FILE_PATH_BLOCK_POSITION).column); + const auto& pos_column = assert_cast( + *block.get_by_position(ICEBERG_ROW_POS_BLOCK_POSITION).column); + for (size_t row = 0; row < read_rows; ++row) { + const auto file_path = file_path_column.get_data_at(row).to_string(); + if (file_path == _data_file_path) { + _rows->push_back(pos_column.get_element(row)); + } + } + return Status::OK(); +} + +Status IcebergTableReader::prepare_split(const reader::SplitReadOptions& options) { + _row_lineage_columns = {}; + _iceberg_params = nullptr; + _delete_predicates_initialized = false; + _position_delete_rows_storage.clear(); + _equality_delete_filters.clear(); + if (options.current_range.__isset.table_format_params && + options.current_range.table_format_params.__isset.iceberg_params) { + const auto& iceberg_params = options.current_range.table_format_params.iceberg_params; + _iceberg_params = &iceberg_params; + if (iceberg_params.__isset.first_row_id) { + _row_lineage_columns.first_row_id = iceberg_params.first_row_id; + } + if (iceberg_params.__isset.last_updated_sequence_number) { + _row_lineage_columns.last_updated_sequence_number = + iceberg_params.last_updated_sequence_number; + } + } + RETURN_IF_ERROR(TableReader::prepare_split(options)); + return _init_delete_predicates(options.current_range.table_format_params); +} + +Status IcebergTableReader::materialize_virtual_columns(Block* table_block) { + for (size_t column_idx = 0; column_idx < _data_reader.column_mapper.mappings().size(); + ++column_idx) { + const auto& mapping = _data_reader.column_mapper.mappings()[column_idx]; + switch (mapping.virtual_column_type) { + case reader::TableVirtualColumnType::ROW_ID: + RETURN_IF_ERROR(_materialize_row_lineage_row_id(table_block, column_idx)); + break; + case reader::TableVirtualColumnType::LAST_UPDATED_SEQUENCE_NUMBER: + RETURN_IF_ERROR( + _materialize_row_lineage_last_updated_sequence_number(table_block, column_idx)); + break; + case reader::TableVirtualColumnType::INVALID: + break; + } + } + return Status::OK(); +} + +Status IcebergTableReader::customize_file_scan_request(reader::FileScanRequest* file_request) { + RETURN_IF_ERROR(TableReader::customize_file_scan_request(file_request)); + if (_row_lineage_columns.first_row_id >= 0 && _need_row_lineage_row_id()) { + RETURN_IF_ERROR(_append_row_position_output_column(file_request)); + } + RETURN_IF_ERROR(_append_equality_delete_predicates(file_request)); + return Status::OK(); +} + +bool IcebergTableReader::_supports_aggregate_pushdown(TPushAggOp::type agg_type) const { + if (!TableReader::_supports_aggregate_pushdown(agg_type)) { + return false; + } + return _equality_delete_filters.empty(); +} + +Status IcebergTableReader::_parse_deletion_vector_file(const TTableFormatFileDesc& t_desc, + DeleteFileDesc* desc, + bool* has_delete_file) { + DORIS_CHECK(desc != nullptr); + DORIS_CHECK(has_delete_file != nullptr); + *has_delete_file = false; + if (!t_desc.__isset.iceberg_params) { + return Status::OK(); + } + const auto& iceberg_params = t_desc.iceberg_params; + if (!iceberg_params.__isset.format_version || + iceberg_params.format_version < MIN_SUPPORT_DELETE_FILES_VERSION || + !iceberg_params.__isset.delete_files || iceberg_params.delete_files.empty()) { + return Status::OK(); + } + + const TIcebergDeleteFileDesc* deletion_vector = nullptr; + for (const auto& delete_file : iceberg_params.delete_files) { + if (!delete_file.__isset.content || delete_file.content != DELETION_VECTOR) { + continue; + } + if (deletion_vector != nullptr) { + return Status::DataQualityError("This iceberg data file has multiple DVs."); + } + deletion_vector = &delete_file; + } + if (deletion_vector == nullptr) { + return Status::OK(); + } + if (!deletion_vector->__isset.content_offset || + !deletion_vector->__isset.content_size_in_bytes) { + return Status::InternalError("Deletion vector is missing content offset or length"); + } + + desc->key = _iceberg_delete_vector_cache_key(*deletion_vector); + desc->path = deletion_vector->path; + desc->start_offset = deletion_vector->content_offset; + desc->size = deletion_vector->content_size_in_bytes; + desc->file_size = -1; + desc->format = DeleteFileDesc::Format::ICEBERG; + *has_delete_file = true; + return Status::OK(); +} + +Status IcebergTableReader::_init_delete_predicates(const TTableFormatFileDesc& t_desc) { + if (!t_desc.__isset.iceberg_params || _delete_predicates_initialized) { + _delete_predicates_initialized = true; + return Status::OK(); + } + const auto& iceberg_params = t_desc.iceberg_params; + if (!iceberg_params.__isset.format_version || + iceberg_params.format_version < MIN_SUPPORT_DELETE_FILES_VERSION || + !iceberg_params.__isset.delete_files || iceberg_params.delete_files.empty()) { + _delete_predicates_initialized = true; + return Status::OK(); + } + + std::vector position_delete_files; + std::vector equality_delete_files; + for (const auto& delete_file : iceberg_params.delete_files) { + if (!delete_file.__isset.content) { + continue; + } + if (delete_file.content == POSITION_DELETE) { + position_delete_files.push_back(delete_file); + } else if (delete_file.content == EQUALITY_DELETE) { + equality_delete_files.push_back(delete_file); + } + } + // `_delete_rows != nullptr` means DeleteVector is parsed + if (_delete_rows != nullptr) { + _position_delete_rows_storage = *_delete_rows; + _delete_rows = &_position_delete_rows_storage; + } + // Combine position delete rows from both deletion vector and position delete files, and + // initialize equality delete predicates. Position delete files contain row positions of + // deleted rows, which can be directly added to `_delete_rows`. Equality delete files contain + // values of deleted rows, which require reading the files and building predicates for later + // filtering. + if (!position_delete_files.empty()) { + RETURN_IF_ERROR(_init_position_delete_rows(position_delete_files)); + } + if (!equality_delete_files.empty()) { + RETURN_IF_ERROR(_init_equality_delete_predicates(equality_delete_files)); + } + + _delete_predicates_initialized = true; + return Status::OK(); +} + +std::string IcebergTableReader::_iceberg_delete_vector_cache_key( + const TIcebergDeleteFileDesc& delete_file) { + const std::string key_prefix = "iceberg_dv:"; + std::string key; + key.resize(key_prefix.size() + delete_file.path.size() + sizeof(delete_file.content_offset) + + sizeof(delete_file.content_size_in_bytes)); + char* data = key.data(); + memcpy(data, key_prefix.data(), key_prefix.size()); + data += key_prefix.size(); + memcpy(data, delete_file.path.data(), delete_file.path.size()); + data += delete_file.path.size(); + memcpy(data, &delete_file.content_offset, sizeof(delete_file.content_offset)); + data += sizeof(delete_file.content_offset); + memcpy(data, &delete_file.content_size_in_bytes, sizeof(delete_file.content_size_in_bytes)); + return key; +} + +std::shared_ptr IcebergTableReader::_delete_file_system_properties( + const TFileScanRangeParams& scan_params) { + auto system_properties = std::make_shared(); + system_properties->system_type = + scan_params.__isset.file_type ? scan_params.file_type : TFileType::FILE_LOCAL; + system_properties->properties = scan_params.properties; + system_properties->hdfs_params = scan_params.hdfs_params; + if (scan_params.__isset.broker_addresses) { + system_properties->broker_addresses.assign(scan_params.broker_addresses.begin(), + scan_params.broker_addresses.end()); + } + return system_properties; +} + +std::unique_ptr IcebergTableReader::_delete_file_description( + const TFileRangeDesc& range) { + auto file_description = std::make_unique(); + file_description->path = range.path; + file_description->file_size = range.__isset.file_size ? range.file_size : -1; + file_description->range_start_offset = range.__isset.start_offset ? range.start_offset : 0; + file_description->range_size = range.__isset.size ? range.size : -1; + if (range.__isset.fs_name) { + file_description->fs_name = range.fs_name; + } + return file_description; +} + +std::string IcebergTableReader::_data_file_path() const { + if (_iceberg_params != nullptr && _iceberg_params->__isset.original_file_path) { + return _iceberg_params->original_file_path; + } + DORIS_CHECK(_current_task != nullptr); + DORIS_CHECK(_current_task->data_file != nullptr); + return _current_task->data_file->path; +} + +Status IcebergTableReader::_append_row_position_output_column(reader::FileScanRequest* request) { + const auto row_position_column_id = + doris::parquet::ParquetColumnReaderFactory::ROW_POSITION_COLUMN_ID; + _append_file_scan_column(request, row_position_column_id, &request->non_predicate_columns); + _row_position_block_position = request->column_positions.at(row_position_column_id); + return Status::OK(); +} + +Status IcebergTableReader::_append_equality_delete_predicates(reader::FileScanRequest* request) { + DORIS_CHECK(request != nullptr); + for (const auto& filter : _equality_delete_filters) { + auto delete_predicate = + std::make_shared(filter.delete_block, filter.field_ids); + DCHECK_EQ(filter.field_ids.size(), filter.key_types.size()); + for (size_t idx = 0; idx < filter.field_ids.size(); ++idx) { + const int field_id = filter.field_ids[idx]; + auto field_it = + std::find_if(_data_reader.file_schema.begin(), _data_reader.file_schema.end(), + [field_id](const reader::SchemaField& field) { + return !field.field_id_path.empty() && + field.field_id_path.back() == field_id; + }); + if (field_it == _data_reader.file_schema.end()) { + return Status::InternalError( + "Can not find equality delete column field id {} in data file schema", + field_id); + } + _append_file_scan_column(request, field_it->id, &request->predicate_columns); + const auto block_position = request->column_positions.at(field_it->id); + auto slot = TableSlotRef::create_shared(cast_set(block_position), + cast_set(block_position), -1, + field_it->type, field_it->name); + if (field_it->type->equals(*filter.key_types[idx])) { + delete_predicate->add_child(std::move(slot)); + } else { + auto cast_expr = Cast::create_shared(filter.key_types[idx]); + cast_expr->add_child(std::move(slot)); + delete_predicate->add_child(std::move(cast_expr)); + } + } + request->delete_conjuncts.push_back( + VExprContext::create_shared(std::move(delete_predicate))); + } + return Status::OK(); +} + +Status IcebergTableReader::_read_parquet_position_delete_file( + const TIcebergDeleteFileDesc& delete_file, const TFileScanRangeParams& scan_params, + IcebergDeleteFileIOContext* delete_io_ctx, PositionDeleteRowsCollector* collector) { + if (!delete_file.__isset.file_format) { + return Status::InternalError("Iceberg position delete file is missing file format"); + } + if (delete_file.file_format == TFileFormatType::FORMAT_ORC) { + return Status::NotSupported("Iceberg ORC position delete file is not supported"); + } + if (delete_file.file_format != TFileFormatType::FORMAT_PARQUET) { + return Status::NotSupported("Unsupported Iceberg delete file format {}", + delete_file.file_format); + } + + auto delete_range = build_iceberg_delete_file_range(delete_file.path); + if (_current_task != nullptr && _current_task->data_file != nullptr && + !_current_task->data_file->fs_name.empty()) { + delete_range.__set_fs_name(_current_task->data_file->fs_name); + } + auto system_properties = _delete_file_system_properties(scan_params); + auto file_description = _delete_file_description(delete_range); + std::shared_ptr io_ctx(&delete_io_ctx->io_ctx, [](io::IOContext*) {}); + parquet::ParquetReader reader(system_properties, file_description, io_ctx, _scanner_profile); + RETURN_IF_ERROR(reader.init(_runtime_state)); + + std::vector schema; + RETURN_IF_ERROR(reader.get_schema(&schema)); + reader::SchemaField* file_path_field = nullptr; + reader::SchemaField* pos_field = nullptr; + for (auto& field : schema) { + if (field.name == ICEBERG_FILE_PATH) { + file_path_field = &field; + } else if (field.name == ICEBERG_ROW_POS) { + pos_field = &field; + } + } + if (file_path_field == nullptr || pos_field == nullptr) { + return Status::InternalError("Position delete parquet file is missing required columns"); + } + + auto request = std::make_unique(); + request->non_predicate_columns = {file_path_field->id, pos_field->id}; + request->column_positions = { + {file_path_field->id, ICEBERG_FILE_PATH_BLOCK_POSITION}, + {pos_field->id, ICEBERG_ROW_POS_BLOCK_POSITION}, + }; + RETURN_IF_ERROR(reader.open(request)); + + bool eof = false; + auto build_position_delete_block = [](const reader::SchemaField& file_path_field, + const reader::SchemaField& pos_field) -> Block { + Block block; + block.insert( + {file_path_field.type->create_column(), file_path_field.type, ICEBERG_FILE_PATH}); + block.insert({pos_field.type->create_column(), pos_field.type, ICEBERG_ROW_POS}); + return block; + }; + while (!eof) { + Block block = build_position_delete_block(*file_path_field, *pos_field); + size_t read_rows = 0; + RETURN_IF_ERROR(reader.get_block(&block, &read_rows, &eof)); + RETURN_IF_ERROR(collector->collect(block, read_rows)); + } + return reader.close(); +} + +Status IcebergTableReader::_init_position_delete_rows( + const std::vector& delete_files) { + TFileScanRangeParams delete_scan_params = + _scan_params == nullptr ? TFileScanRangeParams() : *_scan_params; + reader::DeleteRows position_delete_rows; + IcebergDeleteFileIOContext delete_io_ctx(_runtime_state); + PositionDeleteRowsCollector collector(_data_file_path(), &position_delete_rows); + for (const auto& delete_file : delete_files) { + RETURN_IF_ERROR(_read_parquet_position_delete_file(delete_file, delete_scan_params, + &delete_io_ctx, &collector)); + } + if (position_delete_rows.empty()) { + return Status::OK(); + } + // Position delete files and deletion vectors both become row-position deletes for the + // common TableReader DeletePredicate path. Keep the merged rows in a member vector because + // DeletePredicate stores a reference to the vector used by _delete_rows. + _position_delete_rows_storage.insert(_position_delete_rows_storage.end(), + position_delete_rows.begin(), position_delete_rows.end()); + std::sort(_position_delete_rows_storage.begin(), _position_delete_rows_storage.end()); + _position_delete_rows_storage.erase( + std::unique(_position_delete_rows_storage.begin(), _position_delete_rows_storage.end()), + _position_delete_rows_storage.end()); + _delete_rows = &_position_delete_rows_storage; + return Status::OK(); +} + +Status IcebergTableReader::_init_equality_delete_predicates( + const std::vector& delete_files) { + TFileScanRangeParams delete_scan_params = + _scan_params == nullptr ? TFileScanRangeParams() : *_scan_params; + IcebergDeleteFileIOContext delete_io_ctx(_runtime_state); + for (const auto& delete_file : delete_files) { + RETURN_IF_ERROR(_read_parquet_equality_delete_file(delete_file, delete_scan_params, + &delete_io_ctx)); + } + return Status::OK(); +} + +Status IcebergTableReader::_read_parquet_equality_delete_file( + const TIcebergDeleteFileDesc& delete_file, const TFileScanRangeParams& scan_params, + IcebergDeleteFileIOContext* delete_io_ctx) { + if (!delete_file.__isset.file_format) { + return Status::InternalError("Iceberg equality delete file is missing file format"); + } + if (delete_file.file_format != TFileFormatType::FORMAT_PARQUET) { + return Status::NotSupported("Unsupported Iceberg equality delete file format {}", + delete_file.file_format); + } + if (!delete_file.__isset.field_ids || delete_file.field_ids.empty()) { + return Status::InternalError("Iceberg equality delete file is missing field ids"); + } + + auto delete_range = build_iceberg_delete_file_range(delete_file.path); + if (_current_task != nullptr && _current_task->data_file != nullptr && + !_current_task->data_file->fs_name.empty()) { + delete_range.__set_fs_name(_current_task->data_file->fs_name); + } + auto system_properties = _delete_file_system_properties(scan_params); + auto file_description = _delete_file_description(delete_range); + std::shared_ptr io_ctx(&delete_io_ctx->io_ctx, [](io::IOContext*) {}); + parquet::ParquetReader reader(system_properties, file_description, io_ctx, _scanner_profile); + RETURN_IF_ERROR(reader.init(_runtime_state)); + + std::vector schema; + RETURN_IF_ERROR(reader.get_schema(&schema)); + std::vector delete_fields; + std::vector delete_field_ids; + std::vector delete_key_types; + for (const auto field_id : delete_file.field_ids) { + auto field_it = std::find_if( + schema.begin(), schema.end(), [field_id](const reader::SchemaField& field) { + return !field.field_id_path.empty() && field.field_id_path.back() == field_id; + }); + if (field_it == schema.end()) { + return Status::InternalError("Can not find field id {} in equality delete file {}", + field_id, delete_file.path); + } + if (!field_it->children.empty()) { + return Status::NotSupported( + "Iceberg equality delete does not support complex column {}", field_it->name); + } + delete_fields.push_back(*field_it); + delete_field_ids.push_back(field_id); + delete_key_types.push_back(field_it->type); + } + + auto request = std::make_unique(); + for (size_t idx = 0; idx < delete_fields.size(); ++idx) { + request->non_predicate_columns.push_back(delete_fields[idx].id); + request->column_positions.emplace(delete_fields[idx].id, idx); + } + RETURN_IF_ERROR(reader.open(request)); + + auto build_equality_delete_block = [](const std::vector fields) -> Block { + Block block; + for (const auto& field : fields) { + block.insert({field.type->create_column(), field.type, field.name}); + } + return block; + }; + Block delete_block = build_equality_delete_block(delete_fields); + bool eof = false; + while (!eof) { + Block block = build_equality_delete_block(delete_fields); + size_t read_rows = 0; + RETURN_IF_ERROR(reader.get_block(&block, &read_rows, &eof)); + if (read_rows > 0) { + MutableBlock mutable_block(&delete_block); + RETURN_IF_ERROR(mutable_block.merge(block)); + } + } + RETURN_IF_ERROR(reader.close()); + _equality_delete_filters.push_back( + EqualityDeleteFilter {.field_ids = std::move(delete_field_ids), + .key_types = std::move(delete_key_types), + .delete_block = std::move(delete_block)}); + return Status::OK(); +} + +Status IcebergTableReader::_materialize_row_lineage_row_id(Block* table_block, size_t column_idx) { + if (_row_lineage_columns.first_row_id < 0) { + return Status::OK(); + } + DORIS_CHECK(_row_position_block_position < _data_reader.block_template.columns()); + const auto& row_position_column = assert_cast( + *_data_reader.block_template.get_by_position(_row_position_block_position).column); + DORIS_CHECK(row_position_column.size() == table_block->rows()); + auto column = table_block->get_by_position(column_idx) + .column->convert_to_full_column_if_const() + ->assume_mutable(); + auto* nullable_column = assert_cast(column.get()); + auto& null_map = nullable_column->get_null_map_data(); + auto& data = assert_cast(*nullable_column->get_nested_column_ptr()).get_data(); + null_map.resize(row_position_column.size()); + std::fill(null_map.begin(), null_map.end(), 0); + data.resize(row_position_column.size()); + for (size_t row = 0; row < row_position_column.size(); ++row) { + data[row] = _row_lineage_columns.first_row_id + row_position_column.get_element(row); + } + table_block->replace_by_position(column_idx, std::move(column)); + return Status::OK(); +} + +Status IcebergTableReader::_materialize_row_lineage_last_updated_sequence_number( + Block* table_block, size_t column_idx) { + if (_row_lineage_columns.last_updated_sequence_number < 0) { + return Status::OK(); + } + const auto rows = table_block->rows(); + auto data_column = table_block->get_by_position(column_idx).type->create_column(); + data_column->insert( + Field::create_field(_row_lineage_columns.last_updated_sequence_number)); + auto column = ColumnConst::create(std::move(data_column), rows); + table_block->replace_by_position(column_idx, std::move(column)); + return Status::OK(); +} + +bool IcebergTableReader::_need_row_lineage_row_id() const { + for (const auto& mapping : _data_reader.column_mapper.mappings()) { + if (mapping.virtual_column_type == reader::TableVirtualColumnType::ROW_ID) { + return true; + } + } + return false; +} + +} // namespace doris::iceberg diff --git a/be/src/format/table/iceberg_reader_v2.h b/be/src/format/table/iceberg_reader_v2.h new file mode 100644 index 00000000000000..a543ae0797dec4 --- /dev/null +++ b/be/src/format/table/iceberg_reader_v2.h @@ -0,0 +1,141 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include + +#include "common/status.h" +#include "core/block/block.h" +#include "format/reader/file_reader.h" +#include "format/reader/table_reader.h" +#include "format/table/iceberg_delete_file_reader_helper.h" +#include "gen_cpp/PlanNodes_types.h" + +namespace doris { +class Block; +struct DeleteFileDesc; +namespace io { +struct FileDescription; +struct FileSystemProperties; +} // namespace io +} // namespace doris + +namespace doris::iceberg { + +// Iceberg table-level reader。 +// 该层继承 TableReader,复用多文件编排和动态分区裁剪等通用能力;同时组合 +// FileReader 完成 data file 物理读取,不继承具体文件格式 reader。 +class IcebergTableReader : public reader::TableReader { +public: + ~IcebergTableReader() override = default; + + Status prepare_split(const reader::SplitReadOptions& options) override; + +protected: + Status materialize_virtual_columns(Block* table_block) override; + + Status customize_file_scan_request(reader::FileScanRequest* file_request) override; + + bool _supports_aggregate_pushdown(TPushAggOp::type agg_type) const override; + + Status _parse_deletion_vector_file(const TTableFormatFileDesc& t_desc, DeleteFileDesc* desc, + bool* has_delete_file) override; + + Status _init_delete_predicates(const TTableFormatFileDesc& t_desc); + +private: + static constexpr int MIN_SUPPORT_DELETE_FILES_VERSION = 2; + static constexpr int POSITION_DELETE = 1; + static constexpr int EQUALITY_DELETE = 2; + static constexpr int DELETION_VECTOR = 3; + + struct RowLineageColumns { + int64_t first_row_id = -1; + int64_t last_updated_sequence_number = -1; + }; + + static constexpr const char* ICEBERG_FILE_PATH = "file_path"; + static constexpr const char* ICEBERG_ROW_POS = "pos"; + static constexpr size_t ICEBERG_FILE_PATH_BLOCK_POSITION = 0; + static constexpr size_t ICEBERG_ROW_POS_BLOCK_POSITION = 1; + + class PositionDeleteRowsCollector final { + public: + PositionDeleteRowsCollector(std::string data_file_path, reader::DeleteRows* rows); + + Status collect(const Block& block, size_t read_rows); + + private: + std::string _data_file_path; + reader::DeleteRows* _rows = nullptr; + }; + + static std::string _iceberg_delete_vector_cache_key(const TIcebergDeleteFileDesc& delete_file); + + static std::shared_ptr _delete_file_system_properties( + const TFileScanRangeParams& scan_params); + + static std::unique_ptr _delete_file_description( + const TFileRangeDesc& range); + + std::string _data_file_path() const; + + // Append row position column to file scan request for position delete handling. + Status _append_row_position_output_column(reader::FileScanRequest* request); + // Append equality delete predicates to file scan request based on the delete files in iceberg + // params. DeleteVector and position delete files use the common DeleteRows path in TableReader. + Status _append_equality_delete_predicates(reader::FileScanRequest* request); + + Status _init_equality_delete_predicates( + const std::vector& delete_files); + + // Read equality/position delete files. + Status _read_parquet_equality_delete_file(const TIcebergDeleteFileDesc& delete_file, + const TFileScanRangeParams& scan_params, + IcebergDeleteFileIOContext* delete_io_ctx); + Status _read_parquet_position_delete_file(const TIcebergDeleteFileDesc& delete_file, + const TFileScanRangeParams& scan_params, + IcebergDeleteFileIOContext* delete_io_ctx, + PositionDeleteRowsCollector* collector); + + // Read position delete files and collect deleted row positions to update DeletePredicate. + Status _init_position_delete_rows(const std::vector& delete_files); + + // Materialize row lineage virtual columns based on the position delete file. + Status _materialize_row_lineage_row_id(Block* table_block, size_t column_idx); + Status _materialize_row_lineage_last_updated_sequence_number(Block* table_block, + size_t column_idx); + + RowLineageColumns _row_lineage_columns; + size_t _row_position_block_position = 0; + const TIcebergFileDesc* _iceberg_params = nullptr; + bool _delete_predicates_initialized = false; + reader::DeleteRows _position_delete_rows_storage; + struct EqualityDeleteFilter { + std::vector field_ids; + std::vector key_types; + Block delete_block; + }; + std::vector _equality_delete_filters; + + bool _need_row_lineage_row_id() const; +}; + +} // namespace doris::iceberg diff --git a/be/src/io/file_factory.cpp b/be/src/io/file_factory.cpp index 553cdc4460e15c..9610bc028595ec 100644 --- a/be/src/io/file_factory.cpp +++ b/be/src/io/file_factory.cpp @@ -57,21 +57,20 @@ namespace doris { constexpr std::string_view RANDOM_CACHE_BASE_PATH = "random"; -io::FileReaderOptions FileFactory::get_reader_options(RuntimeState* state, +io::FileReaderOptions FileFactory::get_reader_options(const TQueryOptions& option, const io::FileDescription& fd) { io::FileReaderOptions opts { .cache_base_path {}, .file_size = fd.file_size, .mtime = fd.mtime, }; - if (config::enable_file_cache && state != nullptr && - state->query_options().__isset.enable_file_cache && - state->query_options().enable_file_cache && fd.file_cache_admission) { + if (config::enable_file_cache && option.__isset.enable_file_cache && option.enable_file_cache && + fd.file_cache_admission) { opts.cache_type = io::FileCachePolicy::FILE_BLOCK_CACHE; } - if (state != nullptr && state->query_options().__isset.file_cache_base_path && - state->query_options().file_cache_base_path != RANDOM_CACHE_BASE_PATH) { - opts.cache_base_path = state->query_options().file_cache_base_path; + if (option.__isset.file_cache_base_path && + option.file_cache_base_path != RANDOM_CACHE_BASE_PATH) { + opts.cache_base_path = option.file_cache_base_path; } return opts; } diff --git a/be/src/io/file_factory.h b/be/src/io/file_factory.h index 7d662e4fdde469..33595313b921b1 100644 --- a/be/src/io/file_factory.h +++ b/be/src/io/file_factory.h @@ -16,6 +16,7 @@ // under the License. #pragma once +#include #include #include #include @@ -64,6 +65,8 @@ struct FileDescription { // -1 means unset. // If the file length is not set, the file length will be fetched from the file system. int64_t file_size = -1; + int64_t range_start_offset = 0; + int64_t range_size = -1; // modification time of this file. // 0 means unset. int64_t mtime = 0; @@ -83,7 +86,7 @@ class FileFactory { ENABLE_FACTORY_CREATOR(FileFactory); public: - static io::FileReaderOptions get_reader_options(RuntimeState* state, + static io::FileReaderOptions get_reader_options(const TQueryOptions& option, const io::FileDescription& fd); /// Create a temporary FileSystem for accessing file corresponding to `file_description` diff --git a/be/test/core/data_type_serde/data_type_serde_decoded_values_test.cpp b/be/test/core/data_type_serde/data_type_serde_decoded_values_test.cpp new file mode 100644 index 00000000000000..1622775b6a871a --- /dev/null +++ b/be/test/core/data_type_serde/data_type_serde_decoded_values_test.cpp @@ -0,0 +1,279 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include +#include +#include + +#include "core/assert_cast.h" +#include "core/column/column_decimal.h" +#include "core/column/column_nullable.h" +#include "core/column/column_string.h" +#include "core/column/column_vector.h" +#include "core/data_type/data_type_date_or_datetime_v2.h" +#include "core/data_type/data_type_decimal.h" +#include "core/data_type/data_type_nullable.h" +#include "core/data_type/data_type_number.h" +#include "core/data_type/data_type_string.h" +#include "core/data_type_serde/decoded_column_view.h" +#include "core/string_ref.h" + +namespace doris { + +TEST(DataTypeSerDeDecodedValuesTest, ReadInt32Values) { + auto type = std::make_shared(); + auto column = type->create_column(); + const int32_t values[] = {10, -20, 30}; + + DecodedColumnView view; + view.value_kind = DecodedValueKind::INT32; + view.row_count = 3; + view.values = reinterpret_cast(values); + + auto st = type->get_serde()->read_column_from_decoded_values(*column, view); + ASSERT_TRUE(st.ok()) << st; + + const auto& int_column = assert_cast(*column); + ASSERT_EQ(int_column.size(), 3); + EXPECT_EQ(int_column.get_element(0), 10); + EXPECT_EQ(int_column.get_element(1), -20); + EXPECT_EQ(int_column.get_element(2), 30); +} + +TEST(DataTypeSerDeDecodedValuesTest, ReadPrimitiveNumberValues) { + { + auto type = std::make_shared(); + auto column = type->create_column(); + const bool values[] = {true, false, true}; + + DecodedColumnView view; + view.value_kind = DecodedValueKind::BOOL; + view.row_count = 3; + view.values = reinterpret_cast(values); + + auto st = type->get_serde()->read_column_from_decoded_values(*column, view); + ASSERT_TRUE(st.ok()) << st; + + const auto& bool_column = assert_cast(*column); + ASSERT_EQ(bool_column.size(), 3); + EXPECT_EQ(bool_column.get_element(0), 1); + EXPECT_EQ(bool_column.get_element(1), 0); + EXPECT_EQ(bool_column.get_element(2), 1); + } + { + auto type = std::make_shared(); + auto column = type->create_column(); + const int64_t values[] = {10000000000L, -9L, 42L}; + + DecodedColumnView view; + view.value_kind = DecodedValueKind::INT64; + view.row_count = 3; + view.values = reinterpret_cast(values); + + auto st = type->get_serde()->read_column_from_decoded_values(*column, view); + ASSERT_TRUE(st.ok()) << st; + + const auto& int_column = assert_cast(*column); + ASSERT_EQ(int_column.size(), 3); + EXPECT_EQ(int_column.get_element(0), 10000000000L); + EXPECT_EQ(int_column.get_element(1), -9L); + EXPECT_EQ(int_column.get_element(2), 42L); + } + { + auto type = std::make_shared(); + auto column = type->create_column(); + const float values[] = {1.5F, -2.25F}; + + DecodedColumnView view; + view.value_kind = DecodedValueKind::FLOAT; + view.row_count = 2; + view.values = reinterpret_cast(values); + + auto st = type->get_serde()->read_column_from_decoded_values(*column, view); + ASSERT_TRUE(st.ok()) << st; + + const auto& float_column = assert_cast(*column); + ASSERT_EQ(float_column.size(), 2); + EXPECT_FLOAT_EQ(float_column.get_element(0), 1.5F); + EXPECT_FLOAT_EQ(float_column.get_element(1), -2.25F); + } + { + auto type = std::make_shared(); + auto column = type->create_column(); + const double values[] = {3.5, -4.75}; + + DecodedColumnView view; + view.value_kind = DecodedValueKind::DOUBLE; + view.row_count = 2; + view.values = reinterpret_cast(values); + + auto st = type->get_serde()->read_column_from_decoded_values(*column, view); + ASSERT_TRUE(st.ok()) << st; + + const auto& double_column = assert_cast(*column); + ASSERT_EQ(double_column.size(), 2); + EXPECT_DOUBLE_EQ(double_column.get_element(0), 3.5); + EXPECT_DOUBLE_EQ(double_column.get_element(1), -4.75); + } +} + +TEST(DataTypeSerDeDecodedValuesTest, ReadStringValues) { + auto type = std::make_shared(); + auto column = type->create_column(); + std::vector values = { + StringRef("alpha", 5), + StringRef("beta", 4), + StringRef("gamma", 5), + }; + + DecodedColumnView view; + view.value_kind = DecodedValueKind::BINARY; + view.row_count = values.size(); + view.binary_values = &values; + + auto st = type->get_serde()->read_column_from_decoded_values(*column, view); + ASSERT_TRUE(st.ok()) << st; + + const auto& string_column = assert_cast(*column); + ASSERT_EQ(string_column.size(), 3); + EXPECT_EQ(string_column.get_data_at(0).to_string(), "alpha"); + EXPECT_EQ(string_column.get_data_at(1).to_string(), "beta"); + EXPECT_EQ(string_column.get_data_at(2).to_string(), "gamma"); +} + +TEST(DataTypeSerDeDecodedValuesTest, ReadDateAndDateTimeValues) { + { + auto type = std::make_shared(); + auto column = type->create_column(); + const int32_t values[] = {0, 1, 18628}; + + DecodedColumnView view; + view.value_kind = DecodedValueKind::INT32; + view.row_count = 3; + view.values = reinterpret_cast(values); + + auto st = type->get_serde()->read_column_from_decoded_values(*column, view); + ASSERT_TRUE(st.ok()) << st; + + ASSERT_EQ(column->size(), 3); + EXPECT_EQ(type->to_string(*column, 0), "1970-01-01"); + EXPECT_EQ(type->to_string(*column, 1), "1970-01-02"); + EXPECT_EQ(type->to_string(*column, 2), "2021-01-01"); + } + { + auto type = std::make_shared(6); + auto column = type->create_column(); + const int64_t values[] = {0, 1234567, -1}; + + DecodedColumnView view; + view.value_kind = DecodedValueKind::INT64; + view.time_unit = DecodedTimeUnit::MICROS; + view.row_count = 3; + view.values = reinterpret_cast(values); + + auto st = type->get_serde()->read_column_from_decoded_values(*column, view); + ASSERT_TRUE(st.ok()) << st; + + ASSERT_EQ(column->size(), 3); + EXPECT_EQ(type->to_string(*column, 0), "1970-01-01 00:00:00.000000"); + EXPECT_EQ(type->to_string(*column, 1), "1970-01-01 00:00:01.234567"); + EXPECT_EQ(type->to_string(*column, 2), "1969-12-31 23:59:59.999999"); + } +} + +TEST(DataTypeSerDeDecodedValuesTest, ReadDecimalValues) { + auto type = std::make_shared(18, 2); + auto column = type->create_column(); + const int64_t values[] = {12345, -67, 0}; + + DecodedColumnView view; + view.value_kind = DecodedValueKind::INT64; + view.row_count = 3; + view.values = reinterpret_cast(values); + view.decimal_precision = 18; + view.decimal_scale = 2; + + auto st = type->get_serde()->read_column_from_decoded_values(*column, view); + ASSERT_TRUE(st.ok()) << st; + + const auto& decimal_column = assert_cast(*column); + ASSERT_EQ(decimal_column.size(), 3); + EXPECT_EQ(decimal_column.get_element(0), Decimal128V3(12345)); + EXPECT_EQ(decimal_column.get_element(1), Decimal128V3(-67)); + EXPECT_EQ(decimal_column.get_element(2), Decimal128V3(0)); + EXPECT_EQ(type->to_string(*column, 0), "123.45"); + EXPECT_EQ(type->to_string(*column, 1), "-0.67"); +} + +TEST(DataTypeSerDeDecodedValuesTest, ReadNullableInt32Values) { + auto type = std::make_shared(std::make_shared()); + auto column = type->create_column(); + const int32_t values[] = {1, 2, 3, 4}; + const uint8_t null_map[] = {0, 1, 0, 1}; + + DecodedColumnView view; + view.value_kind = DecodedValueKind::INT32; + view.row_count = 4; + view.values = reinterpret_cast(values); + view.null_map = null_map; + + auto st = type->get_serde()->read_column_from_decoded_values(*column, view); + ASSERT_TRUE(st.ok()) << st; + + const auto& nullable_column = assert_cast(*column); + const auto& nested_column = + assert_cast(nullable_column.get_nested_column()); + ASSERT_EQ(nullable_column.size(), 4); + EXPECT_FALSE(nullable_column.is_null_at(0)); + EXPECT_TRUE(nullable_column.is_null_at(1)); + EXPECT_FALSE(nullable_column.is_null_at(2)); + EXPECT_TRUE(nullable_column.is_null_at(3)); + EXPECT_EQ(nested_column.get_element(0), 1); + EXPECT_EQ(nested_column.get_element(1), 2); + EXPECT_EQ(nested_column.get_element(2), 3); + EXPECT_EQ(nested_column.get_element(3), 4); +} + +TEST(DataTypeSerDeDecodedValuesTest, RejectMismatchedValueKind) { + auto type = std::make_shared(); + auto column = type->create_column(); + const int64_t values[] = {1}; + + DecodedColumnView view; + view.value_kind = DecodedValueKind::INT64; + view.row_count = 1; + view.values = reinterpret_cast(values); + + auto st = type->get_serde()->read_column_from_decoded_values(*column, view); + EXPECT_FALSE(st.ok()); +} + +TEST(DataTypeSerDeDecodedValuesTest, RejectMissingValueBuffer) { + auto type = std::make_shared(); + auto column = type->create_column(); + + DecodedColumnView view; + view.value_kind = DecodedValueKind::INT32; + view.row_count = 1; + + auto st = type->get_serde()->read_column_from_decoded_values(*column, view); + EXPECT_FALSE(st.ok()); +} + +} // namespace doris diff --git a/be/test/format/new_parquet/parquet_column_reader_test.cpp b/be/test/format/new_parquet/parquet_column_reader_test.cpp new file mode 100644 index 00000000000000..ca4003cf3772b6 --- /dev/null +++ b/be/test/format/new_parquet/parquet_column_reader_test.cpp @@ -0,0 +1,1206 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "core/assert_cast.h" +#include "core/column/column_array.h" +#include "core/column/column_decimal.h" +#include "core/column/column_map.h" +#include "core/column/column_nullable.h" +#include "core/column/column_string.h" +#include "core/column/column_struct.h" +#include "core/column/column_vector.h" +#include "core/data_type/data_type.h" +#include "core/data_type/data_type_array.h" +#include "core/data_type/data_type_map.h" +#include "core/data_type/data_type_nullable.h" +#include "core/data_type/data_type_struct.h" +#include "core/types.h" +#include "format/new_parquet/column_reader.h" +#include "format/new_parquet/parquet_column_schema.h" +#include "format/new_parquet/selection_vector.h" +#include "format/reader/file_reader.h" + +namespace doris::parquet { +namespace { + +constexpr int64_t ROW_COUNT = 5; + +std::shared_ptr finish_array(arrow::ArrayBuilder* builder) { + std::shared_ptr array; + EXPECT_TRUE(builder->Finish(&array).ok()); + return array; +} + +class ParquetColumnReaderTest : public testing::Test { +protected: + void SetUp() override { + _test_dir = std::filesystem::temp_directory_path() / "doris_parquet_column_reader_test"; + std::filesystem::remove_all(_test_dir); + std::filesystem::create_directories(_test_dir); + _file_path = (_test_dir / "reader.parquet").string(); + write_parquet_file(); + _file_reader = ::parquet::ParquetFileReader::OpenFile(_file_path, false); + auto metadata = _file_reader->metadata(); + ASSERT_EQ(metadata->num_row_groups(), 1); + _row_group = _file_reader->RowGroup(0); + ASSERT_NE(_row_group, nullptr); + auto schema_descriptor = _file_reader->metadata()->schema(); + ASSERT_NE(schema_descriptor, nullptr); + auto st = build_parquet_column_schema(*schema_descriptor, &_fields); + ASSERT_TRUE(st.ok()) << st; + ASSERT_EQ(_fields.size(), _expected_by_field.size()); + } + + void TearDown() override { std::filesystem::remove_all(_test_dir); } + + template + std::shared_ptr build_required_array(const std::vector& values) { + Builder builder; + for (const auto& value : values) { + EXPECT_TRUE(builder.Append(value).ok()); + } + return finish_array(&builder); + } + + std::shared_ptr build_string_array(const std::vector& values) { + arrow::StringBuilder builder; + for (const auto& value : values) { + EXPECT_TRUE(builder.Append(value).ok()); + } + return finish_array(&builder); + } + + std::shared_ptr build_binary_array(const std::vector& values) { + arrow::BinaryBuilder builder; + for (const auto& value : values) { + EXPECT_TRUE(builder.Append(reinterpret_cast(value.data()), + static_cast(value.size())) + .ok()); + } + return finish_array(&builder); + } + + std::shared_ptr build_fixed_binary_array( + const std::shared_ptr& type, const std::vector& values) { + arrow::FixedSizeBinaryBuilder builder(type, arrow::default_memory_pool()); + for (const auto& value : values) { + EXPECT_TRUE(builder.Append(reinterpret_cast(value.data())).ok()); + } + return finish_array(&builder); + } + + std::shared_ptr build_nullable_int32_array() { + arrow::Int32Builder builder; + EXPECT_TRUE(builder.Append(1).ok()); + EXPECT_TRUE(builder.AppendNull().ok()); + EXPECT_TRUE(builder.Append(3).ok()); + EXPECT_TRUE(builder.AppendNull().ok()); + EXPECT_TRUE(builder.Append(5).ok()); + return finish_array(&builder); + } + + std::shared_ptr build_required_struct_array() { + auto struct_type = arrow::struct_({arrow::field("a", arrow::int32(), false), + arrow::field("b", arrow::utf8(), false)}); + std::vector> field_builders; + auto a_array_builder = std::make_unique(); + field_builders.push_back(std::shared_ptr(std::move(a_array_builder))); + auto b_array_builder = std::make_unique(); + field_builders.push_back(std::shared_ptr(std::move(b_array_builder))); + arrow::StructBuilder builder(struct_type, arrow::default_memory_pool(), + std::move(field_builders)); + auto* a_builder = assert_cast(builder.field_builder(0)); + auto* b_builder = assert_cast(builder.field_builder(1)); + const std::vector a_values = {101, 102, 103, 104, 105}; + const std::vector b_values = {"sa", "sb", "sc", "sd", "se"}; + for (size_t row = 0; row < a_values.size(); ++row) { + EXPECT_TRUE(builder.Append().ok()); + EXPECT_TRUE(a_builder->Append(a_values[row]).ok()); + EXPECT_TRUE(b_builder->Append(b_values[row]).ok()); + } + return finish_array(&builder); + } + + std::shared_ptr build_nullable_struct_array() { + auto struct_type = arrow::struct_( + {arrow::field("a", arrow::int32(), false), arrow::field("b", arrow::utf8(), true)}); + std::vector> field_builders; + auto a_array_builder = std::make_unique(); + field_builders.push_back(std::shared_ptr(std::move(a_array_builder))); + auto b_array_builder = std::make_unique(); + field_builders.push_back(std::shared_ptr(std::move(b_array_builder))); + arrow::StructBuilder builder(struct_type, arrow::default_memory_pool(), + std::move(field_builders)); + auto* a_builder = assert_cast(builder.field_builder(0)); + auto* b_builder = assert_cast(builder.field_builder(1)); + + EXPECT_TRUE(builder.Append().ok()); + EXPECT_TRUE(a_builder->Append(201).ok()); + EXPECT_TRUE(b_builder->Append("nsa").ok()); + EXPECT_TRUE(builder.AppendNull().ok()); + EXPECT_TRUE(builder.Append().ok()); + EXPECT_TRUE(a_builder->Append(203).ok()); + EXPECT_TRUE(b_builder->AppendNull().ok()); + EXPECT_TRUE(builder.Append().ok()); + EXPECT_TRUE(a_builder->Append(204).ok()); + EXPECT_TRUE(b_builder->Append("nsd").ok()); + EXPECT_TRUE(builder.AppendNull().ok()); + return finish_array(&builder); + } + + std::shared_ptr build_required_int_list_array() { + auto value_builder = std::make_shared(); + arrow::ListBuilder builder(arrow::default_memory_pool(), value_builder, + arrow::list(arrow::field("element", arrow::int32(), false))); + const std::vector> values = { + {1, 2}, {3}, {4, 5, 6}, {7}, {8, 9}, + }; + for (const auto& row : values) { + EXPECT_TRUE(builder.Append().ok()); + for (const auto value : row) { + EXPECT_TRUE(value_builder->Append(value).ok()); + } + } + return finish_array(&builder); + } + + std::shared_ptr build_nullable_int_list_array() { + auto value_builder = std::make_shared(); + arrow::ListBuilder builder(arrow::default_memory_pool(), value_builder, + arrow::list(arrow::field("element", arrow::int32(), true))); + EXPECT_TRUE(builder.Append().ok()); + EXPECT_TRUE(value_builder->Append(10).ok()); + EXPECT_TRUE(value_builder->Append(20).ok()); + EXPECT_TRUE(builder.AppendNull().ok()); + EXPECT_TRUE(builder.AppendEmptyValue().ok()); + EXPECT_TRUE(builder.Append().ok()); + EXPECT_TRUE(value_builder->AppendNull().ok()); + EXPECT_TRUE(value_builder->Append(30).ok()); + EXPECT_TRUE(builder.Append().ok()); + EXPECT_TRUE(value_builder->Append(40).ok()); + return finish_array(&builder); + } + + std::shared_ptr build_required_nullable_int_list_array() { + auto value_builder = std::make_shared(); + arrow::ListBuilder builder(arrow::default_memory_pool(), value_builder, + arrow::list(arrow::field("element", arrow::int32(), true))); + EXPECT_TRUE(builder.AppendEmptyValue().ok()); + EXPECT_TRUE(builder.Append().ok()); + EXPECT_TRUE(value_builder->AppendNull().ok()); + EXPECT_TRUE(value_builder->Append(110).ok()); + EXPECT_TRUE(builder.Append().ok()); + EXPECT_TRUE(value_builder->Append(120).ok()); + EXPECT_TRUE(builder.Append().ok()); + EXPECT_TRUE(value_builder->Append(130).ok()); + EXPECT_TRUE(value_builder->AppendNull().ok()); + EXPECT_TRUE(builder.Append().ok()); + return finish_array(&builder); + } + + std::shared_ptr build_required_int_string_map_array() { + auto key_builder = std::make_shared(); + auto value_builder = std::make_shared(); + auto map_type = arrow::map(arrow::int32(), arrow::field("value", arrow::utf8(), false)); + arrow::MapBuilder builder(arrow::default_memory_pool(), key_builder, value_builder, + map_type); + const std::vector>> values = { + {{1, "a"}, {2, "b"}}, {{3, "c"}}, {{4, "d"}, {5, "e"}, {6, "f"}}, + {{7, "g"}}, {{8, "h"}, {9, "i"}}, + }; + for (const auto& row : values) { + EXPECT_TRUE(builder.Append().ok()); + for (const auto& [key, value] : row) { + EXPECT_TRUE(key_builder->Append(key).ok()); + EXPECT_TRUE(value_builder->Append(value).ok()); + } + } + return finish_array(&builder); + } + + std::shared_ptr build_nullable_int_string_map_array() { + auto key_builder = std::make_shared(); + auto value_builder = std::make_shared(); + auto map_type = arrow::map(arrow::int32(), arrow::field("value", arrow::utf8(), true)); + arrow::MapBuilder builder(arrow::default_memory_pool(), key_builder, value_builder, + map_type); + EXPECT_TRUE(builder.Append().ok()); + EXPECT_TRUE(key_builder->Append(10).ok()); + EXPECT_TRUE(value_builder->Append("aa").ok()); + EXPECT_TRUE(key_builder->Append(20).ok()); + EXPECT_TRUE(value_builder->AppendNull().ok()); + EXPECT_TRUE(builder.AppendNull().ok()); + EXPECT_TRUE(builder.AppendEmptyValue().ok()); + EXPECT_TRUE(builder.Append().ok()); + EXPECT_TRUE(key_builder->Append(30).ok()); + EXPECT_TRUE(value_builder->Append("cc").ok()); + EXPECT_TRUE(builder.Append().ok()); + EXPECT_TRUE(key_builder->Append(40).ok()); + EXPECT_TRUE(value_builder->AppendNull().ok()); + return finish_array(&builder); + } + + std::shared_ptr build_required_nullable_string_map_array() { + auto key_builder = std::make_shared(); + auto value_builder = std::make_shared(); + auto map_type = arrow::map(arrow::int32(), arrow::field("value", arrow::utf8(), true)); + arrow::MapBuilder builder(arrow::default_memory_pool(), key_builder, value_builder, + map_type); + EXPECT_TRUE(builder.AppendEmptyValue().ok()); + EXPECT_TRUE(builder.Append().ok()); + EXPECT_TRUE(key_builder->Append(101).ok()); + EXPECT_TRUE(value_builder->AppendNull().ok()); + EXPECT_TRUE(key_builder->Append(102).ok()); + EXPECT_TRUE(value_builder->Append("bb").ok()); + EXPECT_TRUE(builder.Append().ok()); + EXPECT_TRUE(key_builder->Append(103).ok()); + EXPECT_TRUE(value_builder->Append("cc").ok()); + EXPECT_TRUE(builder.AppendEmptyValue().ok()); + EXPECT_TRUE(builder.Append().ok()); + EXPECT_TRUE(key_builder->Append(104).ok()); + EXPECT_TRUE(value_builder->AppendNull().ok()); + return finish_array(&builder); + } + + std::shared_ptr build_time32_array(const std::shared_ptr& type, + const std::vector& values) { + arrow::Time32Builder builder(type, arrow::default_memory_pool()); + for (const auto value : values) { + EXPECT_TRUE(builder.Append(value).ok()); + } + return finish_array(&builder); + } + + std::shared_ptr build_time64_array(const std::shared_ptr& type, + const std::vector& values) { + arrow::Time64Builder builder(type, arrow::default_memory_pool()); + for (const auto value : values) { + EXPECT_TRUE(builder.Append(value).ok()); + } + return finish_array(&builder); + } + + std::shared_ptr build_timestamp_array( + const std::shared_ptr& type, const std::vector& values) { + arrow::TimestampBuilder builder(type, arrow::default_memory_pool()); + for (const auto value : values) { + EXPECT_TRUE(builder.Append(value).ok()); + } + return finish_array(&builder); + } + + std::shared_ptr build_decimal_array(const std::shared_ptr& type, + const std::vector& values) { + arrow::Decimal128Builder builder(type, arrow::default_memory_pool()); + for (const auto value : values) { + EXPECT_TRUE(builder.Append(arrow::Decimal128(value)).ok()); + } + return finish_array(&builder); + } + + void add_field(const std::shared_ptr& field, std::shared_ptr array, + std::function validator) { + _arrow_fields.push_back(field); + _arrays.push_back(std::move(array)); + _expected_by_field.push_back(std::move(validator)); + } + + void write_parquet_file() { + add_field( + arrow::field("bool_col", arrow::boolean(), false), + build_required_array({true, false, true, false, true}), + [](const ParquetColumnSchema& schema, const IColumn& column) { + EXPECT_EQ(schema.type_descriptor.physical_type, ::parquet::Type::BOOLEAN); + const auto& values = assert_cast(column); + EXPECT_EQ(values.get_element(0), 1); + EXPECT_EQ(values.get_element(1), 0); + EXPECT_EQ(values.get_element(4), 1); + }); + add_field(arrow::field("int32_col", arrow::int32(), false), + build_required_array({10, 20, 30, 40, 50}), + [](const ParquetColumnSchema& schema, const IColumn& column) { + EXPECT_EQ(schema.type_descriptor.physical_type, ::parquet::Type::INT32); + const auto& values = assert_cast(column); + EXPECT_EQ(values.get_element(0), 10); + EXPECT_EQ(values.get_element(4), 50); + }); + add_field(arrow::field("int64_col", arrow::int64(), false), + build_required_array( + {10000000000L, -9L, 42L, 77L, 123L}), + [](const ParquetColumnSchema& schema, const IColumn& column) { + EXPECT_EQ(schema.type_descriptor.physical_type, ::parquet::Type::INT64); + const auto& values = assert_cast(column); + EXPECT_EQ(values.get_element(0), 10000000000L); + EXPECT_EQ(values.get_element(1), -9L); + }); + add_field( + arrow::field("float_col", arrow::float32(), false), + build_required_array({1.5F, -2.25F, 3.0F, 4.5F, 5.75F}), + [](const ParquetColumnSchema& schema, const IColumn& column) { + EXPECT_EQ(schema.type_descriptor.physical_type, ::parquet::Type::FLOAT); + const auto& values = assert_cast(column); + EXPECT_FLOAT_EQ(values.get_element(0), 1.5F); + EXPECT_FLOAT_EQ(values.get_element(1), -2.25F); + }); + add_field(arrow::field("double_col", arrow::float64(), false), + build_required_array({3.5, -4.75, 6.0, 7.25, 8.5}), + [](const ParquetColumnSchema& schema, const IColumn& column) { + EXPECT_EQ(schema.type_descriptor.physical_type, ::parquet::Type::DOUBLE); + const auto& values = assert_cast(column); + EXPECT_DOUBLE_EQ(values.get_element(0), 3.5); + EXPECT_DOUBLE_EQ(values.get_element(1), -4.75); + }); + add_field(arrow::field("binary_col", arrow::binary(), false), + build_binary_array({"bin_a", "bin_b", "bin_c", "bin_d", "bin_e"}), + [](const ParquetColumnSchema& schema, const IColumn& column) { + EXPECT_EQ(schema.type_descriptor.physical_type, ::parquet::Type::BYTE_ARRAY); + const auto& values = assert_cast(column); + EXPECT_EQ(values.get_data_at(0).to_string(), "bin_a"); + EXPECT_EQ(values.get_data_at(3).to_string(), "bin_d"); + }); + add_field(arrow::field("string_col", arrow::utf8(), false), + build_string_array({"alpha", "beta", "gamma", "delta", "epsilon"}), + [](const ParquetColumnSchema& schema, const IColumn& column) { + EXPECT_TRUE(schema.type_descriptor.is_string_like); + const auto& values = assert_cast(column); + EXPECT_EQ(values.get_data_at(0).to_string(), "alpha"); + EXPECT_EQ(values.get_data_at(4).to_string(), "epsilon"); + }); + add_field(arrow::field("fixed_binary_col", arrow::fixed_size_binary(4), false), + build_fixed_binary_array(arrow::fixed_size_binary(4), + {"aaaa", "bbbb", "cccc", "dddd", "eeee"}), + [](const ParquetColumnSchema& schema, const IColumn& column) { + EXPECT_EQ(schema.type_descriptor.physical_type, + ::parquet::Type::FIXED_LEN_BYTE_ARRAY); + EXPECT_EQ(schema.type_descriptor.fixed_length, 4); + const auto& values = assert_cast(column); + EXPECT_EQ(values.get_data_at(0).to_string(), "aaaa"); + EXPECT_EQ(values.get_data_at(2).to_string(), "cccc"); + }); + add_field(arrow::field("date_col", arrow::date32(), false), + build_required_array({0, 1, 18628, 18629, 18630}), + [](const ParquetColumnSchema& schema, const IColumn& column) { + EXPECT_EQ(schema.type_descriptor.physical_type, ::parquet::Type::INT32); + EXPECT_EQ(remove_nullable(schema.type)->get_primitive_type(), TYPE_DATEV2); + EXPECT_EQ(schema.type->to_string(column, 0), "1970-01-01"); + EXPECT_EQ(schema.type->to_string(column, 2), "2021-01-01"); + }); + add_field(arrow::field("time_millis_col", arrow::time32(arrow::TimeUnit::MILLI), false), + build_time32_array(arrow::time32(arrow::TimeUnit::MILLI), + {0, 1000, 3723004, 43200000, 86399000}), + [](const ParquetColumnSchema& schema, const IColumn& column) { + EXPECT_EQ(schema.type_descriptor.physical_type, ::parquet::Type::INT32); + EXPECT_EQ(remove_nullable(schema.type)->get_primitive_type(), TYPE_TIMEV2); + EXPECT_EQ(schema.type->to_string(column, 1), "00:00:01.000"); + EXPECT_EQ(schema.type->to_string(column, 2), "01:02:03.004"); + }); + add_field(arrow::field("time_micros_col", arrow::time64(arrow::TimeUnit::MICRO), false), + build_time64_array(arrow::time64(arrow::TimeUnit::MICRO), + {0, 1000000, 3723004567, 43200000000, 86399000000}), + [](const ParquetColumnSchema& schema, const IColumn& column) { + EXPECT_EQ(schema.type_descriptor.physical_type, ::parquet::Type::INT64); + EXPECT_EQ(remove_nullable(schema.type)->get_primitive_type(), TYPE_TIMEV2); + EXPECT_EQ(schema.type->to_string(column, 1), "00:00:01.000000"); + EXPECT_EQ(schema.type->to_string(column, 2), "01:02:03.004567"); + }); + add_field(arrow::field("timestamp_millis_col", arrow::timestamp(arrow::TimeUnit::MILLI), + false), + build_timestamp_array(arrow::timestamp(arrow::TimeUnit::MILLI), + {0, 1234, 1609459200000, 1609459201000, -1}), + [](const ParquetColumnSchema& schema, const IColumn& column) { + EXPECT_EQ(schema.type_descriptor.physical_type, ::parquet::Type::INT64); + EXPECT_EQ(remove_nullable(schema.type)->get_primitive_type(), + TYPE_DATETIMEV2); + EXPECT_EQ(schema.type->to_string(column, 1), "1970-01-01 00:00:01.234"); + EXPECT_EQ(schema.type->to_string(column, 4), "1969-12-31 23:59:59.999"); + }); + add_field(arrow::field("timestamp_micros_col", arrow::timestamp(arrow::TimeUnit::MICRO), + false), + build_timestamp_array(arrow::timestamp(arrow::TimeUnit::MICRO), + {0, 1234567, 1609459200000000, 1609459201000000, -1}), + [](const ParquetColumnSchema& schema, const IColumn& column) { + EXPECT_EQ(schema.type_descriptor.physical_type, ::parquet::Type::INT64); + EXPECT_EQ(remove_nullable(schema.type)->get_primitive_type(), + TYPE_DATETIMEV2); + EXPECT_EQ(schema.type->to_string(column, 1), "1970-01-01 00:00:01.234567"); + EXPECT_EQ(schema.type->to_string(column, 4), "1969-12-31 23:59:59.999999"); + }); + add_field(arrow::field("decimal_fixed_binary_9_2_col", arrow::decimal128(9, 2), false), + build_decimal_array(arrow::decimal128(9, 2), {12345, -67, 0, 987, 1000}), + [](const ParquetColumnSchema& schema, const IColumn& column) { + EXPECT_EQ(schema.type_descriptor.physical_type, + ::parquet::Type::FIXED_LEN_BYTE_ARRAY); + EXPECT_TRUE(schema.type_descriptor.is_decimal); + EXPECT_EQ(remove_nullable(schema.type)->get_primitive_type(), TYPE_DECIMAL32); + const auto& values = assert_cast(column); + EXPECT_EQ(values.get_element(0), Decimal32(12345)); + EXPECT_EQ(schema.type->to_string(column, 0), "123.45"); + }); + add_field(arrow::field("decimal_fixed_binary_18_6_col", arrow::decimal128(18, 6), false), + build_decimal_array(arrow::decimal128(18, 6), + {1234567, -670000, 0, 9870000, 1000000}), + [](const ParquetColumnSchema& schema, const IColumn& column) { + EXPECT_EQ(schema.type_descriptor.physical_type, + ::parquet::Type::FIXED_LEN_BYTE_ARRAY); + EXPECT_TRUE(schema.type_descriptor.is_decimal); + EXPECT_EQ(remove_nullable(schema.type)->get_primitive_type(), TYPE_DECIMAL64); + const auto& values = assert_cast(column); + EXPECT_EQ(values.get_element(0), Decimal64(1234567)); + EXPECT_EQ(schema.type->to_string(column, 0), "1.234567"); + }); + add_field(arrow::field("nullable_int_col", arrow::int32(), true), + build_nullable_int32_array(), + [](const ParquetColumnSchema& schema, const IColumn& column) { + EXPECT_TRUE(schema.type->is_nullable()); + const auto& nullable_column = assert_cast(column); + const auto& nested_column = + assert_cast(nullable_column.get_nested_column()); + ASSERT_EQ(nullable_column.size(), ROW_COUNT); + EXPECT_FALSE(nullable_column.is_null_at(0)); + EXPECT_TRUE(nullable_column.is_null_at(1)); + EXPECT_FALSE(nullable_column.is_null_at(2)); + EXPECT_TRUE(nullable_column.is_null_at(3)); + EXPECT_EQ(nested_column.get_element(0), 1); + EXPECT_EQ(nested_column.get_element(2), 3); + }); + add_field(arrow::field("struct_col", + arrow::struct_({ + arrow::field("a", arrow::int32(), false), + arrow::field("b", arrow::utf8(), false), + }), + false), + build_required_struct_array(), + [](const ParquetColumnSchema& schema, const IColumn& column) { + EXPECT_EQ(remove_nullable(schema.type)->get_primitive_type(), TYPE_STRUCT); + const auto& struct_column = assert_cast(column); + ASSERT_EQ(struct_column.get_columns().size(), 2); + const auto& a_values = + assert_cast(struct_column.get_column(0)); + const auto& b_values = + assert_cast(struct_column.get_column(1)); + EXPECT_EQ(a_values.get_element(0), 101); + EXPECT_EQ(a_values.get_element(4), 105); + EXPECT_EQ(b_values.get_data_at(1).to_string(), "sb"); + EXPECT_EQ(b_values.get_data_at(4).to_string(), "se"); + }); + add_field(arrow::field("nullable_struct_col", + arrow::struct_({ + arrow::field("a", arrow::int32(), false), + arrow::field("b", arrow::utf8(), true), + }), + true), + build_nullable_struct_array(), + [](const ParquetColumnSchema& schema, const IColumn& column) { + EXPECT_TRUE(schema.type->is_nullable()); + const auto& nullable_column = assert_cast(column); + ASSERT_EQ(nullable_column.size(), ROW_COUNT); + EXPECT_FALSE(nullable_column.is_null_at(0)); + EXPECT_TRUE(nullable_column.is_null_at(1)); + EXPECT_FALSE(nullable_column.is_null_at(2)); + EXPECT_FALSE(nullable_column.is_null_at(3)); + EXPECT_TRUE(nullable_column.is_null_at(4)); + + const auto& struct_column = + assert_cast(nullable_column.get_nested_column()); + ASSERT_EQ(struct_column.get_columns().size(), 2); + const auto& a_values = + assert_cast(struct_column.get_column(0)); + const auto& b_values = + assert_cast(struct_column.get_column(1)); + const auto& b_nested = + assert_cast(b_values.get_nested_column()); + EXPECT_EQ(a_values.get_element(0), 201); + EXPECT_EQ(a_values.get_element(2), 203); + EXPECT_EQ(a_values.get_element(3), 204); + EXPECT_FALSE(b_values.is_null_at(0)); + EXPECT_TRUE(b_values.is_null_at(2)); + EXPECT_FALSE(b_values.is_null_at(3)); + EXPECT_EQ(b_nested.get_data_at(0).to_string(), "nsa"); + EXPECT_EQ(b_nested.get_data_at(3).to_string(), "nsd"); + }); + add_field(arrow::field("list_int_col", + arrow::list(arrow::field("element", arrow::int32(), false)), false), + build_required_int_list_array(), + [](const ParquetColumnSchema& schema, const IColumn& column) { + EXPECT_EQ(remove_nullable(schema.type)->get_primitive_type(), TYPE_ARRAY); + const auto* array_type = + assert_cast(remove_nullable(schema.type).get()); + EXPECT_EQ( + remove_nullable(array_type->get_nested_type())->get_primitive_type(), + TYPE_INT); + const auto& array_column = assert_cast(column); + ASSERT_EQ(array_column.size(), ROW_COUNT); + const auto array_size_at = [&array_column](size_t row_idx) { + return array_column.get_offsets()[row_idx] - + (row_idx == 0 ? 0 : array_column.get_offsets()[row_idx - 1]); + }; + EXPECT_EQ(array_size_at(0), 2); + EXPECT_EQ(array_size_at(1), 1); + EXPECT_EQ(array_size_at(2), 3); + EXPECT_EQ(array_size_at(4), 2); + const auto& values = assert_cast(array_column.get_data()); + ASSERT_EQ(values.size(), 9); + EXPECT_EQ(values.get_element(0), 1); + EXPECT_EQ(values.get_element(5), 6); + EXPECT_EQ(values.get_element(8), 9); + }); + add_field(arrow::field("nullable_list_int_col", + arrow::list(arrow::field("element", arrow::int32(), true)), true), + build_nullable_int_list_array(), + [](const ParquetColumnSchema& schema, const IColumn& column) { + EXPECT_TRUE(schema.type->is_nullable()); + const auto& nullable_column = assert_cast(column); + ASSERT_EQ(nullable_column.size(), ROW_COUNT); + EXPECT_FALSE(nullable_column.is_null_at(0)); + EXPECT_TRUE(nullable_column.is_null_at(1)); + EXPECT_FALSE(nullable_column.is_null_at(2)); + EXPECT_FALSE(nullable_column.is_null_at(3)); + const auto& array_column = + assert_cast(nullable_column.get_nested_column()); + const auto& offsets = array_column.get_offsets(); + ASSERT_EQ(offsets.size(), ROW_COUNT); + EXPECT_EQ(offsets[0], 2); + EXPECT_EQ(offsets[1], 2); + EXPECT_EQ(offsets[2], 2); + EXPECT_EQ(offsets[3], 4); + EXPECT_EQ(offsets[4], 5); + const auto& elements = + assert_cast(array_column.get_data()); + const auto& values = + assert_cast(elements.get_nested_column()); + ASSERT_EQ(elements.size(), 5); + EXPECT_EQ(values.get_element(0), 10); + EXPECT_EQ(values.get_element(1), 20); + EXPECT_TRUE(elements.is_null_at(2)); + EXPECT_EQ(values.get_element(3), 30); + EXPECT_EQ(values.get_element(4), 40); + }); + add_field(arrow::field("required_nullable_list_int_col", + arrow::list(arrow::field("element", arrow::int32(), true)), false), + build_required_nullable_int_list_array(), + [](const ParquetColumnSchema& schema, const IColumn& column) { + EXPECT_FALSE(schema.type->is_nullable()); + const auto& array_column = assert_cast(column); + const auto& offsets = array_column.get_offsets(); + ASSERT_EQ(offsets.size(), ROW_COUNT); + EXPECT_EQ(offsets[0], 0); + EXPECT_EQ(offsets[1], 2); + EXPECT_EQ(offsets[2], 3); + EXPECT_EQ(offsets[3], 5); + EXPECT_EQ(offsets[4], 5); + const auto& elements = + assert_cast(array_column.get_data()); + ASSERT_EQ(elements.size(), 5); + EXPECT_TRUE(elements.is_null_at(0)); + EXPECT_FALSE(elements.is_null_at(1)); + EXPECT_TRUE(elements.is_null_at(4)); + }); + add_field(arrow::field( + "map_int_string_col", + arrow::map(arrow::int32(), arrow::field("value", arrow::utf8(), false)), + false), + build_required_int_string_map_array(), + [](const ParquetColumnSchema& schema, const IColumn& column) { + EXPECT_EQ(remove_nullable(schema.type)->get_primitive_type(), TYPE_MAP); + const auto* map_type = + assert_cast(remove_nullable(schema.type).get()); + EXPECT_EQ(remove_nullable(map_type->get_key_type())->get_primitive_type(), + TYPE_INT); + EXPECT_EQ(remove_nullable(map_type->get_value_type())->get_primitive_type(), + TYPE_STRING); + const auto& map_column = assert_cast(column); + ASSERT_EQ(map_column.size(), ROW_COUNT); + const auto map_size_at = [&map_column](size_t row_idx) { + return map_column.get_offsets()[row_idx] - + (row_idx == 0 ? 0 : map_column.get_offsets()[row_idx - 1]); + }; + EXPECT_EQ(map_size_at(0), 2); + EXPECT_EQ(map_size_at(1), 1); + EXPECT_EQ(map_size_at(2), 3); + EXPECT_EQ(map_size_at(4), 2); + const auto& keys = assert_cast(map_column.get_keys()); + const auto& values = + assert_cast(map_column.get_values()); + ASSERT_EQ(keys.size(), 9); + ASSERT_EQ(values.size(), 9); + EXPECT_EQ(keys.get_element(0), 1); + EXPECT_EQ(keys.get_element(5), 6); + EXPECT_EQ(keys.get_element(8), 9); + EXPECT_EQ(values.get_data_at(0).to_string(), "a"); + EXPECT_EQ(values.get_data_at(5).to_string(), "f"); + EXPECT_EQ(values.get_data_at(8).to_string(), "i"); + }); + add_field( + arrow::field("nullable_map_int_string_col", + arrow::map(arrow::int32(), arrow::field("value", arrow::utf8(), true)), + true), + build_nullable_int_string_map_array(), + [](const ParquetColumnSchema& schema, const IColumn& column) { + EXPECT_TRUE(schema.type->is_nullable()); + const auto& nullable_column = assert_cast(column); + ASSERT_EQ(nullable_column.size(), ROW_COUNT); + EXPECT_FALSE(nullable_column.is_null_at(0)); + EXPECT_TRUE(nullable_column.is_null_at(1)); + EXPECT_FALSE(nullable_column.is_null_at(2)); + const auto& map_column = + assert_cast(nullable_column.get_nested_column()); + const auto& offsets = map_column.get_offsets(); + ASSERT_EQ(offsets.size(), ROW_COUNT); + EXPECT_EQ(offsets[0], 2); + EXPECT_EQ(offsets[1], 2); + EXPECT_EQ(offsets[2], 2); + EXPECT_EQ(offsets[3], 3); + EXPECT_EQ(offsets[4], 4); + const auto& keys = assert_cast(map_column.get_keys()); + const auto& values = + assert_cast(map_column.get_values()); + const auto& value_data = + assert_cast(values.get_nested_column()); + ASSERT_EQ(keys.size(), 4); + EXPECT_EQ(keys.get_element(0), 10); + EXPECT_EQ(keys.get_element(1), 20); + EXPECT_EQ(keys.get_element(3), 40); + EXPECT_EQ(value_data.get_data_at(0).to_string(), "aa"); + EXPECT_TRUE(values.is_null_at(1)); + EXPECT_EQ(value_data.get_data_at(2).to_string(), "cc"); + EXPECT_TRUE(values.is_null_at(3)); + }); + add_field( + arrow::field("required_nullable_map_int_string_col", + arrow::map(arrow::int32(), arrow::field("value", arrow::utf8(), true)), + false), + build_required_nullable_string_map_array(), + [](const ParquetColumnSchema& schema, const IColumn& column) { + EXPECT_FALSE(schema.type->is_nullable()); + const auto& map_column = assert_cast(column); + const auto& offsets = map_column.get_offsets(); + ASSERT_EQ(offsets.size(), ROW_COUNT); + EXPECT_EQ(offsets[0], 0); + EXPECT_EQ(offsets[1], 2); + EXPECT_EQ(offsets[2], 3); + EXPECT_EQ(offsets[3], 3); + EXPECT_EQ(offsets[4], 4); + const auto& values = + assert_cast(map_column.get_values()); + ASSERT_EQ(values.size(), 4); + EXPECT_TRUE(values.is_null_at(0)); + EXPECT_FALSE(values.is_null_at(1)); + EXPECT_TRUE(values.is_null_at(3)); + }); + + auto schema = arrow::schema(_arrow_fields); + auto table = arrow::Table::Make(schema, _arrays); + + auto file_result = arrow::io::FileOutputStream::Open(_file_path); + ASSERT_TRUE(file_result.ok()) << file_result.status(); + std::shared_ptr out = *file_result; + + ::parquet::WriterProperties::Builder builder; + builder.version(::parquet::ParquetVersion::PARQUET_2_6); + builder.data_page_version(::parquet::ParquetDataPageVersion::V2); + builder.compression(::parquet::Compression::UNCOMPRESSED); + PARQUET_THROW_NOT_OK(::parquet::arrow::WriteTable(*table, arrow::default_memory_pool(), out, + ROW_COUNT, builder.build())); + } + + std::unique_ptr create_reader(size_t field_idx) const { + ParquetColumnReaderFactory factory(_row_group, _file_reader->metadata()->num_columns()); + std::unique_ptr reader; + auto st = factory.create(*_fields[field_idx], &reader); + EXPECT_TRUE(st.ok()) << st; + return reader; + } + + void read_and_validate(size_t field_idx) const { + auto reader = create_reader(field_idx); + ASSERT_NE(reader, nullptr); + MutableColumnPtr column = reader->type()->create_column(); + int64_t rows_read = 0; + auto st = reader->read(ROW_COUNT, column, &rows_read); + ASSERT_TRUE(st.ok()) << st; + ASSERT_EQ(rows_read, ROW_COUNT); + ASSERT_EQ(column->size(), ROW_COUNT); + _expected_by_field[field_idx](*_fields[field_idx], *column); + } + + size_t find_field_idx(const std::string& name) const { + for (size_t field_idx = 0; field_idx < _fields.size(); ++field_idx) { + if (_fields[field_idx]->name == name) { + return field_idx; + } + } + ADD_FAILURE() << "Cannot find parquet test field " << name; + return _fields.size(); + } + + std::filesystem::path _test_dir; + std::string _file_path; + std::unique_ptr<::parquet::ParquetFileReader> _file_reader; + std::shared_ptr<::parquet::RowGroupReader> _row_group; + std::vector> _fields; + std::vector> _arrow_fields; + std::vector> _arrays; + std::vector> _expected_by_field; +}; + +TEST_F(ParquetColumnReaderTest, ReadAllSupportedPhysicalAndLogicalTypes) { + for (size_t field_idx = 0; field_idx < _fields.size(); ++field_idx) { + SCOPED_TRACE(_fields[field_idx]->name); + if (_fields[field_idx]->kind != ParquetColumnSchemaKind::PRIMITIVE) { + continue; + } + ASSERT_TRUE(supports_record_reader(_fields[field_idx]->type_descriptor)); + read_and_validate(field_idx); + } +} + +TEST_F(ParquetColumnReaderTest, ReadSupportedComplexTypes) { + read_and_validate(find_field_idx("struct_col")); + read_and_validate(find_field_idx("nullable_struct_col")); + read_and_validate(find_field_idx("list_int_col")); + read_and_validate(find_field_idx("nullable_list_int_col")); + read_and_validate(find_field_idx("required_nullable_list_int_col")); + read_and_validate(find_field_idx("map_int_string_col")); + read_and_validate(find_field_idx("nullable_map_int_string_col")); + read_and_validate(find_field_idx("required_nullable_map_int_string_col")); +} + +TEST_F(ParquetColumnReaderTest, SkipThenRead) { + auto reader = create_reader(1); + auto st = reader->skip(2); + ASSERT_TRUE(st.ok()) << st; + + MutableColumnPtr column = reader->type()->create_column(); + int64_t rows_read = 0; + st = reader->read(2, column, &rows_read); + ASSERT_TRUE(st.ok()) << st; + ASSERT_EQ(rows_read, 2); + + const auto& int_values = assert_cast(*column); + ASSERT_EQ(int_values.size(), 2); + EXPECT_EQ(int_values.get_element(0), 30); + EXPECT_EQ(int_values.get_element(1), 40); +} + +TEST_F(ParquetColumnReaderTest, SelectReadsOnlySelectedRanges) { + auto reader = create_reader(1); + SelectionVector selection(3); + selection.set_index(0, 0); + selection.set_index(1, 2); + selection.set_index(2, 4); + + MutableColumnPtr column = reader->type()->create_column(); + auto st = reader->select(selection, 3, ROW_COUNT, column); + ASSERT_TRUE(st.ok()) << st; + + const auto& int_values = assert_cast(*column); + ASSERT_EQ(int_values.size(), 3); + EXPECT_EQ(int_values.get_element(0), 10); + EXPECT_EQ(int_values.get_element(1), 30); + EXPECT_EQ(int_values.get_element(2), 50); +} + +TEST_F(ParquetColumnReaderTest, ReadProjectedStructChildren) { + const auto field_idx = find_field_idx("struct_col"); + ASSERT_LT(field_idx, _fields.size()); + const auto& struct_schema = *_fields[field_idx]; + ASSERT_EQ(struct_schema.name, "struct_col"); + ASSERT_EQ(struct_schema.children.size(), 2); + + reader::FieldProjection projection; + projection.file_column_id = struct_schema.top_level_field_id; + projection.file_path = struct_schema.file_path; + projection.project_all_children = false; + reader::FieldProjection child_projection; + child_projection.file_column_id = struct_schema.top_level_field_id; + child_projection.file_path = struct_schema.children[1]->file_path; + projection.children.push_back(std::move(child_projection)); + + ParquetColumnReaderFactory factory(_row_group, _file_reader->metadata()->num_columns()); + std::unique_ptr reader; + auto st = factory.create(struct_schema, &projection, &reader); + ASSERT_TRUE(st.ok()) << st; + ASSERT_EQ(remove_nullable(reader->type())->get_primitive_type(), TYPE_STRUCT); + const auto* projected_type = + assert_cast(remove_nullable(reader->type()).get()); + ASSERT_EQ(projected_type->get_elements().size(), 1); + EXPECT_EQ(projected_type->get_element_name(0), "b"); + + MutableColumnPtr column = reader->type()->create_column(); + int64_t rows_read = 0; + st = reader->read(ROW_COUNT, column, &rows_read); + ASSERT_TRUE(st.ok()) << st; + ASSERT_EQ(rows_read, ROW_COUNT); + const auto& struct_column = assert_cast(*column); + ASSERT_EQ(struct_column.get_columns().size(), 1); + const auto& values = assert_cast(struct_column.get_column(0)); + EXPECT_EQ(values.get_data_at(0).to_string(), "sa"); + EXPECT_EQ(values.get_data_at(4).to_string(), "se"); +} + +TEST_F(ParquetColumnReaderTest, ReadProjectedNullableStructChildren) { + const auto field_idx = find_field_idx("nullable_struct_col"); + ASSERT_LT(field_idx, _fields.size()); + const auto& struct_schema = *_fields[field_idx]; + ASSERT_EQ(struct_schema.name, "nullable_struct_col"); + ASSERT_EQ(struct_schema.children.size(), 2); + + reader::FieldProjection projection; + projection.file_column_id = struct_schema.top_level_field_id; + projection.file_path = struct_schema.file_path; + projection.project_all_children = false; + reader::FieldProjection child_projection; + child_projection.file_column_id = struct_schema.top_level_field_id; + child_projection.file_path = struct_schema.children[1]->file_path; + projection.children.push_back(std::move(child_projection)); + + ParquetColumnReaderFactory factory(_row_group, _file_reader->metadata()->num_columns()); + std::unique_ptr reader; + auto st = factory.create(struct_schema, &projection, &reader); + ASSERT_TRUE(st.ok()) << st; + ASSERT_TRUE(reader->type()->is_nullable()); + ASSERT_EQ(remove_nullable(reader->type())->get_primitive_type(), TYPE_STRUCT); + const auto* projected_type = + assert_cast(remove_nullable(reader->type()).get()); + ASSERT_EQ(projected_type->get_elements().size(), 1); + EXPECT_EQ(projected_type->get_element_name(0), "b"); + + MutableColumnPtr column = reader->type()->create_column(); + int64_t rows_read = 0; + st = reader->read(ROW_COUNT, column, &rows_read); + ASSERT_TRUE(st.ok()) << st; + ASSERT_EQ(rows_read, ROW_COUNT); + const auto& nullable_column = assert_cast(*column); + EXPECT_FALSE(nullable_column.is_null_at(0)); + EXPECT_TRUE(nullable_column.is_null_at(1)); + EXPECT_FALSE(nullable_column.is_null_at(2)); + EXPECT_FALSE(nullable_column.is_null_at(3)); + EXPECT_TRUE(nullable_column.is_null_at(4)); + const auto& struct_column = + assert_cast(nullable_column.get_nested_column()); + ASSERT_EQ(struct_column.get_columns().size(), 1); + const auto& values = assert_cast(struct_column.get_column(0)); + const auto& nested_values = assert_cast(values.get_nested_column()); + EXPECT_FALSE(values.is_null_at(0)); + EXPECT_TRUE(values.is_null_at(2)); + EXPECT_FALSE(values.is_null_at(3)); + EXPECT_EQ(nested_values.get_data_at(0).to_string(), "nsa"); + EXPECT_EQ(nested_values.get_data_at(3).to_string(), "nsd"); +} + +TEST_F(ParquetColumnReaderTest, ReadListWithOverflowAcrossChunks) { + const auto field_idx = find_field_idx("nullable_list_int_col"); + auto reader = create_reader(field_idx); + MutableColumnPtr column = reader->type()->create_column(); + + int64_t rows_read = 0; + auto st = reader->read(2, column, &rows_read); + ASSERT_TRUE(st.ok()) << st; + ASSERT_EQ(rows_read, 2); + st = reader->read(3, column, &rows_read); + ASSERT_TRUE(st.ok()) << st; + ASSERT_EQ(rows_read, 3); + + _expected_by_field[field_idx](*_fields[field_idx], *column); +} + +TEST_F(ParquetColumnReaderTest, SkipListWithOverflowThenRead) { + const auto field_idx = find_field_idx("nullable_list_int_col"); + auto reader = create_reader(field_idx); + auto st = reader->skip(1); + ASSERT_TRUE(st.ok()) << st; + + MutableColumnPtr column = reader->type()->create_column(); + int64_t rows_read = 0; + st = reader->read(3, column, &rows_read); + ASSERT_TRUE(st.ok()) << st; + ASSERT_EQ(rows_read, 3); + + const auto& nullable_column = assert_cast(*column); + ASSERT_EQ(nullable_column.size(), 3); + EXPECT_TRUE(nullable_column.is_null_at(0)); + const auto& array_column = assert_cast(nullable_column.get_nested_column()); + const auto& offsets = array_column.get_offsets(); + ASSERT_EQ(offsets.size(), 3); + EXPECT_EQ(offsets[0], 0); + EXPECT_EQ(offsets[1], 0); + EXPECT_EQ(offsets[2], 2); +} + +TEST_F(ParquetColumnReaderTest, SelectListWithOverflow) { + const auto field_idx = find_field_idx("nullable_list_int_col"); + auto reader = create_reader(field_idx); + SelectionVector selection(3); + selection.set_index(0, 0); + selection.set_index(1, 3); + selection.set_index(2, 4); + + MutableColumnPtr column = reader->type()->create_column(); + auto st = reader->select(selection, 3, ROW_COUNT, column); + ASSERT_TRUE(st.ok()) << st; + + const auto& nullable_column = assert_cast(*column); + ASSERT_EQ(nullable_column.size(), 3); + EXPECT_FALSE(nullable_column.is_null_at(0)); + EXPECT_FALSE(nullable_column.is_null_at(1)); + EXPECT_FALSE(nullable_column.is_null_at(2)); + const auto& array_column = assert_cast(nullable_column.get_nested_column()); + const auto& offsets = array_column.get_offsets(); + ASSERT_EQ(offsets.size(), 3); + EXPECT_EQ(offsets[0], 2); + EXPECT_EQ(offsets[1], 4); + EXPECT_EQ(offsets[2], 5); +} + +TEST_F(ParquetColumnReaderTest, ReadMapWithOverflowAcrossChunks) { + const auto field_idx = find_field_idx("nullable_map_int_string_col"); + auto reader = create_reader(field_idx); + MutableColumnPtr column = reader->type()->create_column(); + + int64_t rows_read = 0; + auto st = reader->read(2, column, &rows_read); + ASSERT_TRUE(st.ok()) << st; + ASSERT_EQ(rows_read, 2); + st = reader->read(3, column, &rows_read); + ASSERT_TRUE(st.ok()) << st; + ASSERT_EQ(rows_read, 3); + + _expected_by_field[field_idx](*_fields[field_idx], *column); +} + +TEST_F(ParquetColumnReaderTest, SkipMapWithOverflowThenRead) { + const auto field_idx = find_field_idx("nullable_map_int_string_col"); + auto reader = create_reader(field_idx); + auto st = reader->skip(1); + ASSERT_TRUE(st.ok()) << st; + + MutableColumnPtr column = reader->type()->create_column(); + int64_t rows_read = 0; + st = reader->read(3, column, &rows_read); + ASSERT_TRUE(st.ok()) << st; + ASSERT_EQ(rows_read, 3); + + const auto& nullable_column = assert_cast(*column); + ASSERT_EQ(nullable_column.size(), 3); + EXPECT_TRUE(nullable_column.is_null_at(0)); + const auto& map_column = assert_cast(nullable_column.get_nested_column()); + const auto& offsets = map_column.get_offsets(); + ASSERT_EQ(offsets.size(), 3); + EXPECT_EQ(offsets[0], 0); + EXPECT_EQ(offsets[1], 0); + EXPECT_EQ(offsets[2], 1); +} + +TEST_F(ParquetColumnReaderTest, SelectMapWithOverflow) { + const auto field_idx = find_field_idx("nullable_map_int_string_col"); + auto reader = create_reader(field_idx); + SelectionVector selection(3); + selection.set_index(0, 0); + selection.set_index(1, 3); + selection.set_index(2, 4); + + MutableColumnPtr column = reader->type()->create_column(); + auto st = reader->select(selection, 3, ROW_COUNT, column); + ASSERT_TRUE(st.ok()) << st; + + const auto& nullable_column = assert_cast(*column); + ASSERT_EQ(nullable_column.size(), 3); + EXPECT_FALSE(nullable_column.is_null_at(0)); + EXPECT_FALSE(nullable_column.is_null_at(1)); + EXPECT_FALSE(nullable_column.is_null_at(2)); + const auto& map_column = assert_cast(nullable_column.get_nested_column()); + const auto& offsets = map_column.get_offsets(); + ASSERT_EQ(offsets.size(), 3); + EXPECT_EQ(offsets[0], 2); + EXPECT_EQ(offsets[1], 3); + EXPECT_EQ(offsets[2], 4); +} + +TEST_F(ParquetColumnReaderTest, BuildComplexSchemaPathMetadata) { + const auto field_idx = find_field_idx("struct_col"); + ASSERT_LT(field_idx, _fields.size()); + const auto& struct_schema = *_fields[field_idx]; + ASSERT_EQ(struct_schema.name, "struct_col"); + ASSERT_EQ(struct_schema.children.size(), 2); + EXPECT_EQ(struct_schema.file_path, std::vector({static_cast(field_idx)})); + EXPECT_EQ(struct_schema.name_path, std::vector({"struct_col"})); + EXPECT_EQ(struct_schema.children[0]->file_path, + std::vector({static_cast(field_idx), 0})); + EXPECT_EQ(struct_schema.children[1]->file_path, + std::vector({static_cast(field_idx), 1})); + EXPECT_EQ(struct_schema.children[0]->name_path, std::vector({"struct_col", "a"})); + EXPECT_EQ(struct_schema.children[1]->name_path, std::vector({"struct_col", "b"})); + EXPECT_EQ(struct_schema.max_definition_level, 0); + EXPECT_EQ(struct_schema.max_repetition_level, 0); +} + +TEST_F(ParquetColumnReaderTest, ResolveSupportedPhysicalAndLogicalSchemas) { + std::vector<::parquet::schema::NodePtr> nodes = { + ::parquet::schema::PrimitiveNode::Make("required_bool", ::parquet::Repetition::REQUIRED, + ::parquet::Type::BOOLEAN), + ::parquet::schema::PrimitiveNode::Make( + "required_int32", ::parquet::Repetition::REQUIRED, ::parquet::Type::INT32), + ::parquet::schema::PrimitiveNode::Make( + "required_int64", ::parquet::Repetition::REQUIRED, ::parquet::Type::INT64), + ::parquet::schema::PrimitiveNode::Make( + "required_float", ::parquet::Repetition::REQUIRED, ::parquet::Type::FLOAT), + ::parquet::schema::PrimitiveNode::Make( + "required_double", ::parquet::Repetition::REQUIRED, ::parquet::Type::DOUBLE), + ::parquet::schema::PrimitiveNode::Make("required_binary", + ::parquet::Repetition::REQUIRED, + ::parquet::Type::BYTE_ARRAY), + ::parquet::schema::PrimitiveNode::Make( + "required_fixed_binary", ::parquet::Repetition::REQUIRED, + ::parquet::Type::FIXED_LEN_BYTE_ARRAY, ::parquet::ConvertedType::NONE, 4), + ::parquet::schema::PrimitiveNode::Make( + "optional_int32", ::parquet::Repetition::OPTIONAL, ::parquet::Type::INT32), + ::parquet::schema::PrimitiveNode::Make("utf8_binary", ::parquet::Repetition::REQUIRED, + ::parquet::Type::BYTE_ARRAY, + ::parquet::ConvertedType::UTF8), + ::parquet::schema::PrimitiveNode::Make("enum_binary", ::parquet::Repetition::REQUIRED, + ::parquet::Type::BYTE_ARRAY, + ::parquet::ConvertedType::ENUM), + ::parquet::schema::PrimitiveNode::Make("json_binary", ::parquet::Repetition::REQUIRED, + ::parquet::Type::BYTE_ARRAY, + ::parquet::ConvertedType::JSON), + ::parquet::schema::PrimitiveNode::Make("bson_binary", ::parquet::Repetition::REQUIRED, + ::parquet::Type::BYTE_ARRAY, + ::parquet::ConvertedType::BSON), + ::parquet::schema::PrimitiveNode::Make("decimal_int32", ::parquet::Repetition::REQUIRED, + ::parquet::Type::INT32, + ::parquet::ConvertedType::DECIMAL, -1, 9, 2), + ::parquet::schema::PrimitiveNode::Make("decimal_int64", ::parquet::Repetition::REQUIRED, + ::parquet::Type::INT64, + ::parquet::ConvertedType::DECIMAL, -1, 18, 6), + ::parquet::schema::PrimitiveNode::Make( + "decimal_binary", ::parquet::Repetition::REQUIRED, ::parquet::Type::BYTE_ARRAY, + ::parquet::ConvertedType::DECIMAL, -1, 18, 6), + ::parquet::schema::PrimitiveNode::Make("decimal_fixed_binary", + ::parquet::Repetition::REQUIRED, + ::parquet::Type::FIXED_LEN_BYTE_ARRAY, + ::parquet::ConvertedType::DECIMAL, 8, 18, 6), + ::parquet::schema::PrimitiveNode::Make("date_int32", ::parquet::Repetition::REQUIRED, + ::parquet::Type::INT32, + ::parquet::ConvertedType::DATE), + ::parquet::schema::PrimitiveNode::Make( + "time_millis_int32", ::parquet::Repetition::REQUIRED, ::parquet::Type::INT32, + ::parquet::ConvertedType::TIME_MILLIS), + ::parquet::schema::PrimitiveNode::Make( + "time_micros_int64", ::parquet::Repetition::REQUIRED, ::parquet::Type::INT64, + ::parquet::ConvertedType::TIME_MICROS), + ::parquet::schema::PrimitiveNode::Make( + "timestamp_millis_int64", ::parquet::Repetition::REQUIRED, + ::parquet::Type::INT64, ::parquet::ConvertedType::TIMESTAMP_MILLIS), + ::parquet::schema::PrimitiveNode::Make( + "timestamp_micros_int64", ::parquet::Repetition::REQUIRED, + ::parquet::Type::INT64, ::parquet::ConvertedType::TIMESTAMP_MICROS), + ::parquet::schema::PrimitiveNode::Make("int8_int32", ::parquet::Repetition::REQUIRED, + ::parquet::Type::INT32, + ::parquet::ConvertedType::INT_8), + ::parquet::schema::PrimitiveNode::Make("uint8_int32", ::parquet::Repetition::REQUIRED, + ::parquet::Type::INT32, + ::parquet::ConvertedType::UINT_8), + ::parquet::schema::PrimitiveNode::Make("int16_int32", ::parquet::Repetition::REQUIRED, + ::parquet::Type::INT32, + ::parquet::ConvertedType::INT_16), + ::parquet::schema::PrimitiveNode::Make("uint16_int32", ::parquet::Repetition::REQUIRED, + ::parquet::Type::INT32, + ::parquet::ConvertedType::UINT_16), + ::parquet::schema::PrimitiveNode::Make("int32_int32", ::parquet::Repetition::REQUIRED, + ::parquet::Type::INT32, + ::parquet::ConvertedType::INT_32), + ::parquet::schema::PrimitiveNode::Make("uint32_int32", ::parquet::Repetition::REQUIRED, + ::parquet::Type::INT32, + ::parquet::ConvertedType::UINT_32), + ::parquet::schema::PrimitiveNode::Make("int64_int64", ::parquet::Repetition::REQUIRED, + ::parquet::Type::INT64, + ::parquet::ConvertedType::INT_64), + }; + + auto schema = + ::parquet::schema::GroupNode::Make("schema", ::parquet::Repetition::REQUIRED, nodes); + ::parquet::SchemaDescriptor descriptor; + descriptor.Init(schema); + + std::vector> fields; + auto st = build_parquet_column_schema(descriptor, &fields); + ASSERT_TRUE(st.ok()) << st; + ASSERT_EQ(fields.size(), nodes.size()); + + for (const auto& field : fields) { + SCOPED_TRACE(field->name); + ASSERT_TRUE(supports_record_reader(field->type_descriptor)); + ASSERT_NE(field->type, nullptr); + } +} + +TEST_F(ParquetColumnReaderTest, RejectUnsupportedPhysicalAndLogicalTypes) { + auto schema = ::parquet::schema::GroupNode::Make( + "schema", ::parquet::Repetition::REQUIRED, + { + ::parquet::schema::PrimitiveNode::Make( + "int96_col", ::parquet::Repetition::REQUIRED, ::parquet::Type::INT96), + ::parquet::schema::PrimitiveNode::Make("repeated_int32_col", + ::parquet::Repetition::REPEATED, + ::parquet::Type::INT32), + ::parquet::schema::PrimitiveNode::Make( + "decimal256_fixed_col", ::parquet::Repetition::REQUIRED, + ::parquet::Type::FIXED_LEN_BYTE_ARRAY, + ::parquet::ConvertedType::DECIMAL, 20, 39, 6), + ::parquet::schema::PrimitiveNode::Make( + "uint64_col", ::parquet::Repetition::REQUIRED, ::parquet::Type::INT64, + ::parquet::ConvertedType::UINT_64), + ::parquet::schema::PrimitiveNode::Make( + "time_nanos_col", ::parquet::Repetition::REQUIRED, + ::parquet::LogicalType::Time(false, + ::parquet::LogicalType::TimeUnit::NANOS), + ::parquet::Type::INT64), + ::parquet::schema::PrimitiveNode::Make( + "timestamp_nanos_col", ::parquet::Repetition::REQUIRED, + ::parquet::LogicalType::Timestamp( + false, ::parquet::LogicalType::TimeUnit::NANOS), + ::parquet::Type::INT64), + }); + ::parquet::SchemaDescriptor descriptor; + descriptor.Init(schema); + + std::vector> fields; + auto st = build_parquet_column_schema(descriptor, &fields); + ASSERT_TRUE(st.ok()) << st; + ASSERT_EQ(fields.size(), 6); + + for (const auto& field : fields) { + SCOPED_TRACE(field->name); + ASSERT_FALSE(supports_record_reader(field->type_descriptor)); + } +} + +} // namespace +} // namespace doris::parquet diff --git a/be/test/format/new_parquet/parquet_reader_test.cpp b/be/test/format/new_parquet/parquet_reader_test.cpp new file mode 100644 index 00000000000000..fb6c6d8ab35707 --- /dev/null +++ b/be/test/format/new_parquet/parquet_reader_test.cpp @@ -0,0 +1,1051 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "format/new_parquet/parquet_reader.h" + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "core/assert_cast.h" +#include "core/block/block.h" +#include "core/column/column_string.h" +#include "core/column/column_vector.h" +#include "core/data_type/data_type_number.h" +#include "core/data_type/data_type_string.h" +#include "core/data_type/data_type_struct.h" +#include "core/data_type/primitive_type.h" +#include "core/field.h" +#include "exprs/vexpr.h" +#include "exprs/vexpr_context.h" +#include "format/new_parquet/column_reader.h" +#include "format/reader/column_mapper.h" +#include "format/reader/expr/delete_predicate.h" +#include "format/reader/expr/slot_ref.h" +#include "format/reader/file_reader.h" +#include "format/reader/table_reader.h" +#include "gen_cpp/Types_types.h" +#include "io/io_common.h" +#include "runtime/runtime_state.h" +#include "storage/predicate/predicate_creator.h" + +namespace doris { +namespace { + +constexpr int64_t ROW_COUNT = 5; + +class Int32GreaterThanExpr final : public VExpr { +public: + Int32GreaterThanExpr(int column_id, int32_t value) + : VExpr(std::make_shared(), false), + _column_id(column_id), + _value(value) {} + + Status execute_column_impl(VExprContext* context, const Block* block, const Selector* selector, + size_t count, ColumnPtr& result_column) const override { + const auto& input = + assert_cast(*block->get_by_position(_column_id).column); + auto result = ColumnUInt8::create(); + auto& result_data = result->get_data(); + result_data.resize(count); + for (size_t row = 0; row < count; ++row) { + const size_t input_row = selector == nullptr ? row : (*selector)[row]; + result_data[row] = input.get_element(input_row) > _value; + } + result_column = std::move(result); + return Status::OK(); + } + + const std::string& expr_name() const override { return _expr_name; } + +private: + const int _column_id; + const int32_t _value; + const std::string _expr_name = "Int32GreaterThanExpr"; +}; + +class Int32SumGreaterThanExpr final : public VExpr { +public: + Int32SumGreaterThanExpr(int left_column_id, int right_column_id, int32_t value) + : VExpr(std::make_shared(), false), + _left_column_id(left_column_id), + _right_column_id(right_column_id), + _value(value) {} + + Status execute_column_impl(VExprContext* context, const Block* block, const Selector* selector, + size_t count, ColumnPtr& result_column) const override { + const auto& left_input = + assert_cast(*block->get_by_position(_left_column_id).column); + const auto& right_input = + assert_cast(*block->get_by_position(_right_column_id).column); + auto result = ColumnUInt8::create(); + auto& result_data = result->get_data(); + result_data.resize(count); + for (size_t row = 0; row < count; ++row) { + const size_t input_row = selector == nullptr ? row : (*selector)[row]; + result_data[row] = + left_input.get_element(input_row) + right_input.get_element(input_row) > _value; + } + result_column = std::move(result); + return Status::OK(); + } + + const std::string& expr_name() const override { return _expr_name; } + +private: + const int _left_column_id; + const int _right_column_id; + const int32_t _value; + const std::string _expr_name = "Int32SumGreaterThanExpr"; +}; + +VExprContextSPtr create_int32_greater_than_conjunct(int column_id, int32_t value) { + auto ctx = + VExprContext::create_shared(std::make_shared(column_id, value)); + ctx->_prepared = true; + ctx->_opened = true; + return ctx; +} + +VExprContextSPtr create_int32_sum_greater_than_conjunct(int left_column_id, int right_column_id, + int32_t value) { + auto ctx = VExprContext::create_shared( + std::make_shared(left_column_id, right_column_id, value)); + ctx->_prepared = true; + ctx->_opened = true; + return ctx; +} + +std::shared_ptr finish_array(arrow::ArrayBuilder* builder) { + std::shared_ptr array; + EXPECT_TRUE(builder->Finish(&array).ok()); + return array; +} + +std::shared_ptr build_int32_array(const std::vector& values) { + arrow::Int32Builder builder; + for (const auto value : values) { + EXPECT_TRUE(builder.Append(value).ok()); + } + return finish_array(&builder); +} + +std::shared_ptr build_string_array(const std::vector& values) { + arrow::StringBuilder builder; + for (const auto& value : values) { + EXPECT_TRUE(builder.Append(value).ok()); + } + return finish_array(&builder); +} + +void write_parquet_file(const std::string& file_path, int64_t row_group_size = ROW_COUNT) { + auto schema = arrow::schema({ + arrow::field("id", arrow::int32(), false), + arrow::field("value", arrow::utf8(), false), + }); + auto table = arrow::Table::Make(schema, + {build_int32_array({1, 2, 3, 4, 5}), + build_string_array({"one", "two", "three", "four", "five"})}); + + auto file_result = arrow::io::FileOutputStream::Open(file_path); + ASSERT_TRUE(file_result.ok()) << file_result.status(); + std::shared_ptr out = *file_result; + + ::parquet::WriterProperties::Builder builder; + builder.version(::parquet::ParquetVersion::PARQUET_2_6); + builder.data_page_version(::parquet::ParquetDataPageVersion::V2); + builder.compression(::parquet::Compression::UNCOMPRESSED); + PARQUET_THROW_NOT_OK(::parquet::arrow::WriteTable(*table, arrow::default_memory_pool(), out, + row_group_size, builder.build())); +} + +void write_int_pair_parquet_file(const std::string& file_path, int64_t row_group_size = ROW_COUNT) { + auto schema = arrow::schema({ + arrow::field("id", arrow::int32(), false), + arrow::field("score", arrow::int32(), false), + arrow::field("value", arrow::utf8(), false), + }); + auto table = arrow::Table::Make( + schema, {build_int32_array({1, 2, 3, 4, 5}), build_int32_array({1, 2, 3, 4, 5}), + build_string_array({"one", "two", "three", "four", "five"})}); + + auto file_result = arrow::io::FileOutputStream::Open(file_path); + ASSERT_TRUE(file_result.ok()) << file_result.status(); + std::shared_ptr out = *file_result; + + ::parquet::WriterProperties::Builder builder; + builder.version(::parquet::ParquetVersion::PARQUET_2_6); + builder.data_page_version(::parquet::ParquetDataPageVersion::V2); + builder.compression(::parquet::Compression::UNCOMPRESSED); + PARQUET_THROW_NOT_OK(::parquet::arrow::WriteTable(*table, arrow::default_memory_pool(), out, + row_group_size, builder.build())); +} + +void write_dictionary_filter_parquet_file(const std::string& file_path) { + auto schema = arrow::schema({ + arrow::field("id", arrow::int32(), false), + arrow::field("value", arrow::utf8(), false), + }); + auto table = + arrow::Table::Make(schema, {build_int32_array({1, 2, 3, 4, 5, 6}), + build_string_array({"aa", "az", "lm", "lz", "za", "zz"})}); + + auto file_result = arrow::io::FileOutputStream::Open(file_path); + ASSERT_TRUE(file_result.ok()) << file_result.status(); + std::shared_ptr out = *file_result; + + ::parquet::WriterProperties::Builder builder; + builder.version(::parquet::ParquetVersion::PARQUET_2_6); + builder.data_page_version(::parquet::ParquetDataPageVersion::V2); + builder.compression(::parquet::Compression::UNCOMPRESSED); + builder.enable_dictionary("value"); + builder.disable_dictionary("id"); + builder.disable_statistics(); + PARQUET_THROW_NOT_OK(::parquet::arrow::WriteTable(*table, arrow::default_memory_pool(), out, 1, + builder.build())); +} + +void write_dictionary_edge_parquet_file(const std::string& file_path) { + auto schema = arrow::schema({ + arrow::field("id", arrow::int32(), false), + arrow::field("value", arrow::utf8(), false), + }); + auto table = arrow::Table::Make( + schema, + {build_int32_array({1, 2, 3, 4, 5, 6, 7, 8}), + build_string_array({"", "same", "other", "long-value", "", "tail", "same", "last"})}); + + auto file_result = arrow::io::FileOutputStream::Open(file_path); + ASSERT_TRUE(file_result.ok()) << file_result.status(); + std::shared_ptr out = *file_result; + + ::parquet::WriterProperties::Builder builder; + builder.version(::parquet::ParquetVersion::PARQUET_2_6); + builder.data_page_version(::parquet::ParquetDataPageVersion::V2); + builder.compression(::parquet::Compression::UNCOMPRESSED); + builder.enable_dictionary("value"); + builder.disable_dictionary("id"); + builder.disable_statistics(); + PARQUET_THROW_NOT_OK(::parquet::arrow::WriteTable(*table, arrow::default_memory_pool(), out, 2, + builder.build())); +} + +Block build_file_block(const std::vector& schema) { + Block block; + for (const auto& field : schema) { + block.insert({field.type->create_column(), field.type, field.name}); + } + return block; +} + +Block build_file_block_with_row_position(const std::vector& schema) { + auto block = build_file_block(schema); + const auto row_position_field = + parquet::ParquetColumnReaderFactory::row_position_schema_field(); + block.insert({row_position_field.type->create_column(), row_position_field.type, + row_position_field.name}); + return block; +} + +int64_t parquet_column_start_offset(const ::parquet::ColumnChunkMetaData& column_metadata) { + return column_metadata.has_dictionary_page() + ? static_cast(column_metadata.dictionary_page_offset()) + : static_cast(column_metadata.data_page_offset()); +} + +std::pair row_group_mid_range(const std::string& file_path, int row_group_idx) { + auto reader = ::parquet::ParquetFileReader::OpenFile(file_path, false); + auto metadata = reader->metadata(); + auto row_group_metadata = metadata->RowGroup(row_group_idx); + auto first_column = row_group_metadata->ColumnChunk(0); + auto last_column = row_group_metadata->ColumnChunk(row_group_metadata->num_columns() - 1); + const int64_t row_group_start_offset = parquet_column_start_offset(*first_column); + const int64_t row_group_end_offset = + parquet_column_start_offset(*last_column) + last_column->total_compressed_size(); + const int64_t row_group_mid_offset = + row_group_start_offset + (row_group_end_offset - row_group_start_offset) / 2; + return {row_group_mid_offset, 1}; +} + +class TestFileReader final : public reader::FileReader { +public: + TestFileReader(std::shared_ptr& system_properties, + std::unique_ptr& file_description, + std::shared_ptr io_ctx) + : reader::FileReader(system_properties, file_description, io_ctx, nullptr) {} + + Status get_schema(std::vector* file_schema) const override { + file_schema->clear(); + reader::SchemaField field; + field.id = 0; + field.name = "id"; + field.type = std::make_shared(); + file_schema->push_back(std::move(field)); + return Status::OK(); + } + + bool has_request() const { return _request != nullptr; } + + bool eof() const { return _eof; } + + bool has_io_context() const { return _io_ctx != nullptr; } + + long io_context_use_count() const { return _io_ctx.use_count(); } +}; + +TEST(FileReaderTest, OpenStoresRequestAndCloseClearsState) { + auto system_properties = std::make_shared(); + system_properties->system_type = TFileType::FILE_LOCAL; + auto file_description = std::make_unique(); + auto io_ctx = std::make_shared(); + TestFileReader reader(system_properties, file_description, io_ctx); + + auto request = std::make_unique(); + request->non_predicate_columns.push_back(0); + ASSERT_TRUE(reader.open(request).ok()); + EXPECT_EQ(request, nullptr); + EXPECT_TRUE(reader.has_request()); + + ASSERT_TRUE(reader.close().ok()); + EXPECT_FALSE(reader.has_request()); + EXPECT_TRUE(reader.eof()); +} + +TEST(FileReaderTest, CloseReleasesSharedIOContext) { + auto system_properties = std::make_shared(); + system_properties->system_type = TFileType::FILE_LOCAL; + auto file_description = std::make_unique(); + auto io_ctx = std::make_shared(); + std::weak_ptr weak_io_ctx = io_ctx; + TestFileReader reader(system_properties, file_description, io_ctx); + + EXPECT_TRUE(reader.has_io_context()); + EXPECT_EQ(reader.io_context_use_count(), 2); + io_ctx.reset(); + EXPECT_FALSE(weak_io_ctx.expired()); + EXPECT_EQ(reader.io_context_use_count(), 1); + + ASSERT_TRUE(reader.close().ok()); + EXPECT_FALSE(reader.has_io_context()); + EXPECT_TRUE(weak_io_ctx.expired()); +} + +TEST(TableColumnMapperTest, CreatesComplexProjectionForStructChildren) { + reader::SchemaField struct_field; + struct_field.id = 0; + struct_field.name = "s"; + struct_field.file_path = {0}; + reader::SchemaField a_field; + a_field.id = 0; + a_field.name = "a"; + a_field.type = std::make_shared(); + a_field.file_path = {0, 0}; + reader::SchemaField b_field; + b_field.id = 0; + b_field.name = "b"; + b_field.type = std::make_shared(); + b_field.file_path = {0, 1}; + struct_field.children = {a_field, b_field}; + struct_field.type = std::make_shared(DataTypes {a_field.type, b_field.type}, + Strings {"a", "b"}); + + reader::TableColumn table_child; + table_child.id = 101; + table_child.name = "b"; + table_child.type = b_field.type; + reader::TableColumn table_column; + table_column.id = 100; + table_column.name = "s"; + table_column.type = std::make_shared(DataTypes {b_field.type}, Strings {"b"}); + table_column.children = {table_child}; + + reader::TableColumnMapperOptions options; + options.mode = reader::TableColumnMappingMode::BY_NAME; + reader::TableColumnMapper mapper(options); + ASSERT_TRUE(mapper.create_mapping({table_column}, {}, {struct_field}).ok()); + + auto request = std::make_unique(); + ASSERT_TRUE(mapper.create_scan_request({}, {}, {table_column}, request.get()).ok()); + ASSERT_EQ(request->non_predicate_columns, std::vector({0})); + ASSERT_EQ(request->complex_projections.size(), 1); + const auto& projection = request->complex_projections.at(0); + EXPECT_EQ(projection.file_path, std::vector({0})); + ASSERT_FALSE(projection.project_all_children); + ASSERT_EQ(projection.children.size(), 1); + EXPECT_EQ(projection.children[0].file_path, std::vector({0, 1})); + + ASSERT_EQ(mapper.mappings().size(), 1); + const auto* projected_type = + assert_cast(mapper.mappings()[0].file_type.get()); + ASSERT_EQ(projected_type->get_elements().size(), 1); + EXPECT_EQ(projected_type->get_element_name(0), "b"); +} + +class NewParquetReaderTest : public testing::Test { +protected: + void SetUp() override { + _test_dir = std::filesystem::temp_directory_path() / "doris_new_parquet_reader_test"; + std::filesystem::remove_all(_test_dir); + std::filesystem::create_directories(_test_dir); + _file_path = (_test_dir / "reader.parquet").string(); + write_parquet_file(_file_path); + } + + void TearDown() override { std::filesystem::remove_all(_test_dir); } + + std::unique_ptr create_reader(int64_t range_start_offset = 0, + int64_t range_size = -1) const { + auto system_properties = std::make_shared(); + system_properties->system_type = TFileType::FILE_LOCAL; + auto file_description = std::make_unique(); + file_description->path = _file_path; + file_description->file_size = static_cast(std::filesystem::file_size(_file_path)); + file_description->range_start_offset = range_start_offset; + file_description->range_size = range_size; + return std::make_unique(system_properties, file_description, + nullptr, nullptr); + } + + std::filesystem::path _test_dir; + std::string _file_path; +}; + +TEST_F(NewParquetReaderTest, GetSchemaReturnsFileLocalColumns) { + auto reader = create_reader(); + RuntimeState state {TQueryOptions(), TQueryGlobals()}; + ASSERT_TRUE(reader->init(&state).ok()); + + std::vector schema; + ASSERT_TRUE(reader->get_schema(&schema).ok()); + ASSERT_EQ(schema.size(), 2); + EXPECT_EQ(schema[0].id, 0); + EXPECT_EQ(schema[0].name, "id"); + EXPECT_EQ(schema[0].type->get_primitive_type(), TYPE_INT); + EXPECT_EQ(schema[1].id, 1); + EXPECT_EQ(schema[1].name, "value"); + EXPECT_EQ(schema[1].type->get_primitive_type(), TYPE_STRING); +} + +TEST_F(NewParquetReaderTest, ReadSingleRowGroupThenEof) { + auto reader = create_reader(); + RuntimeState state {TQueryOptions(), TQueryGlobals()}; + ASSERT_TRUE(reader->init(&state).ok()); + + std::vector schema; + ASSERT_TRUE(reader->get_schema(&schema).ok()); + Block block = build_file_block(schema); + + auto request = std::make_unique(); + request->non_predicate_columns = {0, 1}; + ASSERT_TRUE(reader->open(request).ok()); + + size_t rows = 0; + bool eof = false; + ASSERT_TRUE(reader->get_block(&block, &rows, &eof).ok()); + EXPECT_FALSE(eof); + ASSERT_EQ(rows, ROW_COUNT); + + const auto& ids = assert_cast(*block.get_by_position(0).column); + const auto& values = assert_cast(*block.get_by_position(1).column); + ASSERT_EQ(ids.size(), ROW_COUNT); + ASSERT_EQ(values.size(), ROW_COUNT); + EXPECT_EQ(ids.get_element(0), 1); + EXPECT_EQ(ids.get_element(4), 5); + EXPECT_EQ(values.get_data_at(0).to_string(), "one"); + EXPECT_EQ(values.get_data_at(4).to_string(), "five"); + + rows = 0; + eof = false; + ASSERT_TRUE(reader->get_block(&block, &rows, &eof).ok()); + EXPECT_TRUE(eof); + EXPECT_EQ(rows, 0); +} + +TEST_F(NewParquetReaderTest, ReadMultipleRowGroups) { + write_parquet_file(_file_path, 2); + auto parquet_file_reader = ::parquet::ParquetFileReader::OpenFile(_file_path, false); + ASSERT_EQ(parquet_file_reader->metadata()->num_row_groups(), 3); + + auto reader = create_reader(); + RuntimeState state {TQueryOptions(), TQueryGlobals()}; + ASSERT_TRUE(reader->init(&state).ok()); + + std::vector schema; + ASSERT_TRUE(reader->get_schema(&schema).ok()); + auto request = std::make_unique(); + request->non_predicate_columns = {0, 1}; + ASSERT_TRUE(reader->open(request).ok()); + + std::vector ids; + std::vector values; + bool eof = false; + while (!eof) { + Block block = build_file_block(schema); + size_t rows = 0; + ASSERT_TRUE(reader->get_block(&block, &rows, &eof).ok()); + if (rows == 0) { + continue; + } + const auto& id_column = assert_cast(*block.get_by_position(0).column); + const auto& value_column = + assert_cast(*block.get_by_position(1).column); + for (size_t row = 0; row < rows; ++row) { + ids.push_back(id_column.get_element(row)); + values.push_back(value_column.get_data_at(row).to_string()); + } + } + + EXPECT_EQ(ids, std::vector({1, 2, 3, 4, 5})); + EXPECT_EQ(values, std::vector({"one", "two", "three", "four", "five"})); +} + +TEST_F(NewParquetReaderTest, ReadPredicateAndNonPredicateColumnsWithSelection) { + auto reader = create_reader(); + RuntimeState state {TQueryOptions(), TQueryGlobals()}; + ASSERT_TRUE(reader->init(&state).ok()); + + std::vector schema; + ASSERT_TRUE(reader->get_schema(&schema).ok()); + Block block = build_file_block(schema); + + auto request = std::make_unique(); + request->predicate_columns = {0}; + request->non_predicate_columns = {1}; + request->conjuncts.push_back(create_int32_greater_than_conjunct(0, 2)); + reader::FileColumnPredicateFilter column_filter; + column_filter.file_column_id = 0; + column_filter.predicates.push_back(create_comparison_predicate( + 0, "id", schema[0].type, Field::create_field(2), false)); + request->column_predicate_filters.push_back(std::move(column_filter)); + ASSERT_TRUE(reader->open(request).ok()); + + size_t rows = 0; + bool eof = false; + ASSERT_TRUE(reader->get_block(&block, &rows, &eof).ok()); + EXPECT_FALSE(eof); + ASSERT_EQ(rows, 3); + + const auto& ids = assert_cast(*block.get_by_position(0).column); + const auto& values = assert_cast(*block.get_by_position(1).column); + ASSERT_EQ(ids.size(), 3); + ASSERT_EQ(values.size(), 3); + EXPECT_EQ(ids.get_element(0), 3); + EXPECT_EQ(ids.get_element(1), 4); + EXPECT_EQ(ids.get_element(2), 5); + EXPECT_EQ(values.get_data_at(0).to_string(), "three"); + EXPECT_EQ(values.get_data_at(1).to_string(), "four"); + EXPECT_EQ(values.get_data_at(2).to_string(), "five"); + + rows = 0; + eof = false; + ASSERT_TRUE(reader->get_block(&block, &rows, &eof).ok()); + EXPECT_TRUE(eof); + EXPECT_EQ(rows, 0); +} + +TEST_F(NewParquetReaderTest, ReadMultiPredicateColumnsBeforeExpressionFilter) { + write_int_pair_parquet_file(_file_path); + auto reader = create_reader(); + RuntimeState state {TQueryOptions(), TQueryGlobals()}; + ASSERT_TRUE(reader->init(&state).ok()); + + std::vector schema; + ASSERT_TRUE(reader->get_schema(&schema).ok()); + Block block = build_file_block(schema); + + auto request = std::make_unique(); + request->predicate_columns = {0, 1}; + request->non_predicate_columns = {}; + request->conjuncts.push_back(create_int32_sum_greater_than_conjunct(0, 1, 7)); + ASSERT_TRUE(reader->open(request).ok()); + + size_t rows = 0; + bool eof = false; + ASSERT_TRUE(reader->get_block(&block, &rows, &eof).ok()); + EXPECT_FALSE(eof); + ASSERT_EQ(rows, 2); + + const auto& ids = assert_cast(*block.get_by_position(0).column); + const auto& scores = assert_cast(*block.get_by_position(1).column); + ASSERT_EQ(ids.size(), 2); + ASSERT_EQ(scores.size(), 2); + EXPECT_EQ(ids.get_element(0), 4); + EXPECT_EQ(ids.get_element(1), 5); + EXPECT_EQ(scores.get_element(0), 4); + EXPECT_EQ(scores.get_element(1), 5); +} + +TEST_F(NewParquetReaderTest, PredicateFiltersRowGroupsByStatistics) { + write_parquet_file(_file_path, 2); + auto parquet_file_reader = ::parquet::ParquetFileReader::OpenFile(_file_path, false); + ASSERT_EQ(parquet_file_reader->metadata()->num_row_groups(), 3); + + auto reader = create_reader(); + RuntimeState state {TQueryOptions(), TQueryGlobals()}; + ASSERT_TRUE(reader->init(&state).ok()); + + std::vector schema; + ASSERT_TRUE(reader->get_schema(&schema).ok()); + auto request = std::make_unique(); + request->predicate_columns = {0}; + request->non_predicate_columns = {1}; + request->conjuncts.push_back(create_int32_greater_than_conjunct(0, 2)); + reader::FileColumnPredicateFilter column_filter; + column_filter.file_column_id = 0; + column_filter.predicates.push_back(create_comparison_predicate( + 0, "id", schema[0].type, Field::create_field(2), false)); + request->column_predicate_filters.push_back(std::move(column_filter)); + ASSERT_TRUE(reader->open(request).ok()); + + std::vector ids; + std::vector values; + bool eof = false; + while (!eof) { + Block block = build_file_block(schema); + size_t rows = 0; + ASSERT_TRUE(reader->get_block(&block, &rows, &eof).ok()); + if (rows == 0) { + continue; + } + const auto& id_column = assert_cast(*block.get_by_position(0).column); + const auto& value_column = + assert_cast(*block.get_by_position(1).column); + for (size_t row = 0; row < rows; ++row) { + ids.push_back(id_column.get_element(row)); + values.push_back(value_column.get_data_at(row).to_string()); + } + } + + EXPECT_EQ(ids, std::vector({3, 4, 5})); + EXPECT_EQ(values, std::vector({"three", "four", "five"})); +} + +TEST_F(NewParquetReaderTest, PredicateFiltersRowGroupsByDictionary) { + write_dictionary_filter_parquet_file(_file_path); + auto parquet_file_reader = ::parquet::ParquetFileReader::OpenFile(_file_path, false); + ASSERT_EQ(parquet_file_reader->metadata()->num_row_groups(), 6); + for (int row_group_idx = 0; row_group_idx < 6; ++row_group_idx) { + auto row_group = parquet_file_reader->metadata()->RowGroup(row_group_idx); + ASSERT_NE(row_group, nullptr); + auto value_chunk = row_group->ColumnChunk(1); + ASSERT_NE(value_chunk, nullptr); + ASSERT_TRUE(value_chunk->has_dictionary_page()); + ASSERT_TRUE(value_chunk->statistics() == nullptr || + !value_chunk->statistics()->HasMinMax()); + } + + auto reader = create_reader(); + RuntimeState state {TQueryOptions(), TQueryGlobals()}; + ASSERT_TRUE(reader->init(&state).ok()); + + std::vector schema; + ASSERT_TRUE(reader->get_schema(&schema).ok()); + auto request = std::make_unique(); + request->predicate_columns = {1}; + request->non_predicate_columns = {0}; + reader::FileColumnPredicateFilter column_filter; + column_filter.file_column_id = 1; + column_filter.predicates.push_back(create_comparison_predicate( + 1, "value", schema[1].type, Field::create_field("lm"), false)); + request->column_predicate_filters.push_back(std::move(column_filter)); + ASSERT_TRUE(reader->open(request).ok()); + + std::vector ids; + std::vector values; + bool eof = false; + while (!eof) { + Block block = build_file_block(schema); + size_t rows = 0; + ASSERT_TRUE(reader->get_block(&block, &rows, &eof).ok()); + if (rows == 0) { + continue; + } + const auto& id_column = assert_cast(*block.get_by_position(0).column); + const auto& value_column = + assert_cast(*block.get_by_position(1).column); + for (size_t row = 0; row < rows; ++row) { + ids.push_back(id_column.get_element(row)); + values.push_back(value_column.get_data_at(row).to_string()); + } + } + + EXPECT_EQ(ids, std::vector({3})); + EXPECT_EQ(values, std::vector({"lm"})); +} + +TEST_F(NewParquetReaderTest, InPredicateFiltersRowGroupsByDictionary) { + write_dictionary_filter_parquet_file(_file_path); + auto reader = create_reader(); + RuntimeState state {TQueryOptions(), TQueryGlobals()}; + ASSERT_TRUE(reader->init(&state).ok()); + + std::vector schema; + ASSERT_TRUE(reader->get_schema(&schema).ok()); + auto request = std::make_unique(); + request->predicate_columns = {1}; + request->non_predicate_columns = {0}; + auto set = build_set(); + set->insert(const_cast("az"), 2); + set->insert(const_cast("za"), 2); + reader::FileColumnPredicateFilter column_filter; + column_filter.file_column_id = 1; + column_filter.predicates.push_back(create_in_list_predicate( + 1, "value", schema[1].type, set, false)); + request->column_predicate_filters.push_back(std::move(column_filter)); + ASSERT_TRUE(reader->open(request).ok()); + + std::vector ids; + std::vector values; + bool eof = false; + while (!eof) { + Block block = build_file_block(schema); + size_t rows = 0; + ASSERT_TRUE(reader->get_block(&block, &rows, &eof).ok()); + if (rows == 0) { + continue; + } + const auto& id_column = assert_cast(*block.get_by_position(0).column); + const auto& value_column = + assert_cast(*block.get_by_position(1).column); + for (size_t row = 0; row < rows; ++row) { + ids.push_back(id_column.get_element(row)); + values.push_back(value_column.get_data_at(row).to_string()); + } + } + + EXPECT_EQ(ids, std::vector({2, 5})); + EXPECT_EQ(values, std::vector({"az", "za"})); +} + +TEST_F(NewParquetReaderTest, DictionaryPageV2StringEdgesSurviveSelection) { + write_dictionary_edge_parquet_file(_file_path); + auto parquet_file_reader = ::parquet::ParquetFileReader::OpenFile(_file_path, false); + ASSERT_EQ(parquet_file_reader->metadata()->num_row_groups(), 4); + for (int row_group_idx = 0; row_group_idx < 4; ++row_group_idx) { + auto row_group = parquet_file_reader->metadata()->RowGroup(row_group_idx); + ASSERT_NE(row_group, nullptr); + ASSERT_TRUE(row_group->ColumnChunk(1)->has_dictionary_page()); + } + + auto reader = create_reader(); + RuntimeState state {TQueryOptions(), TQueryGlobals()}; + ASSERT_TRUE(reader->init(&state).ok()); + + std::vector schema; + ASSERT_TRUE(reader->get_schema(&schema).ok()); + auto request = std::make_unique(); + request->predicate_columns = {1}; + request->non_predicate_columns = {0}; + auto set = build_set(); + set->insert(const_cast(""), 0); + set->insert(const_cast("same"), 4); + reader::FileColumnPredicateFilter column_filter; + column_filter.file_column_id = 1; + column_filter.predicates.push_back(create_in_list_predicate( + 1, "value", schema[1].type, set, false)); + request->column_predicate_filters.push_back(std::move(column_filter)); + ASSERT_TRUE(reader->open(request).ok()); + + std::vector ids; + std::vector values; + bool eof = false; + while (!eof) { + Block block = build_file_block(schema); + size_t rows = 0; + ASSERT_TRUE(reader->get_block(&block, &rows, &eof).ok()); + if (rows == 0) { + continue; + } + const auto& id_column = assert_cast(*block.get_by_position(0).column); + const auto& value_column = + assert_cast(*block.get_by_position(1).column); + for (size_t row = 0; row < rows; ++row) { + ids.push_back(id_column.get_element(row)); + values.push_back(value_column.get_data_at(row).to_string()); + } + } + + EXPECT_EQ(ids, std::vector({1, 2, 5, 6, 7, 8})); + EXPECT_EQ(values, std::vector({"", "same", "", "tail", "same", "last"})); +} + +TEST_F(NewParquetReaderTest, StatisticsPruningSkipsPrefixRowGroupsAndReadsLaterGroups) { + write_parquet_file(_file_path, 1); + auto parquet_file_reader = ::parquet::ParquetFileReader::OpenFile(_file_path, false); + ASSERT_EQ(parquet_file_reader->metadata()->num_row_groups(), 5); + + auto reader = create_reader(); + RuntimeState state {TQueryOptions(), TQueryGlobals()}; + ASSERT_TRUE(reader->init(&state).ok()); + + std::vector schema; + ASSERT_TRUE(reader->get_schema(&schema).ok()); + auto request = std::make_unique(); + request->predicate_columns = {0}; + request->non_predicate_columns = {1}; + reader::FileColumnPredicateFilter column_filter; + column_filter.file_column_id = 0; + column_filter.predicates.push_back(create_comparison_predicate( + 0, "id", schema[0].type, Field::create_field(4), false)); + request->column_predicate_filters.push_back(std::move(column_filter)); + ASSERT_TRUE(reader->open(request).ok()); + + std::vector ids; + std::vector values; + bool eof = false; + while (!eof) { + Block block = build_file_block(schema); + size_t rows = 0; + ASSERT_TRUE(reader->get_block(&block, &rows, &eof).ok()); + if (rows == 0) { + continue; + } + const auto& id_column = assert_cast(*block.get_by_position(0).column); + const auto& value_column = + assert_cast(*block.get_by_position(1).column); + for (size_t row = 0; row < rows; ++row) { + ids.push_back(id_column.get_element(row)); + values.push_back(value_column.get_data_at(row).to_string()); + } + } + + EXPECT_EQ(ids, std::vector({4, 5})); + EXPECT_EQ(values, std::vector({"four", "five"})); +} + +TEST_F(NewParquetReaderTest, RowPositionReaderReturnsFileLocalPositions) { + write_parquet_file(_file_path, 2); + auto parquet_file_reader = ::parquet::ParquetFileReader::OpenFile(_file_path, false); + ASSERT_EQ(parquet_file_reader->metadata()->num_row_groups(), 3); + + auto reader = create_reader(); + RuntimeState state {TQueryOptions(), TQueryGlobals()}; + ASSERT_TRUE(reader->init(&state).ok()); + + std::vector schema; + ASSERT_TRUE(reader->get_schema(&schema).ok()); + auto request = std::make_unique(); + request->non_predicate_columns = {parquet::ParquetColumnReaderFactory::ROW_POSITION_COLUMN_ID, + 0}; + request->column_positions = { + {0, 0}, + {parquet::ParquetColumnReaderFactory::ROW_POSITION_COLUMN_ID, 2}, + }; + ASSERT_TRUE(reader->open(request).ok()); + + std::vector row_positions; + std::vector ids; + bool eof = false; + while (!eof) { + Block block = build_file_block_with_row_position(schema); + size_t rows = 0; + ASSERT_TRUE(reader->get_block(&block, &rows, &eof).ok()); + if (rows == 0) { + continue; + } + const auto& id_column = assert_cast(*block.get_by_position(0).column); + const auto& row_position_column = + assert_cast(*block.get_by_position(2).column); + for (size_t row = 0; row < rows; ++row) { + ids.push_back(id_column.get_element(row)); + row_positions.push_back(row_position_column.get_element(row)); + } + } + + EXPECT_EQ(ids, std::vector({1, 2, 3, 4, 5})); + EXPECT_EQ(row_positions, std::vector({0, 1, 2, 3, 4})); +} + +TEST_F(NewParquetReaderTest, RowPositionReaderKeepsPositionsAfterSelection) { + auto reader = create_reader(); + RuntimeState state {TQueryOptions(), TQueryGlobals()}; + ASSERT_TRUE(reader->init(&state).ok()); + + std::vector schema; + ASSERT_TRUE(reader->get_schema(&schema).ok()); + Block block = build_file_block_with_row_position(schema); + + auto request = std::make_unique(); + request->predicate_columns = {0}; + request->non_predicate_columns = {parquet::ParquetColumnReaderFactory::ROW_POSITION_COLUMN_ID}; + request->column_positions = { + {0, 0}, + {parquet::ParquetColumnReaderFactory::ROW_POSITION_COLUMN_ID, 2}, + }; + request->conjuncts.push_back(create_int32_greater_than_conjunct(0, 2)); + ASSERT_TRUE(reader->open(request).ok()); + + size_t rows = 0; + bool eof = false; + ASSERT_TRUE(reader->get_block(&block, &rows, &eof).ok()); + EXPECT_FALSE(eof); + ASSERT_EQ(rows, 3); + + const auto& id_column = assert_cast(*block.get_by_position(0).column); + const auto& row_position_column = + assert_cast(*block.get_by_position(2).column); + EXPECT_EQ(id_column.get_element(0), 3); + EXPECT_EQ(id_column.get_element(1), 4); + EXPECT_EQ(id_column.get_element(2), 5); + EXPECT_EQ(row_position_column.get_element(0), 2); + EXPECT_EQ(row_position_column.get_element(1), 3); + EXPECT_EQ(row_position_column.get_element(2), 4); +} + +TEST_F(NewParquetReaderTest, DeletePredicateFiltersRowPositions) { + auto reader = create_reader(); + RuntimeState state {TQueryOptions(), TQueryGlobals()}; + ASSERT_TRUE(reader->init(&state).ok()); + + std::vector schema; + ASSERT_TRUE(reader->get_schema(&schema).ok()); + Block block = build_file_block_with_row_position(schema); + + static const std::vector deleted_rows {1, 3}; + auto delete_predicate = std::make_shared(deleted_rows); + delete_predicate->add_child(TableSlotRef::create_shared( + 2, 2, -1, std::make_shared(), + parquet::ParquetColumnReaderFactory::ROW_POSITION_COLUMN_NAME)); + + auto request = std::make_unique(); + request->predicate_columns = {parquet::ParquetColumnReaderFactory::ROW_POSITION_COLUMN_ID}; + request->non_predicate_columns = {0}; + request->column_positions = { + {0, 0}, + {parquet::ParquetColumnReaderFactory::ROW_POSITION_COLUMN_ID, 2}, + }; + request->delete_conjuncts.push_back(VExprContext::create_shared(std::move(delete_predicate))); + ASSERT_TRUE(reader->open(request).ok()); + + size_t rows = 0; + bool eof = false; + ASSERT_TRUE(reader->get_block(&block, &rows, &eof).ok()); + EXPECT_FALSE(eof); + ASSERT_EQ(rows, 3); + + const auto& id_column = assert_cast(*block.get_by_position(0).column); + const auto& row_position_column = + assert_cast(*block.get_by_position(2).column); + EXPECT_EQ(id_column.get_element(0), 1); + EXPECT_EQ(id_column.get_element(1), 3); + EXPECT_EQ(id_column.get_element(2), 5); + EXPECT_EQ(row_position_column.get_element(0), 0); + EXPECT_EQ(row_position_column.get_element(1), 2); + EXPECT_EQ(row_position_column.get_element(2), 4); +} + +TEST_F(NewParquetReaderTest, QueryPredicateAndDeletePredicateFilterRowPositions) { + auto reader = create_reader(); + RuntimeState state {TQueryOptions(), TQueryGlobals()}; + ASSERT_TRUE(reader->init(&state).ok()); + + std::vector schema; + ASSERT_TRUE(reader->get_schema(&schema).ok()); + Block block = build_file_block_with_row_position(schema); + + static const std::vector deleted_rows {3}; + auto delete_predicate = std::make_shared(deleted_rows); + delete_predicate->add_child(TableSlotRef::create_shared( + 2, 2, -1, std::make_shared(), + parquet::ParquetColumnReaderFactory::ROW_POSITION_COLUMN_NAME)); + + auto request = std::make_unique(); + request->predicate_columns = {0, parquet::ParquetColumnReaderFactory::ROW_POSITION_COLUMN_ID}; + request->non_predicate_columns = {}; + request->column_positions = { + {0, 0}, + {parquet::ParquetColumnReaderFactory::ROW_POSITION_COLUMN_ID, 2}, + }; + request->conjuncts.push_back(create_int32_greater_than_conjunct(0, 2)); + request->delete_conjuncts.push_back(VExprContext::create_shared(std::move(delete_predicate))); + ASSERT_TRUE(reader->open(request).ok()); + + size_t rows = 0; + bool eof = false; + ASSERT_TRUE(reader->get_block(&block, &rows, &eof).ok()); + EXPECT_FALSE(eof); + ASSERT_EQ(rows, 2); + + const auto& id_column = assert_cast(*block.get_by_position(0).column); + const auto& row_position_column = + assert_cast(*block.get_by_position(2).column); + EXPECT_EQ(id_column.get_element(0), 3); + EXPECT_EQ(id_column.get_element(1), 5); + EXPECT_EQ(row_position_column.get_element(0), 2); + EXPECT_EQ(row_position_column.get_element(1), 4); +} + +TEST_F(NewParquetReaderTest, RowPositionReaderUsesFileLocalPositionsForScanRange) { + write_parquet_file(_file_path, 2); + auto parquet_file_reader = ::parquet::ParquetFileReader::OpenFile(_file_path, false); + ASSERT_EQ(parquet_file_reader->metadata()->num_row_groups(), 3); + + const std::vector> expected_ids = {{1, 2}, {3, 4}, {5}}; + const std::vector> expected_row_positions = {{0, 1}, {2, 3}, {4}}; + for (int row_group_idx = 0; row_group_idx < 3; ++row_group_idx) { + const auto [range_start_offset, range_size] = + row_group_mid_range(_file_path, row_group_idx); + auto reader = create_reader(range_start_offset, range_size); + RuntimeState state {TQueryOptions(), TQueryGlobals()}; + ASSERT_TRUE(reader->init(&state).ok()); + + std::vector schema; + ASSERT_TRUE(reader->get_schema(&schema).ok()); + auto request = std::make_unique(); + request->non_predicate_columns = { + parquet::ParquetColumnReaderFactory::ROW_POSITION_COLUMN_ID, 0}; + request->column_positions = { + {0, 0}, + {parquet::ParquetColumnReaderFactory::ROW_POSITION_COLUMN_ID, 2}, + }; + ASSERT_TRUE(reader->open(request).ok()); + + std::vector ids; + std::vector row_positions; + bool eof = false; + while (!eof) { + Block block = build_file_block_with_row_position(schema); + size_t rows = 0; + ASSERT_TRUE(reader->get_block(&block, &rows, &eof).ok()); + if (rows == 0) { + continue; + } + const auto& id_column = + assert_cast(*block.get_by_position(0).column); + const auto& row_position_column = + assert_cast(*block.get_by_position(2).column); + for (size_t row = 0; row < rows; ++row) { + ids.push_back(id_column.get_element(row)); + row_positions.push_back(row_position_column.get_element(row)); + } + } + + EXPECT_EQ(ids, expected_ids[row_group_idx]); + EXPECT_EQ(row_positions, expected_row_positions[row_group_idx]); + } +} + +} // namespace +} // namespace doris diff --git a/be/test/format/reader/expr/cast_test.cpp b/be/test/format/reader/expr/cast_test.cpp new file mode 100644 index 00000000000000..93858dbf53ef85 --- /dev/null +++ b/be/test/format/reader/expr/cast_test.cpp @@ -0,0 +1,484 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "format/reader/expr/cast.h" + +#include + +#include +#include +#include + +#include "common/status.h" +#include "core/block/block.h" +#include "core/column/column_nullable.h" +#include "core/column/column_vector.h" +#include "core/data_type/data_type_nullable.h" +#include "core/data_type/data_type_number.h" +#include "core/data_type/data_type_string.h" +#include "core/field.h" +#include "exprs/vexpr_context.h" +#include "format/reader/column_mapper.h" +#include "format/reader/expr/literal.h" +#include "format/reader/expr/slot_ref.h" +#include "format/reader/file_reader.h" +#include "format/reader/table_reader.h" +#include "runtime/descriptors.h" +#include "testutil/column_helper.h" +#include "testutil/mock/mock_runtime_state.h" + +namespace doris { + +class CastTest : public testing::Test { +protected: + void SetUp() override { state.set_enable_strict_cast(true); } + + static VExprContextSPtr create_context(const DataTypePtr& return_type, + const DataTypePtr& child_type, int child_column_id = 0) { + auto cast = Cast::create_shared(return_type); + cast->add_child(TableSlotRef::create_shared(child_column_id, child_column_id, -1, + child_type, "source_column")); + return VExprContext::create_shared(cast); + } + + Status prepare_open_execute(VExprContext* context, Block* block, int* result_column_id) { + RETURN_IF_ERROR(context->prepare(&state, RowDescriptor())); + RETURN_IF_ERROR(context->open(&state)); + return context->execute(block, result_column_id); + } + + MockRuntimeState state; +}; + +class Int64ChildGreaterThanExpr final : public VExpr { +public: + explicit Int64ChildGreaterThanExpr(int64_t value) + : VExpr(std::make_shared(), false), _value(value) {} + + Status execute_column_impl(VExprContext* context, const Block* block, const Selector* selector, + size_t count, ColumnPtr& result_column) const override { + ColumnPtr child_column; + RETURN_IF_ERROR( + get_child(0)->execute_column(context, block, selector, count, child_column)); + const auto& input = assert_cast(*child_column); + auto result = ColumnUInt8::create(); + auto& result_data = result->get_data(); + result_data.resize(count); + for (size_t row = 0; row < count; ++row) { + result_data[row] = input.get_element(row) > _value; + } + result_column = std::move(result); + return Status::OK(); + } + + const std::string& expr_name() const override { return _expr_name; } + +private: + const int64_t _value; + const std::string _expr_name = "Int64ChildGreaterThanExpr"; +}; + +TEST_F(CastTest, CastIntSlotToBigInt) { + auto source_type = std::make_shared(); + auto return_type = std::make_shared(); + auto context = create_context(return_type, source_type); + Block block; + block.insert(ColumnHelper::create_column_with_name({1, -2, 3})); + + int result_column_id = -1; + auto status = prepare_open_execute(context.get(), &block, &result_column_id); + ASSERT_TRUE(status.ok()) << status; + + ASSERT_EQ(result_column_id, 1); + ASSERT_EQ(block.columns(), 2); + EXPECT_EQ(block.get_by_position(result_column_id).type, return_type); + const auto& result_column = + assert_cast(*block.get_by_position(result_column_id).column); + EXPECT_EQ(result_column.get_data()[0], 1); + EXPECT_EQ(result_column.get_data()[1], -2); + EXPECT_EQ(result_column.get_data()[2], 3); + + context->close(); +} + +TEST_F(CastTest, CastStringSlotToNullableInt) { + state.set_enable_strict_cast(false); + auto source_type = std::make_shared(); + auto return_type = std::make_shared(std::make_shared()); + auto context = create_context(return_type, source_type); + Block block; + block.insert(ColumnHelper::create_column_with_name({"10", "bad", "-3"})); + + int result_column_id = -1; + auto status = prepare_open_execute(context.get(), &block, &result_column_id); + ASSERT_TRUE(status.ok()) << status; + + const auto& nullable_column = + assert_cast(*block.get_by_position(result_column_id).column); + const auto& result_column = + assert_cast(nullable_column.get_nested_column()); + const auto& null_map = nullable_column.get_null_map_data(); + EXPECT_EQ(result_column.get_data()[0], 10); + EXPECT_EQ(result_column.get_data()[2], -3); + EXPECT_EQ(null_map[0], 0); + EXPECT_EQ(null_map[1], 1); + EXPECT_EQ(null_map[2], 0); + + context->close(); +} + +TEST_F(CastTest, CastLiteralToString) { + auto source_type = std::make_shared(); + auto return_type = std::make_shared(); + auto cast = Cast::create_shared(return_type); + cast->add_child(TableLiteral::create_shared(source_type, Field::create_field(123))); + auto context = VExprContext::create_shared(cast); + Block block; + block.insert(ColumnHelper::create_column_with_name({1, 2, 3})); + + int result_column_id = -1; + auto status = prepare_open_execute(context.get(), &block, &result_column_id); + ASSERT_TRUE(status.ok()) << status; + + const auto& result = block.get_by_position(result_column_id); + EXPECT_EQ(result.type->to_string(*result.column, 0), "123"); + EXPECT_EQ(result.type->to_string(*result.column, 1), "123"); + EXPECT_EQ(result.type->to_string(*result.column, 2), "123"); + + context->close(); +} + +TEST_F(CastTest, EmptyBlockAppendsEmptyResultColumn) { + auto source_type = std::make_shared(); + auto return_type = std::make_shared(); + auto context = create_context(return_type, source_type); + Block block; + block.insert(ColumnHelper::create_column_with_name({})); + + int result_column_id = -1; + auto status = prepare_open_execute(context.get(), &block, &result_column_id); + ASSERT_TRUE(status.ok()) << status; + + ASSERT_EQ(result_column_id, 1); + EXPECT_EQ(block.get_by_position(result_column_id).column->size(), 0); + + context->close(); +} + +TEST_F(CastTest, PrepareRejectsMissingChild) { + auto cast = Cast::create_shared(std::make_shared()); + VExprContext context(cast); + + auto status = context.prepare(&state, RowDescriptor()); + ASSERT_FALSE(status.ok()); + EXPECT_NE(status.to_string().find("exactly 1 child expr"), std::string::npos); +} + +TEST_F(CastTest, PrepareRejectsMultipleChildren) { + auto child_type = std::make_shared(); + auto cast = Cast::create_shared(std::make_shared()); + cast->add_child(TableSlotRef::create_shared(0, 0, -1, child_type, "c0")); + cast->add_child(TableSlotRef::create_shared(1, 1, -1, child_type, "c1")); + VExprContext context(cast); + + auto status = context.prepare(&state, RowDescriptor()); + ASSERT_FALSE(status.ok()); + EXPECT_NE(status.to_string().find("exactly 1 child expr"), std::string::npos); +} + +TEST_F(CastTest, ColumnMapperBuildsCastProjectionForTypeMismatch) { + reader::TableColumnMapper mapper; + reader::TableColumn table_column; + table_column.id = 7; + table_column.name = "value"; + table_column.type = std::make_shared(); + std::vector projected_columns {table_column}; + + reader::SchemaField file_field; + file_field.id = 0; + file_field.name = "value"; + file_field.type = std::make_shared(); + std::vector file_schema {file_field}; + + auto status = mapper.create_mapping(projected_columns, {}, file_schema); + ASSERT_TRUE(status.ok()) << status; + ASSERT_EQ(mapper.mappings().size(), 1); + reader::FileScanRequest file_request; + status = mapper.create_scan_request({}, {}, projected_columns, &file_request); + ASSERT_TRUE(status.ok()) << status; + const auto& mapping = mapper.mappings()[0]; + EXPECT_FALSE(mapping.is_trivial); + ASSERT_NE(mapping.projection, nullptr); + + Block block; + block.insert(ColumnHelper::create_column_with_name({11, 22})); + int result_column_id = -1; + status = prepare_open_execute(mapping.projection.get(), &block, &result_column_id); + ASSERT_TRUE(status.ok()) << status; + + const auto& result_column = + assert_cast(*block.get_by_position(result_column_id).column); + EXPECT_EQ(result_column.get_data()[0], 11); + EXPECT_EQ(result_column.get_data()[1], 22); + + mapping.projection->close(); +} + +TEST_F(CastTest, ColumnMapperTreatsEquivalentTypesAsTrivial) { + reader::TableColumnMapper mapper; + reader::TableColumn table_column; + table_column.id = 7; + table_column.name = "value"; + table_column.type = std::make_shared(); + std::vector projected_columns {table_column}; + + reader::SchemaField file_field; + file_field.id = 0; + file_field.name = "value"; + file_field.type = std::make_shared(); + std::vector file_schema {file_field}; + + auto status = mapper.create_mapping(projected_columns, {}, file_schema); + ASSERT_TRUE(status.ok()) << status; + ASSERT_EQ(mapper.mappings().size(), 1); + EXPECT_TRUE(mapper.mappings()[0].is_trivial); +} + +TEST_F(CastTest, ColumnMapperBuildsCastFilterForTypeMismatch) { + reader::TableColumnMapper mapper; + reader::TableColumn table_column; + table_column.id = 7; + table_column.name = "value"; + table_column.type = std::make_shared(); + std::vector projected_columns {table_column}; + + reader::SchemaField file_field; + file_field.id = 0; + file_field.name = "value"; + file_field.type = std::make_shared(); + std::vector file_schema {file_field}; + + auto status = mapper.create_mapping(projected_columns, {}, file_schema); + ASSERT_TRUE(status.ok()) << status; + + auto predicate = std::make_shared(15); + predicate->add_child(TableSlotRef::create_shared(7, 7, -1, table_column.type, "value")); + reader::TableFilter table_filter; + table_filter.conjunct = VExprContext::create_shared(predicate); + table_filter.slot_ids = {7}; + + reader::FileScanRequest file_request; + ASSERT_TRUE( + mapper.create_scan_request({table_filter}, {}, projected_columns, &file_request).ok()); + ASSERT_EQ(file_request.conjuncts.size(), 1); + ASSERT_EQ(file_request.predicate_columns, std::vector({0})); + const auto& localized_expr = file_request.conjuncts[0]->root(); + ASSERT_EQ(localized_expr->get_num_children(), 1); + const auto& localized_child = localized_expr->children()[0]; + ASSERT_NE(dynamic_cast(localized_child.get()), nullptr); + ASSERT_EQ(localized_child->get_num_children(), 1); + const auto* localized_slot = + assert_cast(localized_child->children()[0].get()); + EXPECT_EQ(localized_slot->column_id(), 0); + EXPECT_TRUE(localized_slot->data_type()->equals(*file_field.type)); + EXPECT_TRUE(localized_child->data_type()->equals(*table_column.type)); + + Block block; + block.insert(ColumnHelper::create_column_with_name({11, 22})); + auto* conjunct = file_request.conjuncts[0].get(); + status = conjunct->prepare(&state, RowDescriptor()); + ASSERT_TRUE(status.ok()) << status; + status = conjunct->open(&state); + ASSERT_TRUE(status.ok()) << status; + IColumn::Filter filter(block.rows(), 1); + bool can_filter_all = false; + status = conjunct->execute_filter(&block, filter.data(), block.rows(), false, &can_filter_all); + ASSERT_TRUE(status.ok()) << status; + EXPECT_FALSE(can_filter_all); + ASSERT_EQ(filter.size(), 2); + EXPECT_EQ(filter[0], 0); + EXPECT_EQ(filter[1], 1); + + file_request.conjuncts[0]->close(); +} + +TEST_F(CastTest, ColumnMapperDoesNotNestCastFilterAcrossScanRequests) { + reader::TableColumnMapper mapper; + reader::TableColumn table_column; + table_column.id = 7; + table_column.name = "value"; + table_column.type = std::make_shared(); + std::vector projected_columns {table_column}; + + reader::SchemaField file_field; + file_field.id = 0; + file_field.name = "value"; + file_field.type = std::make_shared(); + std::vector file_schema {file_field}; + + auto status = mapper.create_mapping(projected_columns, {}, file_schema); + ASSERT_TRUE(status.ok()) << status; + + auto predicate = std::make_shared(15); + predicate->add_child(TableSlotRef::create_shared(7, 7, -1, table_column.type, "value")); + reader::TableFilter table_filter; + table_filter.conjunct = VExprContext::create_shared(predicate); + table_filter.slot_ids = {7}; + + reader::FileScanRequest first_request; + ASSERT_TRUE( + mapper.create_scan_request({table_filter}, {}, projected_columns, &first_request).ok()); + reader::FileScanRequest second_request; + ASSERT_TRUE(mapper.create_scan_request({table_filter}, {}, projected_columns, &second_request) + .ok()); + + ASSERT_EQ(second_request.conjuncts.size(), 1); + const auto& localized_expr = second_request.conjuncts[0]->root(); + ASSERT_EQ(localized_expr->get_num_children(), 1); + const auto& localized_child = localized_expr->children()[0]; + ASSERT_NE(dynamic_cast(localized_child.get()), nullptr); + ASSERT_EQ(localized_child->get_num_children(), 1); + const auto* localized_slot = + assert_cast(localized_child->children()[0].get()); + EXPECT_EQ(localized_slot->column_id(), 0); +} + +TEST_F(CastTest, ColumnMapperRewritesPreviousCastFilterToMatchingSplitType) { + reader::TableColumn table_column; + table_column.id = 7; + table_column.name = "value"; + table_column.type = std::make_shared(); + std::vector projected_columns {table_column}; + + auto predicate = std::make_shared(15); + predicate->add_child(TableSlotRef::create_shared(7, 7, -1, table_column.type, "value")); + reader::TableFilter table_filter; + table_filter.conjunct = VExprContext::create_shared(predicate); + table_filter.slot_ids = {7}; + + reader::SchemaField int_file_field; + int_file_field.id = 0; + int_file_field.name = "value"; + int_file_field.type = std::make_shared(); + + reader::TableColumnMapper int_mapper; + ASSERT_TRUE(int_mapper.create_mapping(projected_columns, {}, {int_file_field}).ok()); + reader::FileScanRequest int_request; + ASSERT_TRUE(int_mapper.create_scan_request({table_filter}, {}, projected_columns, &int_request) + .ok()); + + const auto& int_localized_expr = int_request.conjuncts[0]->root(); + ASSERT_EQ(int_localized_expr->get_num_children(), 1); + ASSERT_NE(dynamic_cast(int_localized_expr->children()[0].get()), nullptr); + + reader::SchemaField bigint_file_field; + bigint_file_field.id = 0; + bigint_file_field.name = "value"; + bigint_file_field.type = std::make_shared(); + + reader::TableColumnMapper bigint_mapper; + ASSERT_TRUE(bigint_mapper.create_mapping(projected_columns, {}, {bigint_file_field}).ok()); + reader::FileScanRequest bigint_request; + ASSERT_TRUE(bigint_mapper + .create_scan_request({table_filter}, {}, projected_columns, &bigint_request) + .ok()); + + const auto& bigint_localized_expr = bigint_request.conjuncts[0]->root(); + ASSERT_EQ(bigint_localized_expr->get_num_children(), 1); + const auto& bigint_localized_child = bigint_localized_expr->children()[0]; + const auto* localized_slot = assert_cast(bigint_localized_child.get()); + EXPECT_EQ(localized_slot->column_id(), 0); + EXPECT_TRUE(localized_slot->data_type()->equals(*bigint_file_field.type)); + + Block block; + block.insert(ColumnHelper::create_column_with_name({11, 22})); + auto* conjunct = bigint_request.conjuncts[0].get(); + auto status = conjunct->prepare(&state, RowDescriptor()); + ASSERT_TRUE(status.ok()) << status; + status = conjunct->open(&state); + ASSERT_TRUE(status.ok()) << status; + IColumn::Filter filter(block.rows(), 1); + bool can_filter_all = false; + status = conjunct->execute_filter(&block, filter.data(), block.rows(), false, &can_filter_all); + ASSERT_TRUE(status.ok()) << status; + EXPECT_FALSE(can_filter_all); + ASSERT_EQ(filter.size(), 2); + EXPECT_EQ(filter[0], 0); + EXPECT_EQ(filter[1], 1); + conjunct->close(); +} + +TEST_F(CastTest, ColumnMapperKeepsTableSlotIdWhenFileBlockPositionChanges) { + reader::TableColumn table_column; + table_column.id = 7; + table_column.name = "value"; + table_column.type = std::make_shared(); + std::vector projected_columns {table_column}; + + reader::SchemaField file_field; + file_field.id = 10; + file_field.name = "value"; + file_field.type = std::make_shared(); + + reader::TableColumnMapper mapper; + ASSERT_TRUE(mapper.create_mapping(projected_columns, {}, {file_field}).ok()); + + auto predicate = std::make_shared(15); + predicate->add_child(TableSlotRef::create_shared(7, 7, -1, table_column.type, "value")); + reader::TableFilter table_filter; + table_filter.conjunct = VExprContext::create_shared(predicate); + table_filter.slot_ids = {7}; + + reader::FileScanRequest first_request; + ASSERT_TRUE(mapper.localize_filters({table_filter}, {}, &first_request).ok()); + ASSERT_EQ(first_request.conjuncts.size(), 1); + const auto* first_slot = assert_cast( + first_request.conjuncts[0]->root()->children()[0].get()); + EXPECT_EQ(first_slot->slot_id(), 7); + EXPECT_EQ(first_slot->column_id(), 0); + + reader::FileScanRequest second_request; + second_request.column_positions.emplace(9, 0); + second_request.column_positions.emplace(10, 1); + second_request.non_predicate_columns.push_back(9); + ASSERT_TRUE(mapper.localize_filters({table_filter}, {}, &second_request).ok()); + ASSERT_EQ(second_request.conjuncts.size(), 1); + const auto* second_slot = assert_cast( + second_request.conjuncts[0]->root()->children()[0].get()); + EXPECT_EQ(second_slot->slot_id(), 7); + EXPECT_EQ(second_slot->column_id(), 1); + + Block block; + block.insert(ColumnHelper::create_column_with_name({100, 100})); + block.insert(ColumnHelper::create_column_with_name({11, 22})); + auto* conjunct = second_request.conjuncts[0].get(); + auto status = conjunct->prepare(&state, RowDescriptor()); + ASSERT_TRUE(status.ok()) << status; + status = conjunct->open(&state); + ASSERT_TRUE(status.ok()) << status; + IColumn::Filter filter(block.rows(), 1); + bool can_filter_all = false; + status = conjunct->execute_filter(&block, filter.data(), block.rows(), false, &can_filter_all); + ASSERT_TRUE(status.ok()) << status; + EXPECT_FALSE(can_filter_all); + ASSERT_EQ(filter.size(), 2); + EXPECT_EQ(filter[0], 0); + EXPECT_EQ(filter[1], 1); + conjunct->close(); +} + +} // namespace doris diff --git a/be/test/format/reader/expr/delete_predicate_test.cpp b/be/test/format/reader/expr/delete_predicate_test.cpp new file mode 100644 index 00000000000000..9d9f7387a2267a --- /dev/null +++ b/be/test/format/reader/expr/delete_predicate_test.cpp @@ -0,0 +1,155 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "format/reader/expr/delete_predicate.h" + +#include + +#include +#include +#include + +#include "common/status.h" +#include "core/block/block.h" +#include "core/column/column_vector.h" +#include "core/data_type/data_type_number.h" +#include "exprs/vexpr_context.h" +#include "runtime/descriptors.h" +#include "testutil/mock/mock_slot_ref.h" + +namespace doris { + +class DeletePredicateTest : public testing::Test { +protected: + static Block make_block(const std::vector& row_ids) { + auto column = ColumnInt64::create(); + for (auto row_id : row_ids) { + column->insert_value(row_id); + } + + Block block; + block.insert({std::move(column), std::make_shared(), "row_id"}); + return block; + } + + static std::vector result_column_data(const Block& block, int result_column_id) { + const auto& result_column = + assert_cast(*block.get_by_position(result_column_id).column); + return {result_column.get_data().begin(), result_column.get_data().end()}; + } + + static Status execute_delete_predicate(const std::vector& deleted_rows, Block* block, + int* result_column_id) { + auto delete_predicate = std::make_shared(deleted_rows); + delete_predicate->_open_finished = true; + delete_predicate->add_child( + std::make_shared(0, std::make_shared())); + + VExprContext context(delete_predicate); + return delete_predicate->execute(&context, block, result_column_id); + } +}; + +TEST_F(DeletePredicateTest, MatchDeletedRowsInInputRange) { + const std::vector deleted_rows {-3, 1, 4, 8, 12, 20}; + auto block = make_block({0, 1, 2, 3, 4, 5, 8, 12}); + + int result_column_id = -1; + auto status = execute_delete_predicate(deleted_rows, &block, &result_column_id); + ASSERT_TRUE(status.ok()) << status; + + EXPECT_EQ(result_column_id, 1); + EXPECT_EQ(result_column_data(block, result_column_id), + std::vector({0, 1, 0, 0, 1, 0, 1, 1})); +} + +TEST_F(DeletePredicateTest, EmptyDeletedRowsReturnAllFalse) { + const std::vector deleted_rows; + auto block = make_block({1, 2, 3}); + + int result_column_id = -1; + auto status = execute_delete_predicate(deleted_rows, &block, &result_column_id); + ASSERT_TRUE(status.ok()) << status; + + EXPECT_EQ(result_column_data(block, result_column_id), std::vector({0, 0, 0})); +} + +TEST_F(DeletePredicateTest, DeletedRowsOutsideInputRangeReturnAllFalse) { + const std::vector deleted_rows {-10, -1, 10, 11}; + auto block = make_block({1, 2, 3}); + + int result_column_id = -1; + auto status = execute_delete_predicate(deleted_rows, &block, &result_column_id); + ASSERT_TRUE(status.ok()) << status; + + EXPECT_EQ(result_column_data(block, result_column_id), std::vector({0, 0, 0})); +} + +TEST_F(DeletePredicateTest, EmptyBlockDoesNotAppendResultColumn) { + const std::vector deleted_rows {1, 2, 3}; + Block block; + + int result_column_id = -1; + auto status = execute_delete_predicate(deleted_rows, &block, &result_column_id); + ASSERT_TRUE(status.ok()) << status; + + EXPECT_EQ(block.columns(), 0); + EXPECT_EQ(result_column_id, -1); +} + +TEST_F(DeletePredicateTest, MissingRowIdChildReturnsError) { + const std::vector deleted_rows {1}; + auto block = make_block({1}); + auto delete_predicate = std::make_shared(deleted_rows); + delete_predicate->_open_finished = true; + VExprContext context(delete_predicate); + + int result_column_id = -1; + auto status = delete_predicate->execute(&context, &block, &result_column_id); + ASSERT_FALSE(status.ok()); + EXPECT_NE(status.to_string().find("exactly 1 child expr"), std::string::npos); +} + +TEST_F(DeletePredicateTest, ExecuteColumnImplReturnsError) { + const std::vector deleted_rows {1}; + DeletePredicate delete_predicate(deleted_rows); + VExprContext context(std::make_shared(deleted_rows)); + ColumnPtr result_column; + + auto status = + delete_predicate.execute_column_impl(&context, nullptr, nullptr, 0, result_column); + ASSERT_FALSE(status.ok()); + EXPECT_NE(status.to_string().find("DeletePredicate::execute_column_impl"), std::string::npos); +} + +TEST_F(DeletePredicateTest, LifecycleAndDebugString) { + const std::vector deleted_rows {1}; + DeletePredicate delete_predicate(deleted_rows); + VExprContext context(std::make_shared(deleted_rows)); + RowDescriptor row_desc; + + auto status = delete_predicate.prepare(nullptr, row_desc, &context); + ASSERT_TRUE(status.ok()) << status; + EXPECT_EQ(delete_predicate.expr_name(), "DeletePredicate"); + EXPECT_EQ(delete_predicate.debug_string(), "DeletePredicate"); + + status = delete_predicate.open(nullptr, &context, FunctionContext::THREAD_LOCAL); + ASSERT_TRUE(status.ok()) << status; + delete_predicate.close(&context, FunctionContext::THREAD_LOCAL); +} + +} // namespace doris diff --git a/be/test/format/reader/expr/equality_delete_predicate_test.cpp b/be/test/format/reader/expr/equality_delete_predicate_test.cpp new file mode 100644 index 00000000000000..07ff0f78f81e88 --- /dev/null +++ b/be/test/format/reader/expr/equality_delete_predicate_test.cpp @@ -0,0 +1,181 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "format/reader/expr/equality_delete_predicate.h" + +#include + +#include +#include +#include +#include + +#include "common/status.h" +#include "core/assert_cast.h" +#include "core/block/block.h" +#include "core/column/column_nullable.h" +#include "core/column/column_string.h" +#include "core/column/column_vector.h" +#include "core/data_type/data_type_nullable.h" +#include "core/data_type/data_type_number.h" +#include "core/data_type/data_type_string.h" +#include "exprs/vexpr_context.h" +#include "format/reader/expr/cast.h" +#include "runtime/descriptors.h" +#include "testutil/column_helper.h" +#include "testutil/mock/mock_runtime_state.h" +#include "testutil/mock/mock_slot_ref.h" + +namespace doris { + +class EqualityDeletePredicateTest : public testing::Test { +protected: + static ColumnWithTypeAndName make_nullable_int_column( + const std::string& name, const std::vector>& values) { + auto data = ColumnInt32::create(); + auto null_map = ColumnUInt8::create(); + for (const auto& value : values) { + data->insert_value(value.value_or(0)); + null_map->insert_value(!value.has_value()); + } + auto type = make_nullable(std::make_shared()); + return {ColumnNullable::create(std::move(data), std::move(null_map)), type, name}; + } + + static ColumnWithTypeAndName make_nullable_string_column( + const std::string& name, const std::vector>& values) { + auto data = ColumnString::create(); + auto null_map = ColumnUInt8::create(); + for (const auto& value : values) { + const std::string data_value = value.value_or(""); + data->insert_data(data_value.data(), data_value.size()); + null_map->insert_value(!value.has_value()); + } + auto type = make_nullable(std::make_shared()); + return {ColumnNullable::create(std::move(data), std::move(null_map)), type, name}; + } + + static std::vector result_column_data(const Block& block, int result_column_id) { + const auto& result_column = + assert_cast(*block.get_by_position(result_column_id).column); + return {result_column.get_data().begin(), result_column.get_data().end()}; + } + + static Status execute_equality_delete_predicate(Block delete_block, std::vector field_ids, + Block* data_block, int* result_column_id) { + auto predicate = + std::make_shared(std::move(delete_block), field_ids); + predicate->_open_finished = true; + for (size_t idx = 0; idx < field_ids.size(); ++idx) { + predicate->add_child( + std::make_shared(idx, data_block->get_by_position(idx).type)); + } + + VExprContext context(predicate); + return predicate->execute(&context, data_block, result_column_id); + } + + static Status execute_prepared_equality_delete_predicate(const VExprContextSPtr& context, + MockRuntimeState* state, + Block* data_block, + int* result_column_id) { + RETURN_IF_ERROR(context->prepare(state, RowDescriptor())); + RETURN_IF_ERROR(context->open(state)); + return context->execute(data_block, result_column_id); + } +}; + +TEST_F(EqualityDeletePredicateTest, MatchSingleColumn) { + Block delete_block; + delete_block.insert(make_nullable_int_column("id", {1, 4})); + Block data_block; + data_block.insert(make_nullable_int_column("id", {1, 2, 3, 4})); + + int result_column_id = -1; + auto status = execute_equality_delete_predicate(std::move(delete_block), {1}, &data_block, + &result_column_id); + ASSERT_TRUE(status.ok()) << status; + EXPECT_EQ(result_column_data(data_block, result_column_id), std::vector({1, 0, 0, 1})); +} + +TEST_F(EqualityDeletePredicateTest, MatchMultipleColumns) { + Block delete_block; + delete_block.insert(make_nullable_int_column("id", {1, 2})); + delete_block.insert(make_nullable_string_column("name", {"a", "b"})); + Block data_block; + data_block.insert(make_nullable_int_column("id", {1, 1, 2, 2})); + data_block.insert(make_nullable_string_column("name", {"a", "b", "a", "b"})); + + int result_column_id = -1; + auto status = execute_equality_delete_predicate(std::move(delete_block), {1, 2}, &data_block, + &result_column_id); + ASSERT_TRUE(status.ok()) << status; + EXPECT_EQ(result_column_data(data_block, result_column_id), std::vector({1, 0, 0, 1})); +} + +TEST_F(EqualityDeletePredicateTest, MatchNullValues) { + Block delete_block; + delete_block.insert(make_nullable_int_column("id", {std::nullopt})); + Block data_block; + data_block.insert(make_nullable_int_column("id", {1, std::nullopt, 3})); + + int result_column_id = -1; + auto status = execute_equality_delete_predicate(std::move(delete_block), {1}, &data_block, + &result_column_id); + ASSERT_TRUE(status.ok()) << status; + EXPECT_EQ(result_column_data(data_block, result_column_id), std::vector({0, 1, 0})); +} + +TEST_F(EqualityDeletePredicateTest, MatchAfterCastToDeleteKeyType) { + Block delete_block; + delete_block.insert(make_nullable_int_column("id", {1, 4})); + Block data_block; + data_block.insert(ColumnHelper::create_column_with_name({1, 2, 4})); + + auto predicate = std::make_shared(std::move(delete_block), + std::vector {1}); + auto cast_expr = Cast::create_shared(make_nullable(std::make_shared())); + cast_expr->add_child(std::make_shared(0, data_block.get_by_position(0).type)); + predicate->add_child(std::move(cast_expr)); + auto context = VExprContext::create_shared(predicate); + MockRuntimeState state; + + int result_column_id = -1; + auto status = execute_prepared_equality_delete_predicate(context, &state, &data_block, + &result_column_id); + ASSERT_TRUE(status.ok()) << status; + EXPECT_EQ(result_column_data(data_block, result_column_id), std::vector({1, 0, 1})); + context->close(); +} + +TEST_F(EqualityDeletePredicateTest, ChildCountMismatchReturnsError) { + Block delete_block; + delete_block.insert(make_nullable_int_column("id", {1})); + auto predicate = std::make_shared(std::move(delete_block), + std::vector {1}); + predicate->_open_finished = true; + Block data_block; + data_block.insert(make_nullable_int_column("id", {1})); + VExprContext context(predicate); + + int result_column_id = -1; + auto status = predicate->execute(&context, &data_block, &result_column_id); + ASSERT_FALSE(status.ok()); + EXPECT_NE(status.to_string().find("should have 1 child exprs"), std::string::npos); +} + +} // namespace doris diff --git a/be/test/format/reader/expr/table_expr_test.cpp b/be/test/format/reader/expr/table_expr_test.cpp new file mode 100644 index 00000000000000..3caca73c6c5d13 --- /dev/null +++ b/be/test/format/reader/expr/table_expr_test.cpp @@ -0,0 +1,123 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include +#include + +#include "core/block/block.h" +#include "core/column/column_const.h" +#include "core/data_type/data_type_number.h" +#include "core/data_type/primitive_type.h" +#include "core/field.h" +#include "format/reader/expr/literal.h" +#include "format/reader/expr/slot_ref.h" +#include "runtime/descriptors.h" +#include "testutil/column_helper.h" + +namespace doris { + +TEST(TableLiteralTest, CreatesConstColumnWithGivenTypeAndField) { + auto type = std::make_shared(); + auto literal = TableLiteral::create_shared(type, Field::create_field(123)); + + ASSERT_EQ(literal->data_type(), type); + ASSERT_TRUE(literal->is_literal()); + + const auto& column = literal->get_column_ptr(); + ASSERT_EQ(column->size(), 1); + ASSERT_TRUE(is_column_const(*column)); + EXPECT_EQ(column->get_int(0), 123); +} + +TEST(TableLiteralTest, ExecutesAsConstColumn) { + auto type = std::make_shared(); + auto literal = TableLiteral::create_shared(type, Field::create_field(456)); + + ColumnPtr result_column; + ASSERT_TRUE(literal->execute_column(nullptr, nullptr, nullptr, 3, result_column).ok()); + + ASSERT_EQ(result_column->size(), 3); + ASSERT_TRUE(is_column_const(*result_column)); + EXPECT_EQ(result_column->get_int(0), 456); + EXPECT_EQ(result_column->get_int(2), 456); +} + +TEST(TableLiteralTest, ExecuteAppendsConstColumnToBlock) { + auto type = std::make_shared(); + auto literal = TableLiteral::create_shared(type, Field::create_field(789)); + Block block; + block.insert(ColumnHelper::create_column_with_name({1, 2, 3})); + + int result_column_id = -1; + ASSERT_TRUE(literal->execute(nullptr, &block, &result_column_id).ok()); + + ASSERT_EQ(result_column_id, 1); + ASSERT_EQ(block.columns(), 2); + const auto& result_column = block.get_by_position(result_column_id).column; + ASSERT_EQ(result_column->size(), 3); + ASSERT_TRUE(is_column_const(*result_column)); + EXPECT_EQ(result_column->get_int(0), 789); + EXPECT_EQ(result_column->get_int(2), 789); + EXPECT_EQ(block.get_by_position(result_column_id).type, type); +} + +TEST(TableSlotRefTest, KeepsSlotColumnIdsAndType) { + auto type = std::make_shared(); + std::string name = "file_col"; + auto slot_ref = TableSlotRef::create_shared(10, 20, 30, type, name); + + EXPECT_EQ(slot_ref->slot_id(), 10); + EXPECT_EQ(slot_ref->column_id(), 20); + EXPECT_EQ(slot_ref->data_type(), type); + EXPECT_EQ(slot_ref->expr_name(), "file_col"); + EXPECT_EQ(slot_ref->column_name(), "file_col"); + EXPECT_FALSE(slot_ref->is_constant()); + + std::set column_ids; + slot_ref->collect_slot_column_ids(column_ids); + ASSERT_EQ(column_ids.size(), 1); + EXPECT_EQ(*column_ids.begin(), 20); +} + +TEST(TableSlotRefTest, PrepareDoesNotRequireRowDescriptor) { + auto type = std::make_shared(); + std::string name = ""; + auto slot_ref = TableSlotRef::create_shared(10, 20, 30, type, name); + + EXPECT_TRUE(slot_ref->prepare(nullptr, RowDescriptor(), nullptr).ok()); +} + +TEST(TableSlotRefTest, ExecuteReturnsReferencedColumnId) { + auto type = std::make_shared(); + std::string name = ""; + auto slot_ref = TableSlotRef::create_shared(10, 1, 30, type, name); + Block block; + block.insert(ColumnHelper::create_column_with_name({1, 2, 3})); + block.insert(ColumnHelper::create_column_with_name({4, 5, 6})); + + int result_column_id = -1; + ASSERT_TRUE(slot_ref->execute(nullptr, &block, &result_column_id).ok()); + + EXPECT_EQ(result_column_id, 1); + EXPECT_EQ(block.columns(), 2); + EXPECT_EQ(block.get_by_position(result_column_id).column->get_int(0), 4); + EXPECT_EQ(block.get_by_position(result_column_id).column->get_int(2), 6); +} + +} // namespace doris diff --git a/be/test/format/reader/table_reader_test.cpp b/be/test/format/reader/table_reader_test.cpp new file mode 100644 index 00000000000000..c5efa0512e603f --- /dev/null +++ b/be/test/format/reader/table_reader_test.cpp @@ -0,0 +1,2372 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "format/reader/table_reader.h" + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "core/assert_cast.h" +#include "core/block/block.h" +#include "core/column/column_nullable.h" +#include "core/column/column_string.h" +#include "core/column/column_vector.h" +#include "core/data_type/data_type_nullable.h" +#include "core/data_type/data_type_number.h" +#include "core/data_type/data_type_string.h" +#include "exec/common/endian.h" +#include "exprs/vexpr.h" +#include "format/format_common.h" +#include "format/reader/expr/slot_ref.h" +#include "format/table/deletion_vector_reader.h" +#include "format/table/iceberg_reader_v2.h" +#include "gen_cpp/PlanNodes_types.h" +#include "io/io_common.h" +#include "roaring/roaring64map.hh" +#include "runtime/runtime_profile.h" +#include "runtime/runtime_state.h" +#include "storage/predicate/predicate_creator.h" + +namespace doris::reader { +namespace { + +class TableInt32GreaterThanExpr final : public VExpr { +public: + TableInt32GreaterThanExpr(int slot_id, int column_id, int32_t value) + : VExpr(std::make_shared(), false), _value(value) { + add_child(TableSlotRef::create_shared(slot_id, column_id, -1, + std::make_shared(), "id")); + set_node_type(TExprNodeType::BINARY_PRED); + _opcode = TExprOpcode::GT; + } + + Status execute_column_impl(VExprContext* context, const Block* block, const Selector* selector, + size_t count, ColumnPtr& result_column) const override { + const auto* slot_ref = assert_cast(get_child(0).get()); + const auto& input = assert_cast( + *block->get_by_position(slot_ref->column_id()).column); + auto result = ColumnUInt8::create(); + auto& result_data = result->get_data(); + result_data.resize(count); + for (size_t row = 0; row < count; ++row) { + const size_t input_row = selector == nullptr ? row : (*selector)[row]; + result_data[row] = input.get_element(input_row) > _value; + } + result_column = std::move(result); + return Status::OK(); + } + + const std::string& expr_name() const override { return _expr_name; } + +private: + const int32_t _value; + const std::string _expr_name = "TableInt32GreaterThanExpr"; +}; + +class IcebergTableReaderDeleteFileTestHelper final : public doris::iceberg::IcebergTableReader { +public: + Status parse_deletion_vector_file(const TTableFormatFileDesc& t_desc, DeleteFileDesc* desc, + bool* has_delete_file) { + return _parse_deletion_vector_file(t_desc, desc, has_delete_file); + } +}; + +class IcebergTableReaderScanRequestTestHelper final : public doris::iceberg::IcebergTableReader { +public: + Status init_for_scan_request_test(std::vector projected_columns) { + _query_options = std::make_unique(); + _query_globals = std::make_unique(); + _state = std::make_unique(*_query_options, *_query_globals); + RETURN_IF_ERROR(init({ + .projected_columns = std::move(projected_columns), + .column_predicates = {}, + .conjuncts = VExprContext(nullptr), + .format = FileFormat::PARQUET, + .scan_params = nullptr, + .io_ctx = nullptr, + .runtime_state = _state.get(), + .scanner_profile = nullptr, + .allow_missing_columns = true, + .profile = nullptr, + })); + + SplitReadOptions split_options; + split_options.current_range.__set_path("scan-request-test.parquet"); + TTableFormatFileDesc table_format_params; + TIcebergFileDesc iceberg_params; + iceberg_params.__set_first_row_id(1000); + table_format_params.__set_iceberg_params(iceberg_params); + split_options.current_range.__set_table_format_params(table_format_params); + RETURN_IF_ERROR(prepare_split(split_options)); + + _delete_rows_storage = {1}; + _delete_rows = &_delete_rows_storage; + return Status::OK(); + } + + Status customize_request(FileScanRequest* request) { + return customize_file_scan_request(request); + } + +private: + std::unique_ptr _query_options; + std::unique_ptr _query_globals; + std::unique_ptr _state; + DeleteRows _delete_rows_storage; +}; + +class TableInt32SumGreaterThanExpr final : public VExpr { +public: + TableInt32SumGreaterThanExpr(int left_slot_id, int left_column_id, int right_slot_id, + int right_column_id, int32_t value) + : VExpr(std::make_shared(), false), _value(value) { + add_child(TableSlotRef::create_shared(left_slot_id, left_column_id, -1, + std::make_shared(), "id")); + add_child(TableSlotRef::create_shared(right_slot_id, right_column_id, -1, + std::make_shared(), "score")); + set_node_type(TExprNodeType::BINARY_PRED); + _opcode = TExprOpcode::GT; + } + + Status execute_column_impl(VExprContext* context, const Block* block, const Selector* selector, + size_t count, ColumnPtr& result_column) const override { + const auto* left_slot_ref = assert_cast(get_child(0).get()); + const auto* right_slot_ref = assert_cast(get_child(1).get()); + const auto& left_input = assert_cast( + *block->get_by_position(left_slot_ref->column_id()).column); + const auto& right_input = assert_cast( + *block->get_by_position(right_slot_ref->column_id()).column); + auto result = ColumnUInt8::create(); + auto& result_data = result->get_data(); + result_data.resize(count); + for (size_t row = 0; row < count; ++row) { + const size_t input_row = selector == nullptr ? row : (*selector)[row]; + result_data[row] = + left_input.get_element(input_row) + right_input.get_element(input_row) > _value; + } + result_column = std::move(result); + return Status::OK(); + } + + const std::string& expr_name() const override { return _expr_name; } + +private: + const int32_t _value; + const std::string _expr_name = "TableInt32SumGreaterThanExpr"; +}; + +class TableInt32SumLessThanExpr final : public VExpr { +public: + TableInt32SumLessThanExpr(int left_slot_id, int left_column_id, int right_slot_id, + int right_column_id, int32_t value) + : VExpr(std::make_shared(), false), _value(value) { + add_child(TableSlotRef::create_shared(left_slot_id, left_column_id, -1, + std::make_shared(), "id")); + add_child(TableSlotRef::create_shared(right_slot_id, right_column_id, -1, + std::make_shared(), "score")); + set_node_type(TExprNodeType::BINARY_PRED); + _opcode = TExprOpcode::LT; + } + + Status execute_column_impl(VExprContext* context, const Block* block, const Selector* selector, + size_t count, ColumnPtr& result_column) const override { + const auto* left_slot_ref = assert_cast(get_child(0).get()); + const auto* right_slot_ref = assert_cast(get_child(1).get()); + const auto& left_input = assert_cast( + *block->get_by_position(left_slot_ref->column_id()).column); + const auto& right_input = assert_cast( + *block->get_by_position(right_slot_ref->column_id()).column); + auto result = ColumnUInt8::create(); + auto& result_data = result->get_data(); + result_data.resize(count); + for (size_t row = 0; row < count; ++row) { + const size_t input_row = selector == nullptr ? row : (*selector)[row]; + result_data[row] = + left_input.get_element(input_row) + right_input.get_element(input_row) < _value; + } + result_column = std::move(result); + return Status::OK(); + } + + const std::string& expr_name() const override { return _expr_name; } + +private: + const int32_t _value; + const std::string _expr_name = "TableInt32SumLessThanExpr"; +}; + +std::shared_ptr finish_array(arrow::ArrayBuilder* builder) { + std::shared_ptr array; + EXPECT_TRUE(builder->Finish(&array).ok()); + return array; +} + +std::shared_ptr build_int32_array(const std::vector& values) { + arrow::Int32Builder builder; + for (const auto value : values) { + EXPECT_TRUE(builder.Append(value).ok()); + } + return finish_array(&builder); +} + +std::shared_ptr build_int64_array(const std::vector& values) { + arrow::Int64Builder builder; + for (const auto value : values) { + EXPECT_TRUE(builder.Append(value).ok()); + } + return finish_array(&builder); +} + +std::shared_ptr build_string_array(const std::vector& values) { + arrow::StringBuilder builder; + for (const auto& value : values) { + EXPECT_TRUE(builder.Append(value).ok()); + } + return finish_array(&builder); +} + +void write_parquet_file(const std::string& file_path, int32_t id, const std::string& value) { + auto schema = arrow::schema({ + arrow::field("id", arrow::int32(), false), + arrow::field("value", arrow::utf8(), false), + }); + auto table = arrow::Table::Make(schema, {build_int32_array({id}), build_string_array({value})}); + + auto file_result = arrow::io::FileOutputStream::Open(file_path); + ASSERT_TRUE(file_result.ok()) << file_result.status(); + std::shared_ptr out = *file_result; + + ::parquet::WriterProperties::Builder builder; + builder.version(::parquet::ParquetVersion::PARQUET_2_6); + builder.data_page_version(::parquet::ParquetDataPageVersion::V2); + builder.compression(::parquet::Compression::UNCOMPRESSED); + PARQUET_THROW_NOT_OK(::parquet::arrow::WriteTable(*table, arrow::default_memory_pool(), out, 1, + builder.build())); +} + +void write_iceberg_equality_delete_parquet_file(const std::string& file_path, int32_t field_id, + int32_t value) { + const auto metadata = + arrow::key_value_metadata({"PARQUET:field_id"}, {std::to_string(field_id)}); + auto schema = arrow::schema({ + arrow::field("id", arrow::int32(), false)->WithMetadata(metadata), + }); + auto table = arrow::Table::Make(schema, {build_int32_array({value})}); + + auto file_result = arrow::io::FileOutputStream::Open(file_path); + ASSERT_TRUE(file_result.ok()) << file_result.status(); + std::shared_ptr out = *file_result; + + ::parquet::WriterProperties::Builder builder; + builder.version(::parquet::ParquetVersion::PARQUET_2_6); + builder.data_page_version(::parquet::ParquetDataPageVersion::V2); + builder.compression(::parquet::Compression::UNCOMPRESSED); + PARQUET_THROW_NOT_OK(::parquet::arrow::WriteTable(*table, arrow::default_memory_pool(), out, 1, + builder.build())); +} + +void write_iceberg_equality_delete_bigint_parquet_file(const std::string& file_path, + int32_t field_id, int64_t value) { + const auto metadata = + arrow::key_value_metadata({"PARQUET:field_id"}, {std::to_string(field_id)}); + auto schema = arrow::schema({ + arrow::field("id", arrow::int64(), false)->WithMetadata(metadata), + }); + auto table = arrow::Table::Make(schema, {build_int64_array({value})}); + + auto file_result = arrow::io::FileOutputStream::Open(file_path); + ASSERT_TRUE(file_result.ok()) << file_result.status(); + std::shared_ptr out = *file_result; + + ::parquet::WriterProperties::Builder builder; + builder.version(::parquet::ParquetVersion::PARQUET_2_6); + builder.data_page_version(::parquet::ParquetDataPageVersion::V2); + builder.compression(::parquet::Compression::UNCOMPRESSED); + PARQUET_THROW_NOT_OK(::parquet::arrow::WriteTable(*table, arrow::default_memory_pool(), out, 1, + builder.build())); +} + +void write_int_pair_parquet_file(const std::string& file_path, const std::vector& ids, + const std::vector& scores, + const std::vector& values, + int64_t row_group_size = -1) { + const auto id_metadata = arrow::key_value_metadata({"PARQUET:field_id"}, {"0"}); + const auto score_metadata = arrow::key_value_metadata({"PARQUET:field_id"}, {"1"}); + const auto value_metadata = arrow::key_value_metadata({"PARQUET:field_id"}, {"2"}); + auto schema = arrow::schema({ + arrow::field("id", arrow::int32(), false)->WithMetadata(id_metadata), + arrow::field("score", arrow::int32(), false)->WithMetadata(score_metadata), + arrow::field("value", arrow::utf8(), false)->WithMetadata(value_metadata), + }); + auto table = arrow::Table::Make(schema, {build_int32_array(ids), build_int32_array(scores), + build_string_array(values)}); + + auto file_result = arrow::io::FileOutputStream::Open(file_path); + ASSERT_TRUE(file_result.ok()) << file_result.status(); + std::shared_ptr out = *file_result; + + ::parquet::WriterProperties::Builder builder; + builder.version(::parquet::ParquetVersion::PARQUET_2_6); + builder.data_page_version(::parquet::ParquetDataPageVersion::V2); + builder.compression(::parquet::Compression::UNCOMPRESSED); + const auto write_row_group_size = + row_group_size > 0 ? row_group_size : static_cast(ids.size()); + PARQUET_THROW_NOT_OK(::parquet::arrow::WriteTable(*table, arrow::default_memory_pool(), out, + write_row_group_size, builder.build())); +} + +void write_position_delete_parquet_file(const std::string& file_path, + const std::vector& data_file_paths, + const std::vector& positions) { + auto schema = arrow::schema({ + arrow::field("file_path", arrow::utf8(), false), + arrow::field("pos", arrow::int64(), false), + }); + auto table = arrow::Table::Make( + schema, {build_string_array(data_file_paths), build_int64_array(positions)}); + + auto file_result = arrow::io::FileOutputStream::Open(file_path); + ASSERT_TRUE(file_result.ok()) << file_result.status(); + std::shared_ptr out = *file_result; + + ::parquet::WriterProperties::Builder builder; + builder.version(::parquet::ParquetVersion::PARQUET_2_6); + builder.data_page_version(::parquet::ParquetDataPageVersion::V2); + builder.compression(::parquet::Compression::UNCOMPRESSED); + PARQUET_THROW_NOT_OK(::parquet::arrow::WriteTable(*table, arrow::default_memory_pool(), out, + static_cast(positions.size()), + builder.build())); +} + +int64_t write_iceberg_deletion_vector_file(const std::string& file_path, + const std::vector& deleted_positions) { + roaring::Roaring64Map rows; + for (const auto position : deleted_positions) { + rows.add(position); + } + + const size_t bitmap_size = rows.getSizeInBytes(); + std::vector blob(4 + 4 + bitmap_size + 4); + rows.write(blob.data() + 8); + + const uint32_t total_length = static_cast(4 + bitmap_size); + BigEndian::Store32(blob.data(), total_length); + constexpr char DV_MAGIC[] = {'\xD1', '\xD3', '\x39', '\x64'}; + memcpy(blob.data() + 4, DV_MAGIC, 4); + BigEndian::Store32(blob.data() + 8 + bitmap_size, 0); + + std::ofstream output(file_path, std::ios::binary); + EXPECT_TRUE(output.is_open()); + output.write(blob.data(), static_cast(blob.size())); + EXPECT_TRUE(output.good()); + return static_cast(blob.size()); +} + +Block build_table_block(const std::vector& columns) { + Block block; + for (const auto& column : columns) { + block.insert({column.type->create_column(), column.type, column.name}); + } + return block; +} + +void expect_nullable_int64_column_values(const IColumn& column, + const std::vector& expected_values) { + const auto full_column = column.convert_to_full_column_if_const(); + const auto& nullable_column = assert_cast(*full_column); + const auto& values = + assert_cast(nullable_column.get_nested_column()).get_data(); + ASSERT_EQ(nullable_column.size(), expected_values.size()); + for (size_t row = 0; row < expected_values.size(); ++row) { + EXPECT_EQ(nullable_column.get_null_map_data()[row], 0); + EXPECT_EQ(values[row], expected_values[row]); + } +} + +SplitReadOptions build_split_options(const std::string& file_path) { + SplitReadOptions options; + options.current_range.__set_path(file_path); + options.current_range.__set_file_size( + static_cast(std::filesystem::file_size(file_path))); + return options; +} + +void set_iceberg_row_lineage_params(SplitReadOptions* split_options, int64_t first_row_id, + int64_t last_updated_sequence_number) { + TTableFormatFileDesc table_format_params; + TIcebergFileDesc iceberg_params; + iceberg_params.__set_first_row_id(first_row_id); + iceberg_params.__set_last_updated_sequence_number(last_updated_sequence_number); + table_format_params.__set_iceberg_params(iceberg_params); + split_options->current_range.__set_table_format_params(table_format_params); +} + +TIcebergDeleteFileDesc make_iceberg_deletion_vector(const std::string& path, int64_t offset, + int64_t size) { + TIcebergDeleteFileDesc delete_file; + delete_file.__set_content(3); + delete_file.__set_path(path); + delete_file.__set_content_offset(offset); + delete_file.__set_content_size_in_bytes(size); + return delete_file; +} + +TIcebergDeleteFileDesc make_iceberg_position_delete_file(const std::string& path) { + TIcebergDeleteFileDesc delete_file; + delete_file.__set_content(1); + delete_file.__set_path(path); + delete_file.__set_file_format(TFileFormatType::FORMAT_PARQUET); + return delete_file; +} + +TIcebergDeleteFileDesc make_iceberg_equality_delete_file(const std::string& path, + const std::vector& field_ids) { + TIcebergDeleteFileDesc delete_file; + delete_file.__set_content(2); + delete_file.__set_path(path); + delete_file.__set_field_ids(field_ids); + delete_file.__set_file_format(TFileFormatType::FORMAT_PARQUET); + return delete_file; +} + +TFileScanRangeParams make_local_parquet_scan_params() { + TFileScanRangeParams scan_params; + scan_params.__set_file_type(TFileType::FILE_LOCAL); + scan_params.__set_format_type(TFileFormatType::FORMAT_PARQUET); + return scan_params; +} + +std::shared_ptr make_io_context(io::FileReaderStats* file_reader_stats, + io::FileCacheStatistics* file_cache_stats) { + auto io_ctx = std::make_shared(); + io_ctx->file_reader_stats = file_reader_stats; + io_ctx->file_cache_stats = file_cache_stats; + return io_ctx; +} + +std::unique_ptr make_table_read_profile(RuntimeProfile* profile) { + auto read_profile = std::make_unique(); + read_profile->num_delete_files = ADD_COUNTER(profile, "NumDeleteFiles", TUnit::UNIT); + read_profile->num_delete_rows = ADD_COUNTER(profile, "NumDeleteRows", TUnit::UNIT); + read_profile->parse_delete_file_time = ADD_TIMER(profile, "ParseDeleteFileTime"); + return read_profile; +} + +TTableFormatFileDesc make_iceberg_table_format_desc( + const std::string& data_file_path, + const std::vector& delete_files) { + TTableFormatFileDesc table_format_params; + TIcebergFileDesc iceberg_params; + iceberg_params.__set_format_version(2); + iceberg_params.__set_original_file_path(data_file_path); + iceberg_params.__set_delete_files(delete_files); + table_format_params.__set_iceberg_params(iceberg_params); + return table_format_params; +} + +std::vector read_iceberg_ids(doris::iceberg::IcebergTableReader* reader, + const std::vector& projected_columns) { + std::vector ids; + bool eos = false; + while (!eos) { + Block block = build_table_block(projected_columns); + auto status = reader->get_block(&block, &eos); + if (!status.ok()) { + ADD_FAILURE() << status; + return ids; + } + if (block.rows() == 0) { + continue; + } + const auto& id_column = assert_cast(*block.get_by_position(0).column); + for (size_t row = 0; row < block.rows(); ++row) { + ids.push_back(id_column.get_element(row)); + } + } + return ids; +} + +int64_t parquet_column_start_offset(const ::parquet::ColumnChunkMetaData& column_metadata) { + return column_metadata.has_dictionary_page() + ? static_cast(column_metadata.dictionary_page_offset()) + : static_cast(column_metadata.data_page_offset()); +} + +SplitReadOptions build_split_options_for_row_group_mid(const std::string& file_path, + int row_group_idx) { + auto options = build_split_options(file_path); + auto reader = ::parquet::ParquetFileReader::OpenFile(file_path, false); + auto metadata = reader->metadata(); + auto row_group_metadata = metadata->RowGroup(row_group_idx); + auto first_column = row_group_metadata->ColumnChunk(0); + auto last_column = row_group_metadata->ColumnChunk(row_group_metadata->num_columns() - 1); + const int64_t row_group_start_offset = parquet_column_start_offset(*first_column); + const int64_t row_group_end_offset = + parquet_column_start_offset(*last_column) + last_column->total_compressed_size(); + const int64_t row_group_mid_offset = + row_group_start_offset + (row_group_end_offset - row_group_start_offset) / 2; + options.current_range.__set_start_offset(row_group_mid_offset); + options.current_range.__set_size(1); + return options; +} + +TableColumn make_table_column(ColumnId id, const std::string& name, const DataTypePtr& type) { + TableColumn column; + column.id = id; + column.name = name; + column.type = type; + return column; +} + +TEST(TableReaderTest, ReopenSplitAfterClose) { + const auto test_dir = std::filesystem::temp_directory_path() / "doris_table_reader_test"; + std::filesystem::remove_all(test_dir); + std::filesystem::create_directories(test_dir); + + const std::vector file_paths = { + (test_dir / "split_1.parquet").string(), + (test_dir / "split_2.parquet").string(), + (test_dir / "split_3.parquet").string(), + }; + write_parquet_file(file_paths[0], 1, "one"); + write_parquet_file(file_paths[1], 2, "two"); + write_parquet_file(file_paths[2], 3, "three"); + + std::vector projected_columns; + projected_columns.push_back(make_table_column(1, "value", std::make_shared())); + projected_columns.push_back(make_table_column(0, "id", std::make_shared())); + + RuntimeState state {TQueryOptions(), TQueryGlobals()}; + TableReader reader; + ASSERT_TRUE(reader.init({ + .projected_columns = projected_columns, + .column_predicates = {}, + .conjuncts = VExprContext( + std::make_shared(0, 0, 0)), + .format = FileFormat::PARQUET, + .scan_params = nullptr, + .io_ctx = nullptr, + .runtime_state = &state, + .scanner_profile = nullptr, + .allow_missing_columns = true, + .profile = nullptr, + }) + .ok()); + + // Simulate the scanner lifecycle for three different splits: + // init() once, then repeat prepare_split() -> get_block() -> close(). + // This verifies TableReader::close() fully releases the previous low-level reader and task + // state, so a later prepare_split() can open and read a new split on the same TableReader. + // The table-level conjunct is also rebuilt for each split. The projection order puts value + // before id, so the pushed conjunct has to be rewritten to the ParquetReader file-local block + // position every time a new split is opened. + std::vector ids; + std::vector values; + for (const auto& file_path : file_paths) { + auto split_options = build_split_options(file_path); + ASSERT_TRUE(reader.prepare_split(split_options).ok()); + + Block block = build_table_block(projected_columns); + bool eos = false; + ASSERT_TRUE(reader.get_block(&block, &eos).ok()); + ASSERT_FALSE(eos); + + const auto& value_column = + assert_cast(*block.get_by_position(0).column); + const auto& id_column = assert_cast(*block.get_by_position(1).column); + ASSERT_EQ(id_column.size(), 1); + ASSERT_EQ(value_column.size(), 1); + ids.push_back(id_column.get_element(0)); + values.push_back(value_column.get_data_at(0).to_string()); + + ASSERT_TRUE(reader.close().ok()); + } + + EXPECT_EQ(ids, std::vector({1, 2, 3})); + EXPECT_EQ(values, std::vector({"one", "two", "three"})); + + std::filesystem::remove_all(test_dir); +} + +TEST(TableReaderTest, PushDownCountFromNewParquetReader) { + const auto test_dir = std::filesystem::temp_directory_path() / "doris_table_reader_count_test"; + std::filesystem::remove_all(test_dir); + std::filesystem::create_directories(test_dir); + + const auto file_path = (test_dir / "split.parquet").string(); + write_int_pair_parquet_file(file_path, {1, 2, 3, 4, 5}, {10, 20, 30, 40, 50}, + {"one", "two", "three", "four", "five"}, 2); + + std::vector projected_columns; + projected_columns.push_back(make_table_column(0, "id", std::make_shared())); + + RuntimeState state {TQueryOptions(), TQueryGlobals()}; + TableReader reader; + ASSERT_TRUE(reader.init({ + .projected_columns = projected_columns, + .column_predicates = {}, + .conjuncts = VExprContext(nullptr), + .format = FileFormat::PARQUET, + .scan_params = nullptr, + .io_ctx = nullptr, + .runtime_state = &state, + .scanner_profile = nullptr, + .allow_missing_columns = true, + .push_down_agg_type = TPushAggOp::type::COUNT, + .profile = nullptr, + }) + .ok()); + ASSERT_TRUE(reader.prepare_split(build_split_options(file_path)).ok()); + + Block block = build_table_block(projected_columns); + bool eos = false; + ASSERT_TRUE(reader.get_block(&block, &eos).ok()); + ASSERT_FALSE(eos); + ASSERT_EQ(block.rows(), 5); + + ASSERT_TRUE(reader.close().ok()); + std::filesystem::remove_all(test_dir); +} + +TEST(TableReaderTest, PushDownMinMaxFromNewParquetReader) { + const auto test_dir = std::filesystem::temp_directory_path() / "doris_table_reader_minmax_test"; + std::filesystem::remove_all(test_dir); + std::filesystem::create_directories(test_dir); + + const auto file_path = (test_dir / "split.parquet").string(); + write_int_pair_parquet_file(file_path, {3, 1, 5, 2}, {30, 10, 50, 20}, + {"three", "one", "five", "two"}, 2); + + std::vector projected_columns; + projected_columns.push_back(make_table_column(0, "id", std::make_shared())); + projected_columns.push_back(make_table_column(1, "score", std::make_shared())); + + RuntimeState state {TQueryOptions(), TQueryGlobals()}; + TableReader reader; + ASSERT_TRUE(reader.init({ + .projected_columns = projected_columns, + .column_predicates = {}, + .conjuncts = VExprContext(nullptr), + .format = FileFormat::PARQUET, + .scan_params = nullptr, + .io_ctx = nullptr, + .runtime_state = &state, + .scanner_profile = nullptr, + .allow_missing_columns = true, + .push_down_agg_type = TPushAggOp::type::MINMAX, + .profile = nullptr, + }) + .ok()); + ASSERT_TRUE(reader.prepare_split(build_split_options(file_path)).ok()); + + Block block = build_table_block(projected_columns); + bool eos = false; + ASSERT_TRUE(reader.get_block(&block, &eos).ok()); + ASSERT_FALSE(eos); + ASSERT_EQ(block.rows(), 2); + const auto& id_column = assert_cast(*block.get_by_position(0).column); + const auto& score_column = assert_cast(*block.get_by_position(1).column); + EXPECT_EQ(id_column.get_element(0), 1); + EXPECT_EQ(id_column.get_element(1), 5); + EXPECT_EQ(score_column.get_element(0), 10); + EXPECT_EQ(score_column.get_element(1), 50); + + ASSERT_TRUE(reader.close().ok()); + std::filesystem::remove_all(test_dir); +} + +TEST(TableReaderTest, PushDownMinMaxCastsFileValueToTableType) { + const auto test_dir = + std::filesystem::temp_directory_path() / "doris_table_reader_minmax_cast_test"; + std::filesystem::remove_all(test_dir); + std::filesystem::create_directories(test_dir); + + const auto file_path = (test_dir / "split.parquet").string(); + write_int_pair_parquet_file(file_path, {3, 1, 5, 2}, {30, 10, 50, 20}, + {"three", "one", "five", "two"}, 2); + + std::vector projected_columns; + projected_columns.push_back(make_table_column(0, "id", std::make_shared())); + + RuntimeState state {TQueryOptions(), TQueryGlobals()}; + TableReader reader; + ASSERT_TRUE(reader.init({ + .projected_columns = projected_columns, + .column_predicates = {}, + .conjuncts = VExprContext(nullptr), + .format = FileFormat::PARQUET, + .scan_params = nullptr, + .io_ctx = nullptr, + .runtime_state = &state, + .scanner_profile = nullptr, + .allow_missing_columns = true, + .push_down_agg_type = TPushAggOp::type::MINMAX, + .profile = nullptr, + }) + .ok()); + ASSERT_TRUE(reader.prepare_split(build_split_options(file_path)).ok()); + + Block block = build_table_block(projected_columns); + bool eos = false; + ASSERT_TRUE(reader.get_block(&block, &eos).ok()); + ASSERT_FALSE(eos); + ASSERT_EQ(block.rows(), 2); + const auto& id_column = assert_cast(*block.get_by_position(0).column); + EXPECT_EQ(id_column.get_element(0), 1); + EXPECT_EQ(id_column.get_element(1), 5); + + ASSERT_TRUE(reader.close().ok()); + std::filesystem::remove_all(test_dir); +} + +TEST(TableReaderTest, PushDownMinMaxOnlyUsesSelectedRowGroupInFileRange) { + const auto test_dir = + std::filesystem::temp_directory_path() / "doris_table_reader_minmax_range_test"; + std::filesystem::remove_all(test_dir); + std::filesystem::create_directories(test_dir); + + const auto file_path = (test_dir / "split.parquet").string(); + write_int_pair_parquet_file(file_path, {10, 1, 100}, {100, 10, 1000}, {"ten", "one", "hundred"}, + 1); + + std::vector projected_columns; + projected_columns.push_back(make_table_column(0, "id", std::make_shared())); + + RuntimeState state {TQueryOptions(), TQueryGlobals()}; + TableReader reader; + ASSERT_TRUE(reader.init({ + .projected_columns = projected_columns, + .column_predicates = {}, + .conjuncts = VExprContext(nullptr), + .format = FileFormat::PARQUET, + .scan_params = nullptr, + .io_ctx = nullptr, + .runtime_state = &state, + .scanner_profile = nullptr, + .allow_missing_columns = true, + .push_down_agg_type = TPushAggOp::type::MINMAX, + .profile = nullptr, + }) + .ok()); + ASSERT_TRUE(reader.prepare_split(build_split_options_for_row_group_mid(file_path, 1)).ok()); + + Block block = build_table_block(projected_columns); + bool eos = false; + ASSERT_TRUE(reader.get_block(&block, &eos).ok()); + ASSERT_FALSE(eos); + ASSERT_EQ(block.rows(), 2); + const auto& id_column = assert_cast(*block.get_by_position(0).column); + EXPECT_EQ(id_column.get_element(0), 1); + EXPECT_EQ(id_column.get_element(1), 1); + + ASSERT_TRUE(reader.close().ok()); + std::filesystem::remove_all(test_dir); +} + +TEST(TableReaderTest, PushDownCountOnlyUsesSelectedRowGroupInFileRange) { + const auto test_dir = + std::filesystem::temp_directory_path() / "doris_table_reader_count_range_test"; + std::filesystem::remove_all(test_dir); + std::filesystem::create_directories(test_dir); + + const auto file_path = (test_dir / "split.parquet").string(); + write_int_pair_parquet_file(file_path, {1, 2, 3}, {10, 20, 30}, {"one", "two", "three"}, 1); + + std::vector projected_columns; + projected_columns.push_back(make_table_column(0, "id", std::make_shared())); + + RuntimeState state {TQueryOptions(), TQueryGlobals()}; + TableReader reader; + ASSERT_TRUE(reader.init({ + .projected_columns = projected_columns, + .column_predicates = {}, + .conjuncts = VExprContext(nullptr), + .format = FileFormat::PARQUET, + .scan_params = nullptr, + .io_ctx = nullptr, + .runtime_state = &state, + .scanner_profile = nullptr, + .allow_missing_columns = true, + .push_down_agg_type = TPushAggOp::type::COUNT, + .profile = nullptr, + }) + .ok()); + ASSERT_TRUE(reader.prepare_split(build_split_options_for_row_group_mid(file_path, 2)).ok()); + + Block block = build_table_block(projected_columns); + bool eos = false; + ASSERT_TRUE(reader.get_block(&block, &eos).ok()); + ASSERT_FALSE(eos); + ASSERT_EQ(block.rows(), 1); + + ASSERT_TRUE(reader.close().ok()); + std::filesystem::remove_all(test_dir); +} + +TEST(TableReaderTest, PushDownCountFallsBackWithTableConjunct) { + const auto test_dir = + std::filesystem::temp_directory_path() / "doris_table_reader_count_conjunct_test"; + std::filesystem::remove_all(test_dir); + std::filesystem::create_directories(test_dir); + + const auto file_path = (test_dir / "split.parquet").string(); + write_int_pair_parquet_file(file_path, {1, 2, 3}, {10, 20, 30}, {"one", "two", "three"}); + + std::vector projected_columns; + projected_columns.push_back(make_table_column(0, "id", std::make_shared())); + + RuntimeState state {TQueryOptions(), TQueryGlobals()}; + TableReader reader; + ASSERT_TRUE(reader.init({ + .projected_columns = projected_columns, + .column_predicates = {}, + .conjuncts = VExprContext( + std::make_shared(0, 0, 2)), + .format = FileFormat::PARQUET, + .scan_params = nullptr, + .io_ctx = nullptr, + .runtime_state = &state, + .scanner_profile = nullptr, + .allow_missing_columns = true, + .push_down_agg_type = TPushAggOp::type::COUNT, + .profile = nullptr, + }) + .ok()); + ASSERT_TRUE(reader.prepare_split(build_split_options(file_path)).ok()); + + Block block = build_table_block(projected_columns); + bool eos = false; + ASSERT_TRUE(reader.get_block(&block, &eos).ok()); + ASSERT_FALSE(eos); + ASSERT_EQ(block.rows(), 1); + const auto& id_column = assert_cast(*block.get_by_position(0).column); + EXPECT_EQ(id_column.get_element(0), 3); + + ASSERT_TRUE(reader.close().ok()); + std::filesystem::remove_all(test_dir); +} + +TEST(TableReaderTest, PushDownCountFallsBackWithColumnPredicate) { + const auto test_dir = + std::filesystem::temp_directory_path() / "doris_table_reader_count_predicate_test"; + std::filesystem::remove_all(test_dir); + std::filesystem::create_directories(test_dir); + + const auto file_path = (test_dir / "split.parquet").string(); + write_int_pair_parquet_file(file_path, {1, 2, 3}, {10, 20, 30}, {"one", "two", "three"}, 1); + + std::vector projected_columns; + projected_columns.push_back(make_table_column(0, "id", std::make_shared())); + + TableColumnPredicates column_predicates; + column_predicates[0].push_back(create_comparison_predicate( + 0, "id", std::make_shared(), Field::create_field(2), false)); + + RuntimeState state {TQueryOptions(), TQueryGlobals()}; + TableReader reader; + ASSERT_TRUE(reader.init({ + .projected_columns = projected_columns, + .column_predicates = std::move(column_predicates), + .conjuncts = VExprContext(nullptr), + .format = FileFormat::PARQUET, + .scan_params = nullptr, + .io_ctx = nullptr, + .runtime_state = &state, + .scanner_profile = nullptr, + .allow_missing_columns = true, + .push_down_agg_type = TPushAggOp::type::COUNT, + .profile = nullptr, + }) + .ok()); + ASSERT_TRUE(reader.prepare_split(build_split_options(file_path)).ok()); + + Block block = build_table_block(projected_columns); + bool eos = false; + ASSERT_TRUE(reader.get_block(&block, &eos).ok()); + ASSERT_FALSE(eos); + ASSERT_EQ(block.rows(), 1); + const auto& id_column = assert_cast(*block.get_by_position(0).column); + EXPECT_EQ(id_column.get_element(0), 3); + + ASSERT_TRUE(reader.close().ok()); + std::filesystem::remove_all(test_dir); +} + +TEST(TableReaderTest, PushDownMinMaxFallsBackWithoutDirectFileMapping) { + const auto test_dir = std::filesystem::temp_directory_path() / + "doris_table_reader_minmax_missing_mapping_test"; + std::filesystem::remove_all(test_dir); + std::filesystem::create_directories(test_dir); + + const auto file_path = (test_dir / "split.parquet").string(); + write_parquet_file(file_path, 1, "one"); + + std::vector projected_columns; + projected_columns.push_back( + make_table_column(99, "missing_id", std::make_shared())); + + RuntimeState state {TQueryOptions(), TQueryGlobals()}; + TableReader reader; + ASSERT_TRUE(reader.init({ + .projected_columns = projected_columns, + .column_predicates = {}, + .conjuncts = VExprContext(nullptr), + .format = FileFormat::PARQUET, + .scan_params = nullptr, + .io_ctx = nullptr, + .runtime_state = &state, + .scanner_profile = nullptr, + .allow_missing_columns = true, + .push_down_agg_type = TPushAggOp::type::MINMAX, + .profile = nullptr, + }) + .ok()); + ASSERT_TRUE(reader.prepare_split(build_split_options(file_path)).ok()); + + Block block = build_table_block(projected_columns); + bool eos = false; + ASSERT_TRUE(reader.get_block(&block, &eos).ok()); + ASSERT_FALSE(eos); + ASSERT_EQ(block.rows(), 1); + EXPECT_EQ(block.get_by_position(0).column->get_int(0), 0); + + ASSERT_TRUE(reader.close().ok()); + std::filesystem::remove_all(test_dir); +} + +TEST(TableReaderTest, OpenReaderBuildsTableFiltersFromConjuncts) { + const auto test_dir = + std::filesystem::temp_directory_path() / "doris_table_reader_conjunct_filter_test"; + std::filesystem::remove_all(test_dir); + std::filesystem::create_directories(test_dir); + + const auto file_path = (test_dir / "split.parquet").string(); + write_parquet_file(file_path, 3, "three"); + + std::vector projected_columns; + projected_columns.push_back(make_table_column(1, "value", std::make_shared())); + projected_columns.push_back(make_table_column(0, "id", std::make_shared())); + + RuntimeState state {TQueryOptions(), TQueryGlobals()}; + TableReader reader; + ASSERT_TRUE(reader.init({ + .projected_columns = projected_columns, + .column_predicates = {}, + .conjuncts = VExprContext( + std::make_shared(0, 0, 2)), + .format = FileFormat::PARQUET, + .scan_params = nullptr, + .io_ctx = nullptr, + .runtime_state = &state, + .scanner_profile = nullptr, + .allow_missing_columns = true, + .profile = nullptr, + }) + .ok()); + + ASSERT_TRUE(reader.prepare_split(build_split_options(file_path)).ok()); + + // open_reader() should convert the table-level conjunct on projected column id 0 into + // _table_filters before ColumnMapper creates the FileScanRequest. ColumnMapper then rewrites + // the conjunct's slot ref from table column id 0 to the file-local block position used by + // ParquetReader. The projection order intentionally puts value before id, so the id filter + // column is not at position 0 in the file block. + Block block = build_table_block(projected_columns); + bool eos = false; + ASSERT_TRUE(reader.get_block(&block, &eos).ok()); + ASSERT_FALSE(eos); + const auto& id_column = assert_cast(*block.get_by_position(1).column); + ASSERT_EQ(id_column.size(), 1); + EXPECT_EQ(id_column.get_element(0), 3); + + ASSERT_TRUE(reader.close().ok()); + + TableReader filtered_reader; + ASSERT_TRUE(filtered_reader + .init({ + .projected_columns = projected_columns, + .column_predicates = {}, + .conjuncts = VExprContext( + std::make_shared(0, 0, 4)), + .format = FileFormat::PARQUET, + .scan_params = nullptr, + .io_ctx = nullptr, + .runtime_state = &state, + .scanner_profile = nullptr, + .allow_missing_columns = true, + .profile = nullptr, + }) + .ok()); + ASSERT_TRUE(filtered_reader.prepare_split(build_split_options(file_path)).ok()); + + block = build_table_block(projected_columns); + eos = false; + ASSERT_TRUE(filtered_reader.get_block(&block, &eos).ok()); + EXPECT_TRUE(eos); + EXPECT_EQ(block.get_by_position(1).column->size(), 0); + + ASSERT_TRUE(filtered_reader.close().ok()); + std::filesystem::remove_all(test_dir); +} + +TEST(TableReaderTest, OpenReaderBuildsColumnPredicateFilters) { + const auto test_dir = + std::filesystem::temp_directory_path() / "doris_table_reader_column_predicate_test"; + std::filesystem::remove_all(test_dir); + std::filesystem::create_directories(test_dir); + + const auto file_path = (test_dir / "split.parquet").string(); + // ColumnPredicate is only used for row-group/statistics pruning. Keep one row per row + // group so the predicate can prune the first two row groups and leave only id = 3. + write_int_pair_parquet_file(file_path, {1, 2, 3}, {1, 5, 8}, {"one", "two", "three"}, 1); + + std::vector projected_columns; + projected_columns.push_back(make_table_column(2, "value", std::make_shared())); + projected_columns.push_back(make_table_column(0, "id", std::make_shared())); + + TableColumnPredicates column_predicates; + column_predicates[0].push_back(create_comparison_predicate( + 0, "id", std::make_shared(), Field::create_field(2), false)); + + RuntimeState state {TQueryOptions(), TQueryGlobals()}; + TableReader reader; + ASSERT_TRUE(reader.init({ + .projected_columns = projected_columns, + .column_predicates = std::move(column_predicates), + .conjuncts = VExprContext(nullptr), + .format = FileFormat::PARQUET, + .scan_params = nullptr, + .io_ctx = nullptr, + .runtime_state = &state, + .scanner_profile = nullptr, + .allow_missing_columns = true, + .profile = nullptr, + }) + .ok()); + + ASSERT_TRUE(reader.prepare_split(build_split_options(file_path)).ok()); + + Block block = build_table_block(projected_columns); + bool eos = false; + ASSERT_TRUE(reader.get_block(&block, &eos).ok()); + ASSERT_FALSE(eos); + + const auto& value_column = assert_cast(*block.get_by_position(0).column); + const auto& id_column = assert_cast(*block.get_by_position(1).column); + ASSERT_EQ(id_column.size(), 1); + ASSERT_EQ(value_column.size(), 1); + EXPECT_EQ(id_column.get_element(0), 3); + EXPECT_EQ(value_column.get_data_at(0).to_string(), "three"); + + ASSERT_TRUE(reader.close().ok()); + std::filesystem::remove_all(test_dir); +} + +TEST(TableReaderTest, ColumnPredicateSurvivesReopenSplit) { + const auto test_dir = + std::filesystem::temp_directory_path() / "doris_table_reader_predicate_reopen_test"; + std::filesystem::remove_all(test_dir); + std::filesystem::create_directories(test_dir); + + const std::vector file_paths = { + (test_dir / "split_1.parquet").string(), + (test_dir / "split_2.parquet").string(), + }; + write_int_pair_parquet_file(file_paths[0], {1, 3}, {10, 30}, {"one", "three"}, 1); + write_int_pair_parquet_file(file_paths[1], {2, 4}, {20, 40}, {"two", "four"}, 1); + + std::vector projected_columns; + projected_columns.push_back(make_table_column(0, "id", std::make_shared())); + + TableColumnPredicates column_predicates; + column_predicates[0].push_back(create_comparison_predicate( + 0, "id", std::make_shared(), Field::create_field(2), false)); + + RuntimeState state {TQueryOptions(), TQueryGlobals()}; + TableReader reader; + ASSERT_TRUE(reader.init({ + .projected_columns = projected_columns, + .column_predicates = std::move(column_predicates), + .conjuncts = VExprContext(nullptr), + .format = FileFormat::PARQUET, + .scan_params = nullptr, + .io_ctx = nullptr, + .runtime_state = &state, + .scanner_profile = nullptr, + .allow_missing_columns = true, + .profile = nullptr, + }) + .ok()); + + std::vector ids; + for (const auto& file_path : file_paths) { + ASSERT_TRUE(reader.prepare_split(build_split_options(file_path)).ok()); + + Block block = build_table_block(projected_columns); + bool eos = false; + ASSERT_TRUE(reader.get_block(&block, &eos).ok()); + ASSERT_FALSE(eos); + const auto& id_column = assert_cast(*block.get_by_position(0).column); + ASSERT_EQ(id_column.size(), 1); + ids.push_back(id_column.get_element(0)); + + ASSERT_TRUE(reader.close().ok()); + } + + EXPECT_EQ(ids, std::vector({3, 4})); + std::filesystem::remove_all(test_dir); +} + +TEST(TableReaderTest, CreateScanRequestDeduplicatesSharedPredicateColumns) { + const auto int_type = std::make_shared(); + const std::vector projected_columns = { + make_table_column(0, "a", int_type), + make_table_column(1, "b", int_type), + make_table_column(2, "c", int_type), + make_table_column(3, "value", std::make_shared()), + }; + const std::vector file_schema = { + {.id = 0, + .name = "a", + .type = int_type, + .children = {}, + .file_path = {0}, + .field_id_path = {0}, + .name_path = {"a"}, + .column_type = DATA_COLUMN}, + {.id = 1, + .name = "b", + .type = int_type, + .children = {}, + .file_path = {1}, + .field_id_path = {1}, + .name_path = {"b"}, + .column_type = DATA_COLUMN}, + {.id = 2, + .name = "c", + .type = int_type, + .children = {}, + .file_path = {2}, + .field_id_path = {2}, + .name_path = {"c"}, + .column_type = DATA_COLUMN}, + {.id = 3, + .name = "value", + .type = std::make_shared(), + .children = {}, + .file_path = {3}, + .field_id_path = {3}, + .name_path = {"value"}, + .column_type = DATA_COLUMN}, + }; + + TableColumnMapper mapper; + ASSERT_TRUE(mapper.create_mapping(projected_columns, {}, file_schema).ok()); + + std::vector table_filters; + table_filters.push_back({ + .conjunct = VExprContext::create_shared( + std::make_shared(0, 0, 1, 1, 1)), + .slot_ids = {0, 1}, + }); + table_filters.push_back({ + .conjunct = VExprContext::create_shared( + std::make_shared(0, 0, 2, 2, 3)), + .slot_ids = {0, 2}, + }); + + FileScanRequest file_request; + ASSERT_TRUE( + mapper.create_scan_request(table_filters, {}, projected_columns, &file_request).ok()); + + // Both filters reference column a. It must still be read once as a predicate column, and a + // predicate column must not be repeated as a non-predicate column. + EXPECT_EQ(file_request.predicate_columns, std::vector({0, 1, 2})); + EXPECT_EQ(file_request.non_predicate_columns, std::vector({3})); + ASSERT_EQ(file_request.column_positions.size(), 4); + EXPECT_EQ(file_request.column_positions.at(3), 0); + EXPECT_EQ(file_request.column_positions.at(0), 1); + EXPECT_EQ(file_request.column_positions.at(1), 2); + EXPECT_EQ(file_request.column_positions.at(2), 3); + for (const auto predicate_column : file_request.predicate_columns) { + EXPECT_TRUE(std::find(file_request.non_predicate_columns.begin(), + file_request.non_predicate_columns.end(), + predicate_column) == file_request.non_predicate_columns.end()); + } +} + +TEST(TableReaderTest, CreateScanRequestPromotesProjectedColumnToPredicateColumn) { + const auto int_type = std::make_shared(); + const std::vector projected_columns = { + make_table_column(0, "id", int_type), + make_table_column(1, "score", int_type), + }; + const std::vector file_schema = { + {.id = 0, + .name = "id", + .type = int_type, + .children = {}, + .file_path = {0}, + .field_id_path = {0}, + .name_path = {"id"}, + .column_type = DATA_COLUMN}, + {.id = 1, + .name = "score", + .type = int_type, + .children = {}, + .file_path = {1}, + .field_id_path = {1}, + .name_path = {"score"}, + .column_type = DATA_COLUMN}, + }; + + TableColumnMapper mapper; + ASSERT_TRUE(mapper.create_mapping(projected_columns, {}, file_schema).ok()); + + TableFilter table_filter { + .conjunct = VExprContext::create_shared( + std::make_shared(0, 0, 1)), + .slot_ids = {0}, + }; + + FileScanRequest file_request; + ASSERT_TRUE( + mapper.create_scan_request({table_filter}, {}, projected_columns, &file_request).ok()); + + EXPECT_EQ(file_request.predicate_columns, std::vector({0})); + EXPECT_EQ(file_request.non_predicate_columns, std::vector({1})); + ASSERT_EQ(file_request.column_positions.size(), 2); + EXPECT_EQ(file_request.column_positions.at(0), 1); + EXPECT_EQ(file_request.column_positions.at(1), 0); +} + +TEST(TableReaderTest, OpenReaderPushesMultiColumnConjunctToParquetReader) { + const auto test_dir = + std::filesystem::temp_directory_path() / "doris_table_reader_multi_conjunct_test"; + std::filesystem::remove_all(test_dir); + std::filesystem::create_directories(test_dir); + + const auto file_path = (test_dir / "split.parquet").string(); + write_int_pair_parquet_file(file_path, {1, 2, 3}, {1, 5, 8}, {"one", "two", "three"}); + + std::vector projected_columns; + projected_columns.push_back(make_table_column(2, "value", std::make_shared())); + projected_columns.push_back(make_table_column(0, "id", std::make_shared())); + projected_columns.push_back(make_table_column(1, "score", std::make_shared())); + + RuntimeState state {TQueryOptions(), TQueryGlobals()}; + TableReader reader; + ASSERT_TRUE(reader.init({ + .projected_columns = projected_columns, + .column_predicates = {}, + .conjuncts = VExprContext( + std::make_shared(0, 0, 1, + 1, 8)), + .format = FileFormat::PARQUET, + .scan_params = nullptr, + .io_ctx = nullptr, + .runtime_state = &state, + .scanner_profile = nullptr, + .allow_missing_columns = true, + .profile = nullptr, + }) + .ok()); + + ASSERT_TRUE(reader.prepare_split(build_split_options(file_path)).ok()); + + // The conjunct references both id and score, so ColumnMapper must put both file columns into + // predicate_columns and rewrite both slot refs to ParquetReader's file-local block positions. + // ParquetReader then evaluates the expression after all predicate columns have been read. + Block block = build_table_block(projected_columns); + bool eos = false; + ASSERT_TRUE(reader.get_block(&block, &eos).ok()); + ASSERT_FALSE(eos); + + const auto& value_column = assert_cast(*block.get_by_position(0).column); + const auto& id_column = assert_cast(*block.get_by_position(1).column); + const auto& score_column = assert_cast(*block.get_by_position(2).column); + ASSERT_EQ(id_column.size(), 1); + ASSERT_EQ(score_column.size(), 1); + ASSERT_EQ(value_column.size(), 1); + EXPECT_EQ(id_column.get_element(0), 3); + EXPECT_EQ(score_column.get_element(0), 8); + EXPECT_EQ(value_column.get_data_at(0).to_string(), "three"); + + ASSERT_TRUE(reader.close().ok()); + std::filesystem::remove_all(test_dir); +} + +TEST(TableReaderTest, ProjectedColumnsFillDefaultForParquetSchemaMismatch) { + const auto test_dir = + std::filesystem::temp_directory_path() / "doris_table_reader_schema_mismatch_test"; + std::filesystem::remove_all(test_dir); + std::filesystem::create_directories(test_dir); + + const auto file_path = (test_dir / "split.parquet").string(); + write_parquet_file(file_path, 1, "one"); + + std::vector projected_columns; + projected_columns.push_back( + make_table_column(99, "missing_value", std::make_shared())); + + RuntimeState state {TQueryOptions(), TQueryGlobals()}; + TableReader reader; + ASSERT_TRUE(reader.init({ + .projected_columns = projected_columns, + .column_predicates = {}, + .conjuncts = VExprContext(nullptr), + .format = FileFormat::PARQUET, + .scan_params = nullptr, + .io_ctx = nullptr, + .runtime_state = &state, + .scanner_profile = nullptr, + .allow_missing_columns = true, + .profile = nullptr, + }) + .ok()); + + ASSERT_TRUE(reader.prepare_split(build_split_options(file_path)).ok()); + + // The table projection asks for field id 99, but the ParquetReader exposes only file-local + // fields 0 and 1. Missing columns are allowed by the current mapper options, so TableReader + // should still use the Parquet row count and fill a default column in table schema. + Block block = build_table_block(projected_columns); + bool eos = false; + ASSERT_TRUE(reader.get_block(&block, &eos).ok()); + ASSERT_FALSE(eos); + EXPECT_EQ(block.get_by_position(0).column->size(), 1); + + ASSERT_TRUE(reader.close().ok()); + std::filesystem::remove_all(test_dir); +} + +TEST(TableReaderTest, ProjectedColumnsRejectParquetSchemaMismatchWhenMissingColumnsDisallowed) { + const auto test_dir = std::filesystem::temp_directory_path() / + "doris_table_reader_schema_mismatch_reject_test"; + std::filesystem::remove_all(test_dir); + std::filesystem::create_directories(test_dir); + + const auto file_path = (test_dir / "split.parquet").string(); + write_parquet_file(file_path, 1, "one"); + + std::vector projected_columns; + projected_columns.push_back( + make_table_column(99, "missing_value", std::make_shared())); + + RuntimeState state {TQueryOptions(), TQueryGlobals()}; + TableReader reader; + ASSERT_TRUE(reader.init({ + .projected_columns = projected_columns, + .column_predicates = {}, + .conjuncts = VExprContext(nullptr), + .format = FileFormat::PARQUET, + .scan_params = nullptr, + .io_ctx = nullptr, + .runtime_state = &state, + .scanner_profile = nullptr, + .allow_missing_columns = false, + .profile = nullptr, + }) + .ok()); + + ASSERT_TRUE(reader.prepare_split(build_split_options(file_path)).ok()); + + // With allow_missing_columns disabled, the same missing projected column should fail while + // opening the split instead of being materialized as a default column. + Block block = build_table_block(projected_columns); + bool eos = false; + const auto status = reader.get_block(&block, &eos); + ASSERT_FALSE(status.ok()); + EXPECT_NE(status.to_string().find("does not have a matching file column"), std::string::npos); + + ASSERT_TRUE(reader.close().ok()); + std::filesystem::remove_all(test_dir); +} + +TEST(TableReaderTest, ProjectedPartitionColumnUsesSplitPartitionValue) { + const auto test_dir = + std::filesystem::temp_directory_path() / "doris_table_reader_partition_value_test"; + std::filesystem::remove_all(test_dir); + std::filesystem::create_directories(test_dir); + + const auto file_path = (test_dir / "split.parquet").string(); + write_parquet_file(file_path, 1, "one"); + + std::vector projected_columns; + auto partition_column = make_table_column(1, "value", std::make_shared()); + partition_column.is_partition_key = true; + projected_columns.push_back(std::move(partition_column)); + + RuntimeState state {TQueryOptions(), TQueryGlobals()}; + TableReader reader; + ASSERT_TRUE(reader.init({ + .projected_columns = projected_columns, + .column_predicates = {}, + .conjuncts = VExprContext(nullptr), + .format = FileFormat::PARQUET, + .scan_params = nullptr, + .io_ctx = nullptr, + .runtime_state = &state, + .scanner_profile = nullptr, + .allow_missing_columns = true, + .profile = nullptr, + }) + .ok()); + + auto split_options = build_split_options(file_path); + split_options.partition_values.emplace("value", Field::create_field("p1")); + ASSERT_TRUE(reader.prepare_split(split_options).ok()); + + // The file has a physical column with the same id/name. The split partition value should still + // take precedence and be materialized by TableReader. + Block block = build_table_block(projected_columns); + bool eos = false; + ASSERT_TRUE(reader.get_block(&block, &eos).ok()); + ASSERT_FALSE(eos); + + const auto& partition_value = + assert_cast(*block.get_by_position(0).column); + ASSERT_EQ(partition_value.size(), 1); + EXPECT_EQ(partition_value.get_data_at(0).to_string(), "p1"); + + ASSERT_TRUE(reader.close().ok()); + std::filesystem::remove_all(test_dir); +} + +TEST(TableReaderTest, IcebergVirtualColumnsUseRowLineageMetadata) { + const auto test_dir = + std::filesystem::temp_directory_path() / "doris_iceberg_virtual_columns_test"; + std::filesystem::remove_all(test_dir); + std::filesystem::create_directories(test_dir); + + const auto file_path = (test_dir / "split.parquet").string(); + write_int_pair_parquet_file(file_path, {1, 2, 3}, {10, 20, 30}, {"one", "two", "three"}); + + std::vector projected_columns; + projected_columns.push_back( + make_table_column(100, "_row_id", make_nullable(std::make_shared()))); + projected_columns.push_back( + make_table_column(101, "_last_updated_sequence_number", + make_nullable(std::make_shared()))); + projected_columns.push_back(make_table_column(0, "id", std::make_shared())); + + RuntimeState state {TQueryOptions(), TQueryGlobals()}; + doris::iceberg::IcebergTableReader reader; + ASSERT_TRUE(reader.init({ + .projected_columns = projected_columns, + .column_predicates = {}, + .conjuncts = VExprContext( + std::make_shared(0, 0, 1)), + .format = FileFormat::PARQUET, + .scan_params = nullptr, + .io_ctx = nullptr, + .runtime_state = &state, + .scanner_profile = nullptr, + .allow_missing_columns = true, + .profile = nullptr, + }) + .ok()); + + auto split_options = build_split_options(file_path); + set_iceberg_row_lineage_params(&split_options, 1000, 77); + ASSERT_TRUE(reader.prepare_split(split_options).ok()); + + Block block = build_table_block(projected_columns); + bool eos = false; + ASSERT_TRUE(reader.get_block(&block, &eos).ok()); + ASSERT_FALSE(eos); + + const auto& id_column = assert_cast(*block.get_by_position(2).column); + + ASSERT_EQ(block.rows(), 2); + EXPECT_EQ(id_column.get_element(0), 2); + EXPECT_EQ(id_column.get_element(1), 3); + expect_nullable_int64_column_values(*block.get_by_position(0).column, {1001, 1002}); + expect_nullable_int64_column_values(*block.get_by_position(1).column, {77, 77}); + + ASSERT_TRUE(reader.close().ok()); + std::filesystem::remove_all(test_dir); +} + +TEST(TableReaderTest, IcebergVirtualColumnsKeepRowLineageAfterConjunctFiltering) { + const auto test_dir = + std::filesystem::temp_directory_path() / "doris_iceberg_virtual_columns_conjunct_test"; + std::filesystem::remove_all(test_dir); + std::filesystem::create_directories(test_dir); + + const auto file_path = (test_dir / "split.parquet").string(); + write_int_pair_parquet_file(file_path, {1, 2, 3}, {10, 20, 30}, {"one", "two", "three"}); + + std::vector projected_columns; + projected_columns.push_back( + make_table_column(100, "_row_id", make_nullable(std::make_shared()))); + projected_columns.push_back( + make_table_column(101, "_last_updated_sequence_number", + make_nullable(std::make_shared()))); + projected_columns.push_back(make_table_column(0, "id", std::make_shared())); + + RuntimeState state {TQueryOptions(), TQueryGlobals()}; + doris::iceberg::IcebergTableReader reader; + ASSERT_TRUE(reader.init({ + .projected_columns = projected_columns, + .column_predicates = {}, + .conjuncts = VExprContext( + std::make_shared(0, 0, 1)), + .format = FileFormat::PARQUET, + .scan_params = nullptr, + .io_ctx = nullptr, + .runtime_state = &state, + .scanner_profile = nullptr, + .allow_missing_columns = true, + .profile = nullptr, + }) + .ok()); + + auto split_options = build_split_options(file_path); + set_iceberg_row_lineage_params(&split_options, 3000, 88); + ASSERT_TRUE(reader.prepare_split(split_options).ok()); + + Block block = build_table_block(projected_columns); + bool eos = false; + ASSERT_TRUE(reader.get_block(&block, &eos).ok()); + ASSERT_FALSE(eos); + + const auto& id_column = assert_cast(*block.get_by_position(2).column); + + ASSERT_EQ(block.rows(), 2); + EXPECT_EQ(id_column.get_element(0), 2); + EXPECT_EQ(id_column.get_element(1), 3); + expect_nullable_int64_column_values(*block.get_by_position(0).column, {3001, 3002}); + expect_nullable_int64_column_values(*block.get_by_position(1).column, {88, 88}); + + ASSERT_TRUE(reader.close().ok()); + std::filesystem::remove_all(test_dir); +} + +TEST(TableReaderTest, IcebergVirtualColumnsKeepRowLineageAfterRowGroupPredicatePruning) { + const auto test_dir = std::filesystem::temp_directory_path() / + "doris_iceberg_virtual_columns_row_group_predicate_test"; + std::filesystem::remove_all(test_dir); + std::filesystem::create_directories(test_dir); + + const auto file_path = (test_dir / "split.parquet").string(); + // ColumnPredicate is used for row-group/statistics pruning. Keep one row per row group so + // id > 2 prunes the first two row groups and leaves only the third file-local row. + write_int_pair_parquet_file(file_path, {1, 2, 3}, {10, 20, 30}, {"one", "two", "three"}, 1); + + std::vector projected_columns; + projected_columns.push_back( + make_table_column(100, "_row_id", make_nullable(std::make_shared()))); + projected_columns.push_back( + make_table_column(101, "_last_updated_sequence_number", + make_nullable(std::make_shared()))); + projected_columns.push_back(make_table_column(0, "id", std::make_shared())); + + TableColumnPredicates column_predicates; + column_predicates[0].push_back(create_comparison_predicate( + 0, "id", std::make_shared(), Field::create_field(2), false)); + + RuntimeState state {TQueryOptions(), TQueryGlobals()}; + doris::iceberg::IcebergTableReader reader; + ASSERT_TRUE(reader.init({ + .projected_columns = projected_columns, + .column_predicates = std::move(column_predicates), + .conjuncts = VExprContext(nullptr), + .format = FileFormat::PARQUET, + .scan_params = nullptr, + .io_ctx = nullptr, + .runtime_state = &state, + .scanner_profile = nullptr, + .allow_missing_columns = true, + .profile = nullptr, + }) + .ok()); + + auto split_options = build_split_options(file_path); + set_iceberg_row_lineage_params(&split_options, 4000, 99); + ASSERT_TRUE(reader.prepare_split(split_options).ok()); + + Block block = build_table_block(projected_columns); + bool eos = false; + ASSERT_TRUE(reader.get_block(&block, &eos).ok()); + ASSERT_FALSE(eos); + + const auto& id_column = assert_cast(*block.get_by_position(2).column); + + ASSERT_EQ(block.rows(), 1); + EXPECT_EQ(id_column.get_element(0), 3); + expect_nullable_int64_column_values(*block.get_by_position(0).column, {4002}); + expect_nullable_int64_column_values(*block.get_by_position(1).column, {99}); + + ASSERT_TRUE(reader.close().ok()); + std::filesystem::remove_all(test_dir); +} + +TEST(TableReaderTest, IcebergDeletionVectorUsesTableReaderDeleteFileInterface) { + TTableFormatFileDesc table_format_desc; + TIcebergFileDesc iceberg_desc; + iceberg_desc.__set_format_version(2); + iceberg_desc.__set_delete_files({make_iceberg_deletion_vector("dv.bin", 8, 128)}); + table_format_desc.__set_iceberg_params(iceberg_desc); + + IcebergTableReaderDeleteFileTestHelper reader; + DeleteFileDesc desc; + bool has_delete_file = false; + ASSERT_TRUE(reader.parse_deletion_vector_file(table_format_desc, &desc, &has_delete_file).ok()); + + EXPECT_TRUE(has_delete_file); + EXPECT_EQ(desc.path, "dv.bin"); + EXPECT_EQ(desc.start_offset, 8); + EXPECT_EQ(desc.size, 128); + EXPECT_EQ(desc.file_size, -1); + EXPECT_EQ(desc.format, DeleteFileDesc::Format::ICEBERG); +} + +TEST(TableReaderTest, IcebergDeletionVectorRejectsMultipleDeleteFiles) { + TTableFormatFileDesc table_format_desc; + TIcebergFileDesc iceberg_desc; + iceberg_desc.__set_format_version(2); + iceberg_desc.__set_delete_files({make_iceberg_deletion_vector("dv-a.bin", 8, 128), + make_iceberg_deletion_vector("dv-b.bin", 16, 256)}); + table_format_desc.__set_iceberg_params(iceberg_desc); + + IcebergTableReaderDeleteFileTestHelper reader; + DeleteFileDesc desc; + bool has_delete_file = false; + auto status = reader.parse_deletion_vector_file(table_format_desc, &desc, &has_delete_file); + + EXPECT_FALSE(status.ok()); +} + +TEST(TableReaderTest, IcebergTableReaderAppliesDeletionVectorFile) { + const auto test_dir = + std::filesystem::temp_directory_path() / "doris_iceberg_deletion_vector_file_test"; + std::filesystem::remove_all(test_dir); + std::filesystem::create_directories(test_dir); + + const auto file_path = (test_dir / "split.parquet").string(); + const auto dv_path = (test_dir / "delete-vector.bin").string(); + write_int_pair_parquet_file(file_path, {1, 2, 3, 4, 5}, {10, 20, 30, 40, 50}, + {"one", "two", "three", "four", "five"}); + const auto dv_size = write_iceberg_deletion_vector_file(dv_path, {0, 4}); + + std::vector projected_columns; + projected_columns.push_back(make_table_column(0, "id", std::make_shared())); + + RuntimeProfile profile("test_profile"); + RuntimeState state {TQueryOptions(), TQueryGlobals()}; + auto scan_params = make_local_parquet_scan_params(); + io::FileReaderStats file_reader_stats; + io::FileCacheStatistics file_cache_stats; + auto io_ctx = make_io_context(&file_reader_stats, &file_cache_stats); + ShardedKVCache cache(1); + doris::iceberg::IcebergTableReader reader; + ASSERT_TRUE(reader.init({ + .projected_columns = projected_columns, + .column_predicates = {}, + .conjuncts = VExprContext(nullptr), + .format = FileFormat::PARQUET, + .scan_params = &scan_params, + .io_ctx = io_ctx, + .runtime_state = &state, + .scanner_profile = &profile, + .allow_missing_columns = true, + .push_down_agg_type = TPushAggOp::type::COUNT, + .profile = make_table_read_profile(&profile), + }) + .ok()); + + auto split_options = build_split_options(file_path); + split_options.cache = &cache; + split_options.current_range.__set_table_format_params(make_iceberg_table_format_desc( + file_path, {make_iceberg_deletion_vector(dv_path, 0, dv_size)})); + ASSERT_TRUE(reader.prepare_split(split_options).ok()); + + EXPECT_EQ(read_iceberg_ids(&reader, projected_columns), std::vector({2, 3, 4})); + + ASSERT_TRUE(reader.close().ok()); + std::filesystem::remove_all(test_dir); +} + +TEST(TableReaderTest, IcebergTableReaderDoesNotPushDownAggregateWithDeletes) { + const auto test_dir = + std::filesystem::temp_directory_path() / "doris_iceberg_aggregate_delete_test"; + std::filesystem::remove_all(test_dir); + std::filesystem::create_directories(test_dir); + + const auto file_path = (test_dir / "split.parquet").string(); + const auto dv_path = (test_dir / "delete-vector.bin").string(); + write_int_pair_parquet_file(file_path, {1, 2, 3}, {10, 20, 30}, {"one", "two", "three"}); + const auto dv_size = write_iceberg_deletion_vector_file(dv_path, {0}); + + std::vector projected_columns; + projected_columns.push_back(make_table_column(0, "id", std::make_shared())); + + RuntimeProfile profile("test_profile"); + RuntimeState state {TQueryOptions(), TQueryGlobals()}; + auto scan_params = make_local_parquet_scan_params(); + io::FileReaderStats file_reader_stats; + io::FileCacheStatistics file_cache_stats; + auto io_ctx = make_io_context(&file_reader_stats, &file_cache_stats); + ShardedKVCache cache(1); + doris::iceberg::IcebergTableReader reader; + ASSERT_TRUE(reader.init({ + .projected_columns = projected_columns, + .column_predicates = {}, + .conjuncts = VExprContext(nullptr), + .format = FileFormat::PARQUET, + .scan_params = &scan_params, + .io_ctx = io_ctx, + .runtime_state = &state, + .scanner_profile = &profile, + .allow_missing_columns = true, + .push_down_agg_type = TPushAggOp::type::COUNT, + .profile = make_table_read_profile(&profile), + }) + .ok()); + + auto split_options = build_split_options(file_path); + split_options.cache = &cache; + split_options.current_range.__set_table_format_params(make_iceberg_table_format_desc( + file_path, {make_iceberg_deletion_vector(dv_path, 0, dv_size)})); + ASSERT_TRUE(reader.prepare_split(split_options).ok()); + + Block block = build_table_block(projected_columns); + bool eos = false; + ASSERT_TRUE(reader.get_block(&block, &eos).ok()); + ASSERT_FALSE(eos); + ASSERT_EQ(block.rows(), 2); + const auto& id_column = assert_cast(*block.get_by_position(0).column); + EXPECT_EQ(id_column.get_element(0), 2); + EXPECT_EQ(id_column.get_element(1), 3); + + ASSERT_TRUE(reader.close().ok()); + std::filesystem::remove_all(test_dir); +} + +TEST(TableReaderTest, IcebergTableReaderDoesNotPushDownAggregateWithPositionDelete) { + const auto test_dir = + std::filesystem::temp_directory_path() / "doris_iceberg_aggregate_position_delete_test"; + std::filesystem::remove_all(test_dir); + std::filesystem::create_directories(test_dir); + + const auto file_path = (test_dir / "split.parquet").string(); + const auto delete_file_path = (test_dir / "position-delete.parquet").string(); + write_int_pair_parquet_file(file_path, {1, 2, 3}, {10, 20, 30}, {"one", "two", "three"}); + write_position_delete_parquet_file(delete_file_path, {file_path}, {1}); + + std::vector projected_columns; + projected_columns.push_back(make_table_column(0, "id", std::make_shared())); + + RuntimeProfile profile("test_profile"); + RuntimeState state {TQueryOptions(), TQueryGlobals()}; + auto scan_params = make_local_parquet_scan_params(); + io::FileReaderStats file_reader_stats; + io::FileCacheStatistics file_cache_stats; + auto io_ctx = make_io_context(&file_reader_stats, &file_cache_stats); + ShardedKVCache cache(1); + doris::iceberg::IcebergTableReader reader; + ASSERT_TRUE(reader.init({ + .projected_columns = projected_columns, + .column_predicates = {}, + .conjuncts = VExprContext(nullptr), + .format = FileFormat::PARQUET, + .scan_params = &scan_params, + .io_ctx = io_ctx, + .runtime_state = &state, + .scanner_profile = &profile, + .allow_missing_columns = true, + .push_down_agg_type = TPushAggOp::type::COUNT, + .profile = make_table_read_profile(&profile), + }) + .ok()); + + auto split_options = build_split_options(file_path); + split_options.cache = &cache; + split_options.current_range.__set_table_format_params(make_iceberg_table_format_desc( + file_path, {make_iceberg_position_delete_file(delete_file_path)})); + ASSERT_TRUE(reader.prepare_split(split_options).ok()); + + Block block = build_table_block(projected_columns); + bool eos = false; + ASSERT_TRUE(reader.get_block(&block, &eos).ok()); + ASSERT_FALSE(eos); + ASSERT_EQ(block.rows(), 2); + const auto& id_column = assert_cast(*block.get_by_position(0).column); + EXPECT_EQ(id_column.get_element(0), 1); + EXPECT_EQ(id_column.get_element(1), 3); + + ASSERT_TRUE(reader.close().ok()); + std::filesystem::remove_all(test_dir); +} + +TEST(TableReaderTest, IcebergPositionDeleteFallsBackToSplitPath) { + const auto test_dir = + std::filesystem::temp_directory_path() / "doris_iceberg_position_delete_path_test"; + std::filesystem::remove_all(test_dir); + std::filesystem::create_directories(test_dir); + + const auto file_path = (test_dir / "split.parquet").string(); + const auto delete_file_path = (test_dir / "position-delete.parquet").string(); + write_int_pair_parquet_file(file_path, {1, 2, 3}, {10, 20, 30}, {"one", "two", "three"}); + write_position_delete_parquet_file(delete_file_path, {file_path}, {1}); + + std::vector projected_columns; + projected_columns.push_back(make_table_column(0, "id", std::make_shared())); + + RuntimeProfile profile("test_profile"); + RuntimeState state {TQueryOptions(), TQueryGlobals()}; + auto scan_params = make_local_parquet_scan_params(); + io::FileReaderStats file_reader_stats; + io::FileCacheStatistics file_cache_stats; + auto io_ctx = make_io_context(&file_reader_stats, &file_cache_stats); + ShardedKVCache cache(1); + doris::iceberg::IcebergTableReader reader; + ASSERT_TRUE(reader.init({ + .projected_columns = projected_columns, + .column_predicates = {}, + .conjuncts = VExprContext(nullptr), + .format = FileFormat::PARQUET, + .scan_params = &scan_params, + .io_ctx = io_ctx, + .runtime_state = &state, + .scanner_profile = &profile, + .allow_missing_columns = true, + .profile = make_table_read_profile(&profile), + }) + .ok()); + + auto split_options = build_split_options(file_path); + split_options.cache = &cache; + TTableFormatFileDesc table_format_params; + TIcebergFileDesc iceberg_params; + iceberg_params.__set_format_version(2); + iceberg_params.__set_delete_files({make_iceberg_position_delete_file(delete_file_path)}); + table_format_params.__set_iceberg_params(iceberg_params); + split_options.current_range.__set_table_format_params(table_format_params); + ASSERT_TRUE(reader.prepare_split(split_options).ok()); + + EXPECT_EQ(read_iceberg_ids(&reader, projected_columns), std::vector({1, 3})); + + ASSERT_TRUE(reader.close().ok()); + std::filesystem::remove_all(test_dir); +} + +TEST(TableReaderTest, IcebergTableReaderDoesNotPushDownAggregateWithEqualityDelete) { + const auto test_dir = + std::filesystem::temp_directory_path() / "doris_iceberg_aggregate_equality_delete_test"; + std::filesystem::remove_all(test_dir); + std::filesystem::create_directories(test_dir); + + const auto file_path = (test_dir / "split.parquet").string(); + const auto delete_file_path = (test_dir / "equality-delete.parquet").string(); + write_int_pair_parquet_file(file_path, {1, 2, 3}, {10, 20, 30}, {"one", "two", "three"}); + write_iceberg_equality_delete_parquet_file(delete_file_path, 0, 2); + + std::vector projected_columns; + projected_columns.push_back(make_table_column(0, "id", std::make_shared())); + + RuntimeProfile profile("test_profile"); + RuntimeState state {TQueryOptions(), TQueryGlobals()}; + auto scan_params = make_local_parquet_scan_params(); + io::FileReaderStats file_reader_stats; + io::FileCacheStatistics file_cache_stats; + auto io_ctx = make_io_context(&file_reader_stats, &file_cache_stats); + ShardedKVCache cache(1); + doris::iceberg::IcebergTableReader reader; + ASSERT_TRUE(reader.init({ + .projected_columns = projected_columns, + .column_predicates = {}, + .conjuncts = VExprContext(nullptr), + .format = FileFormat::PARQUET, + .scan_params = &scan_params, + .io_ctx = io_ctx, + .runtime_state = &state, + .scanner_profile = &profile, + .allow_missing_columns = true, + .push_down_agg_type = TPushAggOp::type::COUNT, + .profile = make_table_read_profile(&profile), + }) + .ok()); + + auto split_options = build_split_options(file_path); + split_options.cache = &cache; + split_options.current_range.__set_table_format_params(make_iceberg_table_format_desc( + file_path, {make_iceberg_equality_delete_file(delete_file_path, {0})})); + ASSERT_TRUE(reader.prepare_split(split_options).ok()); + + Block block = build_table_block(projected_columns); + bool eos = false; + ASSERT_TRUE(reader.get_block(&block, &eos).ok()); + ASSERT_FALSE(eos); + ASSERT_EQ(block.rows(), 2); + const auto& id_column = assert_cast(*block.get_by_position(0).column); + EXPECT_EQ(id_column.get_element(0), 1); + EXPECT_EQ(id_column.get_element(1), 3); + + ASSERT_TRUE(reader.close().ok()); + std::filesystem::remove_all(test_dir); +} + +TEST(TableReaderTest, IcebergEqualityDeleteCastsDataColumnToDeleteKeyType) { + const auto test_dir = + std::filesystem::temp_directory_path() / "doris_iceberg_equality_delete_cast_test"; + std::filesystem::remove_all(test_dir); + std::filesystem::create_directories(test_dir); + + const auto file_path = (test_dir / "split.parquet").string(); + const auto delete_file_path = (test_dir / "equality-delete.parquet").string(); + write_int_pair_parquet_file(file_path, {1, 2, 3}, {10, 20, 30}, {"one", "two", "three"}); + write_iceberg_equality_delete_bigint_parquet_file(delete_file_path, 0, 2); + + std::vector projected_columns; + projected_columns.push_back(make_table_column(0, "id", std::make_shared())); + + RuntimeProfile profile("test_profile"); + RuntimeState state {TQueryOptions(), TQueryGlobals()}; + auto scan_params = make_local_parquet_scan_params(); + io::FileReaderStats file_reader_stats; + io::FileCacheStatistics file_cache_stats; + auto io_ctx = make_io_context(&file_reader_stats, &file_cache_stats); + ShardedKVCache cache(1); + doris::iceberg::IcebergTableReader reader; + ASSERT_TRUE(reader.init({ + .projected_columns = projected_columns, + .column_predicates = {}, + .conjuncts = VExprContext(nullptr), + .format = FileFormat::PARQUET, + .scan_params = &scan_params, + .io_ctx = io_ctx, + .runtime_state = &state, + .scanner_profile = &profile, + .allow_missing_columns = true, + .profile = make_table_read_profile(&profile), + }) + .ok()); + + auto split_options = build_split_options(file_path); + split_options.cache = &cache; + split_options.current_range.__set_table_format_params(make_iceberg_table_format_desc( + file_path, {make_iceberg_equality_delete_file(delete_file_path, {0})})); + ASSERT_TRUE(reader.prepare_split(split_options).ok()); + + EXPECT_EQ(read_iceberg_ids(&reader, projected_columns), std::vector({1, 3})); + + ASSERT_TRUE(reader.close().ok()); + std::filesystem::remove_all(test_dir); +} + +TEST(TableReaderTest, IcebergPositionDeleteOnlyMatchesOriginalDataFilePath) { + const auto test_dir = std::filesystem::temp_directory_path() / + "doris_iceberg_position_delete_path_match_test"; + std::filesystem::remove_all(test_dir); + std::filesystem::create_directories(test_dir); + + const auto file_path = (test_dir / "split.parquet").string(); + const auto other_file_path = (test_dir / "other.parquet").string(); + const auto delete_file_path = (test_dir / "position-delete.parquet").string(); + write_int_pair_parquet_file(file_path, {1, 2, 3}, {10, 20, 30}, {"one", "two", "three"}); + write_position_delete_parquet_file(delete_file_path, {other_file_path, file_path}, {0, 1}); + + std::vector projected_columns; + projected_columns.push_back(make_table_column(0, "id", std::make_shared())); + + RuntimeProfile profile("test_profile"); + RuntimeState state {TQueryOptions(), TQueryGlobals()}; + auto scan_params = make_local_parquet_scan_params(); + io::FileReaderStats file_reader_stats; + io::FileCacheStatistics file_cache_stats; + auto io_ctx = make_io_context(&file_reader_stats, &file_cache_stats); + ShardedKVCache cache(1); + doris::iceberg::IcebergTableReader reader; + ASSERT_TRUE(reader.init({ + .projected_columns = projected_columns, + .column_predicates = {}, + .conjuncts = VExprContext(nullptr), + .format = FileFormat::PARQUET, + .scan_params = &scan_params, + .io_ctx = io_ctx, + .runtime_state = &state, + .scanner_profile = &profile, + .allow_missing_columns = true, + .profile = make_table_read_profile(&profile), + }) + .ok()); + + auto split_options = build_split_options(file_path); + split_options.cache = &cache; + split_options.current_range.__set_table_format_params(make_iceberg_table_format_desc( + file_path, {make_iceberg_position_delete_file(delete_file_path)})); + ASSERT_TRUE(reader.prepare_split(split_options).ok()); + + EXPECT_EQ(read_iceberg_ids(&reader, projected_columns), std::vector({1, 3})); + + ASSERT_TRUE(reader.close().ok()); + std::filesystem::remove_all(test_dir); +} + +TEST(TableReaderTest, IcebergRowLineageRemainsFileLocalAfterDeleteFiltering) { + const auto test_dir = + std::filesystem::temp_directory_path() / "doris_iceberg_row_lineage_delete_test"; + std::filesystem::remove_all(test_dir); + std::filesystem::create_directories(test_dir); + + const auto file_path = (test_dir / "split.parquet").string(); + const auto delete_file_path = (test_dir / "position-delete.parquet").string(); + write_int_pair_parquet_file(file_path, {1, 2, 3}, {10, 20, 30}, {"one", "two", "three"}); + write_position_delete_parquet_file(delete_file_path, {file_path}, {1}); + + std::vector projected_columns; + projected_columns.push_back( + make_table_column(100, "_row_id", make_nullable(std::make_shared()))); + projected_columns.push_back(make_table_column(0, "id", std::make_shared())); + + RuntimeProfile profile("test_profile"); + RuntimeState state {TQueryOptions(), TQueryGlobals()}; + auto scan_params = make_local_parquet_scan_params(); + io::FileReaderStats file_reader_stats; + io::FileCacheStatistics file_cache_stats; + auto io_ctx = make_io_context(&file_reader_stats, &file_cache_stats); + ShardedKVCache cache(1); + doris::iceberg::IcebergTableReader reader; + ASSERT_TRUE(reader.init({ + .projected_columns = projected_columns, + .column_predicates = {}, + .conjuncts = VExprContext(nullptr), + .format = FileFormat::PARQUET, + .scan_params = &scan_params, + .io_ctx = io_ctx, + .runtime_state = &state, + .scanner_profile = &profile, + .allow_missing_columns = true, + .profile = make_table_read_profile(&profile), + }) + .ok()); + + auto split_options = build_split_options(file_path); + split_options.cache = &cache; + TTableFormatFileDesc table_format_params = make_iceberg_table_format_desc( + file_path, {make_iceberg_position_delete_file(delete_file_path)}); + table_format_params.iceberg_params.__set_first_row_id(1000); + split_options.current_range.__set_table_format_params(table_format_params); + ASSERT_TRUE(reader.prepare_split(split_options).ok()); + + Block block = build_table_block(projected_columns); + bool eos = false; + ASSERT_TRUE(reader.get_block(&block, &eos).ok()); + ASSERT_FALSE(eos); + ASSERT_EQ(block.rows(), 2); + expect_nullable_int64_column_values(*block.get_by_position(0).column, {1000, 1002}); + const auto& id_column = assert_cast(*block.get_by_position(1).column); + EXPECT_EQ(id_column.get_element(0), 1); + EXPECT_EQ(id_column.get_element(1), 3); + + ASSERT_TRUE(reader.close().ok()); + std::filesystem::remove_all(test_dir); +} + +TEST(TableReaderTest, IcebergTableReaderAppliesPositionDeleteFile) { + const auto test_dir = + std::filesystem::temp_directory_path() / "doris_iceberg_position_delete_file_test"; + std::filesystem::remove_all(test_dir); + std::filesystem::create_directories(test_dir); + + const auto file_path = (test_dir / "split.parquet").string(); + const auto delete_file_path = (test_dir / "position-delete.parquet").string(); + write_int_pair_parquet_file(file_path, {1, 2, 3, 4, 5}, {10, 20, 30, 40, 50}, + {"one", "two", "three", "four", "five"}); + write_position_delete_parquet_file(delete_file_path, {file_path, file_path}, {1, 3}); + + std::vector projected_columns; + projected_columns.push_back(make_table_column(0, "id", std::make_shared())); + + RuntimeProfile profile("test_profile"); + RuntimeState state {TQueryOptions(), TQueryGlobals()}; + auto scan_params = make_local_parquet_scan_params(); + io::FileReaderStats file_reader_stats; + io::FileCacheStatistics file_cache_stats; + auto io_ctx = make_io_context(&file_reader_stats, &file_cache_stats); + ShardedKVCache cache(1); + doris::iceberg::IcebergTableReader reader; + ASSERT_TRUE(reader.init({ + .projected_columns = projected_columns, + .column_predicates = {}, + .conjuncts = VExprContext(nullptr), + .format = FileFormat::PARQUET, + .scan_params = &scan_params, + .io_ctx = io_ctx, + .runtime_state = &state, + .scanner_profile = &profile, + .allow_missing_columns = true, + .profile = make_table_read_profile(&profile), + }) + .ok()); + + auto split_options = build_split_options(file_path); + split_options.cache = &cache; + split_options.current_range.__set_table_format_params(make_iceberg_table_format_desc( + file_path, {make_iceberg_position_delete_file(delete_file_path)})); + ASSERT_TRUE(reader.prepare_split(split_options).ok()); + + EXPECT_EQ(read_iceberg_ids(&reader, projected_columns), std::vector({1, 3, 5})); + + ASSERT_TRUE(reader.close().ok()); + std::filesystem::remove_all(test_dir); +} + +TEST(TableReaderTest, IcebergTableReaderMergesDeletionVectorAndPositionDeleteFiles) { + const auto test_dir = + std::filesystem::temp_directory_path() / "doris_iceberg_delete_files_merge_test"; + std::filesystem::remove_all(test_dir); + std::filesystem::create_directories(test_dir); + + const auto file_path = (test_dir / "split.parquet").string(); + const auto dv_path = (test_dir / "delete-vector.bin").string(); + const auto position_delete_path = (test_dir / "position-delete.parquet").string(); + write_int_pair_parquet_file(file_path, {1, 2, 3, 4, 5}, {10, 20, 30, 40, 50}, + {"one", "two", "three", "four", "five"}); + const auto dv_size = write_iceberg_deletion_vector_file(dv_path, {0}); + write_position_delete_parquet_file(position_delete_path, {file_path, file_path}, {3, 3}); + + std::vector projected_columns; + projected_columns.push_back(make_table_column(0, "id", std::make_shared())); + + RuntimeProfile profile("test_profile"); + RuntimeState state {TQueryOptions(), TQueryGlobals()}; + auto scan_params = make_local_parquet_scan_params(); + io::FileReaderStats file_reader_stats; + io::FileCacheStatistics file_cache_stats; + auto io_ctx = make_io_context(&file_reader_stats, &file_cache_stats); + ShardedKVCache cache(1); + doris::iceberg::IcebergTableReader reader; + ASSERT_TRUE(reader.init({ + .projected_columns = projected_columns, + .column_predicates = {}, + .conjuncts = VExprContext(nullptr), + .format = FileFormat::PARQUET, + .scan_params = &scan_params, + .io_ctx = io_ctx, + .runtime_state = &state, + .scanner_profile = &profile, + .allow_missing_columns = true, + .profile = make_table_read_profile(&profile), + }) + .ok()); + + auto split_options = build_split_options(file_path); + split_options.cache = &cache; + split_options.current_range.__set_table_format_params(make_iceberg_table_format_desc( + file_path, {make_iceberg_deletion_vector(dv_path, 0, dv_size), + make_iceberg_position_delete_file(position_delete_path)})); + ASSERT_TRUE(reader.prepare_split(split_options).ok()); + + EXPECT_EQ(read_iceberg_ids(&reader, projected_columns), std::vector({2, 3, 5})); + + ASSERT_TRUE(reader.close().ok()); + std::filesystem::remove_all(test_dir); +} + +TEST(TableReaderTest, RowPositionDeletePredicateColumnIsNotRepeatedAsOutputColumn) { + const auto row_position_column_id = + doris::parquet::ParquetColumnReaderFactory::ROW_POSITION_COLUMN_ID; + std::vector projected_columns; + projected_columns.push_back( + make_table_column(100, "_row_id", make_nullable(std::make_shared()))); + projected_columns.push_back(make_table_column(0, "id", std::make_shared())); + + IcebergTableReaderScanRequestTestHelper reader; + ASSERT_TRUE(reader.init_for_scan_request_test(projected_columns).ok()); + + FileScanRequest request; + request.non_predicate_columns.push_back(0); + request.column_positions.emplace(0, 0); + + ASSERT_TRUE(reader.customize_request(&request).ok()); + + EXPECT_EQ(request.predicate_columns, std::vector({row_position_column_id})); + EXPECT_EQ(request.non_predicate_columns, std::vector({0})); + ASSERT_TRUE(request.column_positions.contains(row_position_column_id)); + EXPECT_EQ(request.column_positions.at(row_position_column_id), 1); + ASSERT_TRUE(request.conjuncts.empty()); + ASSERT_EQ(request.delete_conjuncts.size(), 1); + EXPECT_NE(request.delete_conjuncts[0], nullptr); +} + +TEST(TableReaderTest, ParquetReaderReadsOnlyRowGroupsInFileRange) { + const auto test_dir = + std::filesystem::temp_directory_path() / "doris_table_reader_file_range_test"; + std::filesystem::remove_all(test_dir); + std::filesystem::create_directories(test_dir); + + const auto file_path = (test_dir / "split.parquet").string(); + write_int_pair_parquet_file(file_path, {1, 2, 3}, {10, 20, 30}, + {"range_group_one", "range_group_two", "range_group_three"}, 1); + + std::vector projected_columns; + projected_columns.push_back(make_table_column(0, "id", std::make_shared())); + projected_columns.push_back(make_table_column(2, "value", std::make_shared())); + + RuntimeState state {TQueryOptions(), TQueryGlobals()}; + TableReader reader; + ASSERT_TRUE(reader.init({ + .projected_columns = projected_columns, + .column_predicates = {}, + .conjuncts = VExprContext(nullptr), + .format = FileFormat::PARQUET, + .scan_params = nullptr, + .io_ctx = nullptr, + .runtime_state = &state, + .scanner_profile = nullptr, + .allow_missing_columns = true, + .profile = nullptr, + }) + .ok()); + + ASSERT_TRUE(reader.prepare_split(build_split_options_for_row_group_mid(file_path, 1)).ok()); + + Block block = build_table_block(projected_columns); + bool eos = false; + ASSERT_TRUE(reader.get_block(&block, &eos).ok()); + ASSERT_FALSE(eos); + + const auto& id_column = assert_cast(*block.get_by_position(0).column); + const auto& value_column = assert_cast(*block.get_by_position(1).column); + ASSERT_EQ(block.rows(), 1); + EXPECT_EQ(id_column.get_element(0), 2); + EXPECT_EQ(value_column.get_data_at(0).to_string(), "range_group_two"); + + ASSERT_TRUE(reader.get_block(&block, &eos).ok()); + EXPECT_TRUE(eos); + EXPECT_EQ(block.rows(), 0); + + ASSERT_TRUE(reader.close().ok()); + std::filesystem::remove_all(test_dir); +} + +TEST(TableReaderTest, ProjectedColumnsUseMapperExpressionForSameNameDifferentIdParquetSchema) { + const auto test_dir = + std::filesystem::temp_directory_path() / "doris_table_reader_same_name_diff_id_test"; + std::filesystem::remove_all(test_dir); + std::filesystem::create_directories(test_dir); + + const auto file_path = (test_dir / "split.parquet").string(); + write_parquet_file(file_path, 1, "one"); + + std::vector projected_columns; + projected_columns.push_back(make_table_column(99, "id", std::make_shared())); + + RuntimeState state {TQueryOptions(), TQueryGlobals()}; + TableReader reader; + ASSERT_TRUE(reader.init({ + .projected_columns = projected_columns, + .column_predicates = {}, + .conjuncts = VExprContext(nullptr), + .format = FileFormat::PARQUET, + .scan_params = nullptr, + .io_ctx = nullptr, + .runtime_state = &state, + .scanner_profile = nullptr, + .allow_missing_columns = true, + .profile = nullptr, + }) + .ok()); + + ASSERT_TRUE(reader.prepare_split(build_split_options(file_path)).ok()); + + // The table column has the same name as the Parquet field, but a different field id. + // ColumnMapper should still resolve it by name and build a SlotRef projection from the file + // column into the requested table column. + Block block = build_table_block(projected_columns); + bool eos = false; + ASSERT_TRUE(reader.get_block(&block, &eos).ok()); + ASSERT_FALSE(eos); + + const auto& id_column = assert_cast(*block.get_by_position(0).column); + ASSERT_EQ(id_column.size(), 1); + EXPECT_EQ(id_column.get_element(0), 1); + + ASSERT_TRUE(reader.close().ok()); + std::filesystem::remove_all(test_dir); +} + +TEST(TableReaderTest, ProjectedColumnsUseMapperExpressionsForParquetSchemaMismatch) { + const auto test_dir = + std::filesystem::temp_directory_path() / "doris_table_reader_mapper_expr_test"; + std::filesystem::remove_all(test_dir); + std::filesystem::create_directories(test_dir); + + const auto file_path = (test_dir / "split.parquet").string(); + write_parquet_file(file_path, 7, "seven"); + + std::vector projected_columns; + projected_columns.push_back( + make_table_column(0, "table_id", std::make_shared())); + projected_columns.push_back( + make_table_column(1, "table_value", std::make_shared())); + + RuntimeState state {TQueryOptions(), TQueryGlobals()}; + TableReader reader; + ASSERT_TRUE(reader.init({ + .projected_columns = projected_columns, + .column_predicates = {}, + .conjuncts = VExprContext(nullptr), + .format = FileFormat::PARQUET, + .scan_params = nullptr, + .io_ctx = nullptr, + .runtime_state = &state, + .scanner_profile = nullptr, + .allow_missing_columns = true, + .profile = nullptr, + }) + .ok()); + + ASSERT_TRUE(reader.prepare_split(build_split_options(file_path)).ok()); + + // The table projection is intentionally different from the Parquet schema: + // field id 0 is requested as BIGINT instead of the file INT, so ColumnMapper should build a + // Cast expression; field id 1 has a different table name but the same type, so it should build + // a SlotRef projection. Both columns should still materialize in table schema order. + Block block = build_table_block(projected_columns); + bool eos = false; + ASSERT_TRUE(reader.get_block(&block, &eos).ok()); + ASSERT_FALSE(eos); + + ASSERT_EQ(block.get_by_position(0).name, "table_id"); + ASSERT_EQ(block.get_by_position(1).name, "table_value"); + const auto& id_column = assert_cast(*block.get_by_position(0).column); + const auto& value_column = assert_cast(*block.get_by_position(1).column); + ASSERT_EQ(id_column.size(), 1); + ASSERT_EQ(value_column.size(), 1); + EXPECT_EQ(id_column.get_element(0), 7); + EXPECT_EQ(value_column.get_data_at(0).to_string(), "seven"); + + ASSERT_TRUE(reader.close().ok()); + std::filesystem::remove_all(test_dir); +} + +} // namespace +} // namespace doris::reader diff --git a/docs/doris-arrow-parquet-complex-types-implementation.md b/docs/doris-arrow-parquet-complex-types-implementation.md new file mode 100644 index 00000000000000..1ee0dabc944fe7 --- /dev/null +++ b/docs/doris-arrow-parquet-complex-types-implementation.md @@ -0,0 +1,559 @@ +# Doris Arrow Parquet Reader 复杂类型完整支持方案 + +本文档描述 `be/src/format/new_parquet/` 新 Parquet reader 对 `STRUCT`、`LIST`、`MAP` 复杂类型的完整支持方案。 + +目标是在现有 file-local reader 边界内补齐复杂类型读取能力: + +- 继续复用 Arrow C++ Parquet core API 解析文件、row group、column chunk 和 leaf value。 +- 输出仍然是 Doris `Block` / `Column`,不引入 `parquet::arrow::FileReader`、`arrow::RecordBatch` 或 `arrow::Table` 作为 scan 输出路径。 +- `ParquetReader` 仍只理解 Parquet file-local schema,不处理 Iceberg/global schema evolution。 +- schema change、default/generated/partition column、delete、virtual column 仍由 `TableReader` / `TableColumnMapper` 负责。 +- 复杂类型读取必须以 Parquet definition level / repetition level 为准,不能依赖简单 row count 拼接。 +- 复杂类型列裁剪是本轮实现目标:读取 top-level complex column 时,只读取被请求的 child subtree。 +- 复杂类型 schema change 不在本轮实现,但本轮设计必须保留 field id、path、level 和 projection tree 边界,保证后续可以在 `TableColumnMapper` 中补齐 child-level mapping。 + +## 参考实现:DuckDB Parquet Reader + +参考目录: + +```text +/Users/xiaogangsu/code/duckdb/extension/parquet/ +``` + +重点参考文件: + +```text +extension/parquet/include/parquet_column_schema.hpp +extension/parquet/include/column_reader.hpp +extension/parquet/parquet_reader.cpp +extension/parquet/column_reader.cpp +extension/parquet/reader/struct_column_reader.cpp +extension/parquet/reader/list_column_reader.cpp +``` + +DuckDB 中值得借鉴的核心结构: + +- `ParquetColumnSchema` 保存 `max_define`、`max_repeat`、`schema_index`、`column_index`,schema tree 本身携带 Dremel level 信息。 +- `ParseSchemaRecursive()` 在解析 schema 时递增 definition/repetition level,并把 legacy repeated field、3-level LIST、MAP/MAP_KEY_VALUE 统一成 reader 可消费的 schema tree。 +- primitive reader 读取 leaf value 的同时输出 definition/repetition level。 +- struct reader 递归读取 children,并用 child 输出的 definition level 设置 struct null。 +- list/map reader 不直接按 row 数读取 child;它读取 child leaf stream,根据当前 list/map 层的 repetition level 折叠出 parent rows、offsets 和 null map。 +- skip/select 是 reader 级语义,不是 column filter fallback;复杂类型 skip 也必须消费对应的 level stream,保证所有 child reader 游标一致。 + +Doris 不需要照搬 DuckDB 的 thrift/page decoder;当前方案仍优先封装 Arrow internal `RecordReader`。但 DuckDB 的 reader 分层和 level 组装模型应作为 Doris 复杂类型支持的主参考。 + +## 当前 Doris 状态 + +现有文件: + +```text +be/src/format/new_parquet/parquet_reader.* +be/src/format/new_parquet/column_reader.* +be/src/format/new_parquet/parquet_column_schema.* +be/src/format/new_parquet/parquet_type.* +be/src/format/new_parquet/selection_vector.h +``` + +已有能力: + +- schema builder 可以识别 `STRUCT`、`LIST`、`MAP`,并生成 `DataTypeStruct`、`DataTypeArray`、`DataTypeMap`。 +- `ScalarColumnReader` 支持 flat primitive/string/decimal/date/time/timestamp。 +- `StructColumnReader` 递归读取 children,支持非常基础的非 nullable struct。 +- `ColumnReader::select()` 已经定义为 `skip + read` 的 selected read,不退化为整批读取后过滤。 + +主要缺口: + +- `ParquetColumnSchema` 没有保存完整 `max_definition_level` / `max_repetition_level` 和各复杂节点的 level 边界。 +- `ScalarColumnReader` 当前只支持 `max_repetition_level == 0 && max_definition_level <= 1`。 +- primitive reader 没有向 parent reader 暴露 leaf definition/repetition level stream。 +- nullable struct、list、map 没有 assembler。 +- repeated primitive、legacy repeated group、嵌套 list/map/struct 没有统一 schema 规约。 +- `skip(rows)` 对复杂类型还不是 parent-row 语义。 + +## 总体设计 + +复杂类型读取分为两层: + +```text +ParquetReader + -> ParquetColumnReader public API + read(parent_rows, output_column, rows_read) + skip(parent_rows) + select(selection, selected_rows, batch_rows, output_column) + -> Nested read API + read_nested(parent_rows, level_state, output_column, rows_read) + skip_nested(parent_rows, level_state) + -> Leaf RecordReader adapter + read leaf values + definition levels + repetition levels + -> Dremel assembler + Struct / List / Map build Doris columns +``` + +对 `ParquetReader` 来说,接口仍然是 top-level file-local row batch;复杂类型细节只存在于 `column_reader.*` 内部。 + +### 关键原则 + +- public `read(rows)` 和 `skip(rows)` 的 `rows` 始终表示当前 reader 对外暴露的 parent rows。 +- leaf reader 内部可以读取更多 physical records,但不能把 physical value count 泄露给 `ParquetReader`。 +- list/map 的 offsets 只能由 repetition level 生成,不能用 child column size 推断。 +- nullable 信息只能由 definition level 生成,不能通过 value 缺失猜测。 +- 所有复杂类型 reader 必须保持 child reader 游标严格同步;遇到不一致 level stream 应返回 `Corruption`。 +- 复杂类型 reader 不处理 table/global schema change;child-level schema evolution 后续在 `TableColumnMapper` 处理。 +- 复杂类型 reader 必须支持 file-local child projection。未投影 child 不创建 leaf reader,不读取对应 column chunk,不参与 value materialization。 +- 即使 child 被裁剪,也必须保留足够的 schema/path/level 元数据,使后续 schema change 可以把 table child 映射到 file child、default child 或 cast projection。 + +## Schema 扩展 + +扩展 `ParquetColumnSchema`: + +```text +struct ParquetColumnSchema { + int field_id; + int top_level_field_id; + int leaf_column_id; + int schema_node_id; + int parent_schema_node_id; + std::vector file_path; + std::vector field_id_path; + std::vector name_path; + std::string name; + DataTypePtr type; + ParquetColumnSchemaKind kind; + const parquet::schema::Node* node; + const parquet::ColumnDescriptor* descriptor; + ParquetTypeDescriptor type_descriptor; + int16_t max_definition_level; + int16_t max_repetition_level; + int16_t nullable_definition_level; + int16_t repeated_repetition_level; + std::vector> children; +}; +``` + +字段含义: + +- `schema_node_id`:Parquet schema tree 中的 node ordinal,用于 debug、error message、field id tracing。 +- `top_level_field_id`:FileScanRequest 使用的 file-local top-level id。 +- `leaf_column_id`:Parquet physical leaf column ordinal。复杂节点为 `-1`。 +- `file_path`:从 top-level field 到当前节点的 file-local child ordinal path,例如 `profile.address.city` 可以表示为 `[3, 0, 1]`。 +- `field_id_path`:从 top-level field 到当前节点的 Parquet field id path。缺失 field id 时使用 `-1` 占位,不在 file reader 层解释 Iceberg 语义。 +- `name_path`:从 top-level field 到当前节点的 Parquet node name path,用于 by-name fallback、error message 和后续 schema change。 +- `max_definition_level` / `max_repetition_level`:该节点下 leaf stream 的最大 level。复杂节点取其 subtree leaf 的约束值。 +- `nullable_definition_level`:该节点自身从 null 变成 defined 所需的 definition level。required 节点为 parent level,不额外增加。 +- `repeated_repetition_level`:该 repeated/list/map 层对应的 repetition level。非 repeated 节点为 parent level。 + +Schema builder 改造: + +- 从 root 递归解析,每进入 optional 节点 `definition_level + 1`。 +- 每进入 repeated 节点 `definition_level + 1` 且 `repetition_level + 1`。 +- 识别标准 3-level LIST: + +```text +optional group a (LIST) { + repeated group list { + optional element; + } +} +``` + +- 识别 legacy repeated primitive/group: + +```text +repeated int32 a; +repeated group a { ... } +``` + +并规约为 Doris `Array(element_type)`。 + +- 识别 MAP/MAP_KEY_VALUE: + +```text +optional group m (MAP) { + repeated group key_value { + required key_type key; + optional value_type value; + } +} +``` + +并规约为 Doris `Map(key_type, value_type)`。 + +- MAP key 按 Parquet 规范应为 required。若文件声明 nullable key,应在 schema 阶段返回 `NotSupported` 或 `Corruption`,不生成可继续执行的 reader。 + +## 复杂类型列裁剪 + +复杂类型列裁剪应在 file-local 层实现,语义是“只读取投影需要的 child subtree”,不是 table schema evolution。 + +建议扩展 `reader::FileScanRequest`,增加嵌套 projection tree: + +```text +struct FieldProjection { + ColumnId file_column_id; + std::vector file_path; + bool project_all_children; + std::vector children; +}; + +struct FileScanRequest { + std::vector predicate_columns; + std::vector non_predicate_columns; + std::map column_positions; + std::map complex_projections; + ... +}; +``` + +约束: + +- `predicate_columns` / `non_predicate_columns` 仍表示 top-level file-local fields。 +- `complex_projections` 只描述 top-level complex field 内部需要读取哪些 child。 +- 没有出现在 `complex_projections` 的 top-level complex field 默认 `project_all_children = true`,保持兼容。 +- 对 `STRUCT`,允许只投影部分 children,输出 `DataTypeStruct` 只包含被投影 children,child 顺序保持 file schema 顺序。 +- 对 `LIST`,允许裁剪 element subtree。例如 `Array(Struct)` 投影 `a,c` 时,输出 `Array(Struct)`。 +- 对 `MAP`,key 永远需要读取并输出;value subtree 可以裁剪。例如 `Map>` 投影 value.a 时,输出 `Map>`。 +- 对 nullable parent,parent null map 和 offsets 必须完整生成;裁剪只影响 child value materialization,不能影响 parent row shape。 +- 对所有 children 都被裁剪的 `STRUCT`,仍要能够根据某个保留的 level-driving child 生成 parent row/null 形态。第一版可以要求至少保留一个 child;如果上层真的只需要 parent 存在性,后续补充 `NullShapeColumnReader`。 + +`ParquetColumnReaderFactory` 应接收 projection tree: + +```text +Status create(const ParquetColumnSchema& column_schema, + const FieldProjection* projection, + std::unique_ptr* reader) const; +``` + +实现要求: + +- factory 只为投影中的 leaf 创建 `ScalarColumnReader`。 +- struct/list/map reader 保存 child reader slot;未投影 child 用 `nullptr` 表示,参考 DuckDB `StructColumnReader` 的 child reader 布局。 +- `TotalCompressedSize`、prefetch、statistics 等后续能力只能统计已投影 leaf。 +- 对 top-level output block,`TableReader` 需要使用 projection 后的 `SchemaField` / `DataTypePtr` 构建 block template,而不是原始完整 file schema。 + +列裁剪与延时物化的关系: + +- predicate complex child projection 和 output complex child projection 需要合并,避免同一 leaf 重复读取。 +- 如果 predicate 只依赖 complex child,FileScanRequest 应能表达该 child path 是 predicate projection。 +- 本轮可以先支持 output child pruning;predicate child pruning 可在 batch 内 complex predicate 接入时补齐,但 projection tree 的结构必须现在预留。 + +## Schema Change 兼容边界 + +复杂类型 schema change 不在本轮实现,原因是它涉及 table/global schema、Iceberg field id、default value、cast、generated column 和 filter fallback,属于 `TableColumnMapper` / `TableReader` 范围。 + +但本轮实现必须保证后续可扩展: + +- file schema 中每个 node 都必须导出 `file_path`、`field_id_path`、`name_path`、file-local type 和 child schema。 +- reader 内部不得把 `SchemaField::id` 同时当作 Iceberg field id 和 file-local column id。top-level scan id 只表示 file-local top-level ordinal。 +- `TableColumnMapper` 后续可以根据 table child field id/name path 生成 `FieldProjection`,也可以为缺失 child 生成 default/constant/finalize projection。 +- file reader 输出的 pruned complex type 是 file-local projected type;table reader 负责把它 finalize 成 table/global type。 +- filter localization 后续可以定位到 complex child path。无法安全定位或需要 cast 的 filter 进入 `reader_expression_map` 或 table-level finalize filter。 +- 不在 `ParquetReader` 中补缺失 child,不在 `ParquetReader` 中做 child cast,不在 `ParquetReader` 中解释 Iceberg field id。 + +后续 schema change 的目标形态: + +```text +table projection/filter + -> TableColumnMapper child-level mapping + -> FieldProjection(file-local child paths) + -> ParquetReader reads projected file-local complex block + -> TableReader fills default/generated/partition children + -> TableReader applies child cast/finalize/delete/virtual semantics +``` + +因此,本轮列裁剪实现时不能把 output type 和 original file type 强绑定。所有 `ColumnReader` 创建和 block template 构造都应基于 projected schema view。 + +## Level 读取抽象 + +新增内部结构,位置建议: + +```text +be/src/format/new_parquet/level.h +be/src/format/new_parquet/level.cpp +``` + +核心结构: + +```text +struct LevelBatch { + int64_t record_count; + int64_t value_count; + std::vector definition_levels; + std::vector repetition_levels; +}; + +struct NestedReadResult { + int64_t parent_rows; + int64_t physical_records; +}; +``` + +`ScalarColumnReader` 内部新增 leaf read 路径: + +```text +read_leaf_records(max_records, decoded_values, level_batch) +skip_leaf_records(max_records, level_batch) +``` + +要求: + +- Arrow internal `RecordReader` 的创建和调用继续封装在 `column_reader.*`,不能泄露到 `ParquetReader`。 +- flat primitive 保持当前 `read()` 快路径。 +- nested primitive 必须允许 `max_repetition_level > 0` 或 `max_definition_level > 1`,并输出 definition/repetition levels。 +- `DecodedColumnView::row_count` 对 nested leaf 应表示 value slots 数量,null slot 由 definition level 决定。 + +如果 Arrow internal `RecordReader` 无法稳定提供 Doris 需要的 level/value 对齐语义,则新增 Doris 自己的 leaf page decoder,范围仍限制在 `format/new_parquet/`,不要把 page decoder 细节扩散到 `ParquetReader` 主流程。 + +## Reader 分层 + +建议拆分 `column_reader.cpp`,避免复杂类型 assembler 混在 scalar 读值热路径: + +```text +be/src/format/new_parquet/column_reader.h +be/src/format/new_parquet/column_reader.cpp +be/src/format/new_parquet/scalar_column_reader.cpp +be/src/format/new_parquet/struct_column_reader.cpp +be/src/format/new_parquet/list_column_reader.cpp +be/src/format/new_parquet/map_column_reader.cpp +be/src/format/new_parquet/level.h +be/src/format/new_parquet/level.cpp +``` + +### ScalarColumnReader + +职责: + +- 读取 primitive leaf values。 +- 生成 leaf-level definition/repetition level。 +- 对 flat column 直接写 Doris scalar/nullable column。 +- 对 nested leaf 只作为 child reader 被复杂类型 assembler 调用。 + +flat path: + +```text +read(rows) + -> RecordReader::ReadRecords(rows) + -> DecodedColumnView + -> DataTypeSerDe::read_column_from_decoded_values +``` + +nested path: + +```text +read_nested(parent_rows, level_state) + -> read leaf records until parent_rows complete + -> append valid leaf values into child column + -> expose level_batch to parent assembler +``` + +### StructColumnReader + +输出: + +- non-nullable struct:`ColumnStruct`。 +- nullable struct:`ColumnNullable(ColumnStruct, null_map)`。 + +算法: + +1. 对每个 child reader 读取同样的 parent row count。 +2. child reader 返回的 parent rows 必须一致。 +3. struct 自身 nullable 时,根据 definition level 判断 struct row 是否 null。 +4. 对 null struct row,每个 child column 仍必须补一个 default/null slot,保证 `ColumnStruct` 所有 child size 等于 struct row count。 +5. child 本身的 null 由 child reader 自己根据更深层 definition level 处理。 + +注意: + +- 当前实现仅递归读取 children,没有处理 nullable struct;应改为显式处理 struct-level null map。 +- 对未投影 children 不创建 reader、不写入 output `ColumnStruct`。 +- 对所有 children 都未投影的 struct,第一版可以返回 `NotSupported`,后续用 shape-only reader 支持 parent 存在性读取。 + +### ListColumnReader + +输出: + +- non-nullable array:`ColumnArray(element_column, offsets)`。 +- nullable array:`ColumnNullable(ColumnArray, null_map)`。 + +核心算法参考 DuckDB list reader: + +1. 从 child reader 读取 leaf stream,获得 child values、definition levels、repetition levels。 +2. 根据当前 list 层的 `repeated_repetition_level` 判断一个 child record 是否属于当前 list: + - `rep == list_repetition_level`:当前 list 的后续 element。 + - `rep < list_repetition_level`:新的 parent row 开始。 +3. 根据 definition level 判断 parent row 状态: + - `def < list_defined_level`:null list。 + - `def == empty_list_level`:empty list。 + - `def >= element_defined_level`:有 element。 +4. 对每个 parent row 写一个 offset。 +5. 只有 element defined 时向 element column append value;empty/null list 不 append element。 + +需要维护 overflow: + +- child reader 一次读取可能跨过本次 `parent_rows` 的边界。 +- list reader 必须缓存未消费的 child values 和 levels,下一次 `read()` 继续使用。 +- 该缓存是 reader 游标状态的一部分,`skip()` 和 `read()` 都必须共享。 + +### MapColumnReader + +输出: + +- non-nullable map:`ColumnMap(key_column, value_column, offsets)`。 +- nullable map:`ColumnNullable(ColumnMap, null_map)`。 + +实现方式: + +- 按 Parquet schema 将 map 规约为 `LIST>` 的 level stream。 +- 复用 list assembler 的 parent row 边界判断。 +- 对每个 entry: + - key 必须 defined;key 缺失是文件格式错误。 + - value 可 nullable;由 value child definition level 生成 value null map。 +- append entry 时分别写 key column 和 value column。 +- offsets 表示每个 map row 的 entry 数。 + +不要把 `MAP` 先 materialize 成 `Array(Struct(key,value))` 再转换为 `ColumnMap`,否则会产生额外内存和拷贝。可以在内部复用 list 的边界识别逻辑,但直接写 `ColumnMap` 的 keys/values/offsets。 + +## Skip 和 Select + +public 语义保持不变: + +```text +skip(parent_rows) +select(selection, selected_rows, batch_rows, column) +``` + +复杂类型要求: + +- `skip()` 必须消费 parent rows 对应的所有 child physical records 和 level stream。 +- `select()` 继续使用现有 range 合并策略,即按 selected row ranges 调用 `skip()` + `read()`。 +- list/map 的 `skip()` 不能只跳过 child value count;必须按 repetition level 找到 parent row 边界。 +- empty selection 时必须跳过整个 batch 的 parent rows,保证 reader 游标推进。 + +第一阶段不实现 page-level row range selection;只保证 `skip + read` 的 selected read 正确。 + +## 与 ParquetReader Scan Loop 的关系 + +`ParquetReader::_read_current_row_group_batch()` 不需要理解复杂类型: + +- predicate columns 仍先读。 +- non-predicate columns 仍根据 selection 调用 `read()` 或 `select()`。 +- column reader 自己负责 complex column 的 parent-row 语义。 + +限制: + +- 初期不支持复杂类型直接作为 filter column 执行 batch predicate。 +- row group statistics 仍只对 primitive leaf 做保守裁剪。 +- complex child-level projection 是本轮 reader 实现目标;但 complex child predicate 执行和 schema change finalize 不在本轮完成。 + +## 错误处理 + +遇到明确违反 Parquet spec 或 reader invariant 的情况,应返回错误或触发检查,不能静默修复: + +- MAP key nullable 或 key definition level 缺失。 +- 同一 struct 的 children parent row count 不一致。 +- list/map repetition level 非法回退或超过当前 schema 最大值。 +- leaf reader 返回的 value count、definition/repetition level 数量不一致。 +- child reader overflow 状态与下一次 read/skip 请求冲突。 + +对合法但暂未支持的编码形态返回 `NotSupported`,例如后续若发现 Arrow internal `RecordReader` 无法支持某类 nested level 输出。 + +## 测试计划 + +新增或扩展 BE UT: + +```text +be/test/format/new_parquet/parquet_complex_reader_test.cpp +``` + +优先用 Arrow writer 生成小 Parquet 文件,覆盖: + +- required struct。 +- optional struct。 +- struct child nullable。 +- array of primitive:null array、empty array、array with null element。 +- array of struct。 +- nested array:`Array(Array(String))`。 +- map:empty map、null map、nullable value。 +- struct containing array/map。 +- multiple row groups。 +- child projection:struct child 裁剪、array element struct child 裁剪、map value struct child 裁剪。 +- selected read:复杂列作为 non-predicate column,predicate column 过滤出稀疏 selection。 +- skip then read:直接验证复杂列 reader 游标。 + +后续回归测试: + +```text +regression-test/suites/external_table_p0/parquet_complex_types.groovy +``` + +要求: + +- 结果排序稳定,使用 `order_qt` 或显式 `order by`。 +- 错误场景使用 `test { sql; exception }`。 +- 测试前 drop table,不在测试末尾 drop,便于失败后排查。 + +## 分阶段落地 + +### 阶段 1:Schema level 信息补齐 + +- 扩展 `ParquetColumnSchema`,保存 definition/repetition level。 +- 增加 `file_path`、`field_id_path`、`name_path`,并明确 top-level file-local id 与 table field id 的边界。 +- 重写 `build_parquet_column_schema()` 的复杂类型规约逻辑。 +- 增加 schema-only UT,覆盖 LIST/MAP legacy 和 standard encodings。 + +### 阶段 1.5:Projection tree 和 projected schema view + +- 扩展 `FileScanRequest`,表达 top-level complex field 的 child projection tree。 +- 增加 projected `SchemaField` / `DataTypePtr` 构造逻辑。 +- `ParquetColumnReaderFactory` 接收 projection tree,只创建被投影 child reader。 +- 增加 child pruning UT,验证未投影 leaf 不创建 reader、不读取 column chunk。 + +### 阶段 2:Leaf level reader + +- 为 `ScalarColumnReader` 增加 nested leaf read API。 +- 去掉 `max_repetition_level == 0 && max_definition_level <= 1` 的硬限制,改成 flat path 和 nested path 分支。 +- 验证 nullable primitive 在 nested path 下的 value/null 对齐。 + +### 阶段 3:Struct reader 完整化 + +- 实现 nullable struct。 +- 保证 null struct row 对所有 children 插入 default/null slot。 +- 增加 required/optional struct UT。 + +### 阶段 4:List reader + +- 实现 list assembler、offset 写入、null/empty list 区分。 +- 实现 overflow child buffer。 +- 实现 list `skip()`。 +- 增加 array、nested array、array of struct UT。 + +### 阶段 5:Map reader + +- 实现 map schema 规约到 key/value children。 +- 直接写 `ColumnMap` keys、values、offsets。 +- 校验 required key。 +- 增加 map UT。 + +### 阶段 6:Selected read 和集成测试 + +- 验证 complex non-predicate column 在 lazy materialization 下正确。 +- 验证 complex projected child 在 lazy materialization 下正确。 +- 增加 sparse selection、empty selection、multi-row-group 测试。 +- 将复杂类型 reader 接入 `ParquetReader` 现有 scan loop,不改 table/global schema 边界。 + +### 阶段 7:优化和扩展 + +- complex child predicate execution。 +- complex column statistics 和 page index 支持。 +- complex predicate fallback。 +- 复杂列 schema change child-level mapping。 + +## 验收标准 + +完成“复杂类型完整支持”至少需要满足: + +- `STRUCT`、nullable `STRUCT`、`LIST`、nested `LIST`、`MAP` 可以正确读入 Doris complex columns。 +- 复杂类型 child projection 可以裁剪未请求 leaf,并输出 projected complex type。 +- null、empty、missing element/value 的语义与 Parquet definition/repetition level 一致。 +- `read()`、`skip()`、`select()` 在复杂类型上均保持 parent-row 语义。 +- flat primitive 现有测试不退化。 +- 新增 BE UT 覆盖复杂类型基础、嵌套、selected read 和 multi-row-group。 +- `ParquetReader` 不引入 table/global schema 语义。 +- schema/path/level 元数据足够后续 `TableColumnMapper` 实现 child-level schema change,不需要重写复杂类型 reader 主体。 diff --git a/docs/doris-arrow-parquet-reader-implementation.md b/docs/doris-arrow-parquet-reader-implementation.md new file mode 100644 index 00000000000000..c3acb5d8f1e7f4 --- /dev/null +++ b/docs/doris-arrow-parquet-reader-implementation.md @@ -0,0 +1,349 @@ +# Doris Arrow Parquet Reader 实现方案与当前状态 + +本文档描述 `be/src/format/new_parquet/` 下新 Parquet reader 的设计、当前实现状态和后续缺口。 + +当前目标不是替换旧 `vparquet` 路径,而是在新 reader API 下先实现一个 file-local Parquet reader: + +- 底层复用 Arrow C++ Parquet core API 解析文件、row group 和 column chunk。 +- 输出仍然是 Doris 自己的 `Block` 和 `Column`。 +- 不使用 `parquet::arrow::FileReader`、`arrow::RecordBatch` 或 `arrow::Table` 作为 scan 输出路径。 +- `ParquetReader` 只理解 Parquet file-local schema,不理解 Iceberg/global schema。 +- schema change、filter localization、default/generated/partition column 等 table-level 语义放在 `TableReader` 和 `TableColumnMapper`。 + +## 分层边界 + +当前分层如下: + +```text +FileScanner / TableReader / IcebergTableReader + -> TableColumnMapper + -> reader::FileScanRequest + -> doris::parquet::ParquetReader + -> DorisRandomAccessFile + -> parquet::ParquetFileReader + -> parquet::RowGroupReader + -> parquet::internal::RecordReader + -> Doris Block / Column +``` + +关键边界: + +- `TableReader` 输出 table/global schema block。 +- `ParquetReader` 输出 file-local block。 +- `TableColumnMapper` 负责把 table projection/filter 转成 file-local projection/filter。 +- `ParquetReader` 不补 default column,不物化 partition column,不处理 generated column,不做 Iceberg schema evolution。 +- 所有 table-level cast/finalize/delete/virtual column 都不能塞回 `ParquetReader`。 + +## FileReader 生命周期 + +`ParquetReader` 继承 `reader::FileReader`,当前生命周期是: + +```text +init(RuntimeState*) + -> get_schema(std::vector*) + -> open(std::unique_ptr&) + -> get_block(Block* file_block, size_t* rows, bool* eof) + -> close() +``` + +语义约束: + +- `init()` 打开物理文件并解析 Parquet footer metadata。 +- `get_schema()` 在 `init()` 成功后可调用,不要求 `open()`。 +- `open()` 接收已经 localize 的 `FileScanRequest`,并完成 row group pruning 和 reader 游标初始化。 +- `get_block()` 只能在 `open()` 成功后调用,输出 file-local block。 +- `rows` 表示本批 file-local block 输出行数,`eof` 表示当前物理文件是否读完。 + +## 代码布局 + +```text +be/src/format/new_parquet/parquet_reader.h +be/src/format/new_parquet/parquet_reader.cpp +be/src/format/new_parquet/column_reader.h +be/src/format/new_parquet/column_reader.cpp +be/src/format/new_parquet/parquet_column_schema.h +be/src/format/new_parquet/parquet_column_schema.cpp +be/src/format/new_parquet/parquet_type.h +be/src/format/new_parquet/parquet_type.cpp +be/src/format/new_parquet/parquet_statistics.h +be/src/format/new_parquet/parquet_statistics.cpp +be/src/format/new_parquet/selection_vector.h +``` + +职责划分: + +- `parquet_reader.*`:文件打开、schema 导出、scan state、row group 调度、谓词列优先读取、file-local block 组装。 +- `column_reader.*`:单个 Parquet 字段到 Doris column 的读取;封装 Arrow internal `RecordReader`。 +- `parquet_column_schema.*`:从 Parquet schema descriptor 构建 file-local schema tree。 +- `parquet_type.*`:解析 Parquet physical/logical/converted type,生成 Doris file-local type 和额外类型信息。 +- `parquet_statistics.*`:基于 row group metadata 做保守的统计信息裁剪。 +- `selection_vector.h`:表达 batch 内被选中的 row offset,用于延时物化。 + +## 核心组件 + +### DorisRandomAccessFile + +`DorisRandomAccessFile` 把 Doris `io::FileReader` 适配成 `arrow::io::RandomAccessFile`。 + +它只处理随机读和文件大小查询,不解析 Parquet schema,不携带 table schema,也不执行 filter。 + +### ParquetReaderScanState + +`ParquetReaderScanState` 是 `parquet_reader.cpp` 内部状态,记录: + +- Arrow random access file; +- Arrow Parquet file reader; +- Parquet footer metadata; +- Parquet schema descriptor; +- file-local schema tree; +- 被 row group statistics 选中的 row group; +- 当前 row group reader; +- 当前 row group 内已读行数; +- predicate column readers; +- non-predicate column readers。 + +该状态不暴露给 table reader。 + +### ParquetColumnSchema 和 ParquetTypeDescriptor + +`ParquetColumnSchema` 描述 file-local schema tree,包括: + +- Parquet node name; +- Parquet field id; +- top-level field id; +- leaf column id; +- Doris file-local type; +- 子列 schema; +- primitive column 的 `ParquetTypeDescriptor`。 + +`ParquetTypeDescriptor` 负责保存 Parquet annotation 解析结果,包括: + +- physical type; +- logical type / converted type 推导后的 Doris type; +- decimal precision/scale; +- time/timestamp unit; +- 是否 string-like; +- 是否支持当前 RecordReader 读取路径。 + +类型解析已经从 `column_reader.cpp` 前移到 `parquet_type.*`,`ColumnReader` 热路径只消费解析结果。 + +### ParquetColumnReader + +`ParquetColumnReader` 是 Doris 自己的 file-local column reader 抽象,不是 Arrow 的 `parquet::ColumnReader`。 + +当前接口收敛为: + +```text +read(rows, column, rows_read) +skip(rows) +select(selection, selected_rows, batch_rows, column) +``` + +当前实现: + +- `ScalarColumnReader`:基于 Arrow internal `RecordReader` 读取 flat primitive/string/decimal/time/timestamp。 +- `StructColumnReader`:支持 top-level struct 的 scalar child 组装,包含 nullable parent struct、nullable scalar child 和 struct child projection。 +- `ListColumnReader`:支持 scalar element 的 LIST level 组装,包含 null list、empty list、nullable element 和 overflow state。 +- `MapColumnReader`:支持 scalar key/value 的 MAP level 组装,包含 null map、empty map、nullable scalar value 和 overflow state。 + +`select()` 在基类中统一实现:把 `SelectionVector` 合并成连续 row ranges,然后交替调用 `skip()` 和 `read()`。当前不实现整批 read 后再 filter 的 fallback。 + +### ParquetColumnReaderFactory + +`ParquetColumnReaderFactory` 根据当前 row group 和 `ParquetColumnSchema` 创建 column reader。 + +它集中封装 Arrow internal `RecordReader` 的创建和缓存,避免 Arrow internal API 泄露到 `ParquetReader` 主流程。 + +### DataTypeSerDe decoded value 读取接口 + +`ScalarColumnReader` 不直接把 Parquet value switch 到 Doris column,而是构造 `DecodedColumnView`,再调用: + +```text +DataTypeSerDe::read_column_from_decoded_values(...) +``` + +当前已接入的 SerDe 包括 number、string、decimal、date/time/datetime、nullable 等类型。这样可以把“Parquet 解码”和“Doris 类型写入”拆开,减少 `ColumnReader` 内部的 Doris 类型分发逻辑。 + +## Scan Request 语义 + +新 reader 消费 `reader::FileScanRequest`。 + +重要字段: + +- `predicate_columns`:需要先读取,用于计算 selection 的 file-local columns。 +- `non_predicate_columns`:selection 确定后再读取的 file-local columns。 +- `column_positions`:file column id 到 file-local output block position 的映射。 +- `local_filters`:已经 localize 到 file schema 的 filter。 +- `reader_expression_map`:table filter 无法安全转换成 file-local predicate 时的 fallback 表达式。 + +输出 block 的列顺序和类型遵守 `column_positions`,不是 table/global schema。 + +## 谓词下推 + +当前已实现: + +- row group 级 min/max 统计信息裁剪; +- null count 驱动的 `IS NULL` / `IS NOT NULL` 裁剪; +- unsupported statistics、缺失 statistics、不安全比较时保守保留 row group。 + +当前未实现: + +- page index pruning; +- bloom filter pruning; +- dictionary pruning; +- batch 内直接执行结构化 `ColumnPredicate`; +- `reader_expression_map` fallback 表达式执行。 + +注意:当前 `local_filters.predicates` 已经进入 row group statistics 路径,但在 batch 内过滤阶段,`ParquetReader::_read_filter_columns()` 主要处理 `local_filter.conjunct`。因此如果某个谓词只以 `ColumnPredicate` 形式存在,目前还缺少 batch 内二次过滤闭环。 + +## 延时物化当前状态 + +当前 scan loop 是 predicate-first 模型: + +1. 读取 `predicate_columns`。 +2. 执行表达式 filter,生成 `SelectionVector`。 +3. 如果谓词列也在 output block 中,则复用已经解码的谓词列,并按 selection filter。 +4. 对 `non_predicate_columns` 调用 `ColumnReader::select()`,只读取被选中的行。 +5. 返回 file-local block。 + +已有能力: + +- flat primitive/string/decimal/time/timestamp 的基础 selected read; +- empty selection 时跳过整批 non-predicate columns; +- sparse selection 会被合并成多个连续 ranges; +- predicate column 同时是 projection 时,不会重新读取该列。 + +主要缺口: + +- batch 内 `ColumnPredicate` 执行未接入 selection; +- `reader_expression_map` 仍是 TODO; +- selection index 当前是 `uint16_t`,需要显式约束 batch size; +- selected read 依赖 Arrow internal `RecordReader::SkipRecords` 和 `ReadRecords`,需要继续隔离在 `column_reader.*`; +- 没有 page-level row range selection; +- LIST/MAP 的 `select()` 已经复用 `skip() + read()` range 策略,并通过 nested overflow state 保持 cursor 正确; +- Struct 的 complex child selected read 仍依赖 child reader 自身能力,后续需要补多 stream assembler。 + +## Schema Change 当前状态 + +当前原则是:`ParquetReader` 不理解 schema change,schema change 由 `TableColumnMapper` 和 `TableReader` 处理。 + +已有能力: + +- `TableReader` 初始化时默认使用 `TableColumnMappingMode::BY_FIELD_ID`。 +- `TableColumnMapper` 可以根据 table column 和 file schema 建立 `ColumnMapping`。 +- 缺失 partition column 可以用 partition value 生成 constant mapping。 +- 缺失普通列可以使用 `default_expr`。 +- file type 与 table type 不同的时候,可以生成 finalize cast projection。 +- virtual column 有 `ROW_ID` 和 `LAST_UPDATED_SEQUENCE_NUMBER` 的 mapping 标记。 + +主要缺口: + +- 当前 `SchemaField::id` 同时承担 file-local column id 和 mapping id,边界还不够清晰。尤其 top-level primitive 目前会使用 leaf column id,Iceberg field id 映射还需要重新梳理。 +- `_is_same_type()` 只是 `DataTypePtr` 指针比较,不能可靠表达类型等价。 +- filter localization 仍是 stub,没有完整实现 trivial mapping、safe cast、reader expression fallback、finalize-only filter。 +- `reader_filter_expr` 没有真正生成或执行。 +- 复杂列 schema change 没有 child-level mapping。 +- `IcebergTableReader` 的 equality delete、position delete、virtual column、finalize 仍是框架 stub。 + +## 复杂列当前状态 + +已有能力: + +- schema builder 能识别 `STRUCT`、`LIST`、`MAP`。 +- 可以把复杂 Parquet schema 组合成 Doris `DataTypeStruct`、`DataTypeArray`、`DataTypeMap`。 +- `ParquetColumnSchema` 已记录 file path、field id path、name path、definition level、repetition level、nullable definition level 和 repeated repetition level,为后续 child-level mapping/schema change 留入口。 +- `TableColumnMapper` 可以为 struct child 生成 `FieldProjection`,`ParquetReader` 会把 projected file-local schema 暴露给上层。 +- `StructColumnReader` 支持 top-level struct 的 scalar children: + - required struct; + - nullable struct; + - required scalar child; + - nullable scalar child; + - projected scalar child,例如只读 `s.b` 时仍能根据该 leaf 的 definition level 还原 parent null map。 +- `LIST` 支持 scalar element: + - required / nullable list; + - null list; + - empty list; + - required / nullable scalar element; + - 小批量 read 下跨 batch 的 overflow; + - `skip()` / `select()` 通过同一个 level assembler 推进。 +- `MAP` 支持 scalar key/value: + - required / nullable map; + - null map; + - empty map; + - required key; + - required / nullable scalar value; + - key leaf 作为 shape driver,value leaf 校验 row count、level count 和 repetition level 对齐; + - `skip()` / `select()` 通过同一个 level assembler 推进。 +- `NestedScalarBatch` 在每次 `RecordReader::ReadRecords()` 后复制 def/rep levels,并把 defined values materialize 到 Doris-owned 临时列,避免保存 Arrow buffer 或 `StringRef`。 +- `NestedScalarOverflow` 保存未消费的 level tail 和 compact 后的 Doris-owned value column,LIST/MAP read-ahead 不再假设 child records 等于 output rows。 +- `RepeatedLevelAssembler` 统一折叠 repeated level stream,生成 parent row、entry count、parent null map,并由 sink 写入 list/map child column。 + +主要缺口: + +- `Array(Struct)`、`Map` 还未实现。当前 Struct reader 可以组装 scalar child,但 LIST/MAP assembler 还没有接 complex child sink。 +- 嵌套 list/map 还未实现,例如 `Array(Array)`、`Map>`。 +- nullable struct 如果包含 complex child,目前仍返回 `NotSupported`,避免在缺少多 stream assembler 时误读。 +- LIST/MAP 的 nested projection 还未实现。当前只支持完整读取 scalar element/value,不支持只投影 `array.element.x` 或 `map.value.y`。 +- 复杂类型 schema change 还未实现 child-level remap/default/cast。当前 schema/path/projection 结构按后续扩展预留,但缺失 child、rename、field id remap、default child、nested cast 都还没有接入。 +- primitive reader 的 flat scalar 路径仍只支持 `max_repetition_level == 0 && max_definition_level <= 1`;nested scalar 只能通过 complex reader 使用。 +- complex child 的 lazy materialization 还不完整,尤其是 Struct complex child 和未来多 leaf value 需要统一 cursor/overflow。 + +结论:当前复杂列已经从“schema 可见”推进到“scalar-child LIST/MAP/STRUCT 可读”。下一阶段重点不是再补单个特殊 case,而是把 Struct child 接入 LIST/MAP assembler,并建立多 leaf stream 的统一 cursor/overflow 模型。 + +## 当前可用能力总结 + +当前新 reader 已经具备: + +- 打开 Parquet 文件并解析 footer; +- 导出 file-local schema; +- 基于 row group statistics 做保守裁剪; +- 读取 flat required/nullable primitive; +- 读取 string/binary; +- 读取 decimal precision <= 38 的常见物理编码; +- 读取 date/time/datetime 的部分编码; +- 通过 `DataTypeSerDe::read_column_from_decoded_values()` 写入 Doris column; +- 基础 predicate-first scan; +- flat column selected read; +- non-nullable / nullable struct 的 scalar child 读取; +- struct scalar child projection; +- scalar LIST / MAP 读取; +- LIST / MAP 的 skip/select overflow 推进。 + +当前还不具备完整生产能力,尤其缺少: + +- schema change 的完整 field id 语义; +- filter localization 的完整实现; +- batch 内 `ColumnPredicate` 执行; +- `reader_expression_map`; +- page index / bloom filter / dictionary pruning; +- `Array(Struct)` / `Map`; +- nested list/map; +- LIST/MAP child projection; +- 复杂类型 schema change; +- complex child nested lazy materialization; +- 充分单测覆盖。 + +最近验证状态: + +- `git diff --check` 通过。 +- Fedora `/home/socrates/code/doris` 上 `BUILD_TYPE=DEBUG ./build.sh --be` 通过。 +- 本地 macOS 运行 `./run-be-ut.sh --run '--filter=ParquetColumnReaderTest.*'` 被环境阻断,CMake 检查 clang++ 时失败:`ld: library 'c++' not found`,未进入测试体。 + +## 下一步优先级 + +建议按以下顺序推进: + +1. 抽象 Struct child sink,把 `Array(Struct)` 和 `Map` 接到现有 LIST/MAP level assembler。 +2. 将 LIST/MAP projection 从 top-level projection 扩展到 child projection,先支持 `array.element.` 和 `map.value.` 这类 Struct child 裁剪。 +3. 为多 leaf stream 引入统一 cursor/overflow 状态,避免 Struct、Array、Map 各自维护不兼容的 read-ahead。 +4. 收敛 `SchemaField` 和 `ColumnMapping` 的 id 语义,区分 Iceberg field id、Parquet leaf column id 和 file-local output position。 +5. 设计复杂类型 schema change 的 child-level mapping 接口,先预留缺失 child/default/null/cast sink,不立即实现完整语义。 +6. 补齐 batch 内 `ColumnPredicate` 执行,让 row group pruning 之后仍有正确 residual filter。 +7. 实现 `reader_expression_map`,支撑 schema change 下无法安全下推的 filter fallback。 +8. 在复杂列 assembler 稳定后,再做 nested pruning、nested lazy materialization、page index、bloom filter、dictionary pruning。 + +## 核心规则 + +`ParquetReader` 必须保持 file-local reader。 + +只要某个功能需要 table schema、Iceberg schema evolution、partition value、default/generated column、delete file 或最终 table block 语义,就应该放在 `TableColumnMapper`、`TableReader` 或具体 table reader 中,而不是放进 `be/src/format/new_parquet/`。 diff --git a/docs/doris-iceberg-parquet-api-design.md b/docs/doris-iceberg-parquet-api-design.md new file mode 100644 index 00000000000000..6518043b40dc6f --- /dev/null +++ b/docs/doris-iceberg-parquet-api-design.md @@ -0,0 +1,511 @@ +# Doris Iceberg + Parquet 新架构 API 设计 + +本文档用于描述 Doris 中 Iceberg + Parquet 新架构的 API 设计。本文档作为后续从 +`master` 新开重构分支时的起点,只定义 API 形状、职责边界、依赖方向和兼容原则, +不定义函数实现细节,不提供伪代码,不包含迁移 patch。 + +## 架构总览 + +目标架构包含 table 调度层、表格式语义层、schema 映射层、文件通用层和文件格式实现层: + +```text +FileScanner / split producer + -> +TableReader + -> +IcebergTableReader + -> +TableColumnMapper + FileReader + -> +ParquetReader +``` + +核心职责如下: + +- `TableReader` + 负责多文件、多 split 的上层调度,统一 scan 生命周期,对外输出 table block, + 并承接动态分区裁剪等 table-level 通用逻辑。 +- `IcebergTableReader` + 负责 Iceberg 表语义,包括 schema 绑定、scan task、delete file、虚拟列和 table + block finalize。 +- `TableColumnMapper` + 负责 table schema 到 file schema 的映射,负责 filter localization 和 schema + change 映射。 +- `FileReader` + 负责文件层通用读取接口,只理解 file-local schema 和 file-local scan request。 +- `ParquetReader` + 作为 `FileReader` 的 Parquet 实现,负责 Parquet 文件物理读取。 + +依赖方向必须保持单向: + +```text +TableReader + -> IcebergTableReader + -> TableColumnMapper + -> FileReader + -> ParquetReader +``` + +低层不反向理解高层语义,尤其 `ParquetReader` 不得反向理解 Iceberg/global schema。 + +## 核心 API 设计 + +### TableReader + +`TableReader` 是最上层读取接口,作为 `IcebergTableReader` 的基类,负责多 split / +多 file 调度,并承接 table-level 的通用裁剪逻辑,不下沉文件格式语义。 + +实际 API 文件: + +```text +be/src/format/reader/table_reader.h +``` + +实际命名空间: + +```cpp +namespace doris::reader +``` + +建议职责: + +- 接收 split 列表或 scan task 列表; +- 控制当前 reader 的创建、切换和关闭; +- 管理 scan 生命周期; +- 承接动态分区裁剪等 table-level 通用过滤逻辑; +- 对外统一输出 table block。 +- `next` 是基类统一入口,内部负责 EOF 后切换 reader;具体表格式只提供打开和读取 + 当前 reader 的 hook。 + +建议接口形状: + +```cpp +namespace doris::reader { + +class TableReader { +public: + virtual ~TableReader() = default; + + virtual Status init(const TableReadOptions& options); + virtual Status filter(const VExprContextSPtr& expr, bool* can_filter_all); + Status next(Block* table_block, size_t* rows, bool* eof); + virtual Status close(); + +protected: + Status next_reader(); + virtual Status open_next_reader(bool* has_reader); + virtual Status read_current(Block* table_block, size_t* rows, bool* eof); + virtual Status close_current_reader(); +}; + +} // namespace doris::reader +``` + +接口约束: + +- `TableReader` 输出的是 table block,不输出 file-local block。 +- `TableReader` 负责多文件编排和 table-level 通用裁剪,不负责 schema mapping,不负责 + Parquet 物理解码。 +- `next_reader` 是 `TableReader` 自己的通用切换逻辑,不作为子类公开 override 接口。 +- 动态分区裁剪这类逻辑应下放到 `TableReader`,而不是散落在具体表格式 reader 中。 +- `TableReader` 不直接依赖旧 `vparquet` 表层语义。 + +### IcebergTableReader + +`IcebergTableReader` 是 Iceberg 表语义层,负责把单个 Iceberg data file 的读取组织成 +table 语义输出。 + +实际 API 文件: + +```text +be/src/format/table/iceberg_reader_v2.h +``` + +实际命名空间: + +```cpp +namespace doris::iceberg +``` + +建议职责: + +- 绑定 Iceberg 当前 table schema; +- 接收 `IcebergScanTask` 列表,并按 `TableReader` 的统一调度打开当前 task; +- 处理 position delete、equality delete、deletion vector; +- 物化 `_row_id`、`_last_updated_sequence_number` 等虚拟列; +- 将 `ParquetReader` 返回的 file-local block finalize 成 table block。 + +建议接口形状: + +```cpp +namespace doris::iceberg { + +class IcebergTableReader : public reader::TableReader { +public: + virtual ~IcebergTableReader() = default; + + Status init(IcebergTableReadParams params); + Status close() override; + +protected: + Status open_next_reader(bool* has_reader) override; + Status read_current(Block* table_block, size_t* rows, bool* eof) override; + Status close_current_reader() override; +}; + +} // namespace doris::iceberg +``` + +接口约束: + +- `IcebergTableReader` 继承 `TableReader`,并通过组合使用 `FileReader`。 +- `IcebergTableReader` 不做 Parquet page/column 解码。 +- `IcebergTableReader` 负责 table-level finalize,不负责 file-local pruning 实现。 +- `IcebergTableReader` 的 schema、scan request、scan tasks 和底层 `FileReader` 应通过 + 一个初始化参数对象一次性传入;除非存在明确生命周期差异,不拆成 `bind` / + `init(TableScanRequest)` / `set_scan_tasks` 多阶段接口。 +- `IcebergTableReader` 不重新实现 reader 切换循环,只实现打开 Iceberg task、读取当前 + task 和关闭当前 reader 的 hook。 + +### TableColumnMapper + +`TableColumnMapper` 是 table schema 到 file schema 的通用映射层,不是 +Iceberg-only 组件。 + +实际 API 文件: + +```text +be/src/format/reader/table_reader.h +``` + +实际命名空间: + +```cpp +namespace doris::reader +``` + +建议职责: + +- 输入 table schema、file schema、table scan request; +- 输出 `ColumnMapping` 和通用 `FileScanRequest`; +- 负责 filter localization; +- 负责 schema change 映射; +- 负责复杂列 child mapping; +- 负责缺失列、default、partition、generated 列的 finalize 语义描述。 + +建议接口形状: + +```cpp +namespace doris::reader { + +class TableColumnMapper { +public: + explicit TableColumnMapper(TableColumnMapperOptions options = {}); + + virtual Status create_mapping(const std::vector& table_schema, + const std::vector& file_schema, + std::vector* mappings); + + virtual Status create_scan_request(const TableScanRequest& table_request, + const std::vector& mappings, + FileScanRequest* file_request); +}; + +} // namespace doris::reader +``` + +接口约束: + +- `TableColumnMapper` 的输入是 table schema + file schema + table scan request。 +- `TableColumnMapper` 的输出是 `ColumnMapping` + `FileScanRequest`。 +- `TableColumnMapper` 必须是通用层,不做 Iceberg-only 命名。 +- Iceberg 场景默认按 field id 映射;按 name 映射不是本轮默认路径。 + +### FileReader + +`FileReader` 是文件物理读取层的通用接口,为后续 Parquet 之外的文件格式适配预留。 + +实际 API 文件: + +```text +be/src/format/reader/file_reader.h +``` + +实际命名空间: + +```cpp +namespace doris::reader +``` + +建议职责: + +- 打开物理文件; +- 暴露 file-local schema; +- 接收 `FileScanRequest`; +- 输出 file-local block; +- 不理解 table/global schema。 + +建议接口形状: + +```cpp +namespace doris::reader { + +class FileReader { +public: + virtual ~FileReader() = default; + + virtual Status open(io::FileReaderSPtr file, io::IOContext* io_ctx = nullptr); + virtual Status get_schema(std::vector* file_schema) const; + virtual Status init(const FileScanRequest& request); + virtual Status next(Block* file_block, size_t* rows, bool* eof); + virtual Status close(); +}; + +} // namespace doris::reader +``` + +接口约束: + +- `FileReader` 输出的是 file-local block,不输出 table/global schema block。 +- `FileReader` 不处理 Iceberg schema evolution、default/generated/partition 列。 +- `IcebergTableReader` 组合 `FileReader`,不直接绑定具体文件格式 reader。 + +### ParquetReader + +`ParquetReader` 是 `FileReader` 的 Parquet 实现,只负责 Parquet file-local schema +和 Parquet file-local scan request。 + +实际 API 文件: + +```text +be/src/format/parquet/parquet_reader.h +``` + +实际命名空间: + +```cpp +namespace doris::parquet +``` + +建议职责: + +- 打开 Parquet 文件; +- 解析 footer 和 file schema; +- 接收 `ParquetScanRequest` 或通用 `FileScanRequest`; +- 执行 file-local projection 和 file-local filter; +- 输出 file-local block。 + +建议接口形状: + +```cpp +namespace doris::parquet { + +class ParquetReader : public reader::FileReader { +public: + virtual ~ParquetReader() = default; + + virtual Status open(io::FileReaderSPtr file, io::IOContext* io_ctx = nullptr); + virtual Status get_schema(std::vector* file_schema) const; + virtual Status init(const ParquetScanRequest& request); + virtual Status next(Block* file_block, size_t* rows, bool* eof); + virtual Status close(); +}; + +} // namespace doris::parquet +``` + +接口约束: + +- `ParquetReader` 输出的是 file-local block,不输出 table/global schema block。 +- `ParquetReader` 不理解 Iceberg schema evolution。 +- `ParquetReader` 不负责 default/generated/partition 列。 +- 任何 table-level cast/default/generated/partition 语义都不能重新塞回 + `ParquetReader`。 + +## 关键类型 + +### SchemaField + +`SchemaField` 表示文件层 schema 中的列定义。 + +建议包含的信息: + +- file-local column id; +- 列名; +- 类型; +- child fields。 + +它服务于 `TableColumnMapper` 做 schema matching,不携带 table-level 语义。 + +### TableColumn + +`TableColumn` 表示 table/global schema 中的列定义。 + +建议包含的信息: + +- table column id; +- 列名; +- 类型; +- child columns。 + +Iceberg 场景下,column id 默认对应 field id。 + +### TableFilter + +`TableFilter` 表示 table 层过滤条件。 + +建议包含的信息: + +- `table_column_id` +- `conjunct` +- `predicates` + +职责约束: + +- `conjunct` 偏表达式过滤,适合表达 cast、复杂表达式、复杂列提取等语义; +- `predicates` 偏结构化单列下推,适合驱动 row group stats、page index、dictionary、 + bloom filter 等文件层优化。 + +### FileLocalFilter + +`FileLocalFilter` 表示已经 localize 到 file-local schema 的过滤条件。 + +建议包含的信息: + +- `file_column_id` +- `conjunct` +- `predicates` + +职责约束: + +- `conjunct` 用于 file-local 表达式过滤; +- `predicates` 用于 file-local 结构化下推; +- 其输入必须来自 `TableColumnMapper`,不能由具体文件 reader 自己推导 table 语义。 + +### ColumnMapping + +`ColumnMapping` 是 table schema 与 file schema 之间的核心边界对象。 + +建议包含的信息: + +- `table_column_id` +- `file_column_id` +- `file_type` +- `table_type` +- `finalize_expr` +- `reader_filter_expr` +- `child_mappings` + +职责约束: + +- `finalize_expr` 服务最终输出,把 file-local value 转成 table/global value; +- `reader_filter_expr` 服务读时 filter fallback; +- 二者语义不同,不能混用; +- `child_mappings` 用于复杂列 remap、复杂列裁剪和复杂列 schema change。 + +### TableScanRequest + +`TableScanRequest` 描述 table 层 scan 请求。 + +建议包含的信息: + +- projected table columns; +- table filters。 + +它由 `IcebergTableReader` 接收,再交给 `TableColumnMapper` 生成 file-local request。 + +### ParquetScanRequest + +`ParquetScanRequest` 继承 `FileScanRequest`,描述 Parquet file-local scan 请求。 + +### FileScanRequest + +`FileScanRequest` 描述通用 file-local scan 请求。 + +建议包含的信息: + +- projected file columns; +- local filters; +- reader expression map。 + +它是 `FileReader` 的唯一 scan 输入,不包含 table/global schema 语义。 + +### IcebergScanTask + +`IcebergScanTask` 表示一次 Iceberg data file 读取任务。 + +建议包含的信息: + +- data file 信息; +- position delete 文件; +- equality delete 文件; +- deletion vector 信息。 + +它是 `IcebergTableReader` 的输入,不应直接传给 `ParquetReader`。 + +### IcebergTableReadParams + +`IcebergTableReadParams` 表示一次 Iceberg table scan 的完整初始化输入。 + +建议包含的信息: + +- Iceberg read options; +- Iceberg table schema; +- table scan request; +- Iceberg scan task 列表; +- 底层 `FileReader`。 + +它用于避免 `IcebergTableReader` 暴露多个半初始化阶段。调用方应一次性构造完整 +参数并调用 `init`。 + +## 设计原则 + +### 边界原则 + +- `FileReader` 不理解 global schema,不直接处理 Iceberg schema evolution。 +- `ParquetReader` 是 `FileReader` 的 Parquet 实现。 +- `TableColumnMapper` 是 schema mapping 和 filter localization 的唯一入口。 +- `IcebergTableReader` 不做 Parquet 解码,只负责 table-level finalize、delete、 + virtual columns。 +- `TableReader` 只负责多文件编排和 table-level 通用裁剪,不下沉文件格式语义。 +- 任何 table-level cast/default/generated/partition 语义都不能重新塞回 + `ParquetReader`。 + +### 依赖原则 + +- 低层不能反向依赖高层语义。 +- `FileReader` 只依赖 file-local request。 +- `IcebergTableReader` 继承 `TableReader`,复用其多文件编排和通用裁剪能力。 +- `IcebergTableReader` 通过组合使用 `FileReader`。 +- `TableColumnMapper` 可以被 Iceberg 之外的其他表格式复用。 + +### 命名原则 + +- 表层抽象使用 `TableReader`、`IcebergTableReader`、`TableColumnMapper`、 + `FileReader`、`ParquetReader` 命名。 +- `TableColumnMapper` 不使用 Iceberg-only 命名。 +- file schema 类型使用 `SchemaField`,table schema 类型使用 `TableColumn`。 + +## 兼容原则 + +新架构重构期间,新旧代码允许并存,但必须遵守以下约束: + +- 旧 `vparquet` / Hive / Hudi / Paimon 路径在新架构稳定前允许保留。 +- 新架构实现不得继续向旧 `vparquet` 表层语义回灌依赖。 +- 先搭新框架 API,再逐步迁移调用点。 +- 不允许边改 API 边混入临时裸逻辑、实验性草稿或未收敛命名。 +- 兼容层可能需要存在,但本文档不定义兼容层的具体实现方案。 + +## 验收标准 + +该文档应满足以下目标: + +- 不引用错误实验代码作为既成事实; +- 不出现实现性草稿、裸伪代码、未收敛命名混用; +- 让另一个工程师从 `master` 新开分支时,可以直接按本文档搭 API 骨架; +- 读完文档后,不需要再讨论以下问题: + - 新架构分几层; + - 每层负责什么; + - 哪层理解 global schema; + - 哪层做 schema change / filter localization / finalize; + - 哪层允许依赖旧实现,哪层不允许。 diff --git a/docs/doris-new-parquet-dictionary-pushdown.md b/docs/doris-new-parquet-dictionary-pushdown.md new file mode 100644 index 00000000000000..7ce6b1a12c3ff6 --- /dev/null +++ b/docs/doris-new-parquet-dictionary-pushdown.md @@ -0,0 +1,359 @@ +# Doris New Parquet Reader Dictionary Predicate Pushdown 方案 + +## 背景 + +当前 new parquet reader 位于 `be/src/format/new_parquet/`,读取路径基于 Arrow +Parquet core API,并输出 Doris `Block` / `Column`。 + +当前已经实现的谓词相关能力主要有两类: + +- row group 级 min/max/null statistics 裁剪; +- 读取谓词列后,用 Doris `ColumnPredicate` 生成 `SelectionVector`,再对非谓词列做延时物化。 + +但当前还没有实现 dictionary predicate pushdown。主要原因是 +`ParquetColumnReaderFactory` 创建 Arrow `RecordReader` 时使用: + +```cpp +_row_group->RecordReader(leaf_column_id, /*read_dictionary=*/false); +``` + +因此底层会把字典编码列直接解码成普通值。等 `ParquetReader` 执行 +`ColumnPredicate::evaluate()` 时,已经看不到 dictionary page,也看不到 dictionary id。 + +本文档描述后续在 new parquet reader 中实现字典列谓词下推的设计方案。 + +## 目标 + +字典谓词下推的目标不是替代现有 statistics pruning,而是补充一类更强的过滤能力: + +```sql +where c = 'abc' +where c in ('a', 'b', 'c') +where c != 'x' +``` + +如果 Parquet column chunk 是全字典编码,可以只检查 dictionary values 或 dictionary +ids,而不必先把整列解码成字符串列。 + +预期收益: + +- 在 row group 级提前跳过不可能命中的 row group; +- 在 batch 级避免谓词列 string materialization; +- 和现有 `SelectionVector` / 延时物化路径结合,减少非谓词列读取量。 + +## 当前实现状态 + +### 已具备 + +- `ParquetStatisticsUtils` 已经有 file-local `ParquetColumnPredicate` 计划结构。 +- `ParquetReader` 已经有谓词列优先读取流程。 +- `SelectionVector` 已经能表示 batch 内选中 row offset。 +- `ParquetColumnReader::select()` 已经能按 selection 对非谓词列做 selected read。 + +### 不具备 + +- 没有判断 column chunk 是否全字典编码。 +- 没有读取 dictionary page 并转换成 Doris Column 的接口。 +- 没有 dictionary id reader。 +- 没有 dictionary value 到 dict id 的谓词重写。 +- 没有把 dictionary id selection 接入当前 `SelectionVector`。 + +因此当前实现不能利用字典列谓词下推。 + +## 分层原则 + +字典谓词下推必须保持 file-local 语义: + +- `TableColumnMapper` 负责把 table filter 转换成 file-local `ColumnPredicate`。 +- `ParquetReader` 只消费 file-local `FileScanRequest`。 +- 字典页、encoding、dictionary id 都属于 Parquet 文件格式层,不能泄露到 + Iceberg/table schema 层。 + +建议放置位置: + +```text +be/src/format/new_parquet/parquet_statistics.* + row group 级 dictionary pruning + +be/src/format/new_parquet/column_reader.* + dictionary values / dictionary ids 读取能力 + +be/src/format/new_parquet/parquet_reader.cpp + 将 dictionary selection 接入现有 predicate-first scan loop +``` + +## 方案一:Row Group 级字典裁剪 + +### 思路 + +对于全字典编码的 column chunk,dictionary page 包含该 row group 中所有可能出现的非 +NULL 值。如果所有 dictionary values 都不能满足谓词,则整个 row group 可以跳过。 + +例子: + +```text +predicate: name = 'Bob' +dictionary values: ['Alice', 'Cindy'] + +=> dictionary 中没有任何值满足 name = 'Bob' +=> row group 可以跳过 +``` + +### 流程 + +```text +FileScanRequest.local_filters + -> Build ParquetColumnPredicate + -> 对每个 row group / column chunk: + 1. 判断 column chunk 是否全字典编码 + 2. 读取 dictionary page + 3. 将 dictionary values materialize 成 Doris Column + 4. 对 dictionary values 执行 ColumnPredicate + 5. 如果没有任何 dictionary value 命中,则跳过 row group +``` + +### 全字典编码判断 + +Parquet 允许同一个 column chunk 先使用字典编码,后续 fallback 到 plain encoding。 +这种 mixed encoding 不能用于 row group 级字典裁剪,否则会漏读 plain page 中的值。 + +判断方式可以参考旧 `vparquet`: + +- 优先使用 `encoding_stats`: + - 所有 `DATA_PAGE` 必须是 `PLAIN_DICTIONARY` 或 `RLE_DICTIONARY`; + - 不能存在 count > 0 的非字典 data page。 +- 如果没有 `encoding_stats`,退化检查 `encodings`: + - 必须包含 dictionary encoding; + - 除 dictionary encoding、`RLE`、`BIT_PACKED` 外,不能包含其它 data encoding。 + +需要注意:`RLE` / `BIT_PACKED` 可能用于 definition/repetition levels,不代表 value +不是字典编码。 + +### 支持的谓词 + +第一阶段建议只支持结构化 `ColumnPredicate`: + +- `EQ` +- `IN` +- `NE` +- `NOT IN` +- `IS NULL` +- `IS NOT NULL` + +其中 null 语义需要谨慎: + +- dictionary page 不包含 NULL; +- `IS NULL` / `IS NOT NULL` 仍需要结合 column chunk null count; +- 不能仅靠 dictionary values 判断 NULL 谓词。 + +更复杂的表达式型 filter,例如 `lower(name) = 'abc'`,不在第一阶段支持。 + +### 正确性规则 + +row group 级裁剪必须保守: + +- 不能确认全字典编码时,保留 row group; +- 不能读取 dictionary page 时,保留 row group; +- 谓词类型不支持时,保留 row group; +- 类型转换不安全时,保留 row group; +- NULL 语义不能确认时,保留 row group。 + +## 方案二:Batch 级 Dict Id Selection + +### 思路 + +row group 不能整体跳过时,仍可以避免把谓词列完整解码成字符串列。 + +例子: + +```text +dictionary values: + id 0 -> 'Alice' + id 1 -> 'Bob' + id 2 -> 'Cindy' + +predicate: + name = 'Bob' + +matched dict ids: + {1} + +data page ids: + [0, 1, 1, 2, 0] + +selection: + [1, 2] +``` + +这时谓词列只需要扫描 dictionary ids,不需要 materialize 成 `ColumnString`。 +非谓词列继续复用当前 `SelectionVector` 做延时物化。 + +### 流程 + +```text +打开 row group + -> 对字典谓词列读取 dictionary values + -> 对 dictionary values 执行 ColumnPredicate + -> 得到 matched dict id set + +读取 batch + -> 读取该 batch 的 dictionary ids + -> 用 matched dict id set 生成 SelectionVector + -> 非谓词列按 SelectionVector selected read + -> 如果字典谓词列也在 projection 中,再按需转换成真实值列 +``` + +### Reader 抽象 + +建议在 `column_reader.*` 增加独立 reader 分支,而不是把逻辑塞进 +`PrimitiveColumnReader::read()`: + +```text +ParquetColumnReader + PrimitiveColumnReader + DictionaryColumnReader +``` + +或者先不新增类,通过内部 strategy 表达: + +```text +PrimitiveColumnReader + decoded reader path + dictionary reader path +``` + +需要暴露的能力: + +```text +read_dictionary_values(MutableColumnPtr* values) +read_dictionary_ids(int64_t rows, MutableColumnPtr* ids, int64_t* rows_read) +select_by_dictionary_ids(...) +materialize_dictionary_ids(...) +``` + +具体命名可以在实现时收敛,但边界应保持: + +- dictionary values / ids 读取属于 `column_reader.*`; +- 用谓词生成 matched dict ids 属于 `parquet_statistics.*` 或新的 filter helper; +- 将 selection 接入 scan loop 属于 `parquet_reader.cpp`。 + +### Arrow RecordReader 的限制 + +Arrow Parquet `RecordReader` 有 `read_dictionary` 参数和 `ReadDictionary()` API。 +但当前代码用的是 `read_dictionary=false`。 + +后续可以尝试: + +```cpp +_row_group->RecordReader(leaf_column_id, /*read_dictionary=*/true) +``` + +需要验证: + +- 只有全字典编码 column chunk 是否才会暴露 dictionary ids; +- mixed encoding 是否自动 fallback 为 decoded values; +- `RecordReader::read_dictionary()` 是否能可靠表示当前 reader 是否真的在读 ids; +- `BYTE_ARRAY` / `FIXED_LEN_BYTE_ARRAY` 之外的类型支持情况; +- nullable column 下 ids 和 def levels 的行对齐方式。 + +从 Arrow 头文件注释看,dictionary expose 主要是 experimental API,且对 fully +dictionary encoded byte array column chunk 更可靠。因此第一版实现应该只针对 string-like +列,并且必须有 fallback。 + +## 和旧 vparquet 的关系 + +旧 `vparquet` 已经实现了一套字典过滤思路: + +1. 判断 column chunk 是否全字典编码; +2. 读取 dictionary values 到临时 string column; +3. 执行原始谓词; +4. 将命中的 dictionary value 下标重写成 int dict code 谓词; +5. 读取 data page 时输出 dict id column; +6. 最终需要输出该列时再把 dict id 转回 string。 + +new parquet reader 可以复用这个设计思想,但不建议直接复用旧实现代码: + +- 旧实现基于 Doris 自研 page decoder; +- new parquet reader 当前基于 Arrow Parquet core API; +- new reader 已有 `SelectionVector`,可以直接用 dict ids 生成 selection,而不一定要重写成 + `VExprContext`。 + +更适合 new reader 的方式是: + +```text +dictionary values -> ColumnPredicate -> matched dict id set -> SelectionVector +``` + +而不是: + +```text +dictionary values -> VExprContext -> rewrite predicate expression +``` + +## 推荐实施顺序 + +### 阶段一:Metadata 判断和 Row Group 级 Dictionary Pruning + +新增能力: + +- 判断 column chunk 是否全字典编码; +- 为 string-like primitive column 读取 dictionary values; +- 对 dictionary values 执行 `ColumnPredicate`; +- 在 `ParquetStatisticsUtils::SelectRowGroups()` 中额外执行 dictionary pruning。 + +约束: + +- 只支持 `BYTE_ARRAY` / `FIXED_LEN_BYTE_ARRAY` string-like 列; +- 只支持结构化 `ColumnPredicate`; +- 不处理 expression fallback; +- 不处理 mixed encoding; +- 不能确认时保守保留 row group。 + +### 阶段二:Batch 级 Dict Id Selection + +新增能力: + +- 构造 dictionary-aware predicate column reader; +- 读取 batch dictionary ids; +- 用 matched dict id set 生成 `SelectionVector`; +- 和现有延时物化路径合并。 + +约束: + +- 谓词列如果也在 projection 中,需要按 selection materialize 成真实 Doris column; +- dict id column 不应泄露到 `ParquetReader` 输出 block; +- fallback 到 decoded value path 必须保持正确。 + +### 阶段三:扩展类型和复杂谓词 + +后续再考虑: + +- numeric dictionary; +- decimal dictionary; +- timestamp/date dictionary; +- `LIKE` / prefix filter; +- expression fallback; +- page index + dictionary 组合裁剪。 + +## 当前实现是否可以直接做到 + +不能。 + +当前实现缺少以下关键点: + +- `RecordReader` 使用 `read_dictionary=false`; +- 没有 dictionary metadata 判断; +- 没有 dictionary page 读取接口; +- 没有 dict id column 或 dict id selection; +- 谓词过滤发生在已经 materialize 的 Doris Column 上。 + +因此,当前最多只能做 decoded value filter,不能做 dictionary predicate pushdown。 + +## 关键设计结论 + +- 字典优化应该放在 Parquet file-local 层,不进入 table schema / Iceberg 层。 +- 第一阶段优先做 row group 级 dictionary pruning,收益明确且风险低。 +- 第二阶段再做 batch 级 dict id selection,与现有 `SelectionVector` 和延时物化结合。 +- 基于 Arrow Parquet API 时,必须明确 fallback 策略,不能假设所有字典编码列都能暴露 + dictionary ids。 +- 输出 block 必须始终是正常 Doris Column,不能把 dict id column 暴露给上层。 diff --git a/gensrc/thrift/Exprs.thrift b/gensrc/thrift/Exprs.thrift index 2644ecec417496..967499aac69d8b 100644 --- a/gensrc/thrift/Exprs.thrift +++ b/gensrc/thrift/Exprs.thrift @@ -88,6 +88,8 @@ enum TExprNodeType { TRY_CAST_EXPR = 41 // for search DSL function SEARCH_EXPR = 42, + // Normal predicate expression + PREDICATE = 43, } //enum TAggregationOp { diff --git a/gensrc/thrift/Opcodes.thrift b/gensrc/thrift/Opcodes.thrift index 1e4002357e7599..a2d709799482eb 100644 --- a/gensrc/thrift/Opcodes.thrift +++ b/gensrc/thrift/Opcodes.thrift @@ -97,4 +97,6 @@ enum TExprOpcode { MATCH_REGEXP = 76, MATCH_PHRASE_EDGE = 77, TRY_CAST = 78, + // Delete operator from Iceberg/Paimon + DELETE = 79, }