From ef45c206aa3b6dc6fe5ac62b5d1694f90d3cc386 Mon Sep 17 00:00:00 2001 From: Socrates Date: Mon, 18 May 2026 15:52:10 +0800 Subject: [PATCH 01/38] Add Iceberg Parquet reader API skeleton --- be/src/format/parquet/parquet_reader.h | 67 ++++ be/src/format/reader/file_reader.h | 141 +++++++ be/src/format/reader/table_reader.h | 256 ++++++++++++ be/src/format/table/iceberg_reader_v2.h | 186 +++++++++ docs/doris-iceberg-parquet-api-design.md | 483 +++++++++++++++++++++++ 5 files changed, 1133 insertions(+) create mode 100644 be/src/format/parquet/parquet_reader.h create mode 100644 be/src/format/reader/file_reader.h create mode 100644 be/src/format/reader/table_reader.h create mode 100644 be/src/format/table/iceberg_reader_v2.h create mode 100644 docs/doris-iceberg-parquet-api-design.md diff --git a/be/src/format/parquet/parquet_reader.h b/be/src/format/parquet/parquet_reader.h new file mode 100644 index 00000000000000..dfac6494cd8e45 --- /dev/null +++ b/be/src/format/parquet/parquet_reader.h @@ -0,0 +1,67 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "common/status.h" +#include "format/reader/file_reader.h" + +namespace doris { +namespace io { +struct IOContext; +} // namespace io +} // namespace doris + +namespace doris::parquet { + +// ParquetReader 的 file-local scan 请求。 +// 当前没有新增 Parquet-only 字段,但保留独立类型,便于后续加入 row group/page index +// 等 Parquet 专属选项。 +struct ParquetScanRequest : public reader::FileScanRequest {}; + +// Parquet 文件物理读取层。 +// 该类只理解 Parquet file-local schema 和 ParquetScanRequest,不理解 Iceberg/global +// schema,不处理 table-level cast/default/generated/partition 语义。 +class ParquetReader : public reader::FileReader { +public: + virtual ~ParquetReader() = default; + + Status get_schema(std::vector* file_schema) const override { + // 真实实现会从 Parquet footer / schema descriptor 展开 file-local schema。 + file_schema->clear(); + return Status::OK(); + } + + Status init(const ParquetScanRequest& request) { + // 真实实现会根据 projected_file_columns、local_filters 和 reader_expression_map + // 初始化 row group、column chunk、page reader 以及延时物化计划。 + return reader::FileReader::init(request); + } + + Status next(Block* file_block, size_t* rows, bool* eof) override { + // 真实实现会输出 file-local block。stub 默认立即 EOF。 + return reader::FileReader::next(file_block, rows, eof); + } + + Status init(const reader::FileScanRequest& request) override { + return reader::FileReader::init(request); + } +}; + +} // namespace doris::parquet diff --git a/be/src/format/reader/file_reader.h b/be/src/format/reader/file_reader.h new file mode 100644 index 00000000000000..af03691b94ecf3 --- /dev/null +++ b/be/src/format/reader/file_reader.h @@ -0,0 +1,141 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "common/status.h" +#include "core/data_type/data_type.h" +#include "exprs/vexpr_fwd.h" +#include "io/fs/file_reader_writer_fwd.h" + +namespace doris { +class Block; +class ColumnPredicate; + +namespace io { +struct IOContext; +} // namespace io +} // namespace doris + +namespace doris::reader { + +using ColumnId = int32_t; + +// 文件本地 schema 字段。 +// 这是 FileReader 暴露给 table 层的 file-local schema 视图,不携带 table/global +// schema 语义。Iceberg field id、name mapping、default/generated/partition 列都不在 +// FileReader 内部解释。 +struct SchemaField { + ColumnId id = -1; + std::string name; + DataTypePtr type; + std::vector children; +}; + +// 已经 localize 到文件 schema 的过滤条件。 +// TableColumnMapper 负责把 table-level filter 转成这个结构;FileReader 只消费 +// file-local column id、表达式和结构化谓词。 +struct FileLocalFilter { + ColumnId file_column_id = -1; + + // 表达式过滤。适合 cast、复杂表达式或 reader_expression_map 生成的临时列过滤。 + // 它通常不能直接驱动 row group stats、page index、dictionary、bloom filter。 + VExprContextSPtr conjunct; + + // 结构化列谓词。适合文件层 pruning,例如 min/max、page index、dictionary、 + // bloom filter 等只理解单列谓词的优化。 + std::vector> predicates; +}; + +// 通用文件层 scan 请求。 +// 该结构描述所有文件格式都可以共享的 file-local 读取输入。这里不出现 table/global +// schema。所有 schema change、filter localization、default/generated/partition +// 列都应在 table 层完成。 +struct FileScanRequest { + virtual ~FileScanRequest() = default; + + std::vector projected_file_columns; + std::vector local_filters; + std::vector> reader_expression_map; +}; + +// 文件物理读取层通用接口。 +// 该接口只描述 file-local schema、file-local scan request 和 file-local block。 +// TableReader/IcebergTableReader 可以通过它组合不同文件格式 reader。 +class FileReader { +public: + virtual ~FileReader() = default; + + virtual Status open(io::FileReaderSPtr file, io::IOContext* io_ctx = nullptr) { + // 真实实现会保存文件句柄、IO 上下文并读取文件元数据。 + _file = std::move(file); + _io_ctx = io_ctx; + _eof = false; + return Status::OK(); + } + + virtual Status get_schema(std::vector* file_schema) const { + // 真实实现会展开文件格式自己的 file-local schema。 + file_schema->clear(); + return Status::OK(); + } + + virtual Status init(const FileScanRequest& request) { + // 真实实现会根据 projected columns、local filters 和 reader expressions + // 初始化文件格式自己的物理读取计划。 + _request.projected_file_columns = request.projected_file_columns; + _request.local_filters = request.local_filters; + _request.reader_expression_map = request.reader_expression_map; + return Status::OK(); + } + + virtual Status next(Block* file_block, size_t* rows, bool* eof) { + // stub 默认立即 EOF。 + (void)file_block; + if (rows != nullptr) { + *rows = 0; + } + if (eof != nullptr) { + *eof = true; + } + _eof = true; + return Status::OK(); + } + + virtual Status close() { + _file.reset(); + _io_ctx = nullptr; + _request = FileScanRequest {}; + _eof = true; + return Status::OK(); + } + +protected: + io::FileReaderSPtr _file; + io::IOContext* _io_ctx = nullptr; + FileScanRequest _request; + bool _eof = true; +}; + +} // namespace doris::reader diff --git a/be/src/format/reader/table_reader.h b/be/src/format/reader/table_reader.h new file mode 100644 index 00000000000000..422ee3142d1ea9 --- /dev/null +++ b/be/src/format/reader/table_reader.h @@ -0,0 +1,256 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include "common/status.h" +#include "core/data_type/data_type.h" +#include "exprs/vexpr_fwd.h" +#include "format/reader/file_reader.h" + +namespace doris { +class Block; +class ColumnPredicate; +} // namespace doris + +namespace doris::reader { + +// table/global schema 中的列视图。 +// Iceberg 场景下,id 默认对应 Iceberg field id。该结构不描述文件中的物理列。 +struct TableColumn { + ColumnId id = -1; + std::string name; + DataTypePtr type; + std::vector children; +}; + +// table-level filter。 +// TableColumnMapper 负责把它转换成 FileLocalFilter 或 reader_expression_map。 +struct TableFilter { + ColumnId table_column_id = -1; + + // 表达式过滤,适合表达 cast、复杂表达式、复杂列提取等语义。 + VExprContextSPtr conjunct; + + // 结构化列谓词,适合下推到文件层做 row group stats、page index、dictionary、 + // bloom filter 等优化。 + std::vector> predicates; +}; + +// 单个 table column 到 file column 的映射结果。 +// 这是 table 层和 file 层的核心边界对象。 +struct ColumnMapping { + ColumnId table_column_id = -1; + std::optional file_column_id; + DataTypePtr file_type; + DataTypePtr table_type; + + // 最终输出表达式。用于把 file-local value 转成 table/global value,例如 cast、 + // default、partition、generated column 或复杂列 remap。 + VExprContextSPtr finalize_expr; + + // 读时过滤 fallback 表达式。只在 table filter 不能安全转换成 file-local predicate + // 时使用,服务 reader_expression_map,不等价于 finalize_expr。 + VExprContextSPtr reader_filter_expr; + + std::vector child_mappings; + bool is_trivial = false; + bool is_constant = false; +}; + +enum class TableColumnMappingMode { + BY_FIELD_ID, + BY_NAME, +}; + +enum class TableFilterConversion { + COPY_DIRECTLY, + CAST_FILTER, + EVALUATE_EXPRESSION, + FINALIZE_ONLY, +}; + +struct TableColumnMapperOptions { + TableColumnMappingMode mode = TableColumnMappingMode::BY_FIELD_ID; + bool allow_missing_columns = true; + bool enable_reader_expression_fallback = true; +}; + +// table-level scan 请求。 +// 它仍然使用 table/global schema 语义,不能直接传给 FileReader。 +struct TableScanRequest { + std::vector projected_table_columns; + std::vector table_filters; +}; + +// 通用 table schema 到 file schema 映射层。 +// Iceberg 会使用 BY_FIELD_ID;普通 by-name 场景可以复用该组件,但不应把它命名成 +// Iceberg-only 组件。 +class TableColumnMapper { +public: + explicit TableColumnMapper(TableColumnMapperOptions options = {}) : _options(std::move(options)) {} + virtual ~TableColumnMapper() = default; + + virtual Status create_mapping(const std::vector& table_schema, + const std::vector& file_schema, + std::vector* mappings) { + // 真实实现会做 field id/name matching、类型转换、复杂列 child mapping、缺失列 + // default/partition/generated 表达式构造。 + mappings->clear(); + for (const auto& table_column : table_schema) { + ColumnMapping mapping; + mapping.table_column_id = table_column.id; + mapping.table_type = table_column.type; + if (const auto* file_field = find_file_field(table_column, file_schema)) { + mapping.file_column_id = file_field->id; + mapping.file_type = file_field->type; + mapping.is_trivial = is_same_type(mapping.table_type, mapping.file_type); + } else { + mapping.is_constant = true; + } + mappings->push_back(std::move(mapping)); + } + _mappings = *mappings; + return Status::OK(); + } + + virtual Status create_scan_request(const TableScanRequest& table_request, + const std::vector& mappings, + FileScanRequest* file_request) { + // 真实实现会把 table projection/filter 转换成 file-local projection/filter。 + file_request->projected_file_columns.clear(); + file_request->local_filters.clear(); + file_request->reader_expression_map.clear(); + _mappings = mappings; + for (const auto& table_column : table_request.projected_table_columns) { + const auto* mapping = find_mapping(table_column.id); + if (mapping != nullptr && mapping->file_column_id.has_value()) { + file_request->projected_file_columns.push_back(*mapping->file_column_id); + } + } + RETURN_IF_ERROR(localize_filters(table_request.table_filters, file_request)); + return Status::OK(); + } + + virtual Status localize_filters(const std::vector& table_filters, + FileScanRequest* file_request) const { + // 真实实现会处理 trivial mapping、safe cast、reader expression fallback 和 + // finalize-only filter。stub 只复制能够直接定位到 file column 的谓词。 + for (const auto& filter : table_filters) { + const auto* mapping = find_mapping(filter.table_column_id); + if (mapping == nullptr || !mapping->file_column_id.has_value()) { + continue; + } + FileLocalFilter local_filter; + local_filter.file_column_id = *mapping->file_column_id; + local_filter.conjunct = filter.conjunct; + local_filter.predicates = filter.predicates; + file_request->local_filters.push_back(std::move(local_filter)); + } + return Status::OK(); + } + + const std::vector& mappings() const { return _mappings; } + +private: + const SchemaField* find_file_field( + const TableColumn& table_column, + const std::vector& file_schema) const { + for (const auto& field : file_schema) { + if (_options.mode == TableColumnMappingMode::BY_FIELD_ID && field.id == table_column.id) { + return &field; + } + if (_options.mode == TableColumnMappingMode::BY_NAME && field.name == table_column.name) { + return &field; + } + } + return nullptr; + } + + const ColumnMapping* find_mapping(ColumnId table_column_id) const { + for (const auto& mapping : _mappings) { + if (mapping.table_column_id == table_column_id) { + return &mapping; + } + } + return nullptr; + } + + bool is_same_type(const DataTypePtr& table_type, const DataTypePtr& file_type) const { + return table_type == file_type; + } + +private: + TableColumnMapperOptions _options; + std::vector _mappings; +}; + +struct TableReadOptions { + size_t batch_size = 4096; +}; + +// table-level reader 基类。 +// 该层负责多文件编排和动态分区裁剪等通用 table-level 逻辑,对外输出 table block。 +class TableReader { +public: + virtual ~TableReader() = default; + + virtual Status init(const TableReadOptions& options) { + _options = options; + return Status::OK(); + } + + virtual Status filter(const VExprContextSPtr& expr, bool* can_filter_all) { + // 真实实现会基于 split/partition/file stats 判断动态分区裁剪结果。 + (void)expr; + if (can_filter_all != nullptr) { + *can_filter_all = false; + } + return Status::OK(); + } + + virtual Status next_reader() { + // 真实实现会切换到下一个 data file / split reader。 + return Status::OK(); + } + + virtual Status next(Block* table_block, size_t* rows, bool* eof) { + (void)table_block; + if (rows != nullptr) { + *rows = 0; + } + if (eof != nullptr) { + *eof = true; + } + return Status::OK(); + } + + virtual Status close() { return Status::OK(); } + +protected: + TableReadOptions _options; +}; + +} // namespace doris::reader diff --git a/be/src/format/table/iceberg_reader_v2.h b/be/src/format/table/iceberg_reader_v2.h new file mode 100644 index 00000000000000..70ee2bb3ff548a --- /dev/null +++ b/be/src/format/table/iceberg_reader_v2.h @@ -0,0 +1,186 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "common/status.h" +#include "format/reader/file_reader.h" +#include "format/reader/table_reader.h" + +namespace doris { +class Block; +} // namespace doris + +namespace doris::iceberg { + +// Iceberg data file 摘要。它描述当前要读取的物理 data file,不承载列映射逻辑。 +struct IcebergDataFile { + std::string path; + std::string format; + int64_t record_count = 0; + int64_t file_size = 0; + int64_t sequence_number = 0; + int64_t first_row_id = -1; +}; + +// Iceberg delete file 摘要。position/equality/deletion vector 的具体读取在 +// IcebergTableReader 实现阶段补齐。 +struct IcebergDeleteFile { + std::string path; + std::string format; + int64_t sequence_number = 0; + std::vector equality_field_ids; +}; + +// 单个 Iceberg data file 的 scan 输入。 +// 该结构只进入 IcebergTableReader,不直接传给 ParquetReader。 +struct IcebergScanTask { + IcebergDataFile data_file; + std::vector positional_deletes; + std::vector equality_deletes; + std::vector deletion_vectors; +}; + +struct IcebergReadOptions { + reader::TableReadOptions table_options; + bool enable_position_delete = true; + bool enable_equality_delete = true; + bool enable_deletion_vector = true; +}; + +// Iceberg table-level reader。 +// 该层继承 TableReader,复用多文件编排和动态分区裁剪等通用能力;同时组合 +// FileReader 完成 data file 物理读取,不继承具体文件格式 reader。 +class IcebergTableReader : public reader::TableReader { +public: + IcebergTableReader() = default; + + explicit IcebergTableReader(std::unique_ptr data_reader) + : _data_reader(std::move(data_reader)) {} + + ~IcebergTableReader() override = default; + + Status init(const IcebergReadOptions& options, + std::unique_ptr data_reader) { + _iceberg_options = options; + _data_reader = std::move(data_reader); + return reader::TableReader::init(options.table_options); + } + + Status bind(const std::vector& iceberg_schema) { + // 真实实现会绑定 Iceberg 当前 schema,并准备 field-id based mapping 输入。 + _iceberg_schema = iceberg_schema; + return Status::OK(); + } + + Status init(const reader::TableScanRequest& request) { + // 保存 table-level projection/filter,后续由 TableColumnMapper 转成 FileScanRequest。 + _table_scan_request = request; + return Status::OK(); + } + + Status open_task(const IcebergScanTask& task) { + // 真实实现会读取 data file schema,创建 field-id mapping,应用 position deletes, + // 并初始化底层 ParquetReader。 + _scan_task = task; + std::vector file_schema; + if (_data_reader) { + RETURN_IF_ERROR(_data_reader->get_schema(&file_schema)); + } + reader::TableColumnMapperOptions mapper_options; + mapper_options.mode = reader::TableColumnMappingMode::BY_FIELD_ID; + _column_mapper = reader::TableColumnMapper(mapper_options); + RETURN_IF_ERROR(_column_mapper.create_mapping(_iceberg_schema, file_schema, &_mappings)); + + reader::FileScanRequest file_request; + RETURN_IF_ERROR(_column_mapper.create_scan_request(_table_scan_request, _mappings, + &file_request)); + RETURN_IF_ERROR(apply_position_deletes(&file_request)); + if (_data_reader) { + RETURN_IF_ERROR(_data_reader->init(file_request)); + } + return Status::OK(); + } + + Status next(Block* table_block, size_t* rows, bool* eof) override { + // 真实实现会读取 file-local block,finalize 成 table block,再应用 equality delete + // 和 Iceberg virtual columns。stub 默认 EOF。 + // 后续实现应在 IcebergTableReader 内部持有 file-local block;这里仅复用输出指针 + // 作为 header-only API 占位,避免在骨架阶段引入 Block 的完整定义。 + Block* file_block = table_block; + if (_data_reader) { + RETURN_IF_ERROR(_data_reader->next(file_block, rows, eof)); + } + RETURN_IF_ERROR(finalize_chunk(file_block, table_block)); + RETURN_IF_ERROR(apply_equality_deletes(table_block)); + RETURN_IF_ERROR(materialize_virtual_columns(table_block, rows != nullptr ? *rows : 0)); + return Status::OK(); + } + + Status finalize_chunk(Block* file_block, Block* table_block) { + // 真实实现会根据 ColumnMapping 执行 finalize_expr/default/partition/generated + // expressions,把 file-local block 写成 table block。 + (void)file_block; + (void)table_block; + return Status::OK(); + } + + Status apply_position_deletes(reader::FileScanRequest* request) { + // 真实实现会把 position delete / deletion vector 转换成 file-local delete 信息。 + (void)request; + return Status::OK(); + } + + Status apply_equality_deletes(Block* table_block) { + // 真实实现会在 table block 上应用 equality delete。 + (void)table_block; + return Status::OK(); + } + + Status materialize_virtual_columns(Block* table_block, size_t rows) { + // 真实实现会物化 _row_id、_last_updated_sequence_number 等 Iceberg 虚拟列。 + (void)table_block; + (void)rows; + return Status::OK(); + } + + Status close() override { + if (_data_reader) { + RETURN_IF_ERROR(_data_reader->close()); + } + _data_reader.reset(); + return Status::OK(); + } + +private: + IcebergReadOptions _iceberg_options; + IcebergScanTask _scan_task; + reader::TableScanRequest _table_scan_request; + std::vector _iceberg_schema; + std::vector _mappings; + reader::TableColumnMapper _column_mapper; + std::unique_ptr _data_reader; +}; + +} // namespace doris::iceberg diff --git a/docs/doris-iceberg-parquet-api-design.md b/docs/doris-iceberg-parquet-api-design.md new file mode 100644 index 00000000000000..58036667d44ef6 --- /dev/null +++ b/docs/doris-iceberg-parquet-api-design.md @@ -0,0 +1,483 @@ +# Doris Iceberg + Parquet 新架构 API 设计 + +本文档用于描述 Doris 中 Iceberg + Parquet 新架构的 API 设计。本文档作为后续从 +`master` 新开重构分支时的起点,只定义 API 形状、职责边界、依赖方向和兼容原则, +不定义函数实现细节,不提供伪代码,不包含迁移 patch。 + +## 架构总览 + +目标架构包含 table 调度层、表格式语义层、schema 映射层、文件通用层和文件格式实现层: + +```text +FileScanner / split producer + -> +TableReader + -> +IcebergTableReader + -> +TableColumnMapper + FileReader + -> +ParquetReader +``` + +核心职责如下: + +- `TableReader` + 负责多文件、多 split 的上层调度,统一 scan 生命周期,对外输出 table block, + 并承接动态分区裁剪等 table-level 通用逻辑。 +- `IcebergTableReader` + 负责 Iceberg 表语义,包括 schema 绑定、scan task、delete file、虚拟列和 table + block finalize。 +- `TableColumnMapper` + 负责 table schema 到 file schema 的映射,负责 filter localization 和 schema + change 映射。 +- `FileReader` + 负责文件层通用读取接口,只理解 file-local schema 和 file-local scan request。 +- `ParquetReader` + 作为 `FileReader` 的 Parquet 实现,负责 Parquet 文件物理读取。 + +依赖方向必须保持单向: + +```text +TableReader + -> IcebergTableReader + -> TableColumnMapper + -> FileReader + -> ParquetReader +``` + +低层不反向理解高层语义,尤其 `ParquetReader` 不得反向理解 Iceberg/global schema。 + +## 核心 API 设计 + +### TableReader + +`TableReader` 是最上层读取接口,作为 `IcebergTableReader` 的基类,负责多 split / +多 file 调度,并承接 table-level 的通用裁剪逻辑,不下沉文件格式语义。 + +实际 API 文件: + +```text +be/src/format/reader/table_reader.h +``` + +实际命名空间: + +```cpp +namespace doris::reader +``` + +建议职责: + +- 接收 split 列表或 scan task 列表; +- 控制当前 reader 的创建、切换和关闭; +- 管理 scan 生命周期; +- 承接动态分区裁剪等 table-level 通用过滤逻辑; +- 对外统一输出 table block。 + +建议接口形状: + +```cpp +namespace doris::reader { + +class TableReader { +public: + virtual ~TableReader() = default; + + virtual Status init(const TableReadOptions& options); + virtual Status filter(const VExprContextSPtr& expr, bool* can_filter_all); + virtual Status next_reader(); + virtual Status next(Block* table_block, size_t* rows, bool* eof); + virtual Status close(); +}; + +} // namespace doris::reader +``` + +接口约束: + +- `TableReader` 输出的是 table block,不输出 file-local block。 +- `TableReader` 负责多文件编排和 table-level 通用裁剪,不负责 schema mapping,不负责 + Parquet 物理解码。 +- 动态分区裁剪这类逻辑应下放到 `TableReader`,而不是散落在具体表格式 reader 中。 +- `TableReader` 不直接依赖旧 `vparquet` 表层语义。 + +### IcebergTableReader + +`IcebergTableReader` 是 Iceberg 表语义层,负责把单个 Iceberg data file 的读取组织成 +table 语义输出。 + +实际 API 文件: + +```text +be/src/format/table/iceberg_reader_v2.h +``` + +实际命名空间: + +```cpp +namespace doris::iceberg +``` + +建议职责: + +- 绑定 Iceberg 当前 table schema; +- 接收 `IcebergScanTask`; +- 处理 position delete、equality delete、deletion vector; +- 物化 `_row_id`、`_last_updated_sequence_number` 等虚拟列; +- 将 `ParquetReader` 返回的 file-local block finalize 成 table block。 + +建议接口形状: + +```cpp +namespace doris::iceberg { + +class IcebergTableReader : public reader::TableReader { +public: + virtual ~IcebergTableReader() = default; + + Status init(const IcebergReadOptions& options, + std::unique_ptr data_reader); + Status bind(const std::vector& iceberg_schema); + Status init(const reader::TableScanRequest& request); + Status open_task(const IcebergScanTask& task); + Status next(Block* table_block, size_t* rows, bool* eof) override; + Status close() override; +}; + +} // namespace doris::iceberg +``` + +接口约束: + +- `IcebergTableReader` 继承 `TableReader`,并通过组合使用 `FileReader`。 +- `IcebergTableReader` 不做 Parquet page/column 解码。 +- `IcebergTableReader` 负责 table-level finalize,不负责 file-local pruning 实现。 + +### TableColumnMapper + +`TableColumnMapper` 是 table schema 到 file schema 的通用映射层,不是 +Iceberg-only 组件。 + +实际 API 文件: + +```text +be/src/format/reader/table_reader.h +``` + +实际命名空间: + +```cpp +namespace doris::reader +``` + +建议职责: + +- 输入 table schema、file schema、table scan request; +- 输出 `ColumnMapping` 和通用 `FileScanRequest`; +- 负责 filter localization; +- 负责 schema change 映射; +- 负责复杂列 child mapping; +- 负责缺失列、default、partition、generated 列的 finalize 语义描述。 + +建议接口形状: + +```cpp +namespace doris::reader { + +class TableColumnMapper { +public: + explicit TableColumnMapper(TableColumnMapperOptions options = {}); + + virtual Status create_mapping(const std::vector& table_schema, + const std::vector& file_schema, + std::vector* mappings); + + virtual Status create_scan_request(const TableScanRequest& table_request, + const std::vector& mappings, + FileScanRequest* file_request); +}; + +} // namespace doris::reader +``` + +接口约束: + +- `TableColumnMapper` 的输入是 table schema + file schema + table scan request。 +- `TableColumnMapper` 的输出是 `ColumnMapping` + `FileScanRequest`。 +- `TableColumnMapper` 必须是通用层,不做 Iceberg-only 命名。 +- Iceberg 场景默认按 field id 映射;按 name 映射不是本轮默认路径。 + +### FileReader + +`FileReader` 是文件物理读取层的通用接口,为后续 Parquet 之外的文件格式适配预留。 + +实际 API 文件: + +```text +be/src/format/reader/file_reader.h +``` + +实际命名空间: + +```cpp +namespace doris::reader +``` + +建议职责: + +- 打开物理文件; +- 暴露 file-local schema; +- 接收 `FileScanRequest`; +- 输出 file-local block; +- 不理解 table/global schema。 + +建议接口形状: + +```cpp +namespace doris::reader { + +class FileReader { +public: + virtual ~FileReader() = default; + + virtual Status open(io::FileReaderSPtr file, io::IOContext* io_ctx = nullptr); + virtual Status get_schema(std::vector* file_schema) const; + virtual Status init(const FileScanRequest& request); + virtual Status next(Block* file_block, size_t* rows, bool* eof); + virtual Status close(); +}; + +} // namespace doris::reader +``` + +接口约束: + +- `FileReader` 输出的是 file-local block,不输出 table/global schema block。 +- `FileReader` 不处理 Iceberg schema evolution、default/generated/partition 列。 +- `IcebergTableReader` 组合 `FileReader`,不直接绑定具体文件格式 reader。 + +### ParquetReader + +`ParquetReader` 是 `FileReader` 的 Parquet 实现,只负责 Parquet file-local schema +和 Parquet file-local scan request。 + +实际 API 文件: + +```text +be/src/format/parquet/parquet_reader.h +``` + +实际命名空间: + +```cpp +namespace doris::parquet +``` + +建议职责: + +- 打开 Parquet 文件; +- 解析 footer 和 file schema; +- 接收 `ParquetScanRequest` 或通用 `FileScanRequest`; +- 执行 file-local projection 和 file-local filter; +- 输出 file-local block。 + +建议接口形状: + +```cpp +namespace doris::parquet { + +class ParquetReader : public reader::FileReader { +public: + virtual ~ParquetReader() = default; + + virtual Status open(io::FileReaderSPtr file, io::IOContext* io_ctx = nullptr); + virtual Status get_schema(std::vector* file_schema) const; + virtual Status init(const ParquetScanRequest& request); + virtual Status next(Block* file_block, size_t* rows, bool* eof); + virtual Status close(); +}; + +} // namespace doris::parquet +``` + +接口约束: + +- `ParquetReader` 输出的是 file-local block,不输出 table/global schema block。 +- `ParquetReader` 不理解 Iceberg schema evolution。 +- `ParquetReader` 不负责 default/generated/partition 列。 +- 任何 table-level cast/default/generated/partition 语义都不能重新塞回 + `ParquetReader`。 + +## 关键类型 + +### SchemaField + +`SchemaField` 表示文件层 schema 中的列定义。 + +建议包含的信息: + +- file-local column id; +- 列名; +- 类型; +- child fields。 + +它服务于 `TableColumnMapper` 做 schema matching,不携带 table-level 语义。 + +### TableColumn + +`TableColumn` 表示 table/global schema 中的列定义。 + +建议包含的信息: + +- table column id; +- 列名; +- 类型; +- child columns。 + +Iceberg 场景下,column id 默认对应 field id。 + +### TableFilter + +`TableFilter` 表示 table 层过滤条件。 + +建议包含的信息: + +- `table_column_id` +- `conjunct` +- `predicates` + +职责约束: + +- `conjunct` 偏表达式过滤,适合表达 cast、复杂表达式、复杂列提取等语义; +- `predicates` 偏结构化单列下推,适合驱动 row group stats、page index、dictionary、 + bloom filter 等文件层优化。 + +### FileLocalFilter + +`FileLocalFilter` 表示已经 localize 到 file-local schema 的过滤条件。 + +建议包含的信息: + +- `file_column_id` +- `conjunct` +- `predicates` + +职责约束: + +- `conjunct` 用于 file-local 表达式过滤; +- `predicates` 用于 file-local 结构化下推; +- 其输入必须来自 `TableColumnMapper`,不能由具体文件 reader 自己推导 table 语义。 + +### ColumnMapping + +`ColumnMapping` 是 table schema 与 file schema 之间的核心边界对象。 + +建议包含的信息: + +- `table_column_id` +- `file_column_id` +- `file_type` +- `table_type` +- `finalize_expr` +- `reader_filter_expr` +- `child_mappings` + +职责约束: + +- `finalize_expr` 服务最终输出,把 file-local value 转成 table/global value; +- `reader_filter_expr` 服务读时 filter fallback; +- 二者语义不同,不能混用; +- `child_mappings` 用于复杂列 remap、复杂列裁剪和复杂列 schema change。 + +### TableScanRequest + +`TableScanRequest` 描述 table 层 scan 请求。 + +建议包含的信息: + +- projected table columns; +- table filters。 + +它由 `IcebergTableReader` 接收,再交给 `TableColumnMapper` 生成 file-local request。 + +### ParquetScanRequest + +`ParquetScanRequest` 继承 `FileScanRequest`,描述 Parquet file-local scan 请求。 + +### FileScanRequest + +`FileScanRequest` 描述通用 file-local scan 请求。 + +建议包含的信息: + +- projected file columns; +- local filters; +- reader expression map。 + +它是 `FileReader` 的唯一 scan 输入,不包含 table/global schema 语义。 + +### IcebergScanTask + +`IcebergScanTask` 表示一次 Iceberg data file 读取任务。 + +建议包含的信息: + +- data file 信息; +- position delete 文件; +- equality delete 文件; +- deletion vector 信息。 + +它是 `IcebergTableReader` 的输入,不应直接传给 `ParquetReader`。 + +## 设计原则 + +### 边界原则 + +- `FileReader` 不理解 global schema,不直接处理 Iceberg schema evolution。 +- `ParquetReader` 是 `FileReader` 的 Parquet 实现。 +- `TableColumnMapper` 是 schema mapping 和 filter localization 的唯一入口。 +- `IcebergTableReader` 不做 Parquet 解码,只负责 table-level finalize、delete、 + virtual columns。 +- `TableReader` 只负责多文件编排和 table-level 通用裁剪,不下沉文件格式语义。 +- 任何 table-level cast/default/generated/partition 语义都不能重新塞回 + `ParquetReader`。 + +### 依赖原则 + +- 低层不能反向依赖高层语义。 +- `FileReader` 只依赖 file-local request。 +- `IcebergTableReader` 继承 `TableReader`,复用其多文件编排和通用裁剪能力。 +- `IcebergTableReader` 通过组合使用 `FileReader`。 +- `TableColumnMapper` 可以被 Iceberg 之外的其他表格式复用。 + +### 命名原则 + +- 表层抽象使用 `TableReader`、`IcebergTableReader`、`TableColumnMapper`、 + `FileReader`、`ParquetReader` 命名。 +- `TableColumnMapper` 不使用 Iceberg-only 命名。 +- file schema 类型使用 `SchemaField`,table schema 类型使用 `TableColumn`。 + +## 兼容原则 + +新架构重构期间,新旧代码允许并存,但必须遵守以下约束: + +- 旧 `vparquet` / Hive / Hudi / Paimon 路径在新架构稳定前允许保留。 +- 新架构实现不得继续向旧 `vparquet` 表层语义回灌依赖。 +- 先搭新框架 API,再逐步迁移调用点。 +- 不允许边改 API 边混入临时裸逻辑、实验性草稿或未收敛命名。 +- 兼容层可能需要存在,但本文档不定义兼容层的具体实现方案。 + +## 验收标准 + +该文档应满足以下目标: + +- 不引用错误实验代码作为既成事实; +- 不出现实现性草稿、裸伪代码、未收敛命名混用; +- 让另一个工程师从 `master` 新开分支时,可以直接按本文档搭 API 骨架; +- 读完文档后,不需要再讨论以下问题: + - 新架构分几层; + - 每层负责什么; + - 哪层理解 global schema; + - 哪层做 schema change / filter localization / finalize; + - 哪层允许依赖旧实现,哪层不允许。 From 57178c889aa1fdaa64efc9ac9e9f54d6753ea2c2 Mon Sep 17 00:00:00 2001 From: Socrates Date: Mon, 18 May 2026 16:16:50 +0800 Subject: [PATCH 02/38] Refine Iceberg reader API boundaries --- be/src/format/parquet/parquet_reader.h | 10 +++ be/src/format/reader/file_reader.h | 14 ++++ be/src/format/reader/table_reader.h | 93 ++++++++++++++++++++++-- be/src/format/table/iceberg_reader_v2.h | 89 ++++++++++++++++++----- docs/doris-iceberg-parquet-api-design.md | 46 +++++++++--- 5 files changed, 218 insertions(+), 34 deletions(-) diff --git a/be/src/format/parquet/parquet_reader.h b/be/src/format/parquet/parquet_reader.h index dfac6494cd8e45..65227aba04cf31 100644 --- a/be/src/format/parquet/parquet_reader.h +++ b/be/src/format/parquet/parquet_reader.h @@ -42,23 +42,33 @@ class ParquetReader : public reader::FileReader { public: virtual ~ParquetReader() = default; + // 解析 Parquet footer 并返回 Parquet 文件自身的 schema。 + // 这里不做 Iceberg schema evolution,也不把字段转换成 table/global schema。 Status get_schema(std::vector* file_schema) const override { // 真实实现会从 Parquet footer / schema descriptor 展开 file-local schema。 file_schema->clear(); return Status::OK(); } + // 初始化 Parquet 专属 scan。 + // 后续可以在 ParquetScanRequest 中扩展 row group、page index、bloom filter 等 + // Parquet-only 选项;table-level 语义仍然必须由 TableColumnMapper 提前转换。 Status init(const ParquetScanRequest& request) { // 真实实现会根据 projected_file_columns、local_filters 和 reader_expression_map // 初始化 row group、column chunk、page reader 以及延时物化计划。 return reader::FileReader::init(request); } + // 读取下一批 Parquet file-local block。 + // 返回列必须保持 file-local 语义,不能在这里补 default/generated/partition 列。 Status next(Block* file_block, size_t* rows, bool* eof) override { // 真实实现会输出 file-local block。stub 默认立即 EOF。 return reader::FileReader::next(file_block, rows, eof); } + // 通用 FileReader 初始化入口。 + // 当上层只持有 reader::FileReader 指针时会走该接口;Parquet 专属参数通过 + // ParquetScanRequest 重载表达。 Status init(const reader::FileScanRequest& request) override { return reader::FileReader::init(request); } diff --git a/be/src/format/reader/file_reader.h b/be/src/format/reader/file_reader.h index af03691b94ecf3..fd5bfcf933f63c 100644 --- a/be/src/format/reader/file_reader.h +++ b/be/src/format/reader/file_reader.h @@ -87,6 +87,9 @@ class FileReader { public: virtual ~FileReader() = default; + // 打开一个物理文件并加载文件级元数据。 + // 该方法只建立 file-local reader 状态,不接收 table schema,也不做 projection/filter + // 规划;这些输入由 init(FileScanRequest) 提供。 virtual Status open(io::FileReaderSPtr file, io::IOContext* io_ctx = nullptr) { // 真实实现会保存文件句柄、IO 上下文并读取文件元数据。 _file = std::move(file); @@ -95,12 +98,18 @@ class FileReader { return Status::OK(); } + // 返回文件自己的 schema 视图。 + // 返回结果必须是 file-local schema:列 id、类型和 children 都按文件格式展开, + // 不在这里解释 Iceberg field id、缺失列、默认值或 generated column。 virtual Status get_schema(std::vector* file_schema) const { // 真实实现会展开文件格式自己的 file-local schema。 file_schema->clear(); return Status::OK(); } + // 初始化一次 file-local scan。 + // request 由 TableColumnMapper 生成,只包含文件列投影、本地过滤条件和 reader + // expression。FileReader 可以基于它初始化 row group/page/stripe 等文件格式计划。 virtual Status init(const FileScanRequest& request) { // 真实实现会根据 projected columns、local filters 和 reader expressions // 初始化文件格式自己的物理读取计划。 @@ -110,6 +119,9 @@ class FileReader { return Status::OK(); } + // 读取下一批 file-local block。 + // file_block 的列顺序和类型必须遵守 FileScanRequest,而不是 table/global schema。 + // eof 表示当前文件 reader 是否读完;多文件切换由 TableReader 负责。 virtual Status next(Block* file_block, size_t* rows, bool* eof) { // stub 默认立即 EOF。 (void)file_block; @@ -123,6 +135,8 @@ class FileReader { return Status::OK(); } + // 关闭当前物理文件 reader 并释放文件层状态。 + // 该方法不处理 table-level delete/finalize 状态,后者由 TableReader 子类管理。 virtual Status close() { _file.reset(); _io_ctx = nullptr; diff --git a/be/src/format/reader/table_reader.h b/be/src/format/reader/table_reader.h index 422ee3142d1ea9..8d88ce4fe1ba8a 100644 --- a/be/src/format/reader/table_reader.h +++ b/be/src/format/reader/table_reader.h @@ -113,6 +113,9 @@ class TableColumnMapper { explicit TableColumnMapper(TableColumnMapperOptions options = {}) : _options(std::move(options)) {} virtual ~TableColumnMapper() = default; + // 建立 table schema 到 file schema 的列映射。 + // 输出的 ColumnMapping 描述 table column 如何从 file column、常量列或表达式得到; + // 后续 projection、filter localization 和 table block finalize 都应复用这份映射。 virtual Status create_mapping(const std::vector& table_schema, const std::vector& file_schema, std::vector* mappings) { @@ -136,6 +139,9 @@ class TableColumnMapper { return Status::OK(); } + // 把 table-level scan 请求转换成 file-local scan 请求。 + // table_request 使用 table/global schema;file_request 只包含 FileReader 能理解的 + // projected_file_columns、local_filters 和 reader_expression_map。 virtual Status create_scan_request(const TableScanRequest& table_request, const std::vector& mappings, FileScanRequest* file_request) { @@ -154,6 +160,9 @@ class TableColumnMapper { return Status::OK(); } + // 将 table-level filter 定位到文件 schema。 + // trivial mapping 可以直接复制结构化谓词;类型变化时可以尝试安全 cast;无法安全 + // 下推的表达式应通过 reader_expression_map 或 table-level finalize/filter fallback 处理。 virtual Status localize_filters(const std::vector& table_filters, FileScanRequest* file_request) const { // 真实实现会处理 trivial mapping、safe cast、reader expression fallback 和 @@ -213,15 +222,21 @@ struct TableReadOptions { // table-level reader 基类。 // 该层负责多文件编排和动态分区裁剪等通用 table-level 逻辑,对外输出 table block。 +// 子类只需要实现“如何打开下一个具体 reader”和“如何读取当前 reader”的表格式语义。 class TableReader { public: virtual ~TableReader() = default; + // 初始化 table reader 的通用运行参数。 + // 子类可以在自己的 init(params) 中调用该方法;这里不接收具体表格式 schema/task。 virtual Status init(const TableReadOptions& options) { _options = options; return Status::OK(); } + // table-level 动态过滤入口。 + // 该方法用于根据 split、partition value 或文件级统计判断是否可以跳过后续 reader。 + // can_filter_all=true 表示当前 table reader 范围内的数据都可以被裁剪。 virtual Status filter(const VExprContextSPtr& expr, bool* can_filter_all) { // 真实实现会基于 split/partition/file stats 判断动态分区裁剪结果。 (void)expr; @@ -231,12 +246,78 @@ class TableReader { return Status::OK(); } - virtual Status next_reader() { - // 真实实现会切换到下一个 data file / split reader。 + // 对外读取 table block 的统一入口。 + // 基类负责 current reader 的打开、EOF 后切换和关闭;子类只实现 protected hook。 + // table_block 的列必须已经是 table/global schema 语义。 + Status next(Block* table_block, size_t* rows, bool* eof) { + if (rows != nullptr) { + *rows = 0; + } + if (eof != nullptr) { + *eof = false; + } + while (true) { + if (!_has_current_reader) { + RETURN_IF_ERROR(next_reader()); + if (!_has_current_reader) { + if (eof != nullptr) { + *eof = true; + } + return Status::OK(); + } + } + + size_t current_rows = 0; + bool current_eof = false; + RETURN_IF_ERROR(read_current(table_block, ¤t_rows, ¤t_eof)); + if (rows != nullptr) { + *rows = current_rows; + } + if (!current_eof || current_rows > 0) { + return Status::OK(); + } + RETURN_IF_ERROR(close_current_reader()); + _has_current_reader = false; + } + } + + // 关闭 table reader 及当前正在读取的底层 reader。 + // 子类如果持有额外表格式资源,应 override 后先调用 TableReader::close()。 + virtual Status close() { + RETURN_IF_ERROR(close_current_reader()); + _has_current_reader = false; + return Status::OK(); + } + +protected: + // 切换到下一个 reader 的通用流程。 + // 该方法先关闭当前 reader,再调用 open_next_reader;子类不应重复实现这个循环。 + Status next_reader() { + // 多文件切换的公共流程留在基类:关闭当前 reader,然后打开下一个 reader。 + // 子类只通过 open_next_reader 提供具体表格式的 task/split 打开方式。 + RETURN_IF_ERROR(close_current_reader()); + bool has_reader = false; + RETURN_IF_ERROR(open_next_reader(&has_reader)); + _has_current_reader = has_reader; return Status::OK(); } - virtual Status next(Block* table_block, size_t* rows, bool* eof) { + // 打开下一个具体 reader。 + // 子类在这里选择下一个 split/task,创建或重置底层 FileReader,并设置 has_reader。 + // has_reader=false 表示没有更多输入,TableReader::next 会返回 eof=true。 + virtual Status open_next_reader(bool* has_reader) { + // stub 默认没有下一个 reader。 + if (has_reader != nullptr) { + *has_reader = false; + } + return Status::OK(); + } + + // 从当前 reader 读取一批 table block。 + // 子类应在这里读取 file-local block,并完成 delete、virtual column、finalize_expr + // 等 table-level 处理,最终写入 table_block。 + virtual Status read_current(Block* table_block, size_t* rows, bool* eof) { + // stub 默认当前 reader 立即 EOF。 (void)table_block; if (rows != nullptr) { *rows = 0; @@ -247,10 +328,12 @@ class TableReader { return Status::OK(); } - virtual Status close() { return Status::OK(); } + // 关闭当前具体 reader。 + // 该 hook 会被 next_reader 和 close 调用;实现应保持幂等。 + virtual Status close_current_reader() { return Status::OK(); } -protected: TableReadOptions _options; + bool _has_current_reader = false; }; } // namespace doris::reader diff --git a/be/src/format/table/iceberg_reader_v2.h b/be/src/format/table/iceberg_reader_v2.h index 70ee2bb3ff548a..29b556f71ed561 100644 --- a/be/src/format/table/iceberg_reader_v2.h +++ b/be/src/format/table/iceberg_reader_v2.h @@ -69,6 +69,17 @@ struct IcebergReadOptions { bool enable_deletion_vector = true; }; +// IcebergTableReader 的完整初始化输入。 +// 这些字段共同决定一次 table scan 的语义,除非后续有明确的生命周期差异,否则不拆成 +// bind/init/set_tasks 多个阶段,避免调用点暴露半初始化状态。 +struct IcebergTableReadParams { + IcebergReadOptions options; + std::vector iceberg_schema; + reader::TableScanRequest scan_request; + std::vector scan_tasks; + std::unique_ptr data_reader; +}; + // Iceberg table-level reader。 // 该层继承 TableReader,复用多文件编排和动态分区裁剪等通用能力;同时组合 // FileReader 完成 data file 物理读取,不继承具体文件格式 reader。 @@ -76,30 +87,36 @@ class IcebergTableReader : public reader::TableReader { public: IcebergTableReader() = default; - explicit IcebergTableReader(std::unique_ptr data_reader) - : _data_reader(std::move(data_reader)) {} - ~IcebergTableReader() override = default; - Status init(const IcebergReadOptions& options, - std::unique_ptr data_reader) { - _iceberg_options = options; - _data_reader = std::move(data_reader); - return reader::TableReader::init(options.table_options); - } - - Status bind(const std::vector& iceberg_schema) { - // 真实实现会绑定 Iceberg 当前 schema,并准备 field-id based mapping 输入。 - _iceberg_schema = iceberg_schema; - return Status::OK(); + // 初始化一次 Iceberg table scan。 + // params 必须一次性提供 schema、projection/filter、scan tasks 和底层 FileReader; + // 这样 IcebergTableReader 不会暴露 bind/set_tasks 等半初始化阶段。 + Status init(IcebergTableReadParams params) { + // 一次性保存 Iceberg table scan 所需输入。TableReader 负责 reader 切换流程; + // IcebergTableReader 只提供后续要打开的 task 以及 table/file schema 映射语义。 + _iceberg_options = params.options; + _iceberg_schema = std::move(params.iceberg_schema); + _table_scan_request = std::move(params.scan_request); + _scan_tasks = std::move(params.scan_tasks); + _data_reader = std::move(params.data_reader); + _next_task_idx = 0; + return reader::TableReader::init(_iceberg_options.table_options); } - Status init(const reader::TableScanRequest& request) { - // 保存 table-level projection/filter,后续由 TableColumnMapper 转成 FileScanRequest。 - _table_scan_request = request; + // 关闭当前 Iceberg scan。 + // 先让 TableReader 关闭当前 task reader,再释放 IcebergTableReader 持有的底层 + // FileReader。 + Status close() override { + RETURN_IF_ERROR(reader::TableReader::close()); + _data_reader.reset(); return Status::OK(); } +protected: + // 打开单个 Iceberg scan task。 + // 该方法完成当前 data file 的 schema mapping、filter localization、position delete + // 注入,并初始化底层 FileReader;它由 TableReader 的 reader 切换流程调用。 Status open_task(const IcebergScanTask& task) { // 真实实现会读取 data file schema,创建 field-id mapping,应用 position deletes, // 并初始化底层 ParquetReader。 @@ -123,7 +140,27 @@ class IcebergTableReader : public reader::TableReader { return Status::OK(); } - Status next(Block* table_block, size_t* rows, bool* eof) override { + // 打开下一个 Iceberg task。 + // TableReader 负责循环和 EOF 处理;这里仅从 _scan_tasks 中取下一个 task 并调用 + // open_task。 + Status open_next_reader(bool* has_reader) override { + if (_next_task_idx >= _scan_tasks.size()) { + if (has_reader != nullptr) { + *has_reader = false; + } + return Status::OK(); + } + RETURN_IF_ERROR(open_task(_scan_tasks[_next_task_idx++])); + if (has_reader != nullptr) { + *has_reader = true; + } + return Status::OK(); + } + + // 读取当前 Iceberg task 的下一批 table block。 + // 这里组合底层 FileReader 输出的 file-local block,并负责 equality delete、 + // virtual columns 和 finalize,最终输出 table/global schema block。 + Status read_current(Block* table_block, size_t* rows, bool* eof) override { // 真实实现会读取 file-local block,finalize 成 table block,再应用 equality delete // 和 Iceberg virtual columns。stub 默认 EOF。 // 后续实现应在 IcebergTableReader 内部持有 file-local block;这里仅复用输出指针 @@ -138,6 +175,9 @@ class IcebergTableReader : public reader::TableReader { return Status::OK(); } + // 将 file-local block 转换为 table/global schema block。 + // 这里执行 ColumnMapping 中的 finalize_expr、缺失列填充、partition/generated 列 + // 物化以及复杂列 remap。 Status finalize_chunk(Block* file_block, Block* table_block) { // 真实实现会根据 ColumnMapping 执行 finalize_expr/default/partition/generated // expressions,把 file-local block 写成 table block。 @@ -146,18 +186,24 @@ class IcebergTableReader : public reader::TableReader { return Status::OK(); } + // 将 Iceberg position delete / deletion vector 转换成底层 reader 可消费的删除信息。 + // 这一步发生在读取 data file 前,因此会修改 FileScanRequest。 Status apply_position_deletes(reader::FileScanRequest* request) { // 真实实现会把 position delete / deletion vector 转换成 file-local delete 信息。 (void)request; return Status::OK(); } + // 在 table block 上应用 equality delete。 + // equality delete 依赖 table-level 列语义,因此不能下沉到 ParquetReader。 Status apply_equality_deletes(Block* table_block) { // 真实实现会在 table block 上应用 equality delete。 (void)table_block; return Status::OK(); } + // 物化 Iceberg 虚拟列。 + // 例如 _row_id、_last_updated_sequence_number 等,它们不来自 Parquet 文件物理列。 Status materialize_virtual_columns(Block* table_block, size_t rows) { // 真实实现会物化 _row_id、_last_updated_sequence_number 等 Iceberg 虚拟列。 (void)table_block; @@ -165,17 +211,20 @@ class IcebergTableReader : public reader::TableReader { return Status::OK(); } - Status close() override { + // 关闭当前 task 对应的底层 FileReader。 + // 该方法由 TableReader 在切换 reader 或 close 时调用,要求可重复调用。 + Status close_current_reader() override { if (_data_reader) { RETURN_IF_ERROR(_data_reader->close()); } - _data_reader.reset(); return Status::OK(); } private: IcebergReadOptions _iceberg_options; IcebergScanTask _scan_task; + std::vector _scan_tasks; + size_t _next_task_idx = 0; reader::TableScanRequest _table_scan_request; std::vector _iceberg_schema; std::vector _mappings; diff --git a/docs/doris-iceberg-parquet-api-design.md b/docs/doris-iceberg-parquet-api-design.md index 58036667d44ef6..6518043b40dc6f 100644 --- a/docs/doris-iceberg-parquet-api-design.md +++ b/docs/doris-iceberg-parquet-api-design.md @@ -74,6 +74,8 @@ namespace doris::reader - 管理 scan 生命周期; - 承接动态分区裁剪等 table-level 通用过滤逻辑; - 对外统一输出 table block。 +- `next` 是基类统一入口,内部负责 EOF 后切换 reader;具体表格式只提供打开和读取 + 当前 reader 的 hook。 建议接口形状: @@ -86,9 +88,14 @@ public: virtual Status init(const TableReadOptions& options); virtual Status filter(const VExprContextSPtr& expr, bool* can_filter_all); - virtual Status next_reader(); - virtual Status next(Block* table_block, size_t* rows, bool* eof); + Status next(Block* table_block, size_t* rows, bool* eof); virtual Status close(); + +protected: + Status next_reader(); + virtual Status open_next_reader(bool* has_reader); + virtual Status read_current(Block* table_block, size_t* rows, bool* eof); + virtual Status close_current_reader(); }; } // namespace doris::reader @@ -99,6 +106,7 @@ public: - `TableReader` 输出的是 table block,不输出 file-local block。 - `TableReader` 负责多文件编排和 table-level 通用裁剪,不负责 schema mapping,不负责 Parquet 物理解码。 +- `next_reader` 是 `TableReader` 自己的通用切换逻辑,不作为子类公开 override 接口。 - 动态分区裁剪这类逻辑应下放到 `TableReader`,而不是散落在具体表格式 reader 中。 - `TableReader` 不直接依赖旧 `vparquet` 表层语义。 @@ -122,7 +130,7 @@ namespace doris::iceberg 建议职责: - 绑定 Iceberg 当前 table schema; -- 接收 `IcebergScanTask`; +- 接收 `IcebergScanTask` 列表,并按 `TableReader` 的统一调度打开当前 task; - 处理 position delete、equality delete、deletion vector; - 物化 `_row_id`、`_last_updated_sequence_number` 等虚拟列; - 将 `ParquetReader` 返回的 file-local block finalize 成 table block。 @@ -136,13 +144,13 @@ class IcebergTableReader : public reader::TableReader { public: virtual ~IcebergTableReader() = default; - Status init(const IcebergReadOptions& options, - std::unique_ptr data_reader); - Status bind(const std::vector& iceberg_schema); - Status init(const reader::TableScanRequest& request); - Status open_task(const IcebergScanTask& task); - Status next(Block* table_block, size_t* rows, bool* eof) override; + Status init(IcebergTableReadParams params); Status close() override; + +protected: + Status open_next_reader(bool* has_reader) override; + Status read_current(Block* table_block, size_t* rows, bool* eof) override; + Status close_current_reader() override; }; } // namespace doris::iceberg @@ -153,6 +161,11 @@ public: - `IcebergTableReader` 继承 `TableReader`,并通过组合使用 `FileReader`。 - `IcebergTableReader` 不做 Parquet page/column 解码。 - `IcebergTableReader` 负责 table-level finalize,不负责 file-local pruning 实现。 +- `IcebergTableReader` 的 schema、scan request、scan tasks 和底层 `FileReader` 应通过 + 一个初始化参数对象一次性传入;除非存在明确生命周期差异,不拆成 `bind` / + `init(TableScanRequest)` / `set_scan_tasks` 多阶段接口。 +- `IcebergTableReader` 不重新实现 reader 切换循环,只实现打开 Iceberg task、读取当前 + task 和关闭当前 reader 的 hook。 ### TableColumnMapper @@ -430,6 +443,21 @@ Iceberg 场景下,column id 默认对应 field id。 它是 `IcebergTableReader` 的输入,不应直接传给 `ParquetReader`。 +### IcebergTableReadParams + +`IcebergTableReadParams` 表示一次 Iceberg table scan 的完整初始化输入。 + +建议包含的信息: + +- Iceberg read options; +- Iceberg table schema; +- table scan request; +- Iceberg scan task 列表; +- 底层 `FileReader`。 + +它用于避免 `IcebergTableReader` 暴露多个半初始化阶段。调用方应一次性构造完整 +参数并调用 `init`。 + ## 设计原则 ### 边界原则 From 1676f2ef64fb4651a15d6e8df8c0b2e06bcb15e8 Mon Sep 17 00:00:00 2001 From: Gabriel Date: Mon, 18 May 2026 20:22:03 +0800 Subject: [PATCH 03/38] fix compiling (#63368) --- be/src/format/reader/file_reader.h | 6 +- be/src/format/reader/table_reader.h | 167 +++++++++++++--------- be/src/format/table/iceberg_reader_v2.cpp | 20 +++ be/src/format/table/iceberg_reader_v2.h | 150 +++---------------- 4 files changed, 141 insertions(+), 202 deletions(-) create mode 100644 be/src/format/table/iceberg_reader_v2.cpp diff --git a/be/src/format/reader/file_reader.h b/be/src/format/reader/file_reader.h index fd5bfcf933f63c..e9fb0f3c963a93 100644 --- a/be/src/format/reader/file_reader.h +++ b/be/src/format/reader/file_reader.h @@ -122,12 +122,8 @@ class FileReader { // 读取下一批 file-local block。 // file_block 的列顺序和类型必须遵守 FileScanRequest,而不是 table/global schema。 // eof 表示当前文件 reader 是否读完;多文件切换由 TableReader 负责。 - virtual Status next(Block* file_block, size_t* rows, bool* eof) { + virtual Status get_block(Block* file_block, bool* eof) { // stub 默认立即 EOF。 - (void)file_block; - if (rows != nullptr) { - *rows = 0; - } if (eof != nullptr) { *eof = true; } diff --git a/be/src/format/reader/table_reader.h b/be/src/format/reader/table_reader.h index 8d88ce4fe1ba8a..99dcc507e5d503 100644 --- a/be/src/format/reader/table_reader.h +++ b/be/src/format/reader/table_reader.h @@ -17,6 +17,8 @@ #pragma once +#include + #include #include #include @@ -26,6 +28,7 @@ #include #include "common/status.h" +#include "core/block/block.h" #include "core/data_type/data_type.h" #include "exprs/vexpr_fwd.h" #include "format/reader/file_reader.h" @@ -110,7 +113,8 @@ struct TableScanRequest { // Iceberg-only 组件。 class TableColumnMapper { public: - explicit TableColumnMapper(TableColumnMapperOptions options = {}) : _options(std::move(options)) {} + explicit TableColumnMapper(TableColumnMapperOptions options = {}) + : _options(std::move(options)) {} virtual ~TableColumnMapper() = default; // 建立 table schema 到 file schema 的列映射。 @@ -184,14 +188,15 @@ class TableColumnMapper { const std::vector& mappings() const { return _mappings; } private: - const SchemaField* find_file_field( - const TableColumn& table_column, - const std::vector& file_schema) const { + const SchemaField* find_file_field(const TableColumn& table_column, + const std::vector& file_schema) const { for (const auto& field : file_schema) { - if (_options.mode == TableColumnMappingMode::BY_FIELD_ID && field.id == table_column.id) { + if (_options.mode == TableColumnMappingMode::BY_FIELD_ID && + field.id == table_column.id) { return &field; } - if (_options.mode == TableColumnMappingMode::BY_NAME && field.name == table_column.name) { + if (_options.mode == TableColumnMappingMode::BY_NAME && + field.name == table_column.name) { return &field; } } @@ -216,8 +221,28 @@ class TableColumnMapper { std::vector _mappings; }; +struct BaseDataFile { + virtual ~BaseDataFile() = default; + + std::string path; + std::string format; + int64_t record_count = 0; + int64_t file_size = 0; +}; + +struct ScanTask { + virtual ~ScanTask() = default; + + std::unique_ptr data_file; +}; + struct TableReadOptions { size_t batch_size = 4096; + // TODO: deleted? SCHEMA should be derived from table metadata and inited by TableReader it self? it shouldn't be part of read options. + std::vector schema; + TableScanRequest scan_request; + // Each task denotes a descriptor of a single file to read, along with file-level metadata such as stats and delete files. + std::vector> scan_tasks; }; // table-level reader 基类。 @@ -228,9 +253,12 @@ class TableReader { virtual ~TableReader() = default; // 初始化 table reader 的通用运行参数。 - // 子类可以在自己的 init(params) 中调用该方法;这里不接收具体表格式 schema/task。 - virtual Status init(const TableReadOptions& options) { - _options = options; + // 子类可以在自己的 init(options) 中调用该方法;这里不接收具体表格式 schema/task。 + virtual Status init(TableReadOptions options) { + _schema = std::move(_options.schema); + _table_scan_request = std::move(_options.scan_request); + _scan_tasks = std::move(_options.scan_tasks); + _next_task_idx = 0; return Status::OK(); } @@ -249,91 +277,102 @@ class TableReader { // 对外读取 table block 的统一入口。 // 基类负责 current reader 的打开、EOF 后切换和关闭;子类只实现 protected hook。 // table_block 的列必须已经是 table/global schema 语义。 - Status next(Block* table_block, size_t* rows, bool* eof) { - if (rows != nullptr) { - *rows = 0; - } - if (eof != nullptr) { - *eof = false; + Status get_block(Block* block, bool* eos) { + if (eos != nullptr) { + *eos = false; } - while (true) { - if (!_has_current_reader) { - RETURN_IF_ERROR(next_reader()); - if (!_has_current_reader) { - if (eof != nullptr) { - *eof = true; - } + while (block->empty() && !*eos) { + if (!_data_reader) { + RETURN_IF_ERROR(create_next_reader(eos)); + if (!_data_reader) { + DCHECK(*eos); return Status::OK(); } } - size_t current_rows = 0; bool current_eof = false; - RETURN_IF_ERROR(read_current(table_block, ¤t_rows, ¤t_eof)); - if (rows != nullptr) { - *rows = current_rows; - } - if (!current_eof || current_rows > 0) { - return Status::OK(); + RETURN_IF_ERROR(_data_reader->get_block(block, ¤t_eof)); + RETURN_IF_ERROR(finalize_chunk(block)); + RETURN_IF_ERROR(materialize_virtual_columns(block)); + if (current_eof) { + RETURN_IF_ERROR(close_current_reader()); } - RETURN_IF_ERROR(close_current_reader()); - _has_current_reader = false; } + return Status::OK(); } // 关闭 table reader 及当前正在读取的底层 reader。 // 子类如果持有额外表格式资源,应 override 后先调用 TableReader::close()。 virtual Status close() { - RETURN_IF_ERROR(close_current_reader()); - _has_current_reader = false; + if (_data_reader) { + RETURN_IF_ERROR(close_current_reader()); + } return Status::OK(); } protected: // 切换到下一个 reader 的通用流程。 - // 该方法先关闭当前 reader,再调用 open_next_reader;子类不应重复实现这个循环。 - Status next_reader() { + // 该方法先关闭当前 reader,再打开下一个具体 reader;子类不应重复实现这个循环。 + Status create_next_reader(bool* eos) { // 多文件切换的公共流程留在基类:关闭当前 reader,然后打开下一个 reader。 - // 子类只通过 open_next_reader 提供具体表格式的 task/split 打开方式。 - RETURN_IF_ERROR(close_current_reader()); - bool has_reader = false; - RETURN_IF_ERROR(open_next_reader(&has_reader)); - _has_current_reader = has_reader; + DCHECK(_data_reader == nullptr); + // TODO: 创建_data_reader + // _data_reader = std::make_unique(...); + if (!_data_reader) { + if (eos != nullptr) { + *eos = true; + } + return Status::OK(); + } + RETURN_IF_ERROR(open_reader()); return Status::OK(); } - // 打开下一个具体 reader。 - // 子类在这里选择下一个 split/task,创建或重置底层 FileReader,并设置 has_reader。 - // has_reader=false 表示没有更多输入,TableReader::next 会返回 eof=true。 - virtual Status open_next_reader(bool* has_reader) { - // stub 默认没有下一个 reader。 - if (has_reader != nullptr) { - *has_reader = false; - } + // 打开当前具体 reader。 + // 子类在这里基于当前 split/task 初始化底层 FileReader。 + virtual Status open_reader() { + std::vector file_schema; + RETURN_IF_ERROR(_data_reader->get_schema(&file_schema)); + TableColumnMapperOptions mapper_options; + mapper_options.mode = TableColumnMappingMode::BY_FIELD_ID; + _column_mapper = TableColumnMapper(mapper_options); + RETURN_IF_ERROR(_column_mapper.create_mapping(_schema, file_schema, &_mappings)); + + FileScanRequest file_request; + RETURN_IF_ERROR( + _column_mapper.create_scan_request(_table_scan_request, _mappings, &file_request)); + RETURN_IF_ERROR(_data_reader->init(file_request)); return Status::OK(); } - // 从当前 reader 读取一批 table block。 - // 子类应在这里读取 file-local block,并完成 delete、virtual column、finalize_expr - // 等 table-level 处理,最终写入 table_block。 - virtual Status read_current(Block* table_block, size_t* rows, bool* eof) { - // stub 默认当前 reader 立即 EOF。 - (void)table_block; - if (rows != nullptr) { - *rows = 0; - } - if (eof != nullptr) { - *eof = true; - } + // 关闭当前具体 reader。 + // 该 hook 会被 create_next_reader 和 close 调用;实现应保持幂等。 + virtual Status close_current_reader() { + RETURN_IF_ERROR(_data_reader->close()); + _data_reader.reset(); return Status::OK(); } - // 关闭当前具体 reader。 - // 该 hook 会被 next_reader 和 close 调用;实现应保持幂等。 - virtual Status close_current_reader() { return Status::OK(); } + // 将 file-local block 转换为 table/global schema block。 + // 这里执行 ColumnMapping 中的 finalize_expr、缺失列填充、partition/generated 列 + // 物化以及复杂列 remap。 + virtual Status finalize_chunk(Block* block) { return Status::OK(); } + + // 物化虚拟列。 + // 例如 _row_id、_last_updated_sequence_number 等,它们不来自文件物理列。 + virtual Status materialize_virtual_columns(Block* table_block) { + // 真实实现会物化 _row_id、_last_updated_sequence_number 等 Iceberg 虚拟列。 + return Status::OK(); + } TableReadOptions _options; - bool _has_current_reader = false; + std::unique_ptr _data_reader; + std::vector> _scan_tasks; + TableScanRequest _table_scan_request; + std::vector _schema; + std::vector _mappings; + TableColumnMapper _column_mapper; + size_t _next_task_idx = 0; }; } // namespace doris::reader diff --git a/be/src/format/table/iceberg_reader_v2.cpp b/be/src/format/table/iceberg_reader_v2.cpp new file mode 100644 index 00000000000000..220f153e93fc67 --- /dev/null +++ b/be/src/format/table/iceberg_reader_v2.cpp @@ -0,0 +1,20 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "format/table/iceberg_reader_v2.h" + +namespace doris::iceberg {} // namespace doris::iceberg diff --git a/be/src/format/table/iceberg_reader_v2.h b/be/src/format/table/iceberg_reader_v2.h index 29b556f71ed561..3ddadc9f9deb7e 100644 --- a/be/src/format/table/iceberg_reader_v2.h +++ b/be/src/format/table/iceberg_reader_v2.h @@ -35,51 +35,26 @@ class Block; namespace doris::iceberg { // Iceberg data file 摘要。它描述当前要读取的物理 data file,不承载列映射逻辑。 -struct IcebergDataFile { - std::string path; - std::string format; - int64_t record_count = 0; - int64_t file_size = 0; +struct IcebergDataFile final : public reader::BaseDataFile { int64_t sequence_number = 0; int64_t first_row_id = -1; }; // Iceberg delete file 摘要。position/equality/deletion vector 的具体读取在 // IcebergTableReader 实现阶段补齐。 -struct IcebergDeleteFile { - std::string path; - std::string format; +struct IcebergDeleteFile final : public reader::BaseDataFile { int64_t sequence_number = 0; std::vector equality_field_ids; }; // 单个 Iceberg data file 的 scan 输入。 // 该结构只进入 IcebergTableReader,不直接传给 ParquetReader。 -struct IcebergScanTask { - IcebergDataFile data_file; +struct IcebergScanTask final : public reader::ScanTask { std::vector positional_deletes; std::vector equality_deletes; std::vector deletion_vectors; }; -struct IcebergReadOptions { - reader::TableReadOptions table_options; - bool enable_position_delete = true; - bool enable_equality_delete = true; - bool enable_deletion_vector = true; -}; - -// IcebergTableReader 的完整初始化输入。 -// 这些字段共同决定一次 table scan 的语义,除非后续有明确的生命周期差异,否则不拆成 -// bind/init/set_tasks 多个阶段,避免调用点暴露半初始化状态。 -struct IcebergTableReadParams { - IcebergReadOptions options; - std::vector iceberg_schema; - reader::TableScanRequest scan_request; - std::vector scan_tasks; - std::unique_ptr data_reader; -}; - // Iceberg table-level reader。 // 该层继承 TableReader,复用多文件编排和动态分区裁剪等通用能力;同时组合 // FileReader 完成 data file 物理读取,不继承具体文件格式 reader。 @@ -90,18 +65,12 @@ class IcebergTableReader : public reader::TableReader { ~IcebergTableReader() override = default; // 初始化一次 Iceberg table scan。 - // params 必须一次性提供 schema、projection/filter、scan tasks 和底层 FileReader; - // 这样 IcebergTableReader 不会暴露 bind/set_tasks 等半初始化阶段。 - Status init(IcebergTableReadParams params) { + // options 必须一次性提供 schema、projection/filter 和 scan tasks,避免暴露 + // bind/set_tasks 等半初始化阶段。 + Status init(reader::TableReadOptions options) override { // 一次性保存 Iceberg table scan 所需输入。TableReader 负责 reader 切换流程; // IcebergTableReader 只提供后续要打开的 task 以及 table/file schema 映射语义。 - _iceberg_options = params.options; - _iceberg_schema = std::move(params.iceberg_schema); - _table_scan_request = std::move(params.scan_request); - _scan_tasks = std::move(params.scan_tasks); - _data_reader = std::move(params.data_reader); - _next_task_idx = 0; - return reader::TableReader::init(_iceberg_options.table_options); + return reader::TableReader::init(std::move(options)); } // 关闭当前 Iceberg scan。 @@ -114,75 +83,20 @@ class IcebergTableReader : public reader::TableReader { } protected: - // 打开单个 Iceberg scan task。 - // 该方法完成当前 data file 的 schema mapping、filter localization、position delete - // 注入,并初始化底层 FileReader;它由 TableReader 的 reader 切换流程调用。 - Status open_task(const IcebergScanTask& task) { - // 真实实现会读取 data file schema,创建 field-id mapping,应用 position deletes, - // 并初始化底层 ParquetReader。 - _scan_task = task; - std::vector file_schema; - if (_data_reader) { - RETURN_IF_ERROR(_data_reader->get_schema(&file_schema)); - } - reader::TableColumnMapperOptions mapper_options; - mapper_options.mode = reader::TableColumnMappingMode::BY_FIELD_ID; - _column_mapper = reader::TableColumnMapper(mapper_options); - RETURN_IF_ERROR(_column_mapper.create_mapping(_iceberg_schema, file_schema, &_mappings)); - - reader::FileScanRequest file_request; - RETURN_IF_ERROR(_column_mapper.create_scan_request(_table_scan_request, _mappings, - &file_request)); - RETURN_IF_ERROR(apply_position_deletes(&file_request)); - if (_data_reader) { - RETURN_IF_ERROR(_data_reader->init(file_request)); - } - return Status::OK(); - } - - // 打开下一个 Iceberg task。 - // TableReader 负责循环和 EOF 处理;这里仅从 _scan_tasks 中取下一个 task 并调用 - // open_task。 - Status open_next_reader(bool* has_reader) override { - if (_next_task_idx >= _scan_tasks.size()) { - if (has_reader != nullptr) { - *has_reader = false; - } - return Status::OK(); - } - RETURN_IF_ERROR(open_task(_scan_tasks[_next_task_idx++])); - if (has_reader != nullptr) { - *has_reader = true; - } - return Status::OK(); - } - - // 读取当前 Iceberg task 的下一批 table block。 - // 这里组合底层 FileReader 输出的 file-local block,并负责 equality delete、 - // virtual columns 和 finalize,最终输出 table/global schema block。 - Status read_current(Block* table_block, size_t* rows, bool* eof) override { - // 真实实现会读取 file-local block,finalize 成 table block,再应用 equality delete - // 和 Iceberg virtual columns。stub 默认 EOF。 - // 后续实现应在 IcebergTableReader 内部持有 file-local block;这里仅复用输出指针 - // 作为 header-only API 占位,避免在骨架阶段引入 Block 的完整定义。 - Block* file_block = table_block; - if (_data_reader) { - RETURN_IF_ERROR(_data_reader->next(file_block, rows, eof)); - } - RETURN_IF_ERROR(finalize_chunk(file_block, table_block)); - RETURN_IF_ERROR(apply_equality_deletes(table_block)); - RETURN_IF_ERROR(materialize_virtual_columns(table_block, rows != nullptr ? *rows : 0)); - return Status::OK(); - } - // 将 file-local block 转换为 table/global schema block。 // 这里执行 ColumnMapping 中的 finalize_expr、缺失列填充、partition/generated 列 // 物化以及复杂列 remap。 - Status finalize_chunk(Block* file_block, Block* table_block) { + Status finalize_chunk(Block* block) override { // 真实实现会根据 ColumnMapping 执行 finalize_expr/default/partition/generated // expressions,把 file-local block 写成 table block。 - (void)file_block; - (void)table_block; + RETURN_IF_ERROR(apply_equality_deletes(block)); + return Status::OK(); + } + + // 物化 Iceberg 虚拟列。 + // 例如 _row_id、_last_updated_sequence_number 等,它们不来自 Parquet 文件物理列。 + Status materialize_virtual_columns(Block* table_block) override { + // 真实实现会物化 _row_id、_last_updated_sequence_number 等 Iceberg 虚拟列。 return Status::OK(); } @@ -196,40 +110,10 @@ class IcebergTableReader : public reader::TableReader { // 在 table block 上应用 equality delete。 // equality delete 依赖 table-level 列语义,因此不能下沉到 ParquetReader。 - Status apply_equality_deletes(Block* table_block) { + Status apply_equality_deletes(Block* block) { // 真实实现会在 table block 上应用 equality delete。 - (void)table_block; - return Status::OK(); - } - - // 物化 Iceberg 虚拟列。 - // 例如 _row_id、_last_updated_sequence_number 等,它们不来自 Parquet 文件物理列。 - Status materialize_virtual_columns(Block* table_block, size_t rows) { - // 真实实现会物化 _row_id、_last_updated_sequence_number 等 Iceberg 虚拟列。 - (void)table_block; - (void)rows; return Status::OK(); } - - // 关闭当前 task 对应的底层 FileReader。 - // 该方法由 TableReader 在切换 reader 或 close 时调用,要求可重复调用。 - Status close_current_reader() override { - if (_data_reader) { - RETURN_IF_ERROR(_data_reader->close()); - } - return Status::OK(); - } - -private: - IcebergReadOptions _iceberg_options; - IcebergScanTask _scan_task; - std::vector _scan_tasks; - size_t _next_task_idx = 0; - reader::TableScanRequest _table_scan_request; - std::vector _iceberg_schema; - std::vector _mappings; - reader::TableColumnMapper _column_mapper; - std::unique_ptr _data_reader; }; } // namespace doris::iceberg From e5d17b881ba4049948a9370db285eaf3f280ed38 Mon Sep 17 00:00:00 2001 From: Gabriel Date: Tue, 19 May 2026 16:32:59 +0800 Subject: [PATCH 04/38] refactor table reader (#63397) --- be/src/exec/scan/file_scanner.cpp | 1 - be/src/exec/scan/file_scanner.h | 4 +- be/src/exprs/vliteral.cpp | 6 - be/src/exprs/vliteral.h | 8 +- be/src/exprs/vslot_ref.h | 4 + be/src/format/reader/column_mapper.cpp | 137 +++++++++++ be/src/format/reader/column_mapper.h | 124 ++++++++++ be/src/format/reader/expr/literal.h | 35 +++ be/src/format/reader/expr/slot_ref.h | 39 ++++ be/src/format/reader/file_reader.h | 27 ++- be/src/format/reader/table_reader.h | 255 ++++++--------------- be/src/format/table/iceberg_reader_mixin.h | 3 - be/src/format/table/iceberg_reader_v2.h | 20 -- 13 files changed, 440 insertions(+), 223 deletions(-) create mode 100644 be/src/format/reader/column_mapper.cpp create mode 100644 be/src/format/reader/column_mapper.h create mode 100644 be/src/format/reader/expr/literal.h create mode 100644 be/src/format/reader/expr/slot_ref.h diff --git a/be/src/exec/scan/file_scanner.cpp b/be/src/exec/scan/file_scanner.cpp index 5f1d248c1e1f4d..0ba7266456e427 100644 --- a/be/src/exec/scan/file_scanner.cpp +++ b/be/src/exec/scan/file_scanner.cpp @@ -1791,7 +1791,6 @@ Status FileScanner::_init_expr_ctxes() { if (is_file_slot) { _is_file_slot.emplace(slot_id); _file_slot_descs.emplace_back(it->second); - _file_col_names.push_back(it->second->col_name()); } _column_descs.push_back(col_desc); diff --git a/be/src/exec/scan/file_scanner.h b/be/src/exec/scan/file_scanner.h index cd4066ec987ad8..7c3d9d08b6ad7b 100644 --- a/be/src/exec/scan/file_scanner.h +++ b/be/src/exec/scan/file_scanner.h @@ -133,8 +133,6 @@ class FileScanner : public Scanner { bool _cur_reader_eof = false; // File source slot descriptors std::vector _file_slot_descs; - // col names from _file_slot_descs - std::vector _file_col_names; // Unified column descriptors for init_reader (includes file, partition, missing, synthesized cols) std::vector _column_descs; @@ -147,6 +145,7 @@ class FileScanner : public Scanner { // dest slot name to index in _dest_vexpr_ctx; std::unordered_map _dest_slot_name_to_idx; // col name to default value expr + // TODO: only used by json reader. Could we delete this? std::unordered_map _col_default_value_ctx; // the map values of dest slot id to src slot desc // if there is not key of dest slot id in dest_sid_to_src_sid_without_trans, it will be set to nullptr @@ -193,7 +192,6 @@ class FileScanner : public Scanner { std::unique_ptr _io_ctx; // Whether to fill partition columns from path, default is true. - bool _fill_partition_from_path = true; std::unordered_map> _partition_col_descs; std::unordered_map _partition_value_is_null; diff --git a/be/src/exprs/vliteral.cpp b/be/src/exprs/vliteral.cpp index 551839f699e2e6..9b93d7097274ee 100644 --- a/be/src/exprs/vliteral.cpp +++ b/be/src/exprs/vliteral.cpp @@ -37,12 +37,6 @@ namespace doris { class VExprContext; -void VLiteral::init(const TExprNode& node) { - Field field; - field = _data_type->get_field(node); - _column_ptr = _data_type->create_column_const(1, field); -} - Status VLiteral::prepare(RuntimeState* state, const RowDescriptor& desc, VExprContext* context) { RETURN_IF_ERROR_OR_PREPARED(VExpr::prepare(state, desc, context)); return Status::OK(); diff --git a/be/src/exprs/vliteral.h b/be/src/exprs/vliteral.h index b1b8e89157d420..e5a4c7a5f3dbc4 100644 --- a/be/src/exprs/vliteral.h +++ b/be/src/exprs/vliteral.h @@ -39,7 +39,9 @@ class VLiteral : public VExpr { VLiteral(const TExprNode& node, bool should_init = true) : VExpr(node), _expr_name(_data_type->get_name()) { if (should_init) { - init(node); + Field field; + field = _data_type->get_field(node); + _column_ptr = _data_type->create_column_const(1, field); } } @@ -69,11 +71,9 @@ class VLiteral : public VExpr { uint64_t get_digest(uint64_t seed) const override; protected: + VLiteral(const DataTypePtr& type) : VExpr(type, false) {} ColumnPtr _column_ptr; std::string _expr_name; - -private: - void init(const TExprNode& node); }; } // namespace doris diff --git a/be/src/exprs/vslot_ref.h b/be/src/exprs/vslot_ref.h index 21b5735753b83d..3ac9f641c1922e 100644 --- a/be/src/exprs/vslot_ref.h +++ b/be/src/exprs/vslot_ref.h @@ -73,6 +73,10 @@ class VSlotRef MOCK_REMOVE(final) : public VExpr { double execute_cost() const override { return 0.0; } +protected: + VSlotRef(int slot_id, int column_id, int column_uniq_id) + : _slot_id(slot_id), _column_id(column_id), _column_uniq_id(column_uniq_id) {} + private: int _slot_id; int _column_id; diff --git a/be/src/format/reader/column_mapper.cpp b/be/src/format/reader/column_mapper.cpp new file mode 100644 index 00000000000000..7006365b05408b --- /dev/null +++ b/be/src/format/reader/column_mapper.cpp @@ -0,0 +1,137 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "format/reader/column_mapper.h" + +#include + +#include "common/status.h" +#include "expr/slot_ref.h" +#include "format/reader/file_reader.h" +#include "format/reader/table_reader.h" + +namespace doris::reader { + +Status TableColumnMapper::create_mapping(const std::vector& projected_columns, + std::vector block_schema, + const std::map& partition_values, + const std::vector& file_schema) { + // 真实实现会做 field id/name matching、类型转换、复杂列 child mapping、缺失列 + // default/partition/generated 表达式构造。 + _mappings.clear(); + block_schema.clear(); + for (const auto& table_column : projected_columns) { + ColumnMapping mapping; + mapping.table_column_id = table_column.id; + mapping.table_type = table_column.type; + if (const auto* file_field = _find_file_field(table_column, file_schema)) { + mapping.file_column_id = file_field->id; + mapping.file_type = file_field->type; + mapping.is_trivial = _is_same_type(mapping.table_type, mapping.file_type); + if (!mapping.is_trivial) { + // TODO: + return Status::NotSupported( + "column mapping with type conversion is not supported yet: table column " + "'{}' (id={}, type={}) vs file column (id={}, type={})", + table_column.name, mapping.table_column_id, mapping.table_type->get_name(), + mapping.file_column_id.value(), mapping.file_type->get_name()); + } else { + mapping.projection = VExprContext::create_shared(TableSlotRef::create_shared( + *mapping.file_column_id, block_schema.size(), -1, mapping.table_type)); + } + block_schema.push_back(SchemaField { + mapping.file_column_id.value(), table_column.name, mapping.table_type, {}}); + } else if (table_column.default_expr != nullptr) { + mapping.is_constant = true; + mapping.default_expr = table_column.default_expr; + } else if (table_column.is_partition_key && partition_values.count(table_column.name) > 0) { + mapping.default_expr = VExprContext::create_shared(TableLiteral::create_shared( + mapping.table_type, partition_values.at(table_column.name))); + } else { + if (table_column.is_partition_key) { + return Status::InvalidArgument( + "Table column '%s' (id=%d) does not have a matching partition value", + table_column.name); + } + if (!_options.allow_missing_columns) { + return Status::InvalidArgument( + "Table column '%s' (id=%d) does not have a matching file column", + table_column.name, table_column.id); + } + } + _mappings.push_back(std::move(mapping)); + } + return Status::OK(); +} + +Status TableColumnMapper::create_scan_request(const std::map& table_filters, + const std::vector& projected_columns, + FileScanRequest* file_request) { + // 真实实现会把 table projection/filter 转换成 file-local projection/filter。 + file_request->predicate_columns.clear(); + file_request->non_predicate_columns.clear(); + file_request->local_filters.clear(); + file_request->reader_expression_map.clear(); + for (const auto& table_column : projected_columns) { + const auto* mapping = _find_mapping(table_column.id); + if (mapping != nullptr && mapping->file_column_id.has_value() && + table_filters.count(table_column.id) == 0) { + file_request->non_predicate_columns.push_back(*mapping->file_column_id); + } + } + RETURN_IF_ERROR(localize_filters(table_filters, file_request)); + return Status::OK(); +} + +Status TableColumnMapper::localize_filters(const std::map& table_filters, + FileScanRequest* file_request) const { + // 真实实现会处理 trivial mapping、safe cast、reader expression fallback 和 + // finalize-only filter。stub 只复制能够直接定位到 file column 的谓词。 + for (const auto& it : table_filters) { + const auto* mapping = _find_mapping(it.first); + if (mapping == nullptr || !mapping->file_column_id.has_value()) { + continue; + } + if (!it.second.can_be_localized()) { + // TODO: Rewrite table filter to reader_expression_map + // file_request->reader_expression_map.emplace_back(mapping->table_column_id, it.second.conjunct); + } else { + FileLocalFilter local_filter; + local_filter.file_column_id = *mapping->file_column_id; + local_filter.conjunct = it.second.conjunct; + local_filter.predicates = it.second.predicates; + file_request->local_filters.push_back(std::move(local_filter)); + } + file_request->predicate_columns.push_back(*mapping->file_column_id); + } + return Status::OK(); +} + +const SchemaField* TableColumnMapper::_find_file_field( + const TableColumn& table_column, const std::vector& file_schema) const { + for (const auto& field : file_schema) { + if (_options.mode == TableColumnMappingMode::BY_FIELD_ID && field.id == table_column.id) { + return &field; + } + if (field.name == table_column.name) { + return &field; + } + } + return nullptr; +} + +} // namespace doris::reader diff --git a/be/src/format/reader/column_mapper.h b/be/src/format/reader/column_mapper.h new file mode 100644 index 00000000000000..4c6b510ff0e48a --- /dev/null +++ b/be/src/format/reader/column_mapper.h @@ -0,0 +1,124 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "common/status.h" +#include "core/data_type/data_type.h" +#include "exprs/vexpr_fwd.h" +#include "format/reader/expr/literal.h" +namespace doris::reader { + +struct TableColumn; +struct TableFilter; +struct SchemaField; +struct FileScanRequest; + +enum class TableColumnMappingMode { + BY_FIELD_ID, + BY_NAME, +}; + +// 单个 table column 到 file column 的映射结果。 +// 这是 table 层和 file 层的核心边界对象。 +struct ColumnMapping { + int32_t table_column_id = -1; + std::optional file_column_id; + DataTypePtr file_type; + DataTypePtr table_type; + + // 最终输出表达式。用于把 file-local value 转成 table/global value,例如 cast、 + // default、partition、generated column 或复杂列 remap。 + VExprContextSPtr projection; + + // 读时过滤 fallback 表达式。只在 table filter 不能安全转换成 file-local predicate + // 时使用,服务 reader_expression_map,不等价于 finalize_expr。 + VExprContextSPtr reader_filter_expr; + + std::vector child_mappings; + bool is_trivial = false; + bool is_constant = false; + VExprContextSPtr default_expr; +}; + +struct TableColumnMapperOptions { + TableColumnMappingMode mode = TableColumnMappingMode::BY_FIELD_ID; + bool allow_missing_columns = true; + bool enable_reader_expression_fallback = true; +}; + +// 通用 table schema 到 file schema 映射层。 +// Iceberg 会使用 BY_FIELD_ID;普通 by-name 场景可以复用该组件,但不应把它命名成 +// Iceberg-only 组件。 +class TableColumnMapper { +public: + explicit TableColumnMapper(TableColumnMapperOptions options = {}) + : _options(std::move(options)) {} + virtual ~TableColumnMapper() = default; + + // 建立 table schema 到 file schema 的列映射。 + // 输出的 ColumnMapping 描述 table column 如何从 file column、常量列或表达式得到; + // 后续 projection、filter localization 和 table block finalize 都应复用这份映射。 + virtual Status create_mapping(const std::vector& projected_columns, + std::vector block_schema, + const std::map& partition_values, + const std::vector& file_schema); + + // 把 table-level scan 请求转换成 file-local scan 请求。 + // table_request 使用 table/global schema;file_request 只包含 FileReader 能理解的 + // projected_file_columns、local_filters 和 reader_expression_map。 + virtual Status create_scan_request(const std::map& table_filters, + const std::vector& projected_columns, + FileScanRequest* file_request); + + // 将 table-level filter 定位到文件 schema。 + // trivial mapping 可以直接复制结构化谓词;类型变化时可以尝试安全 cast;无法安全 + // 下推的表达式应通过 reader_expression_map 或 table-level finalize/filter fallback 处理。 + virtual Status localize_filters(const std::map& table_filters, + FileScanRequest* file_request) const; + void clear() { _mappings.clear(); } + const std::vector& mappings() const { return _mappings; } + +private: + const SchemaField* _find_file_field(const TableColumn& table_column, + const std::vector& file_schema) const; + + const ColumnMapping* _find_mapping(ColumnId table_column_id) const { + for (const auto& mapping : _mappings) { + if (mapping.table_column_id == table_column_id) { + return &mapping; + } + } + return nullptr; + } + + bool _is_same_type(const DataTypePtr& table_type, const DataTypePtr& file_type) const { + return table_type == file_type; + } + + TableColumnMapperOptions _options; + std::vector _mappings; +}; + +} // namespace doris::reader diff --git a/be/src/format/reader/expr/literal.h b/be/src/format/reader/expr/literal.h new file mode 100644 index 00000000000000..9c4202994ee0ab --- /dev/null +++ b/be/src/format/reader/expr/literal.h @@ -0,0 +1,35 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "core/data_type/data_type.h" +#include "exprs/vliteral.h" + +namespace doris { + +class TableLiteral : public VLiteral { + ENABLE_FACTORY_CREATOR(TableLiteral); + +public: + TableLiteral(const DataTypePtr& type, const Field& field) : VLiteral(type) { + _data_type = type; + _column_ptr = _data_type->create_column_const(1, field); + } +}; + +} // namespace doris diff --git a/be/src/format/reader/expr/slot_ref.h b/be/src/format/reader/expr/slot_ref.h new file mode 100644 index 00000000000000..6b5d027602ee18 --- /dev/null +++ b/be/src/format/reader/expr/slot_ref.h @@ -0,0 +1,39 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "core/data_type/data_type.h" +#include "exprs/vslot_ref.h" + +namespace doris { + +class TableSlotRef : public VSlotRef { + ENABLE_FACTORY_CREATOR(TableSlotRef); + +public: + TableSlotRef(int slot_id, int column_id, int column_uniq_id, const DataTypePtr& type) + : VSlotRef(slot_id, column_id, column_uniq_id) { + _data_type = type; + } + + Status prepare(RuntimeState* state, const RowDescriptor& desc, VExprContext* context) override { + return Status::OK(); + } +}; + +} // namespace doris diff --git a/be/src/format/reader/file_reader.h b/be/src/format/reader/file_reader.h index e9fb0f3c963a93..edebdcaff423e0 100644 --- a/be/src/format/reader/file_reader.h +++ b/be/src/format/reader/file_reader.h @@ -47,7 +47,7 @@ using ColumnId = int32_t; // schema 语义。Iceberg field id、name mapping、default/generated/partition 列都不在 // FileReader 内部解释。 struct SchemaField { - ColumnId id = -1; + int32_t id = -1; std::string name; DataTypePtr type; std::vector children; @@ -65,9 +65,16 @@ struct FileLocalFilter { // 结构化列谓词。适合文件层 pruning,例如 min/max、page index、dictionary、 // bloom filter 等只理解单列谓词的优化。 + // TODO: conjunct 支持表达所有 filter 语义之后删除。 std::vector> predicates; }; +enum class FileFormat { + PARQUET, + ORC, + CSV, +}; + // 通用文件层 scan 请求。 // 该结构描述所有文件格式都可以共享的 file-local 读取输入。这里不出现 table/global // schema。所有 schema change、filter localization、default/generated/partition @@ -75,9 +82,23 @@ struct FileLocalFilter { struct FileScanRequest { virtual ~FileScanRequest() = default; - std::vector projected_file_columns; + std::vector predicate_columns; + std::vector non_predicate_columns; std::vector local_filters; + // fallback path if filters cannot be localized to file-local predicates. The expression can reference projected_file_columns and partition columns. std::vector> reader_expression_map; + // partition key -> value + std::map partition_values; + + // projected_columns' id is file-local column id, and they are all from file schema. + // For example, + // file schema: [0: id (int), 1: name (string), 2: age (int)] + // predicate: age > 30 + // table-level projection: [name, id] + // predicate_columns: [2] + // non_predicate_columns: [1, 0] + // projected_columns are columns in blocks returned to table reader: [1, 0] means only name and id are projected, + std::vector projected_columns; }; // 文件物理读取层通用接口。 @@ -113,7 +134,7 @@ class FileReader { virtual Status init(const FileScanRequest& request) { // 真实实现会根据 projected columns、local filters 和 reader expressions // 初始化文件格式自己的物理读取计划。 - _request.projected_file_columns = request.projected_file_columns; + // _request.projected_file_columns = request.projected_file_columns; _request.local_filters = request.local_filters; _request.reader_expression_map = request.reader_expression_map; return Status::OK(); diff --git a/be/src/format/reader/table_reader.h b/be/src/format/reader/table_reader.h index 99dcc507e5d503..4d8fe0620c8f83 100644 --- a/be/src/format/reader/table_reader.h +++ b/be/src/format/reader/table_reader.h @@ -30,7 +30,10 @@ #include "common/status.h" #include "core/block/block.h" #include "core/data_type/data_type.h" +#include "exprs/vexpr_context.h" #include "exprs/vexpr_fwd.h" +#include "format/reader/column_mapper.h" +#include "format/reader/expr/literal.h" #include "format/reader/file_reader.h" namespace doris { @@ -47,45 +50,22 @@ struct TableColumn { std::string name; DataTypePtr type; std::vector children; + VExprContextSPtr default_expr; + bool is_partition_key = false; }; // table-level filter。 // TableColumnMapper 负责把它转换成 FileLocalFilter 或 reader_expression_map。 struct TableFilter { - ColumnId table_column_id = -1; - // 表达式过滤,适合表达 cast、复杂表达式、复杂列提取等语义。 VExprContextSPtr conjunct; // 结构化列谓词,适合下推到文件层做 row group stats、page index、dictionary、 // bloom filter 等优化。 + // TODO: conjunct 支持表达所有 filter 语义之后删除。 std::vector> predicates; -}; - -// 单个 table column 到 file column 的映射结果。 -// 这是 table 层和 file 层的核心边界对象。 -struct ColumnMapping { - ColumnId table_column_id = -1; - std::optional file_column_id; - DataTypePtr file_type; - DataTypePtr table_type; - - // 最终输出表达式。用于把 file-local value 转成 table/global value,例如 cast、 - // default、partition、generated column 或复杂列 remap。 - VExprContextSPtr finalize_expr; - - // 读时过滤 fallback 表达式。只在 table filter 不能安全转换成 file-local predicate - // 时使用,服务 reader_expression_map,不等价于 finalize_expr。 - VExprContextSPtr reader_filter_expr; - - std::vector child_mappings; - bool is_trivial = false; - bool is_constant = false; -}; -enum class TableColumnMappingMode { - BY_FIELD_ID, - BY_NAME, + bool can_be_localized() const { return true; } }; enum class TableFilterConversion { @@ -95,132 +75,6 @@ enum class TableFilterConversion { FINALIZE_ONLY, }; -struct TableColumnMapperOptions { - TableColumnMappingMode mode = TableColumnMappingMode::BY_FIELD_ID; - bool allow_missing_columns = true; - bool enable_reader_expression_fallback = true; -}; - -// table-level scan 请求。 -// 它仍然使用 table/global schema 语义,不能直接传给 FileReader。 -struct TableScanRequest { - std::vector projected_table_columns; - std::vector table_filters; -}; - -// 通用 table schema 到 file schema 映射层。 -// Iceberg 会使用 BY_FIELD_ID;普通 by-name 场景可以复用该组件,但不应把它命名成 -// Iceberg-only 组件。 -class TableColumnMapper { -public: - explicit TableColumnMapper(TableColumnMapperOptions options = {}) - : _options(std::move(options)) {} - virtual ~TableColumnMapper() = default; - - // 建立 table schema 到 file schema 的列映射。 - // 输出的 ColumnMapping 描述 table column 如何从 file column、常量列或表达式得到; - // 后续 projection、filter localization 和 table block finalize 都应复用这份映射。 - virtual Status create_mapping(const std::vector& table_schema, - const std::vector& file_schema, - std::vector* mappings) { - // 真实实现会做 field id/name matching、类型转换、复杂列 child mapping、缺失列 - // default/partition/generated 表达式构造。 - mappings->clear(); - for (const auto& table_column : table_schema) { - ColumnMapping mapping; - mapping.table_column_id = table_column.id; - mapping.table_type = table_column.type; - if (const auto* file_field = find_file_field(table_column, file_schema)) { - mapping.file_column_id = file_field->id; - mapping.file_type = file_field->type; - mapping.is_trivial = is_same_type(mapping.table_type, mapping.file_type); - } else { - mapping.is_constant = true; - } - mappings->push_back(std::move(mapping)); - } - _mappings = *mappings; - return Status::OK(); - } - - // 把 table-level scan 请求转换成 file-local scan 请求。 - // table_request 使用 table/global schema;file_request 只包含 FileReader 能理解的 - // projected_file_columns、local_filters 和 reader_expression_map。 - virtual Status create_scan_request(const TableScanRequest& table_request, - const std::vector& mappings, - FileScanRequest* file_request) { - // 真实实现会把 table projection/filter 转换成 file-local projection/filter。 - file_request->projected_file_columns.clear(); - file_request->local_filters.clear(); - file_request->reader_expression_map.clear(); - _mappings = mappings; - for (const auto& table_column : table_request.projected_table_columns) { - const auto* mapping = find_mapping(table_column.id); - if (mapping != nullptr && mapping->file_column_id.has_value()) { - file_request->projected_file_columns.push_back(*mapping->file_column_id); - } - } - RETURN_IF_ERROR(localize_filters(table_request.table_filters, file_request)); - return Status::OK(); - } - - // 将 table-level filter 定位到文件 schema。 - // trivial mapping 可以直接复制结构化谓词;类型变化时可以尝试安全 cast;无法安全 - // 下推的表达式应通过 reader_expression_map 或 table-level finalize/filter fallback 处理。 - virtual Status localize_filters(const std::vector& table_filters, - FileScanRequest* file_request) const { - // 真实实现会处理 trivial mapping、safe cast、reader expression fallback 和 - // finalize-only filter。stub 只复制能够直接定位到 file column 的谓词。 - for (const auto& filter : table_filters) { - const auto* mapping = find_mapping(filter.table_column_id); - if (mapping == nullptr || !mapping->file_column_id.has_value()) { - continue; - } - FileLocalFilter local_filter; - local_filter.file_column_id = *mapping->file_column_id; - local_filter.conjunct = filter.conjunct; - local_filter.predicates = filter.predicates; - file_request->local_filters.push_back(std::move(local_filter)); - } - return Status::OK(); - } - - const std::vector& mappings() const { return _mappings; } - -private: - const SchemaField* find_file_field(const TableColumn& table_column, - const std::vector& file_schema) const { - for (const auto& field : file_schema) { - if (_options.mode == TableColumnMappingMode::BY_FIELD_ID && - field.id == table_column.id) { - return &field; - } - if (_options.mode == TableColumnMappingMode::BY_NAME && - field.name == table_column.name) { - return &field; - } - } - return nullptr; - } - - const ColumnMapping* find_mapping(ColumnId table_column_id) const { - for (const auto& mapping : _mappings) { - if (mapping.table_column_id == table_column_id) { - return &mapping; - } - } - return nullptr; - } - - bool is_same_type(const DataTypePtr& table_type, const DataTypePtr& file_type) const { - return table_type == file_type; - } - -private: - TableColumnMapperOptions _options; - std::vector _mappings; -}; - struct BaseDataFile { virtual ~BaseDataFile() = default; @@ -236,13 +90,21 @@ struct ScanTask { std::unique_ptr data_file; }; +struct ReadProfile {}; + struct TableReadOptions { - size_t batch_size = 4096; - // TODO: deleted? SCHEMA should be derived from table metadata and inited by TableReader it self? it shouldn't be part of read options. - std::vector schema; - TableScanRequest scan_request; + const std::vector projected_columns; + // All conjuncts from scan operator + const VExprContext conjuncts; + const FileFormat format; // Each task denotes a descriptor of a single file to read, along with file-level metadata such as stats and delete files. std::vector> scan_tasks; + + std::unique_ptr profile; +}; + +struct SplitReadOptions { + std::map partition_values; }; // table-level reader 基类。 @@ -255,10 +117,20 @@ class TableReader { // 初始化 table reader 的通用运行参数。 // 子类可以在自己的 init(options) 中调用该方法;这里不接收具体表格式 schema/task。 virtual Status init(TableReadOptions options) { - _schema = std::move(_options.schema); - _table_scan_request = std::move(_options.scan_request); _scan_tasks = std::move(_options.scan_tasks); _next_task_idx = 0; + _profile = std::move(options.profile); + TableColumnMapperOptions mapper_options; + mapper_options.mode = TableColumnMappingMode::BY_FIELD_ID; + _data_reader.column_mapper = TableColumnMapper(mapper_options); + // TODO: + // _table_filters = build_table_filters_from_conjuncts(options.conjuncts); + return Status::OK(); + } + + // 读取当前 split/partition 之前初始化。 + virtual Status prepare_split(SplitReadOptions options) { + _partition_values = std::move(options.partition_values); return Status::OK(); } @@ -278,20 +150,30 @@ class TableReader { // 基类负责 current reader 的打开、EOF 后切换和关闭;子类只实现 protected hook。 // table_block 的列必须已经是 table/global schema 语义。 Status get_block(Block* block, bool* eos) { - if (eos != nullptr) { - *eos = false; - } while (block->empty() && !*eos) { - if (!_data_reader) { + if (!_data_reader.reader) { RETURN_IF_ERROR(create_next_reader(eos)); - if (!_data_reader) { + if (!_data_reader.reader) { DCHECK(*eos); return Status::OK(); } } bool current_eof = false; - RETURN_IF_ERROR(_data_reader->get_block(block, ¤t_eof)); + Block current_block; + for (const auto& field : _data_reader.block_schema) { + // TODO: reuse column's memory + current_block.insert({field.type->create_column(), field.type, field.name}); + } + RETURN_IF_ERROR(_data_reader.reader->get_block(¤t_block, ¤t_eof)); + + size_t idx = 0; + for (const auto& mapping : _data_reader.column_mapper.mappings()) { + int res_id; + RETURN_IF_ERROR(mapping.projection->execute(¤t_block, &res_id)); + block->replace_by_position(idx, current_block.get_columns()[res_id]); + idx++; + } RETURN_IF_ERROR(finalize_chunk(block)); RETURN_IF_ERROR(materialize_virtual_columns(block)); if (current_eof) { @@ -304,7 +186,7 @@ class TableReader { // 关闭 table reader 及当前正在读取的底层 reader。 // 子类如果持有额外表格式资源,应 override 后先调用 TableReader::close()。 virtual Status close() { - if (_data_reader) { + if (_data_reader.reader) { RETURN_IF_ERROR(close_current_reader()); } return Status::OK(); @@ -315,10 +197,10 @@ class TableReader { // 该方法先关闭当前 reader,再打开下一个具体 reader;子类不应重复实现这个循环。 Status create_next_reader(bool* eos) { // 多文件切换的公共流程留在基类:关闭当前 reader,然后打开下一个 reader。 - DCHECK(_data_reader == nullptr); + DCHECK(_data_reader.reader == nullptr); // TODO: 创建_data_reader // _data_reader = std::make_unique(...); - if (!_data_reader) { + if (!_data_reader.reader) { if (eos != nullptr) { *eos = true; } @@ -332,24 +214,25 @@ class TableReader { // 子类在这里基于当前 split/task 初始化底层 FileReader。 virtual Status open_reader() { std::vector file_schema; - RETURN_IF_ERROR(_data_reader->get_schema(&file_schema)); - TableColumnMapperOptions mapper_options; - mapper_options.mode = TableColumnMappingMode::BY_FIELD_ID; - _column_mapper = TableColumnMapper(mapper_options); - RETURN_IF_ERROR(_column_mapper.create_mapping(_schema, file_schema, &_mappings)); + RETURN_IF_ERROR(_data_reader.reader->get_schema(&file_schema)); + RETURN_IF_ERROR(_data_reader.column_mapper.create_mapping(_options.projected_columns, + _data_reader.block_schema, + _partition_values, file_schema)); FileScanRequest file_request; - RETURN_IF_ERROR( - _column_mapper.create_scan_request(_table_scan_request, _mappings, &file_request)); - RETURN_IF_ERROR(_data_reader->init(file_request)); + RETURN_IF_ERROR(_data_reader.column_mapper.create_scan_request( + _table_filters, _options.projected_columns, &file_request)); + RETURN_IF_ERROR(_data_reader.reader->init(file_request)); return Status::OK(); } // 关闭当前具体 reader。 // 该 hook 会被 create_next_reader 和 close 调用;实现应保持幂等。 virtual Status close_current_reader() { - RETURN_IF_ERROR(_data_reader->close()); - _data_reader.reset(); + RETURN_IF_ERROR(_data_reader.reader->close()); + _data_reader.reader.reset(); + _data_reader.column_mapper.clear(); + _data_reader.block_schema.clear(); return Status::OK(); } @@ -365,14 +248,20 @@ class TableReader { return Status::OK(); } + struct DataReader { + std::unique_ptr reader; + TableColumnMapper column_mapper; + // Schema of blocks from file reader. + std::vector block_schema; + }; + DataReader _data_reader; TableReadOptions _options; - std::unique_ptr _data_reader; std::vector> _scan_tasks; - TableScanRequest _table_scan_request; - std::vector _schema; - std::vector _mappings; - TableColumnMapper _column_mapper; + // partition key -> value + std::map _partition_values; size_t _next_task_idx = 0; + std::map _table_filters; + std::unique_ptr _profile; }; } // namespace doris::reader diff --git a/be/src/format/table/iceberg_reader_mixin.h b/be/src/format/table/iceberg_reader_mixin.h index 42c80c9b7d4ddc..c9c84639b8faf0 100644 --- a/be/src/format/table/iceberg_reader_mixin.h +++ b/be/src/format/table/iceberg_reader_mixin.h @@ -341,9 +341,6 @@ class IcebergReaderMixin : public BaseReader, public TableSchemaChangeHelper { // id -> block column name std::unordered_map _id_to_block_column_name; - // File column names used during init - std::vector _file_col_names; - std::function()> _create_topn_row_id_column_iterator; diff --git a/be/src/format/table/iceberg_reader_v2.h b/be/src/format/table/iceberg_reader_v2.h index 3ddadc9f9deb7e..fc957eda12448e 100644 --- a/be/src/format/table/iceberg_reader_v2.h +++ b/be/src/format/table/iceberg_reader_v2.h @@ -60,28 +60,8 @@ struct IcebergScanTask final : public reader::ScanTask { // FileReader 完成 data file 物理读取,不继承具体文件格式 reader。 class IcebergTableReader : public reader::TableReader { public: - IcebergTableReader() = default; - ~IcebergTableReader() override = default; - // 初始化一次 Iceberg table scan。 - // options 必须一次性提供 schema、projection/filter 和 scan tasks,避免暴露 - // bind/set_tasks 等半初始化阶段。 - Status init(reader::TableReadOptions options) override { - // 一次性保存 Iceberg table scan 所需输入。TableReader 负责 reader 切换流程; - // IcebergTableReader 只提供后续要打开的 task 以及 table/file schema 映射语义。 - return reader::TableReader::init(std::move(options)); - } - - // 关闭当前 Iceberg scan。 - // 先让 TableReader 关闭当前 task reader,再释放 IcebergTableReader 持有的底层 - // FileReader。 - Status close() override { - RETURN_IF_ERROR(reader::TableReader::close()); - _data_reader.reset(); - return Status::OK(); - } - protected: // 将 file-local block 转换为 table/global schema block。 // 这里执行 ColumnMapping 中的 finalize_expr、缺失列填充、partition/generated 列 From 783e74028cb516ef064abd11ede59a9c6a7a187a Mon Sep 17 00:00:00 2001 From: Gabriel Date: Wed, 20 May 2026 09:37:28 +0800 Subject: [PATCH 05/38] Add unit tests for expr (#63415) --- .gitignore | 1 + .../format/reader/expr/table_expr_test.cpp | 119 ++++++++++++++++++ 2 files changed, 120 insertions(+) create mode 100644 be/test/format/reader/expr/table_expr_test.cpp diff --git a/.gitignore b/.gitignore index edb37019d8fd2d..a93b4957c14bf7 100644 --- a/.gitignore +++ b/.gitignore @@ -151,3 +151,4 @@ compile_commands.json .github .worktrees/ +.worktree_initialized diff --git a/be/test/format/reader/expr/table_expr_test.cpp b/be/test/format/reader/expr/table_expr_test.cpp new file mode 100644 index 00000000000000..df41c1482e309c --- /dev/null +++ b/be/test/format/reader/expr/table_expr_test.cpp @@ -0,0 +1,119 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "format/reader/expr/literal.h" +#include "format/reader/expr/slot_ref.h" + +#include + +#include +#include + +#include "core/block/block.h" +#include "core/column/column_const.h" +#include "core/data_type/data_type_number.h" +#include "core/data_type/primitive_type.h" +#include "core/field.h" +#include "runtime/descriptors.h" +#include "testutil/column_helper.h" + +namespace doris { + +TEST(TableLiteralTest, CreatesConstColumnWithGivenTypeAndField) { + auto type = std::make_shared(); + auto literal = TableLiteral::create_shared(type, Field::create_field(123)); + + ASSERT_EQ(literal->data_type(), type); + ASSERT_TRUE(literal->is_literal()); + + const auto& column = literal->get_column_ptr(); + ASSERT_EQ(column->size(), 1); + ASSERT_TRUE(is_column_const(*column)); + EXPECT_EQ(column->get_int(0), 123); +} + +TEST(TableLiteralTest, ExecutesAsConstColumn) { + auto type = std::make_shared(); + auto literal = TableLiteral::create_shared(type, Field::create_field(456)); + + ColumnPtr result_column; + ASSERT_TRUE(literal->execute_column(nullptr, nullptr, nullptr, 3, result_column).ok()); + + ASSERT_EQ(result_column->size(), 3); + ASSERT_TRUE(is_column_const(*result_column)); + EXPECT_EQ(result_column->get_int(0), 456); + EXPECT_EQ(result_column->get_int(2), 456); +} + +TEST(TableLiteralTest, ExecuteAppendsConstColumnToBlock) { + auto type = std::make_shared(); + auto literal = TableLiteral::create_shared(type, Field::create_field(789)); + Block block; + block.insert(ColumnHelper::create_column_with_name({1, 2, 3})); + + int result_column_id = -1; + ASSERT_TRUE(literal->execute(nullptr, &block, &result_column_id).ok()); + + ASSERT_EQ(result_column_id, 1); + ASSERT_EQ(block.columns(), 2); + const auto& result_column = block.get_by_position(result_column_id).column; + ASSERT_EQ(result_column->size(), 3); + ASSERT_TRUE(is_column_const(*result_column)); + EXPECT_EQ(result_column->get_int(0), 789); + EXPECT_EQ(result_column->get_int(2), 789); + EXPECT_EQ(block.get_by_position(result_column_id).type, type); +} + +TEST(TableSlotRefTest, KeepsSlotColumnIdsAndType) { + auto type = std::make_shared(); + auto slot_ref = TableSlotRef::create_shared(10, 20, 30, type); + + EXPECT_EQ(slot_ref->slot_id(), 10); + EXPECT_EQ(slot_ref->column_id(), 20); + EXPECT_EQ(slot_ref->data_type(), type); + EXPECT_FALSE(slot_ref->is_constant()); + + std::set column_ids; + slot_ref->collect_slot_column_ids(column_ids); + ASSERT_EQ(column_ids.size(), 1); + EXPECT_EQ(*column_ids.begin(), 20); +} + +TEST(TableSlotRefTest, PrepareDoesNotRequireRowDescriptor) { + auto type = std::make_shared(); + auto slot_ref = TableSlotRef::create_shared(10, 20, 30, type); + + EXPECT_TRUE(slot_ref->prepare(nullptr, RowDescriptor(), nullptr).ok()); +} + +TEST(TableSlotRefTest, ExecuteReturnsReferencedColumnId) { + auto type = std::make_shared(); + auto slot_ref = TableSlotRef::create_shared(10, 1, 30, type); + Block block; + block.insert(ColumnHelper::create_column_with_name({1, 2, 3})); + block.insert(ColumnHelper::create_column_with_name({4, 5, 6})); + + int result_column_id = -1; + ASSERT_TRUE(slot_ref->execute(nullptr, &block, &result_column_id).ok()); + + EXPECT_EQ(result_column_id, 1); + EXPECT_EQ(block.columns(), 2); + EXPECT_EQ(block.get_by_position(result_column_id).column->get_int(0), 4); + EXPECT_EQ(block.get_by_position(result_column_id).column->get_int(2), 6); +} + +} // namespace doris From dae05ba674cac4c737e25521ef5b46551d855539 Mon Sep 17 00:00:00 2001 From: Gabriel Date: Wed, 20 May 2026 16:25:47 +0800 Subject: [PATCH 06/38] Framework to do delete filtering (#63442) --- be/src/format/csv/csv_reader.cpp | 2 +- be/src/format/json/new_json_reader.cpp | 2 +- be/src/format/native/native_reader.cpp | 2 +- be/src/format/orc/vorc_reader.cpp | 2 +- be/src/format/parquet/vparquet_reader.cpp | 2 +- .../format/reader/expr/delete_predicate.cpp | 77 +++++++++++++ be/src/format/reader/expr/delete_predicate.h | 57 ++++++++++ be/src/format/reader/file_reader.h | 2 + be/src/format/reader/table/paimon_reader.cpp | 42 +++++++ be/src/format/reader/table/paimon_reader.h | 36 ++++++ be/src/format/reader/table_reader.cpp | 103 ++++++++++++++++++ be/src/format/reader/table_reader.h | 39 ++++++- .../format/table/deletion_vector_reader.cpp | 19 +--- be/src/format/table/deletion_vector_reader.h | 29 ++++- be/src/io/file_factory.cpp | 13 +-- be/src/io/file_factory.h | 3 +- gensrc/thrift/Exprs.thrift | 2 + gensrc/thrift/Opcodes.thrift | 2 + 18 files changed, 401 insertions(+), 33 deletions(-) create mode 100644 be/src/format/reader/expr/delete_predicate.cpp create mode 100644 be/src/format/reader/expr/delete_predicate.h create mode 100644 be/src/format/reader/table/paimon_reader.cpp create mode 100644 be/src/format/reader/table/paimon_reader.h create mode 100644 be/src/format/reader/table_reader.cpp diff --git a/be/src/format/csv/csv_reader.cpp b/be/src/format/csv/csv_reader.cpp index 539132c7c9f003..4231b8eb20c8e5 100644 --- a/be/src/format/csv/csv_reader.cpp +++ b/be/src/format/csv/csv_reader.cpp @@ -638,7 +638,7 @@ Status CsvReader::_create_file_reader(bool need_schema) { } else { _file_description.mtime = _range.__isset.modification_time ? _range.modification_time : 0; io::FileReaderOptions reader_options = - FileFactory::get_reader_options(_state, _file_description); + FileFactory::get_reader_options(_state->query_options(), _file_description); io::FileReaderSPtr file_reader; if (_io_ctx_holder) { file_reader = DORIS_TRY(io::DelegateReader::create_file_reader( diff --git a/be/src/format/json/new_json_reader.cpp b/be/src/format/json/new_json_reader.cpp index da141437fcf200..89992105cb87fd 100644 --- a/be/src/format/json/new_json_reader.cpp +++ b/be/src/format/json/new_json_reader.cpp @@ -478,7 +478,7 @@ Status NewJsonReader::_open_file_reader(bool need_schema) { } else { _file_description.mtime = _range.__isset.modification_time ? _range.modification_time : 0; io::FileReaderOptions reader_options = - FileFactory::get_reader_options(_state, _file_description); + FileFactory::get_reader_options(_state->query_options(), _file_description); io::FileReaderSPtr file_reader; if (_io_ctx_holder) { file_reader = DORIS_TRY(io::DelegateReader::create_file_reader( diff --git a/be/src/format/native/native_reader.cpp b/be/src/format/native/native_reader.cpp index 565bab20231125..32fb7d660ad97b 100644 --- a/be/src/format/native/native_reader.cpp +++ b/be/src/format/native/native_reader.cpp @@ -125,7 +125,7 @@ Status NativeReader::init_reader() { } io::FileReaderOptions reader_options = - FileFactory::get_reader_options(_state, file_description); + FileFactory::get_reader_options(_state->query_options(), file_description); auto reader_res = io::DelegateReader::create_file_reader( _profile, system_properties, file_description, reader_options, io::DelegateReader::AccessMode::RANDOM, _io_ctx); diff --git a/be/src/format/orc/vorc_reader.cpp b/be/src/format/orc/vorc_reader.cpp index bcb1a8d70f4b3f..25db29c49625af 100644 --- a/be/src/format/orc/vorc_reader.cpp +++ b/be/src/format/orc/vorc_reader.cpp @@ -348,7 +348,7 @@ Status OrcReader::_create_file_reader() { _file_description.mtime = _scan_range.__isset.modification_time ? _scan_range.modification_time : 0; io::FileReaderOptions reader_options = - FileFactory::get_reader_options(_state, _file_description); + FileFactory::get_reader_options(_state->query_options(), _file_description); io::FileReaderSPtr inner_reader; if (_io_ctx_holder != nullptr) { inner_reader = DORIS_TRY(io::DelegateReader::create_file_reader( diff --git a/be/src/format/parquet/vparquet_reader.cpp b/be/src/format/parquet/vparquet_reader.cpp index a2f2356085b171..35cb3b1944a22b 100644 --- a/be/src/format/parquet/vparquet_reader.cpp +++ b/be/src/format/parquet/vparquet_reader.cpp @@ -311,7 +311,7 @@ Status ParquetReader::_open_file() { _file_description.mtime = _scan_range.__isset.modification_time ? _scan_range.modification_time : 0; io::FileReaderOptions reader_options = - FileFactory::get_reader_options(_state, _file_description); + FileFactory::get_reader_options(_state->query_options(), _file_description); _file_reader = DORIS_TRY(io::DelegateReader::create_file_reader( _profile, _system_properties, _file_description, reader_options, io::DelegateReader::AccessMode::RANDOM, _io_ctx)); diff --git a/be/src/format/reader/expr/delete_predicate.cpp b/be/src/format/reader/expr/delete_predicate.cpp new file mode 100644 index 00000000000000..8a4ac54102f515 --- /dev/null +++ b/be/src/format/reader/expr/delete_predicate.cpp @@ -0,0 +1,77 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "format/reader/expr/delete_predicate.h" + +#include +#include +#include + +#include +#include +#include + +#include "common/status.h" +#include "core/block/block.h" +#include "core/block/column_numbers.h" +#include "core/block/column_with_type_and_name.h" +#include "core/block/columns_with_type_and_name.h" + +namespace doris { + +DeletePredicate::DeletePredicate(const std::vector& deleted_rows) + : VExpr(), _deleted_rows(deleted_rows) { + _node_type = TExprNodeType::PREDICATE; + _opcode = TExprOpcode::DELETE; + _data_type = std::make_shared(); +} + +Status DeletePredicate::prepare(RuntimeState* state, const RowDescriptor& desc, + VExprContext* context) { + RETURN_IF_ERROR_OR_PREPARED(VExpr::prepare(state, desc, context)); + _expr_name = "DeletePredicate"; + _prepare_finished = true; + return Status::OK(); +} + +Status DeletePredicate::open(RuntimeState* state, VExprContext* context, + FunctionContext::FunctionStateScope scope) { + DCHECK(_prepare_finished); + RETURN_IF_ERROR_OR_PREPARED(VExpr::open(state, context, scope)); + _open_finished = true; + return Status::OK(); +} + +void DeletePredicate::close(VExprContext* context, FunctionContext::FunctionStateScope scope) { + VExpr::close(context, scope); +} + +Status DeletePredicate::execute_column_impl(VExprContext* context, const Block* block, + const Selector* selector, size_t count, + ColumnPtr& result_column) const { + DCHECK(_open_finished || block == nullptr); + + static_cast(_deleted_rows.size()); + // TODO: implement delete predicate logic here, currently we just return a column with all 0 (false) + return Status::OK(); +} + +std::string DeletePredicate::debug_string() const { + return _expr_name; +} + +} // namespace doris diff --git a/be/src/format/reader/expr/delete_predicate.h b/be/src/format/reader/expr/delete_predicate.h new file mode 100644 index 00000000000000..feb8093ea5c981 --- /dev/null +++ b/be/src/format/reader/expr/delete_predicate.h @@ -0,0 +1,57 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "common/object_pool.h" +#include "common/status.h" +#include "exprs/function_context.h" +#include "exprs/vexpr.h" + +namespace doris { +class RowDescriptor; +class RuntimeState; +class TExprNode; +class Block; +class VExprContext; +} // namespace doris + +namespace doris { + +class DeletePredicate final : public VExpr { + ENABLE_FACTORY_CREATOR(DeletePredicate); + +public: + DeletePredicate(const std::vector& deleted_rows); + ~DeletePredicate() override = default; + Status execute_column_impl(VExprContext* context, const Block* block, const Selector* selector, + size_t count, ColumnPtr& result_column) const override; + Status prepare(RuntimeState* state, const RowDescriptor& desc, VExprContext* context) override; + Status open(RuntimeState* state, VExprContext* context, + FunctionContext::FunctionStateScope scope) override; + void close(VExprContext* context, FunctionContext::FunctionStateScope scope) override; + std::string debug_string() const override; + uint64_t get_digest(uint64_t seed) const override { return 0; } + const std::string& expr_name() const override { return _expr_name; } + +private: + std::string _expr_name; + const std::vector& _deleted_rows; +}; +} // namespace doris \ No newline at end of file diff --git a/be/src/format/reader/file_reader.h b/be/src/format/reader/file_reader.h index edebdcaff423e0..6dfbb4a8420cb8 100644 --- a/be/src/format/reader/file_reader.h +++ b/be/src/format/reader/file_reader.h @@ -62,6 +62,8 @@ struct FileLocalFilter { // 表达式过滤。适合 cast、复杂表达式或 reader_expression_map 生成的临时列过滤。 // 它通常不能直接驱动 row group stats、page index、dictionary、bloom filter。 VExprContextSPtr conjunct; + // DeletePredicate + VExprContextSPtr delete_conjunct; // 结构化列谓词。适合文件层 pruning,例如 min/max、page index、dictionary、 // bloom filter 等只理解单列谓词的优化。 diff --git a/be/src/format/reader/table/paimon_reader.cpp b/be/src/format/reader/table/paimon_reader.cpp new file mode 100644 index 00000000000000..713d1a97e68983 --- /dev/null +++ b/be/src/format/reader/table/paimon_reader.cpp @@ -0,0 +1,42 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "format/reader/table/paimon_reader.h" + +#include "format/table/deletion_vector_reader.h" + +namespace doris::paimon { + +bool PaimonReader::_parse_delete_file(const TTableFormatFileDesc& t_desc, DeleteFileDesc& desc) { + const auto& table_desc = t_desc.paimon_params; + if (!table_desc.__isset.deletion_file) { + return false; + } + const auto& deletion_file = table_desc.deletion_file; + + desc.key.resize(deletion_file.path.size() + sizeof(deletion_file.offset)); + memcpy(desc.key.data(), deletion_file.path.data(), deletion_file.path.size()); + memcpy(desc.key.data() + deletion_file.path.size(), &deletion_file.offset, + sizeof(deletion_file.offset)); + desc.path = deletion_file.path; + desc.start_offset = deletion_file.offset; + desc.size = deletion_file.length + 4; + desc.file_size = -1; + return true; +} + +} // namespace doris::paimon diff --git a/be/src/format/reader/table/paimon_reader.h b/be/src/format/reader/table/paimon_reader.h new file mode 100644 index 00000000000000..d0f33c7a90c0b6 --- /dev/null +++ b/be/src/format/reader/table/paimon_reader.h @@ -0,0 +1,36 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "format/reader/table_reader.h" + +namespace doris { +struct DeleteFileDesc; +} +namespace doris::paimon { + +class PaimonReader final : public reader::TableReader { +public: + ENABLE_FACTORY_CREATOR(PaimonReader); + ~PaimonReader() final = default; + +protected: + bool _parse_delete_file(const TTableFormatFileDesc& t_desc, DeleteFileDesc& desc) override; +}; + +} // namespace doris::paimon diff --git a/be/src/format/reader/table_reader.cpp b/be/src/format/reader/table_reader.cpp new file mode 100644 index 00000000000000..b89641c0bd2ee1 --- /dev/null +++ b/be/src/format/reader/table_reader.cpp @@ -0,0 +1,103 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "format/reader/table_reader.h" + +#include + +#include "common/status.h" +#include "format/reader/column_mapper.h" +#include "format/table/deletion_vector_reader.h" + +namespace doris::reader { + +Status TableReader::prepare_split(const SplitReadOptions& options) { + _partition_values = std::move(options.partition_values); + return _parse_delete_predicates(options); +} + +Status TableReader::_parse_delete_predicates(const SplitReadOptions& options) { + DeleteFileDesc desc {.fs_name = options.current_range.fs_name}; + if (_parse_delete_file(options.current_range.table_format_params, desc)) { + Status create_status = Status::OK(); + + _delete_rows = options.cache->get(desc.key, [&]() -> DeleteRows* { + auto* delete_rows = new DeleteRows; + + DeletionVectorReader dv_reader(_runtime_state, _scanner_profile, *_scan_params, desc, + _io_ctx); + create_status = dv_reader.open(); + if (!create_status.ok()) [[unlikely]] { + return nullptr; + } + + size_t bytes_read = desc.size; + std::vector buffer(bytes_read); + create_status = dv_reader.read_at(desc.start_offset, {buffer.data(), bytes_read}); + if (!create_status.ok()) [[unlikely]] { + return nullptr; + } + + const char* buf = buffer.data(); + uint32_t actual_length; + std::memcpy(reinterpret_cast(&actual_length), buf, 4); + std::reverse(reinterpret_cast(&actual_length), + reinterpret_cast(&actual_length) + 4); + buf += 4; + if (actual_length != bytes_read - 4) [[unlikely]] { + create_status = Status::RuntimeError( + "DeletionVector deserialize error: length not match, " + "actual length: {}, expect length: {}", + actual_length, bytes_read - 4); + return nullptr; + } + uint32_t magic_number; + std::memcpy(reinterpret_cast(&magic_number), buf, 4); + std::reverse(reinterpret_cast(&magic_number), + reinterpret_cast(&magic_number) + 4); + buf += 4; + const static uint32_t MAGIC_NUMBER = 1581511376; + if (magic_number != MAGIC_NUMBER) [[unlikely]] { + create_status = Status::RuntimeError( + "DeletionVector deserialize error: invalid magic number {}", magic_number); + return nullptr; + } + + roaring::Roaring roaring_bitmap; + SCOPED_TIMER(_profile->parse_delete_file_time); + try { + roaring_bitmap = roaring::Roaring::readSafe(buf, bytes_read - 4); + } catch (const std::runtime_error& e) { + create_status = Status::RuntimeError( + "DeletionVector deserialize error: failed to deserialize roaring bitmap, " + "{}", + e.what()); + return nullptr; + } + delete_rows->reserve(roaring_bitmap.cardinality()); + for (auto it = roaring_bitmap.begin(); it != roaring_bitmap.end(); it++) { + delete_rows->push_back(*it); + } + COUNTER_UPDATE(_profile->num_delete_rows, delete_rows->size()); + return delete_rows; + }); + RETURN_IF_ERROR(create_status); + } + + return Status::OK(); +} +} // namespace doris::reader diff --git a/be/src/format/reader/table_reader.h b/be/src/format/reader/table_reader.h index 4d8fe0620c8f83..d14e1e782618ad 100644 --- a/be/src/format/reader/table_reader.h +++ b/be/src/format/reader/table_reader.h @@ -33,16 +33,20 @@ #include "exprs/vexpr_context.h" #include "exprs/vexpr_fwd.h" #include "format/reader/column_mapper.h" +#include "format/reader/expr/delete_predicate.h" #include "format/reader/expr/literal.h" #include "format/reader/file_reader.h" namespace doris { class Block; class ColumnPredicate; +struct DeleteFileDesc; } // namespace doris namespace doris::reader { +using DeleteRows = std::vector; + // table/global schema 中的列视图。 // Iceberg 场景下,id 默认对应 Iceberg field id。该结构不描述文件中的物理列。 struct TableColumn { @@ -90,13 +94,21 @@ struct ScanTask { std::unique_ptr data_file; }; -struct ReadProfile {}; +struct ReadProfile { + RuntimeProfile::Counter* num_delete_files; + RuntimeProfile::Counter* num_delete_rows; + RuntimeProfile::Counter* parse_delete_file_time; +}; struct TableReadOptions { const std::vector projected_columns; // All conjuncts from scan operator const VExprContext conjuncts; const FileFormat format; + TFileScanRangeParams* scan_params; + io::IOContext* io_ctx; + RuntimeState* runtime_state; + RuntimeProfile* scanner_profile; // Each task denotes a descriptor of a single file to read, along with file-level metadata such as stats and delete files. std::vector> scan_tasks; @@ -105,6 +117,8 @@ struct TableReadOptions { struct SplitReadOptions { std::map partition_values; + ShardedKVCache* cache; + TFileRangeDesc current_range; }; // table-level reader 基类。 @@ -117,6 +131,11 @@ class TableReader { // 初始化 table reader 的通用运行参数。 // 子类可以在自己的 init(options) 中调用该方法;这里不接收具体表格式 schema/task。 virtual Status init(TableReadOptions options) { + _scan_params = options.scan_params; + _format = options.format; + _io_ctx = options.io_ctx; + _runtime_state = options.runtime_state; + _scanner_profile = options.scanner_profile; _scan_tasks = std::move(_options.scan_tasks); _next_task_idx = 0; _profile = std::move(options.profile); @@ -129,10 +148,7 @@ class TableReader { } // 读取当前 split/partition 之前初始化。 - virtual Status prepare_split(SplitReadOptions options) { - _partition_values = std::move(options.partition_values); - return Status::OK(); - } + virtual Status prepare_split(const SplitReadOptions& options); // table-level 动态过滤入口。 // 该方法用于根据 split、partition value 或文件级统计判断是否可以跳过后续 reader。 @@ -193,6 +209,9 @@ class TableReader { } protected: + virtual bool _parse_delete_file(const TTableFormatFileDesc& t_desc, DeleteFileDesc& desc) { + return false; + } // 切换到下一个 reader 的通用流程。 // 该方法先关闭当前 reader,再打开下一个具体 reader;子类不应重复实现这个循环。 Status create_next_reader(bool* eos) { @@ -262,6 +281,16 @@ class TableReader { size_t _next_task_idx = 0; std::map _table_filters; std::unique_ptr _profile; + // Parsed from DELETION_VECTOR in Iceberg and Paimon + DeleteRows* _delete_rows; + TFileScanRangeParams* _scan_params; + io::IOContext* _io_ctx; + RuntimeState* _runtime_state; + RuntimeProfile* _scanner_profile; + FileFormat _format; + +private: + Status _parse_delete_predicates(const SplitReadOptions& options); }; } // namespace doris::reader diff --git a/be/src/format/table/deletion_vector_reader.cpp b/be/src/format/table/deletion_vector_reader.cpp index bfe34a5f555f94..d7e33c923d95b7 100644 --- a/be/src/format/table/deletion_vector_reader.cpp +++ b/be/src/format/table/deletion_vector_reader.cpp @@ -54,9 +54,9 @@ Status DeletionVectorReader::_create_file_reader() { return Status::EndOfFile("stop read."); } - _file_description.mtime = _range.__isset.modification_time ? _range.modification_time : 0; + _file_description.mtime = _desc.modification_time; io::FileReaderOptions reader_options = - FileFactory::get_reader_options(_state, _file_description); + FileFactory::get_reader_options(_state->query_options(), _file_description); _file_reader = DORIS_TRY(io::DelegateReader::create_file_reader( _profile, _system_properties, _file_description, reader_options, io::DelegateReader::AccessMode::RANDOM, _io_ctx)); @@ -64,20 +64,13 @@ Status DeletionVectorReader::_create_file_reader() { } void DeletionVectorReader::_init_file_description() { - _file_description.path = _range.path; - _file_description.file_size = _range.__isset.file_size ? _range.file_size : -1; - if (_range.__isset.fs_name) { - _file_description.fs_name = _range.fs_name; - } + _file_description.path = _desc.path; + _file_description.file_size = _desc.file_size; + _file_description.fs_name = _desc.fs_name; } void DeletionVectorReader::_init_system_properties() { - if (_range.__isset.file_type) { - // for compatibility - _system_properties.system_type = _range.file_type; - } else { - _system_properties.system_type = _params.file_type; - } + _system_properties.system_type = _params.file_type; _system_properties.properties = _params.properties; _system_properties.hdfs_params = _params.hdfs_params; if (_params.__isset.broker_addresses) { diff --git a/be/src/format/table/deletion_vector_reader.h b/be/src/format/table/deletion_vector_reader.h index 0663f3b28490ef..b030f048415bf1 100644 --- a/be/src/format/table/deletion_vector_reader.h +++ b/be/src/format/table/deletion_vector_reader.h @@ -36,6 +36,16 @@ struct IOContext; } // namespace io namespace doris { +struct DeleteFileDesc { + std::string key = ""; + std::string path = ""; + std::string fs_name = ""; + int64_t start_offset = 0; + int64_t size = 0; + int64_t file_size = -1; + int64_t modification_time = 0; +}; + class DeletionVectorReader { ENABLE_FACTORY_CREATOR(DeletionVectorReader); @@ -43,7 +53,22 @@ class DeletionVectorReader { DeletionVectorReader(RuntimeState* state, RuntimeProfile* profile, const TFileScanRangeParams& params, const TFileRangeDesc& range, io::IOContext* io_ctx) - : _state(state), _profile(profile), _range(range), _params(params), _io_ctx(io_ctx) {} + : _state(state), _profile(profile), _params(params), _io_ctx(io_ctx) { + _desc = DeleteFileDesc { + .key = "", + .path = range.path, + .fs_name = range.__isset.fs_name ? range.fs_name : "", + .start_offset = range.start_offset, + .size = range.size, + .file_size = range.__isset.file_size ? range.file_size : -1, + .modification_time = range.__isset.modification_time ? range.modification_time : 0}; + } + DeletionVectorReader(RuntimeState* state, RuntimeProfile* profile, + const TFileScanRangeParams& params, const DeleteFileDesc& desc, + io::IOContext* io_ctx) + : _state(state), _profile(profile), _params(params), _io_ctx(io_ctx) { + _desc = desc; + } ~DeletionVectorReader() = default; Status open(); Status read_at(size_t offset, Slice result); @@ -56,7 +81,7 @@ class DeletionVectorReader { private: RuntimeState* _state = nullptr; RuntimeProfile* _profile = nullptr; - const TFileRangeDesc& _range; + DeleteFileDesc _desc; const TFileScanRangeParams& _params; io::IOContext* _io_ctx = nullptr; diff --git a/be/src/io/file_factory.cpp b/be/src/io/file_factory.cpp index 553cdc4460e15c..9610bc028595ec 100644 --- a/be/src/io/file_factory.cpp +++ b/be/src/io/file_factory.cpp @@ -57,21 +57,20 @@ namespace doris { constexpr std::string_view RANDOM_CACHE_BASE_PATH = "random"; -io::FileReaderOptions FileFactory::get_reader_options(RuntimeState* state, +io::FileReaderOptions FileFactory::get_reader_options(const TQueryOptions& option, const io::FileDescription& fd) { io::FileReaderOptions opts { .cache_base_path {}, .file_size = fd.file_size, .mtime = fd.mtime, }; - if (config::enable_file_cache && state != nullptr && - state->query_options().__isset.enable_file_cache && - state->query_options().enable_file_cache && fd.file_cache_admission) { + if (config::enable_file_cache && option.__isset.enable_file_cache && option.enable_file_cache && + fd.file_cache_admission) { opts.cache_type = io::FileCachePolicy::FILE_BLOCK_CACHE; } - if (state != nullptr && state->query_options().__isset.file_cache_base_path && - state->query_options().file_cache_base_path != RANDOM_CACHE_BASE_PATH) { - opts.cache_base_path = state->query_options().file_cache_base_path; + if (option.__isset.file_cache_base_path && + option.file_cache_base_path != RANDOM_CACHE_BASE_PATH) { + opts.cache_base_path = option.file_cache_base_path; } return opts; } diff --git a/be/src/io/file_factory.h b/be/src/io/file_factory.h index 7d662e4fdde469..a32c8077c48e03 100644 --- a/be/src/io/file_factory.h +++ b/be/src/io/file_factory.h @@ -16,6 +16,7 @@ // under the License. #pragma once +#include #include #include #include @@ -83,7 +84,7 @@ class FileFactory { ENABLE_FACTORY_CREATOR(FileFactory); public: - static io::FileReaderOptions get_reader_options(RuntimeState* state, + static io::FileReaderOptions get_reader_options(const TQueryOptions& option, const io::FileDescription& fd); /// Create a temporary FileSystem for accessing file corresponding to `file_description` diff --git a/gensrc/thrift/Exprs.thrift b/gensrc/thrift/Exprs.thrift index 2644ecec417496..967499aac69d8b 100644 --- a/gensrc/thrift/Exprs.thrift +++ b/gensrc/thrift/Exprs.thrift @@ -88,6 +88,8 @@ enum TExprNodeType { TRY_CAST_EXPR = 41 // for search DSL function SEARCH_EXPR = 42, + // Normal predicate expression + PREDICATE = 43, } //enum TAggregationOp { diff --git a/gensrc/thrift/Opcodes.thrift b/gensrc/thrift/Opcodes.thrift index 1e4002357e7599..a2d709799482eb 100644 --- a/gensrc/thrift/Opcodes.thrift +++ b/gensrc/thrift/Opcodes.thrift @@ -97,4 +97,6 @@ enum TExprOpcode { MATCH_REGEXP = 76, MATCH_PHRASE_EDGE = 77, TRY_CAST = 78, + // Delete operator from Iceberg/Paimon + DELETE = 79, } From 2539b0a4a7b1996017c5c5b000366a3374b9892d Mon Sep 17 00:00:00 2001 From: Gabriel Date: Thu, 21 May 2026 12:33:58 +0800 Subject: [PATCH 07/38] [test](be) Add DeletePredicate unit tests (#63455) --- .../format/reader/expr/delete_predicate.cpp | 50 +++++- be/src/format/reader/expr/delete_predicate.h | 5 +- .../reader/expr/delete_predicate_test.cpp | 155 ++++++++++++++++++ .../format/reader/expr/table_expr_test.cpp | 5 +- 4 files changed, 206 insertions(+), 9 deletions(-) create mode 100644 be/test/format/reader/expr/delete_predicate_test.cpp diff --git a/be/src/format/reader/expr/delete_predicate.cpp b/be/src/format/reader/expr/delete_predicate.cpp index 8a4ac54102f515..d1ca03a5201d72 100644 --- a/be/src/format/reader/expr/delete_predicate.cpp +++ b/be/src/format/reader/expr/delete_predicate.cpp @@ -60,13 +60,53 @@ void DeletePredicate::close(VExprContext* context, FunctionContext::FunctionStat VExpr::close(context, scope); } -Status DeletePredicate::execute_column_impl(VExprContext* context, const Block* block, - const Selector* selector, size_t count, - ColumnPtr& result_column) const { +/** + * DeletePredicate is derived from 2 cases: + * 1. All row IDs indicates deleted rows. (e.g. Delete rows with row_id in (1, 2, 3)) + * 2. Bit vector indicates whether each row is deleted or not. (e.g. Bit vector[0,1,0,0,1] indicates row 1 and row 4 are deleted) + * + * So DeletePredicate should have exactly 1 child expr, which is the slot of row id. + * Row IDs should be generated by file reader as a virtual column in `block`. + **/ +Status DeletePredicate::execute(VExprContext* context, Block* block, int* result_column_id) const { + if (block->empty()) { + return Status::OK(); + } DCHECK(_open_finished || block == nullptr); + if (_children.size() != 1) { + return Status::InternalError(fmt::format( + "DeletePredicate should have exactly 1 child expr, but got {}", _children.size())); + } + int slot = -1; + RETURN_IF_ERROR(_children[0]->execute(context, block, &slot)); + const auto count = block->rows(); + auto res_col = ColumnBool::create(block->rows(), 0); + const auto& row_ids = + assert_cast(*block->get_by_position(slot).column).get_data(); + DCHECK_EQ(row_ids.size(), count); + if (_deleted_rows.empty()) { + block->insert({std::move(res_col), std::make_shared(), expr_name()}); + *result_column_id = block->get_columns().size() - 1; + return Status::OK(); + } + const int64_t* delete_rows = _deleted_rows.data(); + const int64_t* delete_rows_end = delete_rows + _deleted_rows.size(); + const int64_t* start_pos = std::lower_bound(delete_rows, delete_rows_end, row_ids[0]); + int64_t start_index = start_pos - delete_rows; + const int64_t* end_pos = std::upper_bound(start_pos, delete_rows_end, row_ids[count - 1]); + const int64_t end_index = end_pos - delete_rows; - static_cast(_deleted_rows.size()); - // TODO: implement delete predicate logic here, currently we just return a column with all 0 (false) + while (start_index < end_index) { + int64_t delete_row = delete_rows[start_index]; + if (const auto it = std::ranges::lower_bound(row_ids, delete_row); + it != row_ids.end() && *it == delete_row) { + const size_t index = it - row_ids.begin(); + res_col->get_data()[index] = true; + } + ++start_index; + } + block->insert({std::move(res_col), std::make_shared(), expr_name()}); + *result_column_id = block->get_columns().size() - 1; return Status::OK(); } diff --git a/be/src/format/reader/expr/delete_predicate.h b/be/src/format/reader/expr/delete_predicate.h index feb8093ea5c981..3a95c31d8bfe14 100644 --- a/be/src/format/reader/expr/delete_predicate.h +++ b/be/src/format/reader/expr/delete_predicate.h @@ -40,8 +40,11 @@ class DeletePredicate final : public VExpr { public: DeletePredicate(const std::vector& deleted_rows); ~DeletePredicate() override = default; + Status execute(VExprContext* context, Block* block, int* result_column_id) const override; Status execute_column_impl(VExprContext* context, const Block* block, const Selector* selector, - size_t count, ColumnPtr& result_column) const override; + size_t count, ColumnPtr& result_column) const override { + return Status::InternalError("Not implement DeletePredicate::execute_column_impl"); + } Status prepare(RuntimeState* state, const RowDescriptor& desc, VExprContext* context) override; Status open(RuntimeState* state, VExprContext* context, FunctionContext::FunctionStateScope scope) override; diff --git a/be/test/format/reader/expr/delete_predicate_test.cpp b/be/test/format/reader/expr/delete_predicate_test.cpp new file mode 100644 index 00000000000000..9d9f7387a2267a --- /dev/null +++ b/be/test/format/reader/expr/delete_predicate_test.cpp @@ -0,0 +1,155 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "format/reader/expr/delete_predicate.h" + +#include + +#include +#include +#include + +#include "common/status.h" +#include "core/block/block.h" +#include "core/column/column_vector.h" +#include "core/data_type/data_type_number.h" +#include "exprs/vexpr_context.h" +#include "runtime/descriptors.h" +#include "testutil/mock/mock_slot_ref.h" + +namespace doris { + +class DeletePredicateTest : public testing::Test { +protected: + static Block make_block(const std::vector& row_ids) { + auto column = ColumnInt64::create(); + for (auto row_id : row_ids) { + column->insert_value(row_id); + } + + Block block; + block.insert({std::move(column), std::make_shared(), "row_id"}); + return block; + } + + static std::vector result_column_data(const Block& block, int result_column_id) { + const auto& result_column = + assert_cast(*block.get_by_position(result_column_id).column); + return {result_column.get_data().begin(), result_column.get_data().end()}; + } + + static Status execute_delete_predicate(const std::vector& deleted_rows, Block* block, + int* result_column_id) { + auto delete_predicate = std::make_shared(deleted_rows); + delete_predicate->_open_finished = true; + delete_predicate->add_child( + std::make_shared(0, std::make_shared())); + + VExprContext context(delete_predicate); + return delete_predicate->execute(&context, block, result_column_id); + } +}; + +TEST_F(DeletePredicateTest, MatchDeletedRowsInInputRange) { + const std::vector deleted_rows {-3, 1, 4, 8, 12, 20}; + auto block = make_block({0, 1, 2, 3, 4, 5, 8, 12}); + + int result_column_id = -1; + auto status = execute_delete_predicate(deleted_rows, &block, &result_column_id); + ASSERT_TRUE(status.ok()) << status; + + EXPECT_EQ(result_column_id, 1); + EXPECT_EQ(result_column_data(block, result_column_id), + std::vector({0, 1, 0, 0, 1, 0, 1, 1})); +} + +TEST_F(DeletePredicateTest, EmptyDeletedRowsReturnAllFalse) { + const std::vector deleted_rows; + auto block = make_block({1, 2, 3}); + + int result_column_id = -1; + auto status = execute_delete_predicate(deleted_rows, &block, &result_column_id); + ASSERT_TRUE(status.ok()) << status; + + EXPECT_EQ(result_column_data(block, result_column_id), std::vector({0, 0, 0})); +} + +TEST_F(DeletePredicateTest, DeletedRowsOutsideInputRangeReturnAllFalse) { + const std::vector deleted_rows {-10, -1, 10, 11}; + auto block = make_block({1, 2, 3}); + + int result_column_id = -1; + auto status = execute_delete_predicate(deleted_rows, &block, &result_column_id); + ASSERT_TRUE(status.ok()) << status; + + EXPECT_EQ(result_column_data(block, result_column_id), std::vector({0, 0, 0})); +} + +TEST_F(DeletePredicateTest, EmptyBlockDoesNotAppendResultColumn) { + const std::vector deleted_rows {1, 2, 3}; + Block block; + + int result_column_id = -1; + auto status = execute_delete_predicate(deleted_rows, &block, &result_column_id); + ASSERT_TRUE(status.ok()) << status; + + EXPECT_EQ(block.columns(), 0); + EXPECT_EQ(result_column_id, -1); +} + +TEST_F(DeletePredicateTest, MissingRowIdChildReturnsError) { + const std::vector deleted_rows {1}; + auto block = make_block({1}); + auto delete_predicate = std::make_shared(deleted_rows); + delete_predicate->_open_finished = true; + VExprContext context(delete_predicate); + + int result_column_id = -1; + auto status = delete_predicate->execute(&context, &block, &result_column_id); + ASSERT_FALSE(status.ok()); + EXPECT_NE(status.to_string().find("exactly 1 child expr"), std::string::npos); +} + +TEST_F(DeletePredicateTest, ExecuteColumnImplReturnsError) { + const std::vector deleted_rows {1}; + DeletePredicate delete_predicate(deleted_rows); + VExprContext context(std::make_shared(deleted_rows)); + ColumnPtr result_column; + + auto status = + delete_predicate.execute_column_impl(&context, nullptr, nullptr, 0, result_column); + ASSERT_FALSE(status.ok()); + EXPECT_NE(status.to_string().find("DeletePredicate::execute_column_impl"), std::string::npos); +} + +TEST_F(DeletePredicateTest, LifecycleAndDebugString) { + const std::vector deleted_rows {1}; + DeletePredicate delete_predicate(deleted_rows); + VExprContext context(std::make_shared(deleted_rows)); + RowDescriptor row_desc; + + auto status = delete_predicate.prepare(nullptr, row_desc, &context); + ASSERT_TRUE(status.ok()) << status; + EXPECT_EQ(delete_predicate.expr_name(), "DeletePredicate"); + EXPECT_EQ(delete_predicate.debug_string(), "DeletePredicate"); + + status = delete_predicate.open(nullptr, &context, FunctionContext::THREAD_LOCAL); + ASSERT_TRUE(status.ok()) << status; + delete_predicate.close(&context, FunctionContext::THREAD_LOCAL); +} + +} // namespace doris diff --git a/be/test/format/reader/expr/table_expr_test.cpp b/be/test/format/reader/expr/table_expr_test.cpp index df41c1482e309c..dd831071483fda 100644 --- a/be/test/format/reader/expr/table_expr_test.cpp +++ b/be/test/format/reader/expr/table_expr_test.cpp @@ -15,9 +15,6 @@ // specific language governing permissions and limitations // under the License. -#include "format/reader/expr/literal.h" -#include "format/reader/expr/slot_ref.h" - #include #include @@ -28,6 +25,8 @@ #include "core/data_type/data_type_number.h" #include "core/data_type/primitive_type.h" #include "core/field.h" +#include "format/reader/expr/literal.h" +#include "format/reader/expr/slot_ref.h" #include "runtime/descriptors.h" #include "testutil/column_helper.h" From 0fb11e4e0c3751baeec63421d37cbec6bd7dd479 Mon Sep 17 00:00:00 2001 From: Gabriel Date: Thu, 21 May 2026 17:18:54 +0800 Subject: [PATCH 08/38] cast for schema change (#63477) --- be/src/exprs/vslot_ref.h | 2 +- be/src/format/reader/column_mapper.cpp | 41 ++-- be/src/format/reader/column_mapper.h | 8 +- be/src/format/reader/expr/cast.cpp | 131 +++++++++++ be/src/format/reader/expr/cast.h | 59 +++++ be/src/format/reader/expr/slot_ref.h | 16 +- be/src/format/reader/file_reader.h | 7 + be/src/format/reader/table_reader.h | 10 +- be/test/format/reader/expr/cast_test.cpp | 210 ++++++++++++++++++ .../format/reader/expr/table_expr_test.cpp | 11 +- 10 files changed, 465 insertions(+), 30 deletions(-) create mode 100644 be/src/format/reader/expr/cast.cpp create mode 100644 be/src/format/reader/expr/cast.h create mode 100644 be/test/format/reader/expr/cast_test.cpp diff --git a/be/src/exprs/vslot_ref.h b/be/src/exprs/vslot_ref.h index 3ac9f641c1922e..ceb702728ebb75 100644 --- a/be/src/exprs/vslot_ref.h +++ b/be/src/exprs/vslot_ref.h @@ -31,7 +31,7 @@ class TExprNode; class Block; class VExprContext; -class VSlotRef MOCK_REMOVE(final) : public VExpr { +class VSlotRef : public VExpr { ENABLE_FACTORY_CREATOR(VSlotRef); public: diff --git a/be/src/format/reader/column_mapper.cpp b/be/src/format/reader/column_mapper.cpp index 7006365b05408b..7510413d07fbac 100644 --- a/be/src/format/reader/column_mapper.cpp +++ b/be/src/format/reader/column_mapper.cpp @@ -20,20 +20,20 @@ #include #include "common/status.h" -#include "expr/slot_ref.h" +#include "format/reader/expr/cast.h" +#include "format/reader/expr/slot_ref.h" #include "format/reader/file_reader.h" #include "format/reader/table_reader.h" namespace doris::reader { +static constexpr const char* ROW_LINEAGE_ROW_ID = "_row_id"; +static constexpr const char* ROW_LINEAGE_LAST_UPDATED_SEQ_NUMBER = "_last_updated_sequence_number"; + Status TableColumnMapper::create_mapping(const std::vector& projected_columns, - std::vector block_schema, const std::map& partition_values, const std::vector& file_schema) { - // 真实实现会做 field id/name matching、类型转换、复杂列 child mapping、缺失列 - // default/partition/generated 表达式构造。 _mappings.clear(); - block_schema.clear(); for (const auto& table_column : projected_columns) { ColumnMapping mapping; mapping.table_column_id = table_column.id; @@ -43,24 +43,31 @@ Status TableColumnMapper::create_mapping(const std::vector& project mapping.file_type = file_field->type; mapping.is_trivial = _is_same_type(mapping.table_type, mapping.file_type); if (!mapping.is_trivial) { - // TODO: - return Status::NotSupported( - "column mapping with type conversion is not supported yet: table column " - "'{}' (id={}, type={}) vs file column (id={}, type={})", - table_column.name, mapping.table_column_id, mapping.table_type->get_name(), - mapping.file_column_id.value(), mapping.file_type->get_name()); + // 1. Data type mismatch (caused by schema evolution) and casting is needed. + auto expr = Cast::create_shared(mapping.table_type); + expr->add_child(TableSlotRef::create_shared(mapping.file_column_id.value(), + mapping.file_column_id.value(), -1, + mapping.file_type, file_field->name)); + mapping.projection = VExprContext::create_shared(expr); } else { + // 2. Data type matches, trivial mapping. mapping.projection = VExprContext::create_shared(TableSlotRef::create_shared( - *mapping.file_column_id, block_schema.size(), -1, mapping.table_type)); + mapping.file_column_id.value(), mapping.file_column_id.value(), -1, + mapping.file_type, file_field->name)); } - block_schema.push_back(SchemaField { - mapping.file_column_id.value(), table_column.name, mapping.table_type, {}}); - } else if (table_column.default_expr != nullptr) { - mapping.is_constant = true; - mapping.default_expr = table_column.default_expr; } else if (table_column.is_partition_key && partition_values.count(table_column.name) > 0) { + // 3. Partition column, use partition value as a constant mapping. Note that partition column may also have default expression, but partition value should take precedence if it exists. mapping.default_expr = VExprContext::create_shared(TableLiteral::create_shared( mapping.table_type, partition_values.at(table_column.name))); + } else if (table_column.default_expr != nullptr) { + // 4. Table column does not exist in file (column adding by schema evolution), which has a default expression, use it as a constant mapping. + mapping.is_constant = true; + mapping.default_expr = table_column.default_expr; + } else if (table_column.name == ROW_LINEAGE_ROW_ID) { + // 5. Virtual column, use special mapping to indicate it should be materialized by table reader instead of read from file or evaluated from expression. + mapping.virtual_column_type = TableVirtualColumnType::ROW_ID; + } else if (table_column.name == ROW_LINEAGE_LAST_UPDATED_SEQ_NUMBER) { + mapping.virtual_column_type = TableVirtualColumnType::LAST_UPDATED_SEQUENCE_NUMBER; } else { if (table_column.is_partition_key) { return Status::InvalidArgument( diff --git a/be/src/format/reader/column_mapper.h b/be/src/format/reader/column_mapper.h index 4c6b510ff0e48a..d0d8076798bfcf 100644 --- a/be/src/format/reader/column_mapper.h +++ b/be/src/format/reader/column_mapper.h @@ -40,6 +40,12 @@ enum class TableColumnMappingMode { BY_NAME, }; +enum TableVirtualColumnType { + INVALID = 0, // not a virtual column + ROW_ID = 1, + LAST_UPDATED_SEQUENCE_NUMBER = 2, +}; + // 单个 table column 到 file column 的映射结果。 // 这是 table 层和 file 层的核心边界对象。 struct ColumnMapping { @@ -59,6 +65,7 @@ struct ColumnMapping { std::vector child_mappings; bool is_trivial = false; bool is_constant = false; + TableVirtualColumnType virtual_column_type = TableVirtualColumnType::INVALID; VExprContextSPtr default_expr; }; @@ -81,7 +88,6 @@ class TableColumnMapper { // 输出的 ColumnMapping 描述 table column 如何从 file column、常量列或表达式得到; // 后续 projection、filter localization 和 table block finalize 都应复用这份映射。 virtual Status create_mapping(const std::vector& projected_columns, - std::vector block_schema, const std::map& partition_values, const std::vector& file_schema); diff --git a/be/src/format/reader/expr/cast.cpp b/be/src/format/reader/expr/cast.cpp new file mode 100644 index 00000000000000..69af83c9e77ffe --- /dev/null +++ b/be/src/format/reader/expr/cast.cpp @@ -0,0 +1,131 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "format/reader/expr/cast.h" + +#include +#include +#include + +#include + +#include "common/status.h" +#include "core/block/block.h" +#include "core/block/column_with_type_and_name.h" +#include "core/block/columns_with_type_and_name.h" +#include "exprs/function/simple_function_factory.h" +#include "exprs/vexpr_context.h" +#include "exprs/vliteral.h" + +namespace doris { + +Status Cast::prepare(RuntimeState* state, const RowDescriptor& desc, VExprContext* context) { + RETURN_IF_ERROR_OR_PREPARED(VExpr::prepare(state, desc, context)); + if (_children.size() != 1) { + return Status::InternalError( + fmt::format("Cast should have exactly 1 child expr, but got {}", _children.size())); + } + ColumnsWithTypeAndName argument_template; + argument_template.reserve(_children.size()); + if (_children[0]->is_literal()) { + // For some functions, he needs some literal columns to derive the return type. + auto literal_node = std::dynamic_pointer_cast(_children[0]); + argument_template.emplace_back(literal_node->get_column_ptr(), _children[0]->data_type(), + _children[0]->expr_name()); + } else { + argument_template.emplace_back(nullptr, _children[0]->data_type(), + _children[0]->expr_name()); + } + + _expr_name = fmt::format("CAST(arguments={},return={})", _children[0]->data_type()->get_name(), + _data_type->get_name()); + // get the function. won't prepare function. + _function = SimpleFunctionFactory::instance().get_function( + "CAST", argument_template, _data_type, + {.new_version_unix_timestamp = state->query_options().new_version_unix_timestamp}, + state->be_exec_version()); + if (_function == nullptr) { + return Status::InternalError("Could not find function {} ", _expr_name); + } + VExpr::register_function_context(state, context); + _prepare_finished = true; + return Status::OK(); +} + +Status Cast::open(RuntimeState* state, VExprContext* context, + FunctionContext::FunctionStateScope scope) { + DCHECK(_prepare_finished); + for (auto& i : _children) { + RETURN_IF_ERROR(i->open(state, context, scope)); + } + RETURN_IF_ERROR(VExpr::init_function_context(state, context, scope, _function)); + if (scope == FunctionContext::FRAGMENT_LOCAL) { + RETURN_IF_ERROR(VExpr::get_const_col(context, nullptr)); + } + _open_finished = true; + return Status::OK(); +} + +void Cast::close(VExprContext* context, FunctionContext::FunctionStateScope scope) { + VExpr::close_function_context(context, scope, _function); + VExpr::close(context, scope); +} + +Status Cast::execute_column_impl(VExprContext* context, const Block* block, + const Selector* selector, size_t count, + ColumnPtr& result_column) const { + return _do_execute(context, block, selector, count, result_column); +} + +std::string Cast::debug_string() const { + return _expr_name; +} + +Status Cast::_do_execute(VExprContext* context, const Block* block, const Selector* selector, + size_t count, ColumnPtr& result_column) const { + DCHECK(_open_finished || block == nullptr) << debug_string(); + if (_children.size() != 1) { + return Status::InternalError( + fmt::format("Cast should have exactly 1 child expr, but got {}", _children.size())); + } + if (is_const_and_have_executed()) { // const have executed in open function + result_column = get_result_from_const(count); + return Status::OK(); + } + + Block temp_block; + ColumnNumbers args(1); + + ColumnPtr tmp_arg_column; + RETURN_IF_ERROR(_children[0]->execute_column(context, block, selector, count, tmp_arg_column)); + auto arg_type = _children[0]->execute_type(block); + temp_block.insert({tmp_arg_column, arg_type, _children[0]->expr_name()}); + args[0] = 0; + + uint32_t num_columns_without_result = temp_block.columns(); + // prepare a column to save result + temp_block.insert({nullptr, _data_type, _expr_name}); + + RETURN_IF_ERROR(_function->execute(context->fn_context(_fn_context_index), temp_block, args, + num_columns_without_result, count)); + result_column = temp_block.get_by_position(num_columns_without_result).column; + DCHECK_EQ(result_column->size(), count); + RETURN_IF_ERROR(result_column->column_self_check()); + return Status::OK(); +} + +} // namespace doris diff --git a/be/src/format/reader/expr/cast.h b/be/src/format/reader/expr/cast.h new file mode 100644 index 00000000000000..7d8ca437ba3fb0 --- /dev/null +++ b/be/src/format/reader/expr/cast.h @@ -0,0 +1,59 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "common/object_pool.h" +#include "common/status.h" +#include "exprs/function_context.h" +#include "exprs/vexpr.h" + +namespace doris { +class RowDescriptor; +class RuntimeState; +class TExprNode; +class Block; +class VExprContext; +} // namespace doris + +namespace doris { + +class Cast final : public VExpr { + ENABLE_FACTORY_CREATOR(Cast); + +public: + Cast(const DataTypePtr& type) { _data_type = type; } + ~Cast() override = default; + Status prepare(RuntimeState* state, const RowDescriptor& desc, VExprContext* context) override; + Status open(RuntimeState* state, VExprContext* context, + FunctionContext::FunctionStateScope scope) override; + void close(VExprContext* context, FunctionContext::FunctionStateScope scope) override; + Status execute_column_impl(VExprContext* context, const Block* block, const Selector* selector, + size_t count, ColumnPtr& result_column) const override; + std::string debug_string() const override; + uint64_t get_digest(uint64_t seed) const override { return 0; } + const std::string& expr_name() const override { return _expr_name; } + +private: + Status _do_execute(VExprContext* context, const Block* block, const Selector* selector, + size_t count, ColumnPtr& result_column) const; + std::string _expr_name; + FunctionBasePtr _function; +}; +} // namespace doris diff --git a/be/src/format/reader/expr/slot_ref.h b/be/src/format/reader/expr/slot_ref.h index 6b5d027602ee18..fd4782a1bdde54 100644 --- a/be/src/format/reader/expr/slot_ref.h +++ b/be/src/format/reader/expr/slot_ref.h @@ -26,14 +26,26 @@ class TableSlotRef : public VSlotRef { ENABLE_FACTORY_CREATOR(TableSlotRef); public: - TableSlotRef(int slot_id, int column_id, int column_uniq_id, const DataTypePtr& type) - : VSlotRef(slot_id, column_id, column_uniq_id) { + TableSlotRef(int slot_id, int column_id, int column_uniq_id, const DataTypePtr& type, + const std::string& column_name) + : VSlotRef(slot_id, column_id, column_uniq_id), _cname(column_name) { _data_type = type; } Status prepare(RuntimeState* state, const RowDescriptor& desc, VExprContext* context) override { + if (_prepared) { + return Status::OK(); + } + _prepared = true; + _prepare_finished = true; return Status::OK(); } + + const std::string& expr_name() const override { return _cname; } + const std::string& column_name() const override { return _cname; } + +private: + const std::string _cname; }; } // namespace doris diff --git a/be/src/format/reader/file_reader.h b/be/src/format/reader/file_reader.h index 6dfbb4a8420cb8..96ace67d8defd0 100644 --- a/be/src/format/reader/file_reader.h +++ b/be/src/format/reader/file_reader.h @@ -42,6 +42,12 @@ namespace doris::reader { using ColumnId = int32_t; +enum ColumnType { + DATA_COLUMN = 0, // normal data column + ROW_NUMBER = 1, // row number in a file + FILE_NAME = 2, // file name +}; + // 文件本地 schema 字段。 // 这是 FileReader 暴露给 table 层的 file-local schema 视图,不携带 table/global // schema 语义。Iceberg field id、name mapping、default/generated/partition 列都不在 @@ -51,6 +57,7 @@ struct SchemaField { std::string name; DataTypePtr type; std::vector children; + ColumnType column_type = ColumnType::DATA_COLUMN; }; // 已经 localize 到文件 schema 的过滤条件。 diff --git a/be/src/format/reader/table_reader.h b/be/src/format/reader/table_reader.h index d14e1e782618ad..c3744427aa093d 100644 --- a/be/src/format/reader/table_reader.h +++ b/be/src/format/reader/table_reader.h @@ -232,11 +232,10 @@ class TableReader { // 打开当前具体 reader。 // 子类在这里基于当前 split/task 初始化底层 FileReader。 virtual Status open_reader() { - std::vector file_schema; - RETURN_IF_ERROR(_data_reader.reader->get_schema(&file_schema)); - RETURN_IF_ERROR(_data_reader.column_mapper.create_mapping(_options.projected_columns, - _data_reader.block_schema, - _partition_values, file_schema)); + _data_reader.block_schema.clear(); + RETURN_IF_ERROR(_data_reader.reader->get_schema(&_data_reader.block_schema)); + RETURN_IF_ERROR(_data_reader.column_mapper.create_mapping( + _options.projected_columns, _partition_values, _data_reader.block_schema)); FileScanRequest file_request; RETURN_IF_ERROR(_data_reader.column_mapper.create_scan_request( @@ -270,7 +269,6 @@ class TableReader { struct DataReader { std::unique_ptr reader; TableColumnMapper column_mapper; - // Schema of blocks from file reader. std::vector block_schema; }; DataReader _data_reader; diff --git a/be/test/format/reader/expr/cast_test.cpp b/be/test/format/reader/expr/cast_test.cpp new file mode 100644 index 00000000000000..4f2154189532e9 --- /dev/null +++ b/be/test/format/reader/expr/cast_test.cpp @@ -0,0 +1,210 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "format/reader/expr/cast.h" + +#include + +#include +#include +#include + +#include "common/status.h" +#include "core/block/block.h" +#include "core/column/column_nullable.h" +#include "core/column/column_vector.h" +#include "core/data_type/data_type_nullable.h" +#include "core/data_type/data_type_number.h" +#include "core/data_type/data_type_string.h" +#include "core/field.h" +#include "exprs/vexpr_context.h" +#include "format/reader/column_mapper.h" +#include "format/reader/expr/literal.h" +#include "format/reader/expr/slot_ref.h" +#include "format/reader/file_reader.h" +#include "format/reader/table_reader.h" +#include "runtime/descriptors.h" +#include "testutil/column_helper.h" +#include "testutil/mock/mock_runtime_state.h" + +namespace doris { + +class CastTest : public testing::Test { +protected: + void SetUp() override { state.set_enable_strict_cast(true); } + + static VExprContextSPtr create_context(const DataTypePtr& return_type, + const DataTypePtr& child_type, int child_column_id = 0) { + auto cast = Cast::create_shared(return_type); + cast->add_child(TableSlotRef::create_shared(child_column_id, child_column_id, -1, + child_type, "source_column")); + return VExprContext::create_shared(cast); + } + + Status prepare_open_execute(VExprContext* context, Block* block, int* result_column_id) { + RETURN_IF_ERROR(context->prepare(&state, RowDescriptor())); + RETURN_IF_ERROR(context->open(&state)); + return context->execute(block, result_column_id); + } + + MockRuntimeState state; +}; + +TEST_F(CastTest, CastIntSlotToBigInt) { + auto source_type = std::make_shared(); + auto return_type = std::make_shared(); + auto context = create_context(return_type, source_type); + Block block; + block.insert(ColumnHelper::create_column_with_name({1, -2, 3})); + + int result_column_id = -1; + auto status = prepare_open_execute(context.get(), &block, &result_column_id); + ASSERT_TRUE(status.ok()) << status; + + ASSERT_EQ(result_column_id, 1); + ASSERT_EQ(block.columns(), 2); + EXPECT_EQ(block.get_by_position(result_column_id).type, return_type); + const auto& result_column = + assert_cast(*block.get_by_position(result_column_id).column); + EXPECT_EQ(result_column.get_data()[0], 1); + EXPECT_EQ(result_column.get_data()[1], -2); + EXPECT_EQ(result_column.get_data()[2], 3); + + context->close(); +} + +TEST_F(CastTest, CastStringSlotToNullableInt) { + state.set_enable_strict_cast(false); + auto source_type = std::make_shared(); + auto return_type = std::make_shared(std::make_shared()); + auto context = create_context(return_type, source_type); + Block block; + block.insert(ColumnHelper::create_column_with_name({"10", "bad", "-3"})); + + int result_column_id = -1; + auto status = prepare_open_execute(context.get(), &block, &result_column_id); + ASSERT_TRUE(status.ok()) << status; + + const auto& nullable_column = + assert_cast(*block.get_by_position(result_column_id).column); + const auto& result_column = + assert_cast(nullable_column.get_nested_column()); + const auto& null_map = nullable_column.get_null_map_data(); + EXPECT_EQ(result_column.get_data()[0], 10); + EXPECT_EQ(result_column.get_data()[2], -3); + EXPECT_EQ(null_map[0], 0); + EXPECT_EQ(null_map[1], 1); + EXPECT_EQ(null_map[2], 0); + + context->close(); +} + +TEST_F(CastTest, CastLiteralToString) { + auto source_type = std::make_shared(); + auto return_type = std::make_shared(); + auto cast = Cast::create_shared(return_type); + cast->add_child(TableLiteral::create_shared(source_type, Field::create_field(123))); + auto context = VExprContext::create_shared(cast); + Block block; + block.insert(ColumnHelper::create_column_with_name({1, 2, 3})); + + int result_column_id = -1; + auto status = prepare_open_execute(context.get(), &block, &result_column_id); + ASSERT_TRUE(status.ok()) << status; + + const auto& result = block.get_by_position(result_column_id); + EXPECT_EQ(result.type->to_string(*result.column, 0), "123"); + EXPECT_EQ(result.type->to_string(*result.column, 1), "123"); + EXPECT_EQ(result.type->to_string(*result.column, 2), "123"); + + context->close(); +} + +TEST_F(CastTest, EmptyBlockAppendsEmptyResultColumn) { + auto source_type = std::make_shared(); + auto return_type = std::make_shared(); + auto context = create_context(return_type, source_type); + Block block; + block.insert(ColumnHelper::create_column_with_name({})); + + int result_column_id = -1; + auto status = prepare_open_execute(context.get(), &block, &result_column_id); + ASSERT_TRUE(status.ok()) << status; + + ASSERT_EQ(result_column_id, 1); + EXPECT_EQ(block.get_by_position(result_column_id).column->size(), 0); + + context->close(); +} + +TEST_F(CastTest, PrepareRejectsMissingChild) { + auto cast = Cast::create_shared(std::make_shared()); + VExprContext context(cast); + + auto status = context.prepare(&state, RowDescriptor()); + ASSERT_FALSE(status.ok()); + EXPECT_NE(status.to_string().find("exactly 1 child expr"), std::string::npos); +} + +TEST_F(CastTest, PrepareRejectsMultipleChildren) { + auto child_type = std::make_shared(); + auto cast = Cast::create_shared(std::make_shared()); + cast->add_child(TableSlotRef::create_shared(0, 0, -1, child_type, "c0")); + cast->add_child(TableSlotRef::create_shared(1, 1, -1, child_type, "c1")); + VExprContext context(cast); + + auto status = context.prepare(&state, RowDescriptor()); + ASSERT_FALSE(status.ok()); + EXPECT_NE(status.to_string().find("exactly 1 child expr"), std::string::npos); +} + +TEST_F(CastTest, ColumnMapperBuildsCastProjectionForTypeMismatch) { + reader::TableColumnMapper mapper; + reader::TableColumn table_column; + table_column.id = 7; + table_column.name = "value"; + table_column.type = std::make_shared(); + std::vector projected_columns {table_column}; + + reader::SchemaField file_field; + file_field.id = 0; + file_field.name = "value"; + file_field.type = std::make_shared(); + std::vector file_schema {file_field}; + + auto status = mapper.create_mapping(projected_columns, {}, file_schema); + ASSERT_TRUE(status.ok()) << status; + ASSERT_EQ(mapper.mappings().size(), 1); + const auto& mapping = mapper.mappings()[0]; + EXPECT_FALSE(mapping.is_trivial); + ASSERT_NE(mapping.projection, nullptr); + + Block block; + block.insert(ColumnHelper::create_column_with_name({11, 22})); + int result_column_id = -1; + status = prepare_open_execute(mapping.projection.get(), &block, &result_column_id); + ASSERT_TRUE(status.ok()) << status; + + const auto& result_column = + assert_cast(*block.get_by_position(result_column_id).column); + EXPECT_EQ(result_column.get_data()[0], 11); + EXPECT_EQ(result_column.get_data()[1], 22); + + mapping.projection->close(); +} + +} // namespace doris diff --git a/be/test/format/reader/expr/table_expr_test.cpp b/be/test/format/reader/expr/table_expr_test.cpp index dd831071483fda..3caca73c6c5d13 100644 --- a/be/test/format/reader/expr/table_expr_test.cpp +++ b/be/test/format/reader/expr/table_expr_test.cpp @@ -79,11 +79,14 @@ TEST(TableLiteralTest, ExecuteAppendsConstColumnToBlock) { TEST(TableSlotRefTest, KeepsSlotColumnIdsAndType) { auto type = std::make_shared(); - auto slot_ref = TableSlotRef::create_shared(10, 20, 30, type); + std::string name = "file_col"; + auto slot_ref = TableSlotRef::create_shared(10, 20, 30, type, name); EXPECT_EQ(slot_ref->slot_id(), 10); EXPECT_EQ(slot_ref->column_id(), 20); EXPECT_EQ(slot_ref->data_type(), type); + EXPECT_EQ(slot_ref->expr_name(), "file_col"); + EXPECT_EQ(slot_ref->column_name(), "file_col"); EXPECT_FALSE(slot_ref->is_constant()); std::set column_ids; @@ -94,14 +97,16 @@ TEST(TableSlotRefTest, KeepsSlotColumnIdsAndType) { TEST(TableSlotRefTest, PrepareDoesNotRequireRowDescriptor) { auto type = std::make_shared(); - auto slot_ref = TableSlotRef::create_shared(10, 20, 30, type); + std::string name = ""; + auto slot_ref = TableSlotRef::create_shared(10, 20, 30, type, name); EXPECT_TRUE(slot_ref->prepare(nullptr, RowDescriptor(), nullptr).ok()); } TEST(TableSlotRefTest, ExecuteReturnsReferencedColumnId) { auto type = std::make_shared(); - auto slot_ref = TableSlotRef::create_shared(10, 1, 30, type); + std::string name = ""; + auto slot_ref = TableSlotRef::create_shared(10, 1, 30, type, name); Block block; block.insert(ColumnHelper::create_column_with_name({1, 2, 3})); block.insert(ColumnHelper::create_column_with_name({4, 5, 6})); From 3dbfc4c596dc9997db4f1211765c02872ae5252d Mon Sep 17 00:00:00 2001 From: Gabriel Date: Tue, 26 May 2026 17:45:41 +0800 Subject: [PATCH 09/38] Complete basic parquet reader (#63659) Co-authored-by: Socrates --- .../data_type_datetimev2_serde.cpp | 30 +- .../data_type_datetimev2_serde.h | 2 + .../data_type_datev2_serde.cpp | 20 +- .../data_type_serde/data_type_datev2_serde.h | 2 + .../data_type_decimal_serde.cpp | 64 ++ .../data_type_serde/data_type_decimal_serde.h | 2 + .../data_type_nullable_serde.cpp | 20 +- .../data_type_nullable_serde.h | 2 + .../data_type_number_serde.cpp | 52 ++ .../data_type_serde/data_type_number_serde.h | 3 + .../core/data_type_serde/data_type_serde.cpp | 6 + be/src/core/data_type_serde/data_type_serde.h | 7 + .../data_type_string_serde.cpp | 28 + .../data_type_serde/data_type_string_serde.h | 3 + .../data_type_serde/data_type_time_serde.cpp | 44 ++ .../data_type_serde/data_type_time_serde.h | 2 + .../data_type_serde/decoded_column_view.h | 62 ++ be/src/exprs/vslot_ref.h | 2 +- be/src/format/new_parquet/column_reader.cpp | 562 ++++++++++++++++ be/src/format/new_parquet/column_reader.h | 116 ++++ .../new_parquet/parquet_column_schema.cpp | 158 +++++ .../new_parquet/parquet_column_schema.h | 67 ++ be/src/format/new_parquet/parquet_reader.cpp | 601 ++++++++++++++++++ be/src/format/new_parquet/parquet_reader.h | 138 ++++ .../format/new_parquet/parquet_statistics.cpp | 254 ++++++++ .../format/new_parquet/parquet_statistics.h | 88 +++ be/src/format/new_parquet/parquet_type.cpp | 349 ++++++++++ be/src/format/new_parquet/parquet_type.h | 84 +++ be/src/format/new_parquet/selection_vector.h | 116 ++++ be/src/format/parquet/parquet_reader.h | 77 --- be/src/format/reader/column_mapper.cpp | 7 +- .../format/reader/expr/delete_predicate.cpp | 4 +- be/src/format/reader/file_reader.cpp | 42 ++ be/src/format/reader/file_reader.h | 118 ++-- be/src/format/reader/table_reader.h | 22 +- .../data_type_serde_decoded_values_test.cpp | 278 ++++++++ .../parquet_column_reader_test.cpp | 562 ++++++++++++++++ .../new_parquet/parquet_reader_test.cpp | 341 ++++++++++ ...ris-arrow-parquet-reader-implementation.md | 291 +++++++++ 39 files changed, 4485 insertions(+), 141 deletions(-) create mode 100644 be/src/core/data_type_serde/decoded_column_view.h create mode 100644 be/src/format/new_parquet/column_reader.cpp create mode 100644 be/src/format/new_parquet/column_reader.h create mode 100644 be/src/format/new_parquet/parquet_column_schema.cpp create mode 100644 be/src/format/new_parquet/parquet_column_schema.h create mode 100644 be/src/format/new_parquet/parquet_reader.cpp create mode 100644 be/src/format/new_parquet/parquet_reader.h create mode 100644 be/src/format/new_parquet/parquet_statistics.cpp create mode 100644 be/src/format/new_parquet/parquet_statistics.h create mode 100644 be/src/format/new_parquet/parquet_type.cpp create mode 100644 be/src/format/new_parquet/parquet_type.h create mode 100644 be/src/format/new_parquet/selection_vector.h delete mode 100644 be/src/format/parquet/parquet_reader.h create mode 100644 be/src/format/reader/file_reader.cpp create mode 100644 be/test/core/data_type_serde/data_type_serde_decoded_values_test.cpp create mode 100644 be/test/format/new_parquet/parquet_column_reader_test.cpp create mode 100644 be/test/format/new_parquet/parquet_reader_test.cpp create mode 100644 docs/doris-arrow-parquet-reader-implementation.md diff --git a/be/src/core/data_type_serde/data_type_datetimev2_serde.cpp b/be/src/core/data_type_serde/data_type_datetimev2_serde.cpp index 92a5106b4815a8..fc2c14d1829049 100644 --- a/be/src/core/data_type_serde/data_type_datetimev2_serde.cpp +++ b/be/src/core/data_type_serde/data_type_datetimev2_serde.cpp @@ -22,9 +22,9 @@ #include // IWYU pragma: keep #include - #include "common/status.h" #include "core/column/column_const.h" +#include "core/data_type_serde/decoded_column_view.h" #include "core/data_type/data_type_decimal.h" #include "core/data_type/data_type_number.h" #include "core/data_type/primitive_type.h" @@ -451,6 +451,34 @@ Status DataTypeDateTimeV2SerDe::read_column_from_arrow(IColumn& column, return Status::OK(); } +Status DataTypeDateTimeV2SerDe::read_column_from_decoded_values( + IColumn& column, const DecodedColumnView& view) const { + if (view.value_kind != DecodedValueKind::INT64) { + return Status::NotSupported("DATETIMEV2 decoded reader expects INT64 source"); + } + if (view.values == nullptr && view.row_count > 0) { + return Status::Corruption("Decoded value buffer is null for {}", column.get_name()); + } + auto& data = assert_cast(column).get_data(); + const auto* values = reinterpret_cast(view.values); + static const cctz::time_zone utc_time_zone = cctz::utc_time_zone(); + const int64_t second_mask = view.time_unit == DecodedTimeUnit::MILLIS ? 1000 : 1000000; + for (int64_t row = 0; row < view.row_count; ++row) { + int64_t epoch_seconds = values[row] / second_mask; + int64_t sub_second = values[row] % second_mask; + if (sub_second < 0) { + sub_second += second_mask; + --epoch_seconds; + } + const int32_t microsecond = static_cast(sub_second * (1000000 / second_mask)); + DateV2Value datetime_value; + datetime_value.from_unixtime(epoch_seconds, utc_time_zone); + datetime_value.set_microsecond(static_cast(microsecond)); + data.push_back(datetime_value); + } + return Status::OK(); +} + Status DataTypeDateTimeV2SerDe::write_column_to_mysql_binary(const IColumn& column, MysqlRowBinaryBuffer& result, int64_t row_idx, bool col_const, diff --git a/be/src/core/data_type_serde/data_type_datetimev2_serde.h b/be/src/core/data_type_serde/data_type_datetimev2_serde.h index 0389432a621730..34d0373eba1c34 100644 --- a/be/src/core/data_type_serde/data_type_datetimev2_serde.h +++ b/be/src/core/data_type_serde/data_type_datetimev2_serde.h @@ -88,6 +88,8 @@ class DataTypeDateTimeV2SerDe : public DataTypeNumberSerDe #include - #include "core/column/column_const.h" +#include "core/data_type_serde/decoded_column_view.h" #include "core/data_type/data_type_decimal.h" #include "core/data_type/data_type_number.h" #include "core/data_type/define_primitive_type.h" @@ -125,6 +125,24 @@ Status DataTypeDateV2SerDe::read_column_from_arrow(IColumn& column, const arrow: return Status::OK(); } +Status DataTypeDateV2SerDe::read_column_from_decoded_values( + IColumn& column, const DecodedColumnView& view) const { + if (view.value_kind != DecodedValueKind::INT32) { + return Status::NotSupported("DATEV2 decoded reader expects INT32 source"); + } + if (view.values == nullptr && view.row_count > 0) { + return Status::Corruption("Decoded value buffer is null for {}", column.get_name()); + } + auto& data = assert_cast(column).get_data(); + const auto* values = reinterpret_cast(view.values); + for (int64_t row = 0; row < view.row_count; ++row) { + DateV2Value date_v2; + date_v2.get_date_from_daynr(values[row] + date_threshold); + data.push_back(date_v2); + } + return Status::OK(); +} + Status DataTypeDateV2SerDe::write_column_to_mysql_binary(const IColumn& column, MysqlRowBinaryBuffer& result, int64_t row_idx, bool col_const, diff --git a/be/src/core/data_type_serde/data_type_datev2_serde.h b/be/src/core/data_type_serde/data_type_datev2_serde.h index 0375f9be4b4b23..ff985d61345d5a 100644 --- a/be/src/core/data_type_serde/data_type_datev2_serde.h +++ b/be/src/core/data_type_serde/data_type_datev2_serde.h @@ -86,6 +86,8 @@ class DataTypeDateV2SerDe : public DataTypeNumberSerDe +NativeType decode_big_endian_signed_integer(const uint8_t* data, int length) { + using UnsignedNativeType = + std::conditional_t, unsigned __int128, + std::make_unsigned_t>; + UnsignedNativeType value = data != nullptr && length > 0 && (data[0] & 0x80) != 0 + ? static_cast(-1) + : 0; + for (int i = 0; i < length; ++i) { + value = static_cast((value << 8) | data[i]); + } + return static_cast(value); +} + +template +typename PrimitiveTypeTraits::CppType read_decimal_decoded_value( + const DecodedColumnView& view, int64_t row) { + using FieldType = typename PrimitiveTypeTraits::CppType; + if (view.value_kind == DecodedValueKind::INT32) { + const auto* values = reinterpret_cast(view.values); + return FieldType {static_cast(values[row])}; + } + if (view.value_kind == DecodedValueKind::INT64) { + const auto* values = reinterpret_cast(view.values); + return FieldType {static_cast(values[row])}; + } + const auto& value = (*view.binary_values)[row]; + const auto length = view.value_kind == DecodedValueKind::FIXED_BINARY + ? view.fixed_length + : cast_set(value.size); + return FieldType {static_cast( + decode_big_endian_signed_integer(reinterpret_cast(value.data), + length))}; +} + +template +Status read_decimal_decoded_values(IColumn& column, const DecodedColumnView& view) { + auto& data = assert_cast&>(column).get_data(); + for (int64_t row = 0; row < view.row_count; ++row) { + data.push_back(read_decimal_decoded_value(view, row)); + } + return Status::OK(); +} + +} // namespace template Status DataTypeDecimalSerDe::from_string_batch(const ColumnString& str, ColumnNullable& column, @@ -381,6 +429,22 @@ Status DataTypeDecimalSerDe::read_column_from_arrow(IColumn& column, return Status::OK(); } +template +Status DataTypeDecimalSerDe::read_column_from_decoded_values( + IColumn& column, const DecodedColumnView& view) const { + if constexpr (T == TYPE_DECIMAL32 || T == TYPE_DECIMAL64 || T == TYPE_DECIMAL128I || + T == TYPE_DECIMAL256) { + if (view.value_kind == DecodedValueKind::INT32 || + view.value_kind == DecodedValueKind::INT64 || + view.value_kind == DecodedValueKind::BINARY || + view.value_kind == DecodedValueKind::FIXED_BINARY) { + return read_decimal_decoded_values(column, view); + } + } + return Status::NotSupported("Unsupported decoded values for {} from source kind {}", + get_name(), static_cast(view.value_kind)); +} + template Status DataTypeDecimalSerDe::write_column_to_mysql_binary(const IColumn& column, MysqlRowBinaryBuffer& result, diff --git a/be/src/core/data_type_serde/data_type_decimal_serde.h b/be/src/core/data_type_serde/data_type_decimal_serde.h index 0185672e024718..089835a21be955 100644 --- a/be/src/core/data_type_serde/data_type_decimal_serde.h +++ b/be/src/core/data_type_serde/data_type_decimal_serde.h @@ -107,6 +107,8 @@ class DataTypeDecimalSerDe : public DataTypeSerDe { const cctz::time_zone& ctz) const override; Status read_column_from_arrow(IColumn& column, const arrow::Array* arrow_array, int64_t start, int64_t end, const cctz::time_zone& ctz) const override; + Status read_column_from_decoded_values(IColumn& column, + const DecodedColumnView& view) const override; Status write_column_to_mysql_binary(const IColumn& column, MysqlRowBinaryBuffer& row_buffer, int64_t row_idx, bool col_const, const FormatOptions& options) const override; diff --git a/be/src/core/data_type_serde/data_type_nullable_serde.cpp b/be/src/core/data_type_serde/data_type_nullable_serde.cpp index a93f8d6126c7d5..6b15b29c63ad67 100644 --- a/be/src/core/data_type_serde/data_type_nullable_serde.cpp +++ b/be/src/core/data_type_serde/data_type_nullable_serde.cpp @@ -22,13 +22,14 @@ #include #include -#include +#include #include "core/assert_cast.h" #include "core/column/column.h" #include "core/column/column_const.h" #include "core/column/column_nullable.h" #include "core/column/column_vector.h" +#include "core/data_type_serde/decoded_column_view.h" #include "core/data_type_serde/data_type_serde.h" #include "core/data_type_serde/data_type_string_serde.h" #include "exprs/function/cast/cast_base.h" @@ -350,6 +351,23 @@ Status DataTypeNullableSerDe::read_column_from_arrow(IColumn& column, ctz); } +Status DataTypeNullableSerDe::read_column_from_decoded_values( + IColumn& column, const DecodedColumnView& view) const { + auto& nullable_column = assert_cast(column); + auto& null_map = nullable_column.get_null_map_data(); + const auto old_size = null_map.size(); + null_map.resize(null_map.size() + view.row_count); + if (view.null_map != nullptr) { + // TODO: skip if no null in map + auto* dst = null_map.data() + old_size; + memcpy(dst, view.null_map, view.row_count); + } + DecodedColumnView nested_view = view; + nested_view.null_map = nullptr; + return nested_serde->read_column_from_decoded_values(nullable_column.get_nested_column(), + nested_view); +} + bool DataTypeNullableSerDe::write_column_to_mysql_text(const IColumn& column, BufferWritable& bw, int64_t row_idx, const FormatOptions& options) const { diff --git a/be/src/core/data_type_serde/data_type_nullable_serde.h b/be/src/core/data_type_serde/data_type_nullable_serde.h index cfb4e1e3bca198..376f3692dc1814 100644 --- a/be/src/core/data_type_serde/data_type_nullable_serde.h +++ b/be/src/core/data_type_serde/data_type_nullable_serde.h @@ -86,6 +86,8 @@ class DataTypeNullableSerDe : public DataTypeSerDe { const cctz::time_zone& ctz) const override; Status read_column_from_arrow(IColumn& column, const arrow::Array* arrow_array, int64_t start, int64_t end, const cctz::time_zone& ctz) const override; + Status read_column_from_decoded_values(IColumn& column, + const DecodedColumnView& view) const override; Status write_column_to_mysql_binary(const IColumn& column, MysqlRowBinaryBuffer& row_buffer, int64_t row_idx, bool col_const, const FormatOptions& options) const override; diff --git a/be/src/core/data_type_serde/data_type_number_serde.cpp b/be/src/core/data_type_serde/data_type_number_serde.cpp index 39e9c0726c498a..131e6d059417f7 100644 --- a/be/src/core/data_type_serde/data_type_number_serde.cpp +++ b/be/src/core/data_type_serde/data_type_number_serde.cpp @@ -26,6 +26,7 @@ #include "core/column/column_nullable.h" #include "core/data_type/define_primitive_type.h" #include "core/data_type/primitive_type.h" +#include "core/data_type_serde/decoded_column_view.h" #include "core/data_type_serde/data_type_serde.h" #include "core/packed_int128.h" #include "core/types.h" @@ -42,6 +43,29 @@ #include "util/to_string.h" namespace doris { +namespace { + +template +const NativeType* decoded_values_as(const DecodedColumnView& view) { + return reinterpret_cast(view.values); +} + +template +Status read_number_decoded_values(IColumn& column, const DecodedColumnView& view) { + if (view.values == nullptr && view.row_count > 0) { + return Status::Corruption("Decoded value buffer is null for {}", column.get_name()); + } + auto& data = assert_cast::ColumnType&>(column) + .get_data(); + const auto* values = decoded_values_as(view); + for (int64_t row = 0; row < view.row_count; ++row) { + using DorisCppType = typename PrimitiveTypeTraits::CppType; + data.push_back(static_cast(values[row])); + } + return Status::OK(); +} + +} // namespace // Type map的基本结构 template struct TypeMap { @@ -156,6 +180,34 @@ Status DataTypeNumberSerDe::write_column_to_arrow(const IColumn& column, cons return Status::OK(); } +template +Status DataTypeNumberSerDe::read_column_from_decoded_values( + IColumn& column, const DecodedColumnView& view) const { + if constexpr (T == TYPE_BOOLEAN) { + if (view.value_kind == DecodedValueKind::BOOL) { + return read_number_decoded_values(column, view); + } + } else if constexpr (T == TYPE_INT) { + if (view.value_kind == DecodedValueKind::INT32) { + return read_number_decoded_values(column, view); + } + } else if constexpr (T == TYPE_BIGINT) { + if (view.value_kind == DecodedValueKind::INT64) { + return read_number_decoded_values(column, view); + } + } else if constexpr (T == TYPE_FLOAT) { + if (view.value_kind == DecodedValueKind::FLOAT) { + return read_number_decoded_values(column, view); + } + } else if constexpr (T == TYPE_DOUBLE) { + if (view.value_kind == DecodedValueKind::DOUBLE) { + return read_number_decoded_values(column, view); + } + } + return Status::NotSupported("Unsupported decoded values for {} from source kind {}", + get_name(), static_cast(view.value_kind)); +} + template Status DataTypeNumberSerDe::deserialize_one_cell_from_json(IColumn& column, Slice& slice, const FormatOptions& options) const { diff --git a/be/src/core/data_type_serde/data_type_number_serde.h b/be/src/core/data_type_serde/data_type_number_serde.h index b57f9f9d21298d..0e0a3acfc1aed7 100644 --- a/be/src/core/data_type_serde/data_type_number_serde.h +++ b/be/src/core/data_type_serde/data_type_number_serde.h @@ -117,6 +117,9 @@ class DataTypeNumberSerDe : public DataTypeSerDe { Status read_column_from_arrow(IColumn& column, const arrow::Array* arrow_array, int64_t start, int64_t end, const cctz::time_zone& ctz) const override; + Status read_column_from_decoded_values(IColumn& column, + const DecodedColumnView& view) const override; + Status write_column_to_mysql_binary(const IColumn& column, MysqlRowBinaryBuffer& row_buffer, int64_t row_idx, bool col_const, const FormatOptions& options) const override; diff --git a/be/src/core/data_type_serde/data_type_serde.cpp b/be/src/core/data_type_serde/data_type_serde.cpp index ac688ae6c307a3..b6a49524887087 100644 --- a/be/src/core/data_type_serde/data_type_serde.cpp +++ b/be/src/core/data_type_serde/data_type_serde.cpp @@ -34,6 +34,12 @@ namespace doris { DataTypeSerDe::~DataTypeSerDe() = default; +Status DataTypeSerDe::read_column_from_decoded_values(IColumn& column, + const DecodedColumnView& view) const { + return Status::NotSupported("read_column_from_decoded_values is not supported for {}", + get_name()); +} + DataTypeSerDeSPtrs create_data_type_serdes(const DataTypes& types) { DataTypeSerDeSPtrs serdes; serdes.reserve(types.size()); diff --git a/be/src/core/data_type_serde/data_type_serde.h b/be/src/core/data_type_serde/data_type_serde.h index 7c007c6558ddf3..07ad5d5d1b02d6 100644 --- a/be/src/core/data_type_serde/data_type_serde.h +++ b/be/src/core/data_type_serde/data_type_serde.h @@ -27,6 +27,7 @@ #include "common/cast_set.h" #include "common/status.h" #include "core/column/column_nullable.h" +#include "core/data_type_serde/decoded_column_view.h" #include "core/field.h" #include "core/string_buffer.hpp" #include "core/types.h" @@ -485,6 +486,12 @@ class DataTypeSerDe { int64_t start, int64_t end, const cctz::time_zone& ctz) const = 0; + // Read already decoded column values into a Doris column. The input view is format-neutral: + // file readers translate their decoder output into DecodedColumnView, while SerDe owns + // the Doris-type-specific materialization into IColumn. + virtual Status read_column_from_decoded_values(IColumn& column, + const DecodedColumnView& view) const; + // ORC serializer virtual Status write_column_to_orc(const std::string& timezone, const IColumn& column, const NullMap* null_map, diff --git a/be/src/core/data_type_serde/data_type_string_serde.cpp b/be/src/core/data_type_serde/data_type_string_serde.cpp index b7a59b3c07e42a..478cdf3b5e6f1a 100644 --- a/be/src/core/data_type_serde/data_type_string_serde.cpp +++ b/be/src/core/data_type_serde/data_type_string_serde.cpp @@ -18,12 +18,29 @@ #include "core/data_type_serde/data_type_string_serde.h" #include "core/column/column_string.h" +#include "core/data_type_serde/decoded_column_view.h" #include "core/data_type/define_primitive_type.h" #include "util/jsonb_document_cast.h" #include "util/jsonb_utils.h" #include "util/jsonb_writer.h" namespace doris { +namespace { + +template +Status read_string_decoded_values(IColumn& column, const DecodedColumnView& view) { + if (view.binary_values == nullptr && view.row_count > 0) { + return Status::Corruption("Decoded binary values are null for {}", column.get_name()); + } + auto& string_column = assert_cast(column); + for (int64_t row = 0; row < view.row_count; ++row) { + const auto& value = (*view.binary_values)[row]; + string_column.insert_data(value.data, value.size); + } + return Status::OK(); +} + +} // namespace template Status DataTypeStringSerDeBase::serialize_column_to_json(const IColumn& column, @@ -313,6 +330,17 @@ Status DataTypeStringSerDeBase::read_column_from_arrow( return Status::OK(); } +template +Status DataTypeStringSerDeBase::read_column_from_decoded_values( + IColumn& column, const DecodedColumnView& view) const { + if (view.value_kind != DecodedValueKind::BINARY && + view.value_kind != DecodedValueKind::FIXED_BINARY) { + return Status::NotSupported("Unsupported decoded values for {} from source kind {}", + get_name(), static_cast(view.value_kind)); + } + return read_string_decoded_values(column, view); +} + template Status DataTypeStringSerDeBase::write_column_to_orc( const std::string& timezone, const IColumn& column, const NullMap* null_map, diff --git a/be/src/core/data_type_serde/data_type_string_serde.h b/be/src/core/data_type_serde/data_type_string_serde.h index 79c8450835d39c..81b80eab4a5cbf 100644 --- a/be/src/core/data_type_serde/data_type_string_serde.h +++ b/be/src/core/data_type_serde/data_type_string_serde.h @@ -203,6 +203,9 @@ class DataTypeStringSerDeBase : public DataTypeSerDe { Status read_column_from_arrow(IColumn& column, const arrow::Array* arrow_array, int64_t start, int64_t end, const cctz::time_zone& ctz) const override; + Status read_column_from_decoded_values(IColumn& column, + const DecodedColumnView& view) const override; + Status write_column_to_mysql_binary(const IColumn& column, MysqlRowBinaryBuffer& result, int64_t row_idx, bool col_const, const FormatOptions& options) const override { diff --git a/be/src/core/data_type_serde/data_type_time_serde.cpp b/be/src/core/data_type_serde/data_type_time_serde.cpp index e57fd08a271339..65e1afa577d0ed 100644 --- a/be/src/core/data_type_serde/data_type_time_serde.cpp +++ b/be/src/core/data_type_serde/data_type_time_serde.cpp @@ -17,6 +17,7 @@ #include "core/data_type_serde/data_type_time_serde.h" +#include "core/data_type_serde/decoded_column_view.h" #include "core/data_type/data_type_decimal.h" #include "core/data_type/data_type_number.h" #include "core/data_type/primitive_type.h" @@ -25,6 +26,33 @@ #include "exprs/function/cast/cast_to_time_impl.hpp" namespace doris { +namespace { + +TimeValue::TimeType read_time_decoded_value(const DecodedColumnView& view, int64_t row) { + int64_t micros = 0; + if (view.value_kind == DecodedValueKind::INT32) { + const auto* values = reinterpret_cast(view.values); + micros = static_cast(values[row]) * 1000; + } else { + const auto* values = reinterpret_cast(view.values); + micros = values[row]; + if (view.time_unit == DecodedTimeUnit::MILLIS) { + micros *= 1000; + } else if (view.time_unit == DecodedTimeUnit::NANOS) { + micros /= 1000; + } + } + const bool negative = micros < 0; + const int64_t abs_micros = std::abs(micros); + return TimeValue::make_time(abs_micros / TimeValue::ONE_HOUR_MICROSECONDS, + (abs_micros % TimeValue::ONE_HOUR_MICROSECONDS) / + TimeValue::ONE_MINUTE_MICROSECONDS, + (abs_micros % TimeValue::ONE_MINUTE_MICROSECONDS) / + TimeValue::ONE_SECOND_MICROSECONDS, + abs_micros % TimeValue::ONE_SECOND_MICROSECONDS, negative); +} + +} // namespace Status DataTypeTimeV2SerDe::write_column_to_mysql_binary(const IColumn& column, MysqlRowBinaryBuffer& result, @@ -145,6 +173,22 @@ Status DataTypeTimeV2SerDe::from_string_strict_mode(StringRef& str, IColumn& col return Status::OK(); } +Status DataTypeTimeV2SerDe::read_column_from_decoded_values( + IColumn& column, const DecodedColumnView& view) const { + if (view.value_kind != DecodedValueKind::INT32 && + view.value_kind != DecodedValueKind::INT64) { + return Status::NotSupported("TIMEV2 decoded reader expects INT32 or INT64 source"); + } + if (view.values == nullptr && view.row_count > 0) { + return Status::Corruption("Decoded value buffer is null for {}", column.get_name()); + } + auto& data = assert_cast(column).get_data(); + for (int64_t row = 0; row < view.row_count; ++row) { + data.push_back(read_time_decoded_value(view, row)); + } + return Status::OK(); +} + template Status DataTypeTimeV2SerDe::from_int_batch(const typename IntDataType::ColumnType& int_col, ColumnNullable& target_col) const { diff --git a/be/src/core/data_type_serde/data_type_time_serde.h b/be/src/core/data_type_serde/data_type_time_serde.h index db703616b497cf..e3fccf379c913a 100644 --- a/be/src/core/data_type_serde/data_type_time_serde.h +++ b/be/src/core/data_type_serde/data_type_time_serde.h @@ -67,6 +67,8 @@ class DataTypeTimeV2SerDe : public DataTypeNumberSerDe Status from_decimal_strict_mode_batch(const typename DecimalDataType::ColumnType& decimal_col, IColumn& target_col) const; + Status read_column_from_decoded_values(IColumn& column, + const DecodedColumnView& view) const override; int get_scale() const override { return _scale; } protected: diff --git a/be/src/core/data_type_serde/decoded_column_view.h b/be/src/core/data_type_serde/decoded_column_view.h new file mode 100644 index 00000000000000..9b0b14b17c777d --- /dev/null +++ b/be/src/core/data_type_serde/decoded_column_view.h @@ -0,0 +1,62 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include + +#include "common/status.h" +#include "core/string_ref.h" + +namespace doris { + +class IColumn; + +// 已解码 column batch 的物理值来源类型。 +// 该枚举只描述通用内存布局,不包含 Parquet/ORC/Arrow 等格式专有类型。 +enum class DecodedValueKind { + BOOL, + INT32, + INT64, + FLOAT, + DOUBLE, + BINARY, + FIXED_BINARY, +}; + +enum class DecodedTimeUnit { + UNKNOWN, + MILLIS, + MICROS, + NANOS, +}; + +struct DecodedColumnView { + DecodedValueKind value_kind = DecodedValueKind::INT32; + DecodedTimeUnit time_unit = DecodedTimeUnit::UNKNOWN; + int64_t row_count = 0; + int decimal_precision = -1; + int decimal_scale = -1; + int fixed_length = -1; + const uint8_t* values = nullptr; + const uint8_t* null_map = nullptr; + const std::vector* binary_values = nullptr; +}; + +} // namespace doris diff --git a/be/src/exprs/vslot_ref.h b/be/src/exprs/vslot_ref.h index ceb702728ebb75..6e7197f4cf6876 100644 --- a/be/src/exprs/vslot_ref.h +++ b/be/src/exprs/vslot_ref.h @@ -67,7 +67,7 @@ class VSlotRef : public VExpr { column_ids.insert(_column_id); } - MOCK_FUNCTION const std::string& column_name() const { return *_column_name; } + virtual const std::string& column_name() const { return *_column_name; } uint64_t get_digest(uint64_t seed) const override; diff --git a/be/src/format/new_parquet/column_reader.cpp b/be/src/format/new_parquet/column_reader.cpp new file mode 100644 index 00000000000000..bb101b5c5fc910 --- /dev/null +++ b/be/src/format/new_parquet/column_reader.cpp @@ -0,0 +1,562 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "format/new_parquet/column_reader.h" + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "core/column/column.h" +#include "core/column/column_struct.h" +#include "core/data_type/data_type_nullable.h" +#include "core/data_type_serde/decoded_column_view.h" +#include "format/new_parquet/parquet_column_schema.h" + +namespace doris::parquet { +namespace { + +class ScalarColumnReader final : public ParquetColumnReader { +public: + ScalarColumnReader(int parquet_leaf_column_id, const ::parquet::ColumnDescriptor* descriptor, + ParquetTypeDescriptor type_descriptor, DataTypePtr type, std::string name, + std::shared_ptr<::parquet::internal::RecordReader> record_reader) + : _file_column_id(parquet_leaf_column_id), + _parquet_leaf_column_id(parquet_leaf_column_id), + _descriptor(descriptor), + _type_descriptor(std::move(type_descriptor)), + _type(std::move(type)), + _name(std::move(name)), + _record_reader(std::move(record_reader)) {} + + int file_column_id() const override { return _file_column_id; } + int parquet_leaf_column_id() const override { return _parquet_leaf_column_id; } + const DataTypePtr& type() const override { return _type; } + const std::string& name() const override { return _name; } + + Status read(int64_t rows, MutableColumnPtr& column, int64_t* rows_read) override; + Status skip(int64_t rows) override; + + const ::parquet::ColumnDescriptor* descriptor() const { return _descriptor; } + const std::shared_ptr<::parquet::internal::RecordReader>& record_reader() const { + return _record_reader; + } + +private: + int _file_column_id = -1; + int _parquet_leaf_column_id = -1; + const ::parquet::ColumnDescriptor* _descriptor = nullptr; + ParquetTypeDescriptor _type_descriptor; + DataTypePtr _type; + std::string _name; + std::shared_ptr<::parquet::internal::RecordReader> _record_reader; +}; + +class StructColumnReader final : public ParquetColumnReader { +public: + StructColumnReader(const ParquetColumnSchema& schema, + std::vector> children) + : _field_id(schema.field_id), + _type(schema.type), + _name(schema.name), + _children(std::move(children)) {} + + int file_column_id() const override { return _field_id; } + int parquet_leaf_column_id() const override { return -1; } + const DataTypePtr& type() const override { return _type; } + const std::string& name() const override { return _name; } + + Status read(int64_t rows, MutableColumnPtr& column, int64_t* rows_read) override; + Status skip(int64_t rows) override; + +private: + int _field_id = -1; + DataTypePtr _type; + std::string _name; + std::vector> _children; +}; + +Status read_records(ScalarColumnReader& column_reader, int64_t batch_rows, + ::parquet::internal::RecordReader** record_reader, int64_t* rows_read) { + auto reader = column_reader.record_reader(); + if (reader == nullptr) { + return Status::InternalError("Parquet record reader is not initialized for column {}", + column_reader.name()); + } + + int64_t records_read = 0; + try { + reader->Reset(); + reader->Reserve(batch_rows); + records_read = reader->ReadRecords(batch_rows); + } catch (const ::parquet::ParquetException& e) { + return Status::Corruption("Failed to read parquet records for column {}: {}", + column_reader.name(), e.what()); + } catch (const std::exception& e) { + return Status::InternalError("Failed to read parquet records for column {}: {}", + column_reader.name(), e.what()); + } + if (records_read < 0 || records_read > batch_rows) { + return Status::Corruption("Invalid parquet record read result for column {}: {}", + column_reader.name(), records_read); + } + *record_reader = reader.get(); + *rows_read = records_read; + return Status::OK(); +} + +struct RowRange { + int64_t start = 0; + int64_t length = 0; +}; + +std::vector selection_to_ranges(const SelectionVector& selection, + uint16_t selected_rows) { + std::vector ranges; + if (selected_rows == 0) { + return ranges; + } + + int64_t range_start = selection.get_index(0); + int64_t previous = selection.get_index(0); + for (uint16_t selection_idx = 1; selection_idx < selected_rows; ++selection_idx) { + const int64_t current = selection.get_index(selection_idx); + DCHECK_GT(current, previous); + if (current == previous + 1) { + previous = current; + continue; + } + ranges.push_back(RowRange {range_start, previous - range_start + 1}); + range_start = current; + previous = current; + } + ranges.push_back(RowRange {range_start, previous - range_start + 1}); + return ranges; +} + +DecodedTimeUnit decoded_time_unit(ParquetTimeUnit time_unit) { + switch (time_unit) { + case ParquetTimeUnit::MILLIS: + return DecodedTimeUnit::MILLIS; + case ParquetTimeUnit::MICROS: + return DecodedTimeUnit::MICROS; + case ParquetTimeUnit::NANOS: + return DecodedTimeUnit::NANOS; + case ParquetTimeUnit::UNKNOWN: + default: + return DecodedTimeUnit::UNKNOWN; + } +} + +DecodedValueKind decoded_value_kind(const ParquetTypeDescriptor& type_descriptor) { + switch (type_descriptor.physical_type) { + case ::parquet::Type::BOOLEAN: + return DecodedValueKind::BOOL; + case ::parquet::Type::INT32: + return DecodedValueKind::INT32; + case ::parquet::Type::INT64: + return DecodedValueKind::INT64; + case ::parquet::Type::FLOAT: + return DecodedValueKind::FLOAT; + case ::parquet::Type::DOUBLE: + return DecodedValueKind::DOUBLE; + case ::parquet::Type::FIXED_LEN_BYTE_ARRAY: + return DecodedValueKind::FIXED_BINARY; + case ::parquet::Type::BYTE_ARRAY: + default: + return DecodedValueKind::BINARY; + } +} + +Status build_null_map(const ScalarColumnReader& column_reader, + ::parquet::internal::RecordReader& record_reader, int64_t records_read, + NullMap* null_map) { + if (column_reader.descriptor()->max_definition_level() == 0) { + return Status::OK(); + } + if (record_reader.read_dense_for_nullable()) { + return Status::NotSupported( + "Dense nullable parquet record reader is not supported for column {}", + column_reader.name()); + } + auto* def_levels = record_reader.def_levels(); + if (def_levels == nullptr && records_read > 0) { + return Status::Corruption( + "Parquet record reader returned null definition levels for nullable column {}", + column_reader.name()); + } + const int16_t max_definition_level = column_reader.descriptor()->max_definition_level(); + null_map->resize(records_read); + auto* __restrict dst = null_map->data(); + const auto* __restrict src = def_levels; + for (int64_t record_idx = 0; record_idx < records_read; ++record_idx) { + dst[record_idx] = src[record_idx] != max_definition_level; + } + return Status::OK(); +} + +Status get_binary_chunks(const ScalarColumnReader& column_reader, + ::parquet::internal::RecordReader& record_reader, + std::vector>* chunks) { + auto* binary_reader = dynamic_cast<::parquet::internal::BinaryRecordReader*>(&record_reader); + if (binary_reader == nullptr) { + return Status::InternalError("Parquet binary record reader is not available for column {}", + column_reader.name()); + } + *chunks = binary_reader->GetBuilderChunks(); + return Status::OK(); +} + +Status build_binary_values(const ScalarColumnReader& column_reader, + const std::vector>& chunks, + int64_t records_read, std::vector* binary_values) { + binary_values->reserve(records_read); + for (const auto& chunk : chunks) { + if (chunk == nullptr) { + return Status::Corruption( + "Parquet binary record reader returned null chunk for column {}", + column_reader.name()); + } + if (auto* binary_array = dynamic_cast<::arrow::BinaryArray*>(chunk.get())) { + for (int64_t row_idx = 0; row_idx < binary_array->length(); ++row_idx) { + if (binary_array->IsNull(row_idx)) { + binary_values->emplace_back(static_cast(nullptr), 0); + continue; + } + int32_t length = 0; + const uint8_t* value = binary_array->GetValue(row_idx, &length); + binary_values->emplace_back(reinterpret_cast(value), length); + } + } else if (auto* fixed_array = dynamic_cast<::arrow::FixedSizeBinaryArray*>(chunk.get())) { + for (int64_t row_idx = 0; row_idx < fixed_array->length(); ++row_idx) { + if (fixed_array->IsNull(row_idx)) { + binary_values->emplace_back(static_cast(nullptr), 0); + continue; + } + binary_values->emplace_back( + reinterpret_cast(fixed_array->GetValue(row_idx)), + fixed_array->byte_width()); + } + } else { + return Status::InternalError("Unexpected Arrow binary array type for column {}", + column_reader.name()); + } + } + if (binary_values->size() != static_cast(records_read)) { + return Status::Corruption( + "Invalid parquet binary record read result for column {}: rows={}, records={}", + column_reader.name(), binary_values->size(), records_read); + } + return Status::OK(); +} + +} // namespace + +Status ScalarColumnReader::read(int64_t rows, MutableColumnPtr& column, int64_t* rows_read) { + if (column.get() == nullptr || rows_read == nullptr) { + return Status::InvalidArgument("Invalid parquet column read result pointer for column {}", + _name); + } + if (_record_reader == nullptr) { + return Status::InternalError("Parquet record reader is not initialized for column {}", + _name); + } + ::parquet::internal::RecordReader* record_reader = nullptr; + RETURN_IF_ERROR(read_records(*this, rows, &record_reader, rows_read)); + if (record_reader->values_written() != *rows_read) { + return Status::Corruption( + "Invalid parquet record read result for column {}: values={}, records={}", _name, + record_reader->values_written(), *rows_read); + } + + NullMap null_map; + RETURN_IF_ERROR(build_null_map(*this, *record_reader, *rows_read, &null_map)); + + std::vector binary_values; + std::vector> binary_chunks; + DecodedColumnView view; + view.value_kind = decoded_value_kind(_type_descriptor); + view.time_unit = decoded_time_unit(_type_descriptor.time_unit); + view.row_count = *rows_read; + view.decimal_precision = _type_descriptor.decimal_precision; + view.decimal_scale = _type_descriptor.decimal_scale; + view.fixed_length = _type_descriptor.fixed_length; + view.null_map = null_map.empty() ? nullptr : null_map.data(); + if (view.value_kind == DecodedValueKind::BINARY || + view.value_kind == DecodedValueKind::FIXED_BINARY) { + RETURN_IF_ERROR(get_binary_chunks(*this, *record_reader, &binary_chunks)); + RETURN_IF_ERROR(build_binary_values(*this, binary_chunks, *rows_read, &binary_values)); + view.binary_values = &binary_values; + } else { + view.values = record_reader->values(); + } + + RETURN_IF_ERROR(_type->get_serde()->read_column_from_decoded_values(*column, view)); + return Status::OK(); +} + +Status ScalarColumnReader::skip(int64_t rows) { + if (rows <= 0) { + return Status::OK(); + } + + if (_record_reader == nullptr) { + return Status::InternalError("Parquet record reader is not initialized for column {}", + _name); + } + int64_t skipped_rows = 0; + try { + _record_reader->Reset(); + while (skipped_rows < rows) { + const int64_t skipped = _record_reader->SkipRecords(rows - skipped_rows); + if (skipped <= 0) { + return Status::Corruption( + "Failed to skip parquet records for column {}: skipped {} of {} rows", + _name, skipped_rows, rows); + } + skipped_rows += skipped; + } + } catch (const ::parquet::ParquetException& e) { + return Status::Corruption("Failed to skip parquet records for column {}: {}", _name, + e.what()); + } catch (const std::exception& e) { + return Status::InternalError("Failed to skip parquet records for column {}: {}", _name, + e.what()); + } + return Status::OK(); +} + +Status StructColumnReader::read(int64_t rows, MutableColumnPtr& column, int64_t* rows_read) { + if (column.get() == nullptr || rows_read == nullptr) { + return Status::InvalidArgument("Invalid parquet struct read result pointer for column {}", + _name); + } + if (_children.empty()) { + column->resize(static_cast(rows)); + *rows_read = rows; + return Status::OK(); + } + + int64_t expected_rows = -1; + size_t child_idx = 0; + DCHECK_EQ(assert_cast(*column).get_columns().size(), _children.size()); + for (auto& child_reader : _children) { + int64_t child_rows = 0; + auto child_column = + assert_cast(*column).get_column_ptr(child_idx)->assume_mutable(); + RETURN_IF_ERROR(child_reader->read(rows, child_column, &child_rows)); + if (expected_rows < 0) { + expected_rows = child_rows; + } else if (child_rows != expected_rows) { + return Status::Corruption( + "Parquet struct children returned different row counts in column {}: {} vs {}", + _name, expected_rows, child_rows); + } + child_idx++; + } + + *rows_read = std::max(expected_rows, 0); + return Status::OK(); +} + +Status StructColumnReader::skip(int64_t rows) { + if (rows <= 0) { + return Status::OK(); + } + for (auto& child_reader : _children) { + RETURN_IF_ERROR(child_reader->skip(rows)); + } + return Status::OK(); +} + +Status ParquetColumnReader::skip(int64_t rows) { + return Status::NotSupported("Parquet column skip is not implemented, rows={}", rows); +} + +Status ParquetColumnReader::select(const SelectionVector& sel, uint16_t selected_rows, + int64_t batch_rows, MutableColumnPtr& column) { + if (column.get() == nullptr) { + return Status::InvalidArgument("Parquet selected read result is null for column {}", + name()); + } + RETURN_IF_ERROR(sel.verify(selected_rows, batch_rows)); + + const auto ranges = selection_to_ranges(sel, selected_rows); + int64_t cursor = 0; + for (const auto& range : ranges) { + if (range.start < cursor || range.start + range.length > batch_rows) { + return Status::InvalidArgument("Invalid parquet selection range [{}, {}) for column {}", + range.start, range.start + range.length, name()); + } + RETURN_IF_ERROR(skip(range.start - cursor)); + + int64_t range_rows_read = 0; + RETURN_IF_ERROR(read(range.length, column, &range_rows_read)); + if (range_rows_read != range.length) { + return Status::Corruption( + "Parquet selected read returned {} rows, expected {} rows for column {}", + range_rows_read, range.length, name()); + } + cursor = range.start + range.length; + } + RETURN_IF_ERROR(skip(batch_rows - cursor)); + return Status::OK(); +} + +ParquetColumnReaderFactory::ParquetColumnReaderFactory( + std::shared_ptr<::parquet::RowGroupReader> row_group, int num_leaf_columns) + : _row_group(std::move(row_group)), + _record_readers(static_cast(num_leaf_columns)) {} + +Status ParquetColumnReaderFactory::create_scalar_reader( + int parquet_leaf_column_id, const ParquetTypeDescriptor& type_descriptor, + const ::parquet::ColumnDescriptor* descriptor, DataTypePtr type, std::string name, + std::shared_ptr<::parquet::internal::RecordReader> record_reader, + std::unique_ptr* reader) const { + if (reader == nullptr) { + return Status::InvalidArgument("reader is null"); + } + if (descriptor == nullptr || type == nullptr || record_reader == nullptr) { + return Status::InvalidArgument("Invalid parquet column reader arguments for column {}", + name); + } + *reader = std::make_unique(parquet_leaf_column_id, descriptor, + type_descriptor, std::move(type), + std::move(name), std::move(record_reader)); + return Status::OK(); +} + +Status ParquetColumnReaderFactory::create_scalar_column_reader( + const ParquetColumnSchema& column_schema, + std::unique_ptr* reader) const { + if (reader == nullptr) { + return Status::InvalidArgument("reader is null"); + } + if (column_schema.leaf_column_id < 0 || + column_schema.leaf_column_id >= static_cast(_record_readers.size())) { + return Status::InvalidArgument("Invalid parquet leaf column id {} for column {}", + column_schema.leaf_column_id, column_schema.name); + } + if (!supports_record_reader(column_schema.type_descriptor)) { + return Status::NotSupported( + "Current parquet reader only supports primitive columns without repetition; " + "column {} is not supported", + column_schema.name); + } + std::shared_ptr<::parquet::internal::RecordReader> record_reader; + RETURN_IF_ERROR(get_record_reader(column_schema.leaf_column_id, column_schema.descriptor, + column_schema.name, &record_reader)); + return create_scalar_reader(column_schema.leaf_column_id, column_schema.type_descriptor, + column_schema.descriptor, column_schema.type, column_schema.name, + std::move(record_reader), reader); +} + +Status ParquetColumnReaderFactory::get_record_reader( + int leaf_column_id, const ::parquet::ColumnDescriptor* descriptor, const std::string& name, + std::shared_ptr<::parquet::internal::RecordReader>* reader) const { + if (reader == nullptr) { + return Status::InvalidArgument("reader is null"); + } + if (_row_group == nullptr) { + return Status::InternalError("Parquet row group reader is not initialized for column {}", + name); + } + if (leaf_column_id < 0 || leaf_column_id >= static_cast(_record_readers.size())) { + return Status::InvalidArgument("Invalid parquet leaf column id {} for column {}", + leaf_column_id, name); + } + if (descriptor == nullptr) { + return Status::InvalidArgument("Parquet column descriptor is null for column {}", name); + } + if (descriptor->max_repetition_level() != 0 || descriptor->max_definition_level() > 1) { + return Status::NotSupported( + "Current parquet reader only supports RecordReader-backed columns; column {} is " + "not supported", + name); + } + if (_record_readers[leaf_column_id] == nullptr) { + try { + _record_readers[leaf_column_id] = + _row_group->RecordReader(leaf_column_id, /*read_dictionary=*/false); + } catch (const ::parquet::ParquetException& e) { + return Status::Corruption("Failed to create parquet record reader for column {}: {}", + name, e.what()); + } catch (const std::exception& e) { + return Status::InternalError("Failed to create parquet record reader for column {}: {}", + name, e.what()); + } + } + if (_record_readers[leaf_column_id] == nullptr) { + return Status::Corruption("Failed to create parquet record reader for column {}", name); + } + *reader = _record_readers[leaf_column_id]; + return Status::OK(); +} + +Status ParquetColumnReaderFactory::create_struct_column_reader( + const ParquetColumnSchema& column_schema, + std::unique_ptr* reader) const { + if (reader == nullptr) { + return Status::InvalidArgument("reader is null"); + } + if (column_schema.type != nullptr && column_schema.type->is_nullable()) { + return Status::NotSupported( + "Nullable parquet STRUCT reader is not implemented for column {}", + column_schema.name); + } + std::vector> child_readers; + child_readers.reserve(column_schema.children.size()); + for (const auto& child_schema : column_schema.children) { + std::unique_ptr child_reader; + RETURN_IF_ERROR(create(*child_schema, &child_reader)); + child_readers.push_back(std::move(child_reader)); + } + *reader = std::make_unique(column_schema, std::move(child_readers)); + return Status::OK(); +} + +Status ParquetColumnReaderFactory::create(const ParquetColumnSchema& column_schema, + std::unique_ptr* reader) const { + if (reader == nullptr) { + return Status::InvalidArgument("reader is null"); + } + switch (column_schema.kind) { + case ParquetColumnSchemaKind::PRIMITIVE: + return create_scalar_column_reader(column_schema, reader); + case ParquetColumnSchemaKind::STRUCT: + return create_struct_column_reader(column_schema, reader); + case ParquetColumnSchemaKind::LIST: + return Status::NotSupported("Parquet LIST reader is not implemented for column {}", + column_schema.name); + case ParquetColumnSchemaKind::MAP: + return Status::NotSupported("Parquet MAP reader is not implemented for column {}", + column_schema.name); + } + return Status::NotSupported("Unsupported parquet column schema kind for column {}", + column_schema.name); +} + +} // namespace doris::parquet diff --git a/be/src/format/new_parquet/column_reader.h b/be/src/format/new_parquet/column_reader.h new file mode 100644 index 00000000000000..cd59fc3960a7db --- /dev/null +++ b/be/src/format/new_parquet/column_reader.h @@ -0,0 +1,116 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include + +#include "common/status.h" +#include "core/data_type/data_type.h" +#include "format/new_parquet/parquet_type.h" +#include "format/new_parquet/selection_vector.h" + +namespace parquet { +class ColumnDescriptor; +class RowGroupReader; + +namespace internal { +class RecordReader; +} // namespace internal +} // namespace parquet + +namespace doris { +class IColumn; + +namespace parquet { +struct ParquetColumnSchema; + +// Doris 的 Parquet column reader 抽象。 +// 该类包装 Arrow Parquet RecordReader,负责将 file-local Parquet leaf column 读取成 +// Doris-owned column。它不理解 Iceberg/global schema,也不处理 table-level +// cast/default/generated/partition 语义。 +class ParquetColumnReader { +public: + virtual ~ParquetColumnReader() = default; + + // FileReader 暴露给上层 scan request 的 file-local column id。 + // 对 top-level primitive 列,它通常等于 Parquet leaf column id;对 struct/list/map + // 这类复杂列,它表示 file schema tree 中的逻辑字段 id。 + virtual int file_column_id() const = 0; + + // Parquet 文件内部的 leaf column id,用于访问 RowGroupReader::RecordReader、 + // ColumnChunk metadata、statistics/page index 等 Parquet 物理列结构。 + // 只有 primitive leaf reader 有有效值;复杂列 reader 没有单一 leaf column,返回 -1。 + virtual int parquet_leaf_column_id() const = 0; + + virtual const DataTypePtr& type() const = 0; + virtual const std::string& name() const = 0; + + // 读取一个 file-local column batch。 + virtual Status read(int64_t rows, MutableColumnPtr& column, int64_t* rows_read) = 0; + + // 跳过指定行数。这里必须使用 row-level skip,不能退回到 value-level Skip。 + virtual Status skip(int64_t rows); + + // 按 selection 读取当前 batch 中需要输出的行,并在末尾跳过 batch 内剩余行。 + // 该方法只允许通过 skip + read 推进 reader 游标,不允许退化为整批 read + filter。 + virtual Status select(const SelectionVector& sel, uint16_t selected_rows, int64_t batch_rows, + MutableColumnPtr& column); +}; + +// Parquet column reader 工厂。 +// 工厂绑定当前 row group,并根据 file-local schema tree 创建 Doris 自己的 column +// reader。Arrow internal RecordReader 的创建和缓存必须封装在这里,避免泄露到 +// ParquetReader 主流程。后续 reader options、Dremel assembler、延时物化 cache/skip +// 策略都应挂在该工厂上下文里,而不是继续扩展自由函数参数。 +class ParquetColumnReaderFactory { +public: + ParquetColumnReaderFactory(std::shared_ptr<::parquet::RowGroupReader> row_group, + int num_leaf_columns); + + // 根据 file-local schema tree 创建 column reader。复杂类型会在这里递归创建 + // children。该入口只理解 Parquet file schema,不处理 table/global schema。 + Status create(const ParquetColumnSchema& column_schema, + std::unique_ptr* reader) const; + +private: + Status create_scalar_column_reader(const ParquetColumnSchema& column_schema, + std::unique_ptr* reader) const; + + Status create_struct_column_reader(const ParquetColumnSchema& column_schema, + std::unique_ptr* reader) const; + + Status get_record_reader(int leaf_column_id, const ::parquet::ColumnDescriptor* descriptor, + const std::string& name, + std::shared_ptr<::parquet::internal::RecordReader>* reader) const; + + Status create_scalar_reader(int parquet_leaf_column_id, + const ParquetTypeDescriptor& type_descriptor, + const ::parquet::ColumnDescriptor* descriptor, DataTypePtr type, + std::string name, + std::shared_ptr<::parquet::internal::RecordReader> record_reader, + std::unique_ptr* reader) const; + + std::shared_ptr<::parquet::RowGroupReader> _row_group; + mutable std::vector> _record_readers; +}; + +} // namespace parquet +} // namespace doris diff --git a/be/src/format/new_parquet/parquet_column_schema.cpp b/be/src/format/new_parquet/parquet_column_schema.cpp new file mode 100644 index 00000000000000..131bf9f22c0e18 --- /dev/null +++ b/be/src/format/new_parquet/parquet_column_schema.cpp @@ -0,0 +1,158 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "format/new_parquet/parquet_column_schema.h" + +#include + +#include +#include +#include + +#include "core/data_type/data_type_array.h" +#include "core/data_type/data_type_map.h" +#include "core/data_type/data_type_nullable.h" +#include "core/data_type/data_type_struct.h" +#include "format/new_parquet/parquet_type.h" + +namespace doris::parquet { +namespace { + +bool is_list_node(const ::parquet::schema::Node& node) { + const auto& logical_type = node.logical_type(); + return node.converted_type() == ::parquet::ConvertedType::LIST || + (logical_type != nullptr && logical_type->is_valid() && logical_type->is_list()); +} + +bool is_map_node(const ::parquet::schema::Node& node) { + const auto& logical_type = node.logical_type(); + return node.converted_type() == ::parquet::ConvertedType::MAP || + node.converted_type() == ::parquet::ConvertedType::MAP_KEY_VALUE || + (logical_type != nullptr && logical_type->is_valid() && logical_type->is_map()); +} + +DataTypePtr nullable_if_needed(DataTypePtr type, const ::parquet::schema::Node& node) { + return node.is_optional() ? make_nullable(type) : type; +} + +Status build_node_schema(const ::parquet::SchemaDescriptor& schema, + const ::parquet::schema::Node& node, + std::unique_ptr* result) { + if (result == nullptr) { + return Status::InvalidArgument("result is null"); + } + auto column_schema = std::make_unique(); + column_schema->field_id = node.field_id(); + column_schema->name = node.name(); + column_schema->node = &node; + + if (node.is_primitive()) { + const int leaf_column_id = schema.ColumnIndex(node); + if (leaf_column_id < 0) { + return Status::InvalidArgument("Cannot find leaf column id for parquet column {}", + node.name()); + } + column_schema->kind = ParquetColumnSchemaKind::PRIMITIVE; + column_schema->leaf_column_id = leaf_column_id; + column_schema->descriptor = schema.Column(leaf_column_id); + column_schema->type_descriptor = resolve_parquet_type(column_schema->descriptor); + column_schema->type = column_schema->type_descriptor.doris_type; + if (column_schema->type == nullptr) { + return Status::NotSupported("Unsupported parquet column type for column {}", + node.name()); + } + *result = std::move(column_schema); + return Status::OK(); + } + + const auto& group = static_cast(node); + if (is_list_node(node)) { + column_schema->kind = ParquetColumnSchemaKind::LIST; + if (group.field_count() != 1) { + return Status::NotSupported("Unsupported parquet LIST encoding for column {}", + node.name()); + } + std::unique_ptr child; + RETURN_IF_ERROR(build_node_schema(schema, *group.field(0), &child)); + column_schema->type = + nullable_if_needed(std::make_shared(child->type), node); + column_schema->children.push_back(std::move(child)); + *result = std::move(column_schema); + return Status::OK(); + } + + if (is_map_node(node)) { + column_schema->kind = ParquetColumnSchemaKind::MAP; + if (group.field_count() != 1) { + return Status::NotSupported("Unsupported parquet MAP encoding for column {}", + node.name()); + } + std::unique_ptr key_value; + RETURN_IF_ERROR(build_node_schema(schema, *group.field(0), &key_value)); + if (key_value->children.size() != 2) { + return Status::NotSupported("Unsupported parquet MAP key_value layout for column {}", + node.name()); + } + auto key_type = key_value->children[0]->type; + auto value_type = key_value->children[1]->type; + column_schema->type = + nullable_if_needed(std::make_shared(key_type, value_type), node); + column_schema->children.push_back(std::move(key_value)); + *result = std::move(column_schema); + return Status::OK(); + } + + column_schema->kind = ParquetColumnSchemaKind::STRUCT; + DataTypes child_types; + Strings child_names; + child_types.reserve(group.field_count()); + child_names.reserve(group.field_count()); + for (int child_idx = 0; child_idx < group.field_count(); ++child_idx) { + std::unique_ptr child; + RETURN_IF_ERROR(build_node_schema(schema, *group.field(child_idx), &child)); + child_types.push_back(child->type); + child_names.push_back(child->name); + column_schema->children.push_back(std::move(child)); + } + column_schema->type = + nullable_if_needed(std::make_shared(child_types, child_names), node); + *result = std::move(column_schema); + return Status::OK(); +} + +} // namespace + +Status build_parquet_column_schema(const ::parquet::SchemaDescriptor& schema, + std::vector>* fields) { + if (fields == nullptr) { + return Status::InvalidArgument("fields is null"); + } + fields->clear(); + const auto* root = schema.group_node(); + if (root == nullptr) { + return Status::InvalidArgument("Parquet schema root is null"); + } + fields->reserve(root->field_count()); + for (int field_idx = 0; field_idx < root->field_count(); ++field_idx) { + std::unique_ptr field; + RETURN_IF_ERROR(build_node_schema(schema, *root->field(field_idx), &field)); + fields->push_back(std::move(field)); + } + return Status::OK(); +} + +} // namespace doris::parquet diff --git a/be/src/format/new_parquet/parquet_column_schema.h b/be/src/format/new_parquet/parquet_column_schema.h new file mode 100644 index 00000000000000..0d089a0f9cdcbf --- /dev/null +++ b/be/src/format/new_parquet/parquet_column_schema.h @@ -0,0 +1,67 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include + +#include "common/status.h" +#include "core/data_type/data_type.h" +#include "format/new_parquet/parquet_type.h" + +namespace parquet { +class ColumnDescriptor; +class SchemaDescriptor; + +namespace schema { +class Node; +} // namespace schema +} // namespace parquet + +namespace doris::parquet { + +enum class ParquetColumnSchemaKind { + PRIMITIVE, + STRUCT, + LIST, + MAP, +}; + +// 新 Parquet reader 的 file-local schema tree。 +// 它描述 Parquet 逻辑字段到 leaf column ordinal 的关系,不包含 table/global schema 语义。 +struct ParquetColumnSchema { + int field_id = -1; + // Parquet schema 中的 primitive leaf column ordinal。 + // 该 id 用于访问 ColumnDescriptor、RowGroupReader::RecordReader、ColumnChunk + // metadata 和 statistics。复杂类型节点本身没有单一 leaf column,因此为 -1。 + int leaf_column_id = -1; + std::string name; + DataTypePtr type; + ParquetTypeDescriptor type_descriptor; + ParquetColumnSchemaKind kind = ParquetColumnSchemaKind::PRIMITIVE; + const ::parquet::schema::Node* node = nullptr; + const ::parquet::ColumnDescriptor* descriptor = nullptr; + std::vector> children; +}; + +// 从 Arrow Parquet core schema 构造 file-local schema tree。 +Status build_parquet_column_schema(const ::parquet::SchemaDescriptor& schema, + std::vector>* fields); + +} // namespace doris::parquet diff --git a/be/src/format/new_parquet/parquet_reader.cpp b/be/src/format/new_parquet/parquet_reader.cpp new file mode 100644 index 00000000000000..fc00484758ecf3 --- /dev/null +++ b/be/src/format/new_parquet/parquet_reader.cpp @@ -0,0 +1,601 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "format/new_parquet/parquet_reader.h" + +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "common/exception.h" +#include "core/block/block.h" +#include "core/data_type/data_type_nullable.h" +#include "format/new_parquet/column_reader.h" +#include "format/new_parquet/parquet_column_schema.h" +#include "format/new_parquet/parquet_statistics.h" +#include "format/new_parquet/selection_vector.h" +#include "io/fs/file_reader.h" +#include "storage/predicate/column_predicate.h" +#include "util/slice.h" + +namespace doris::parquet { + +constexpr int64_t DEFAULT_PARQUET_READ_BATCH_SIZE = 4096; + +Status arrow_status_to_doris_status(const arrow::Status& status) { + if (status.ok()) { + return Status::OK(); + } + if (status.IsIOError()) { + return Status::IOError(status.ToString()); + } + if (status.IsInvalid()) { + return Status::InvalidArgument(status.ToString()); + } + return Status::InternalError(status.ToString()); +} + +class DorisRandomAccessFile final : public arrow::io::RandomAccessFile { +public: + DorisRandomAccessFile(io::FileReaderSPtr file_reader, io::IOContext* io_ctx) + : _file_reader(std::move(file_reader)), _io_ctx(io_ctx) { + set_mode(arrow::io::FileMode::READ); + } + + arrow::Status Close() override { + _closed = true; + return arrow::Status::OK(); + } + + bool closed() const override { return _closed; } + + arrow::Result Tell() const override { return _pos; } + + arrow::Status Seek(int64_t position) override { + if (position < 0) { + return arrow::Status::Invalid("negative seek position"); + } + _pos = position; + return arrow::Status::OK(); + } + + arrow::Result GetSize() override { + if (!_file_reader) { + return arrow::Status::IOError("Doris file reader is not open"); + } + return static_cast(_file_reader->size()); + } + + arrow::Result Read(int64_t nbytes, void* out) override { + ARROW_ASSIGN_OR_RAISE(auto bytes_read, ReadAt(_pos, nbytes, out)); + _pos += bytes_read; + return bytes_read; + } + + arrow::Result> Read(int64_t nbytes) override { + ARROW_ASSIGN_OR_RAISE(auto buffer, arrow::AllocateResizableBuffer(nbytes)); + ARROW_ASSIGN_OR_RAISE(auto bytes_read, Read(nbytes, buffer->mutable_data())); + ARROW_RETURN_NOT_OK(buffer->Resize(bytes_read, false)); + buffer->ZeroPadding(); + return buffer; + } + + arrow::Result ReadAt(int64_t position, int64_t nbytes, void* out) override { + if (!_file_reader) { + return arrow::Status::IOError("Doris file reader is not open"); + } + if (position < 0 || nbytes < 0) { + return arrow::Status::Invalid("negative read position or length"); + } + size_t bytes_read = 0; + Status st = _file_reader->read_at( + static_cast(position), + Slice(static_cast(out), static_cast(nbytes)), &bytes_read, + _io_ctx); + if (!st.ok()) { + return arrow::Status::IOError(st.to_string_no_stack()); + } + return static_cast(bytes_read); + } + + arrow::Result> ReadAt(int64_t position, + int64_t nbytes) override { + ARROW_ASSIGN_OR_RAISE(auto buffer, arrow::AllocateResizableBuffer(nbytes)); + ARROW_ASSIGN_OR_RAISE(auto bytes_read, ReadAt(position, nbytes, buffer->mutable_data())); + ARROW_RETURN_NOT_OK(buffer->Resize(bytes_read, false)); + buffer->ZeroPadding(); + return buffer; + } + +private: + io::FileReaderSPtr _file_reader; + io::IOContext* _io_ctx = nullptr; + int64_t _pos = 0; + bool _closed = false; +}; + +struct ParquetReaderScanState { + // Doris 文件句柄适配成 Arrow RandomAccessFile。该对象只处理随机读,不携带 + // table/global schema 语义。 + std::shared_ptr arrow_file; + + // Arrow Parquet core reader 和 footer metadata。ParquetReader 只依赖 core API, + // 不使用 parquet::arrow reader,也不输出 Arrow Array/RecordBatch。 + std::unique_ptr<::parquet::ParquetFileReader> file_reader; + std::shared_ptr<::parquet::FileMetaData> metadata; + const ::parquet::SchemaDescriptor* schema = nullptr; + std::vector> file_schema; + + // 当前 scan 的 top-level file-local projection 和 row group 列表。projected_fields + // 决定输出 block;具体 leaf column reader 由 ParquetColumnReaderFactory 按需创建。 + std::vector predicate_fields; + std::vector non_predicate_fields; + std::vector selected_row_groups; + size_t next_row_group_idx = 0; + std::shared_ptr<::parquet::RowGroupReader> current_row_group; + std::vector> current_predicate_columns; + std::vector> current_non_predicate_columns; + int64_t current_row_group_rows = 0; + int64_t current_row_group_rows_read = 0; +}; + +Status ParquetReader::_reset_reader_position() { + _state->next_row_group_idx = 0; + _state->current_row_group.reset(); + _state->current_predicate_columns.clear(); + _state->current_non_predicate_columns.clear(); + _state->current_row_group_rows = 0; + _state->current_row_group_rows_read = 0; + return Status::OK(); +} + +void ParquetReader::_reset_current_row_group() { + _state->current_row_group.reset(); + _state->current_predicate_columns.clear(); + _state->current_non_predicate_columns.clear(); + _state->current_row_group_rows = 0; + _state->current_row_group_rows_read = 0; +} + +void ParquetReader::_fill_schema_field(const ParquetColumnSchema& column_schema, + reader::SchemaField* field) const { + field->id = column_schema.leaf_column_id >= 0 ? column_schema.leaf_column_id + : column_schema.field_id; + field->name = column_schema.name; + field->type = column_schema.type; + field->children.clear(); + field->children.reserve(column_schema.children.size()); + for (const auto& child : column_schema.children) { + reader::SchemaField child_field; + _fill_schema_field(*child, &child_field); + field->children.push_back(std::move(child_field)); + } +} + +bool ParquetReader::_has_structured_filter(const reader::FileLocalFilter& local_filter) { + for (const auto& predicate : local_filter.predicates) { + if (predicate != nullptr) { + return true; + } + } + return false; +} + +Status ParquetReader::_read_filter_columns(int64_t batch_rows, Block* file_block, + SelectionVector* selection, uint16_t* selected_rows) { + selection->resize(static_cast(batch_rows)); + for (size_t filter_idx = 0; filter_idx < _request->predicate_columns.size(); ++filter_idx) { + const int file_field_id = _request->predicate_columns[filter_idx]; + auto& column_reader = _state->current_predicate_columns[filter_idx]; + auto column = file_block->get_by_position(column_reader->file_column_id()) + .column->assume_mutable(); + DCHECK_EQ(file_block->get_by_position(column_reader->file_column_id()) + .type->get_primitive_type(), + column_reader->type()->get_primitive_type()); + int64_t column_rows = 0; + RETURN_IF_ERROR(column_reader->read(batch_rows, column, &column_rows)); + if (column_rows != batch_rows) { + return Status::Corruption("Parquet filter column {} returned {} rows, expected {} rows", + column_reader->name(), column_rows, batch_rows); + } + + for (const auto& local_filter : _request->local_filters) { + if (local_filter.file_column_id != file_field_id || + !_has_structured_filter(local_filter)) { + continue; + } + if (*selected_rows == 0) { + break; + } + for (const auto& predicate : local_filter.predicates) { + *selected_rows = predicate->evaluate(*column, selection->data(), *selected_rows); + if (*selected_rows == 0) { + break; + } + } + break; + } + file_block->replace_by_position(file_field_id, std::move(column)); + if (*selected_rows == 0) { + break; + } + } + return Status::OK(); +} + +Status ParquetReader::_validate_supported_local_filters( + const std::vector& local_filters) { + for (const auto& local_filter : local_filters) { + if (local_filter.conjunct != nullptr) { + return Status::NotSupported( + "Parquet expression filter fallback is not implemented for field {}", + local_filter.file_column_id); + } + } + return Status::OK(); +} + +IColumn::Filter ParquetReader::_selection_to_filter(const SelectionVector& selection, + uint16_t selected_rows, int64_t batch_rows) { + IColumn::Filter filter(static_cast(batch_rows), 0); + for (uint16_t selection_idx = 0; selection_idx < selected_rows; ++selection_idx) { + filter[selection.get_index(selection_idx)] = 1; + } + return filter; +} + +Status ParquetReader::_open_next_row_group(bool* has_row_group) { + *has_row_group = false; + while (_state->next_row_group_idx < _state->selected_row_groups.size()) { + const int row_group_idx = _state->selected_row_groups[_state->next_row_group_idx++]; + try { + _state->current_row_group = _state->file_reader->RowGroup(row_group_idx); + } catch (const ::parquet::ParquetException& e) { + return Status::Corruption("Failed to open parquet row group {}: {}", row_group_idx, + e.what()); + } catch (const std::exception& e) { + return Status::InternalError("Failed to open parquet row group {}: {}", row_group_idx, + e.what()); + } + + auto row_group_metadata = _state->metadata->RowGroup(row_group_idx); + _state->current_row_group_rows = + row_group_metadata == nullptr ? 0 : row_group_metadata->num_rows(); + if (_state->current_row_group_rows < 0) { + return Status::Corruption("Invalid negative row count in parquet row group {}", + row_group_idx); + } else if (_state->current_row_group_rows == 0) { + _reset_current_row_group(); + continue; + } + _state->current_row_group_rows_read = 0; + _state->current_predicate_columns.clear(); + _state->current_non_predicate_columns.clear(); + + ParquetColumnReaderFactory column_reader_factory(_state->current_row_group, + _state->schema->num_columns()); + for (const auto file_field_id : _request->predicate_columns) { + const auto& column_schema = _state->file_schema[file_field_id]; + std::unique_ptr column_reader; + RETURN_IF_ERROR(column_reader_factory.create(*column_schema, &column_reader)); + _state->current_predicate_columns.push_back(std::move(column_reader)); + } + for (const auto file_field_id : _request->non_predicate_columns) { + const auto& column_schema = _state->file_schema[file_field_id]; + std::unique_ptr column_reader; + RETURN_IF_ERROR(column_reader_factory.create(*column_schema, &column_reader)); + _state->current_non_predicate_columns.push_back(std::move(column_reader)); + } + *has_row_group = true; + break; + } + return Status::OK(); +} + +// `file_block` has a complete struct derived from the file's schema. +Status ParquetReader::_read_current_row_group_batch(int64_t batch_rows, Block* file_block, + size_t* rows) { + if (_state->current_predicate_columns.empty() && + _state->current_non_predicate_columns.empty()) { + *rows = static_cast(batch_rows); + return Status::OK(); + } + SelectionVector selection; + uint16_t selected_rows = batch_rows; + // 1. Read all predicate columns and evaluate selection vector. + RETURN_IF_ERROR(_read_filter_columns(batch_rows, file_block, &selection, &selected_rows)); + + // 2. Materialize all predicate columns after filtering. + const bool need_filter_output = selected_rows != batch_rows; + if (need_filter_output) { + IColumn::Filter output_filter = _selection_to_filter(selection, selected_rows, batch_rows); + for (const auto file_field_id : _request->predicate_columns) { + RETURN_IF_CATCH_EXCEPTION(file_block->replace_by_position( + file_field_id, file_block->get_by_position(file_field_id) + .column->filter(output_filter, selected_rows))); + } + } + + // 3. Materialize all non-predicate columns with selection. + for (size_t output_idx = 0; output_idx < _state->current_non_predicate_columns.size(); + ++output_idx) { + auto& column_reader = _state->current_non_predicate_columns[output_idx]; + auto col = file_block->get_columns()[column_reader->file_column_id()]->assume_mutable(); + DCHECK_EQ(file_block->get_by_position(column_reader->file_column_id()) + .type->get_primitive_type(), + column_reader->type()->get_primitive_type()); + if (need_filter_output) { + [[maybe_unused]] auto old_size = col->size(); + RETURN_IF_ERROR(column_reader->select(selection, selected_rows, batch_rows, col)); + if (col->size() != old_size + selected_rows) { + return Status::Corruption( + "Parquet selected output column {} returned {} rows, expected {} rows", + column_reader->name(), col->size(), old_size + selected_rows); + } + } else { + int64_t column_rows = 0; + RETURN_IF_ERROR(column_reader->read(batch_rows, col, &column_rows)); + if (column_rows != batch_rows) { + return Status::Corruption( + "Parquet output column {} returned {} rows, expected {} rows", + column_reader->name(), column_rows, batch_rows); + } + } + } + + *rows = static_cast(selected_rows); + return Status::OK(); +} + +ParquetReader::ParquetReader(std::unique_ptr& system_properties, + std::unique_ptr& file_description, + std::shared_ptr io_ctx, RuntimeProfile* profile) + : FileReader(system_properties, file_description, io_ctx, profile) {} + +ParquetReader::~ParquetReader() = default; + +Status ParquetReader::init(RuntimeState* state) { + RETURN_IF_ERROR(reader::FileReader::init(state)); + _state = std::make_unique(); + _state->arrow_file = + std::make_shared(_tracing_file_reader, _io_ctx.get()); + + try { + _state->file_reader = ::parquet::ParquetFileReader::Open( + _state->arrow_file, ::parquet::default_reader_properties()); + _state->metadata = _state->file_reader->metadata(); + _state->schema = _state->metadata != nullptr ? _state->metadata->schema() : nullptr; + } catch (const ::parquet::ParquetException& e) { + return Status::Corruption("Failed to open parquet file: {}", e.what()); + } catch (const std::exception& e) { + return Status::InternalError("Failed to open parquet file: {}", e.what()); + } + + if (_state->metadata == nullptr || _state->schema == nullptr) { + return Status::Corruption("Failed to read parquet metadata"); + } + RETURN_IF_ERROR(build_parquet_column_schema(*_state->schema, &_state->file_schema)); + return Status::OK(); +} + +Status ParquetReader::get_schema(std::vector* file_schema) const { + if (file_schema == nullptr) { + return Status::InvalidArgument("file_schema is null"); + } + file_schema->clear(); + if (_state == nullptr || _state->schema == nullptr) { + return Status::Uninitialized("ParquetReader is not open"); + } + + file_schema->reserve(_state->file_schema.size()); + for (size_t column_idx = 0; column_idx < _state->file_schema.size(); ++column_idx) { + reader::SchemaField field; + _fill_schema_field(*_state->file_schema[column_idx], &field); + field.id = static_cast(column_idx); + file_schema->push_back(std::move(field)); + } + return Status::OK(); +} + +Status ParquetReader::open(std::unique_ptr& request) { + if (_state == nullptr || _state->metadata == nullptr || _state->schema == nullptr) { + return Status::Uninitialized("ParquetReader is not open"); + } + RETURN_IF_ERROR(reader::FileReader::open(request)); + + const int num_fields = static_cast(_state->file_schema.size()); + for (const auto& local_filter : _request->local_filters) { + if (local_filter.file_column_id < 0 || local_filter.file_column_id >= num_fields) { + return Status::InvalidArgument("Invalid parquet filter top-level field id {}", + local_filter.file_column_id); + } + } + RETURN_IF_ERROR(_validate_supported_local_filters(_request->local_filters)); + + RETURN_IF_ERROR(select_row_groups_by_statistics(*_state->metadata, _state->file_schema, + *_request, &_state->selected_row_groups)); + RETURN_IF_ERROR(_reset_reader_position()); + _eof = _state->selected_row_groups.empty(); + return Status::OK(); +} + +Status ParquetReader::get_block(Block* file_block, size_t* rows, bool* eof) { + if (_state == nullptr || _state->file_reader == nullptr || _state->schema == nullptr) { + return Status::Uninitialized("ParquetReader is not open"); + } + *rows = 0; + if (_eof) { + *eof = true; + return Status::OK(); + } + + while (true) { + if (_state->current_row_group == nullptr) { + bool has_row_group = false; + RETURN_IF_ERROR(_open_next_row_group(&has_row_group)); + if (!has_row_group) { + _eof = true; + *eof = true; + return Status::OK(); + } + } + + const int64_t remaining_rows = + _state->current_row_group_rows - _state->current_row_group_rows_read; + if (remaining_rows <= 0) { + _reset_current_row_group(); + continue; + } + + const int64_t batch_rows = + std::min(DEFAULT_PARQUET_READ_BATCH_SIZE, remaining_rows); + const int64_t physical_rows_read = batch_rows; + RETURN_IF_ERROR(_read_current_row_group_batch(batch_rows, file_block, rows)); + _state->current_row_group_rows_read += physical_rows_read; + if (_state->current_row_group_rows_read >= _state->current_row_group_rows) { + _reset_current_row_group(); + } + if (*rows == 0) { + continue; + } + *eof = false; + // TODO: Compute _request->reader_expression_map to filter file_block + return Status::OK(); + } +} + +Status ParquetReader::close() { + if (_state != nullptr) { + if (_state->file_reader != nullptr) { + try { + _state->file_reader->Close(); + } catch (const std::exception&) { + // close 需要保持幂等;这里不覆盖此前 scan 路径上的真实错误。 + } + } + if (_state->arrow_file != nullptr) { + static_cast(arrow_status_to_doris_status(_state->arrow_file->Close())); + } + _state = std::make_unique(); + } + return FileReader::close(); +} + +void ParquetReader::_init_profile() { + if (_profile != nullptr) { + static const char* parquet_profile = "ParquetReader"; + ADD_TIMER_WITH_LEVEL(_profile, parquet_profile, 1); + + _parquet_profile.filtered_row_groups = ADD_CHILD_COUNTER_WITH_LEVEL( + _profile, "RowGroupsFiltered", TUnit::UNIT, parquet_profile, 1); + _parquet_profile.filtered_row_groups_by_min_max = ADD_CHILD_COUNTER_WITH_LEVEL( + _profile, "RowGroupsFilteredByMinMax", TUnit::UNIT, parquet_profile, 1); + _parquet_profile.filtered_row_groups_by_bloom_filter = ADD_CHILD_COUNTER_WITH_LEVEL( + _profile, "RowGroupsFilteredByBloomFilter", TUnit::UNIT, parquet_profile, 1); + _parquet_profile.to_read_row_groups = ADD_CHILD_COUNTER_WITH_LEVEL( + _profile, "RowGroupsReadNum", TUnit::UNIT, parquet_profile, 1); + _parquet_profile.total_row_groups = ADD_CHILD_COUNTER_WITH_LEVEL( + _profile, "RowGroupsTotalNum", TUnit::UNIT, parquet_profile, 1); + _parquet_profile.filtered_group_rows = ADD_CHILD_COUNTER_WITH_LEVEL( + _profile, "FilteredRowsByGroup", TUnit::UNIT, parquet_profile, 1); + _parquet_profile.filtered_page_rows = ADD_CHILD_COUNTER_WITH_LEVEL( + _profile, "FilteredRowsByPage", TUnit::UNIT, parquet_profile, 1); + _parquet_profile.lazy_read_filtered_rows = ADD_CHILD_COUNTER_WITH_LEVEL( + _profile, "FilteredRowsByLazyRead", TUnit::UNIT, parquet_profile, 1); + _parquet_profile.filtered_bytes = ADD_CHILD_COUNTER_WITH_LEVEL( + _profile, "FilteredBytes", TUnit::BYTES, parquet_profile, 1); + _parquet_profile.raw_rows_read = ADD_CHILD_COUNTER_WITH_LEVEL( + _profile, "RawRowsRead", TUnit::UNIT, parquet_profile, 1); + _parquet_profile.column_read_time = + ADD_CHILD_TIMER_WITH_LEVEL(_profile, "ColumnReadTime", parquet_profile, 1); + _parquet_profile.parse_meta_time = + ADD_CHILD_TIMER_WITH_LEVEL(_profile, "ParseMetaTime", parquet_profile, 1); + _parquet_profile.parse_footer_time = + ADD_CHILD_TIMER_WITH_LEVEL(_profile, "ParseFooterTime", parquet_profile, 1); + _parquet_profile.file_reader_create_time = + ADD_CHILD_TIMER_WITH_LEVEL(_profile, "FileReaderCreateTime", parquet_profile, 1); + _parquet_profile.open_file_num = + ADD_CHILD_COUNTER_WITH_LEVEL(_profile, "FileNum", TUnit::UNIT, parquet_profile, 1); + _parquet_profile.page_index_read_calls = + ADD_COUNTER_WITH_LEVEL(_profile, "PageIndexReadCalls", TUnit::UNIT, 1); + _parquet_profile.page_index_filter_time = + ADD_CHILD_TIMER_WITH_LEVEL(_profile, "PageIndexFilterTime", parquet_profile, 1); + _parquet_profile.read_page_index_time = + ADD_CHILD_TIMER_WITH_LEVEL(_profile, "PageIndexReadTime", parquet_profile, 1); + _parquet_profile.parse_page_index_time = + ADD_CHILD_TIMER_WITH_LEVEL(_profile, "PageIndexParseTime", parquet_profile, 1); + _parquet_profile.row_group_filter_time = + ADD_CHILD_TIMER_WITH_LEVEL(_profile, "RowGroupFilterTime", parquet_profile, 1); + _parquet_profile.file_footer_read_calls = + ADD_COUNTER_WITH_LEVEL(_profile, "FileFooterReadCalls", TUnit::UNIT, 1); + _parquet_profile.file_footer_hit_cache = + ADD_COUNTER_WITH_LEVEL(_profile, "FileFooterHitCache", TUnit::UNIT, 1); + _parquet_profile.decompress_time = + ADD_CHILD_TIMER_WITH_LEVEL(_profile, "DecompressTime", parquet_profile, 1); + _parquet_profile.decompress_cnt = ADD_CHILD_COUNTER_WITH_LEVEL( + _profile, "DecompressCount", TUnit::UNIT, parquet_profile, 1); + _parquet_profile.page_read_counter = ADD_CHILD_COUNTER_WITH_LEVEL( + _profile, "PageReadCount", TUnit::UNIT, parquet_profile, 1); + _parquet_profile.page_cache_write_counter = ADD_CHILD_COUNTER_WITH_LEVEL( + _profile, "PageCacheWriteCount", TUnit::UNIT, parquet_profile, 1); + _parquet_profile.page_cache_compressed_write_counter = ADD_CHILD_COUNTER_WITH_LEVEL( + _profile, "PageCacheCompressedWriteCount", TUnit::UNIT, parquet_profile, 1); + _parquet_profile.page_cache_decompressed_write_counter = ADD_CHILD_COUNTER_WITH_LEVEL( + _profile, "PageCacheDecompressedWriteCount", TUnit::UNIT, parquet_profile, 1); + _parquet_profile.page_cache_hit_counter = ADD_CHILD_COUNTER_WITH_LEVEL( + _profile, "PageCacheHitCount", TUnit::UNIT, parquet_profile, 1); + _parquet_profile.page_cache_missing_counter = ADD_CHILD_COUNTER_WITH_LEVEL( + _profile, "PageCacheMissingCount", TUnit::UNIT, parquet_profile, 1); + _parquet_profile.page_cache_compressed_hit_counter = ADD_CHILD_COUNTER_WITH_LEVEL( + _profile, "PageCacheCompressedHitCount", TUnit::UNIT, parquet_profile, 1); + _parquet_profile.page_cache_decompressed_hit_counter = ADD_CHILD_COUNTER_WITH_LEVEL( + _profile, "PageCacheDecompressedHitCount", TUnit::UNIT, parquet_profile, 1); + _parquet_profile.decode_header_time = + ADD_CHILD_TIMER_WITH_LEVEL(_profile, "PageHeaderDecodeTime", parquet_profile, 1); + _parquet_profile.read_page_header_time = + ADD_CHILD_TIMER_WITH_LEVEL(_profile, "PageHeaderReadTime", parquet_profile, 1); + _parquet_profile.decode_value_time = + ADD_CHILD_TIMER_WITH_LEVEL(_profile, "DecodeValueTime", parquet_profile, 1); + _parquet_profile.decode_dict_time = + ADD_CHILD_TIMER_WITH_LEVEL(_profile, "DecodeDictTime", parquet_profile, 1); + _parquet_profile.decode_level_time = + ADD_CHILD_TIMER_WITH_LEVEL(_profile, "DecodeLevelTime", parquet_profile, 1); + _parquet_profile.decode_null_map_time = + ADD_CHILD_TIMER_WITH_LEVEL(_profile, "DecodeNullMapTime", parquet_profile, 1); + _parquet_profile.skip_page_header_num = ADD_CHILD_COUNTER_WITH_LEVEL( + _profile, "SkipPageHeaderNum", TUnit::UNIT, parquet_profile, 1); + _parquet_profile.parse_page_header_num = ADD_CHILD_COUNTER_WITH_LEVEL( + _profile, "ParsePageHeaderNum", TUnit::UNIT, parquet_profile, 1); + _parquet_profile.predicate_filter_time = + ADD_CHILD_TIMER_WITH_LEVEL(_profile, "PredicateFilterTime", parquet_profile, 1); + _parquet_profile.dict_filter_rewrite_time = + ADD_CHILD_TIMER_WITH_LEVEL(_profile, "DictFilterRewriteTime", parquet_profile, 1); + _parquet_profile.convert_time = + ADD_CHILD_TIMER_WITH_LEVEL(_profile, "ConvertTime", parquet_profile, 1); + _parquet_profile.bloom_filter_read_time = + ADD_CHILD_TIMER_WITH_LEVEL(_profile, "BloomFilterReadTime", parquet_profile, 1); + } +} + +} // namespace doris::parquet diff --git a/be/src/format/new_parquet/parquet_reader.h b/be/src/format/new_parquet/parquet_reader.h new file mode 100644 index 00000000000000..426960a4dfd042 --- /dev/null +++ b/be/src/format/new_parquet/parquet_reader.h @@ -0,0 +1,138 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +#include "common/status.h" +#include "format/reader/file_reader.h" +#include "parquet_column_schema.h" +#include "selection_vector.h" + +namespace doris { +namespace io { +struct IOContext; +} // namespace io +} // namespace doris + +namespace doris::parquet { + +struct ParquetReaderScanState; + +// ParquetReader 的 file-local scan 请求。 +// 当前没有新增 Parquet-only 字段,但保留独立类型,便于后续加入 row group/page index +// 等 Parquet 专属选项。 +struct ParquetScanRequest : public reader::FileScanRequest {}; + +// Parquet 文件物理读取层。 +// 该类只理解 Parquet file-local schema 和 ParquetScanRequest,不理解 Iceberg/global +// schema,不处理 table-level cast/default/generated/partition 语义。 +class ParquetReader : public reader::FileReader { +public: + ParquetReader(std::unique_ptr& system_properties, + std::unique_ptr& file_description, + std::shared_ptr io_ctx, RuntimeProfile* profile); + ~ParquetReader() override; + + // 打开 Parquet 文件并解析 footer metadata。 + // init 成功后可以调用 get_schema() 获取 Parquet file-local schema。 + Status init(RuntimeState* state) override; + + // 解析 Parquet footer 并返回 Parquet 文件自身的 schema。 + // 该方法只能在 open() 成功后调用,不要求 init() 已经执行。 + // 这里不做 Iceberg schema evolution,也不把字段转换成 table/global schema。 + Status get_schema(std::vector* file_schema) const override; + + Status open(std::unique_ptr& request) override; + // 读取下一批 Parquet file-local block。 + // 该方法只能在 init() 成功后调用。 + // 返回列必须保持 file-local 语义,不能在这里补 default/generated/partition 列。 + Status get_block(Block* file_block, size_t* rows, bool* eof) override; + + Status close() override; + +protected: + void _init_profile() override; + +private: + struct ParquetProfile { + RuntimeProfile::Counter* filtered_row_groups = nullptr; + RuntimeProfile::Counter* filtered_row_groups_by_min_max = nullptr; + RuntimeProfile::Counter* filtered_row_groups_by_bloom_filter = nullptr; + RuntimeProfile::Counter* to_read_row_groups = nullptr; + RuntimeProfile::Counter* total_row_groups = nullptr; + RuntimeProfile::Counter* filtered_group_rows = nullptr; + RuntimeProfile::Counter* filtered_page_rows = nullptr; + RuntimeProfile::Counter* lazy_read_filtered_rows = nullptr; + RuntimeProfile::Counter* filtered_bytes = nullptr; + RuntimeProfile::Counter* raw_rows_read = nullptr; + RuntimeProfile::Counter* column_read_time = nullptr; + RuntimeProfile::Counter* parse_meta_time = nullptr; + RuntimeProfile::Counter* parse_footer_time = nullptr; + RuntimeProfile::Counter* file_reader_create_time = nullptr; + RuntimeProfile::Counter* open_file_num = nullptr; + RuntimeProfile::Counter* row_group_filter_time = nullptr; + RuntimeProfile::Counter* page_index_read_calls = nullptr; + RuntimeProfile::Counter* page_index_filter_time = nullptr; + RuntimeProfile::Counter* read_page_index_time = nullptr; + RuntimeProfile::Counter* parse_page_index_time = nullptr; + RuntimeProfile::Counter* file_footer_read_calls = nullptr; + RuntimeProfile::Counter* file_footer_hit_cache = nullptr; + RuntimeProfile::Counter* decompress_time = nullptr; + RuntimeProfile::Counter* decompress_cnt = nullptr; + RuntimeProfile::Counter* page_read_counter = nullptr; + RuntimeProfile::Counter* page_cache_write_counter = nullptr; + RuntimeProfile::Counter* page_cache_compressed_write_counter = nullptr; + RuntimeProfile::Counter* page_cache_decompressed_write_counter = nullptr; + RuntimeProfile::Counter* page_cache_hit_counter = nullptr; + RuntimeProfile::Counter* page_cache_missing_counter = nullptr; + RuntimeProfile::Counter* page_cache_compressed_hit_counter = nullptr; + RuntimeProfile::Counter* page_cache_decompressed_hit_counter = nullptr; + RuntimeProfile::Counter* decode_header_time = nullptr; + RuntimeProfile::Counter* read_page_header_time = nullptr; + RuntimeProfile::Counter* decode_value_time = nullptr; + RuntimeProfile::Counter* decode_dict_time = nullptr; + RuntimeProfile::Counter* decode_level_time = nullptr; + RuntimeProfile::Counter* decode_null_map_time = nullptr; + RuntimeProfile::Counter* skip_page_header_num = nullptr; + RuntimeProfile::Counter* parse_page_header_num = nullptr; + RuntimeProfile::Counter* predicate_filter_time = nullptr; + RuntimeProfile::Counter* dict_filter_rewrite_time = nullptr; + RuntimeProfile::Counter* convert_time = nullptr; + RuntimeProfile::Counter* bloom_filter_read_time = nullptr; + }; + Status _reset_reader_position(); + void _reset_current_row_group(); + void _fill_schema_field(const ParquetColumnSchema& column_schema, + reader::SchemaField* field) const; + bool _has_structured_filter(const reader::FileLocalFilter& local_filter); + Status _read_filter_columns(int64_t batch_rows, Block* file_block, SelectionVector* selection, + uint16_t* selected_rows); + Status _validate_supported_local_filters( + const std::vector& local_filters); + IColumn::Filter _selection_to_filter(const SelectionVector& selection, uint16_t selected_rows, + int64_t batch_rows); + Status _open_next_row_group(bool* has_row_group); + Status _read_current_row_group_batch(int64_t batch_rows, Block* file_block, size_t* rows); + + std::unique_ptr _state; + ParquetProfile _parquet_profile; +}; + +} // namespace doris::parquet diff --git a/be/src/format/new_parquet/parquet_statistics.cpp b/be/src/format/new_parquet/parquet_statistics.cpp new file mode 100644 index 00000000000000..aebc6d4e04d9fb --- /dev/null +++ b/be/src/format/new_parquet/parquet_statistics.cpp @@ -0,0 +1,254 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "format/new_parquet/parquet_statistics.h" + +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "core/data_type/data_type.h" +#include "core/data_type/data_type_nullable.h" +#include "core/data_type/primitive_type.h" +#include "core/field.h" +#include "format/new_parquet/parquet_column_schema.h" +#include "storage/index/zone_map/zone_map_index.h" +#include "storage/predicate/column_predicate.h" + +namespace doris::parquet { +namespace { + +PrimitiveType physical_filter_type(const ParquetColumnSchema& column_schema) { + if (column_schema.type == nullptr) { + return INVALID_TYPE; + } + switch (remove_nullable(column_schema.type)->get_primitive_type()) { + case TYPE_BOOLEAN: + case TYPE_INT: + case TYPE_BIGINT: + case TYPE_FLOAT: + case TYPE_DOUBLE: + case TYPE_STRING: + return remove_nullable(column_schema.type)->get_primitive_type(); + default: + return INVALID_TYPE; + } +} + +template +bool set_typed_min_max(const std::shared_ptr<::parquet::Statistics>& statistics, ConvertFn convert, + ParquetColumnStatistics* column_statistics) { + auto typed_statistics = + std::static_pointer_cast<::parquet::TypedStatistics>(statistics); + column_statistics->min_value = Field::create_field(convert(typed_statistics->min())); + column_statistics->max_value = Field::create_field(convert(typed_statistics->max())); + return true; +} + +bool set_string_min_max(const std::shared_ptr<::parquet::Statistics>& statistics, + const ::parquet::ColumnDescriptor* descriptor, + ParquetColumnStatistics* column_statistics) { + switch (statistics->physical_type()) { + case ::parquet::Type::BYTE_ARRAY: { + auto typed_statistics = + std::static_pointer_cast<::parquet::TypedStatistics<::parquet::ByteArrayType>>( + statistics); + column_statistics->min_value = Field::create_field( + ::parquet::ByteArrayToString(typed_statistics->min())); + column_statistics->max_value = Field::create_field( + ::parquet::ByteArrayToString(typed_statistics->max())); + return true; + } + case ::parquet::Type::FIXED_LEN_BYTE_ARRAY: { + if (descriptor == nullptr || descriptor->type_length() <= 0) { + return false; + } + auto typed_statistics = + std::static_pointer_cast<::parquet::TypedStatistics<::parquet::FLBAType>>( + statistics); + const int type_length = descriptor->type_length(); + column_statistics->min_value = Field::create_field(std::string( + reinterpret_cast(typed_statistics->min().ptr), type_length)); + column_statistics->max_value = Field::create_field(std::string( + reinterpret_cast(typed_statistics->max().ptr), type_length)); + return true; + } + default: + return false; + } +} + +bool is_null_only_predicate(const ColumnPredicate& predicate) { + return predicate.type() == PredicateType::IS_NULL || + predicate.type() == PredicateType::IS_NOT_NULL; +} + +segment_v2::ZoneMap to_column_predicate_statistics(const ParquetColumnStatistics& statistics) { + segment_v2::ZoneMap predicate_statistics; + predicate_statistics.min_value = statistics.min_value; + predicate_statistics.max_value = statistics.max_value; + predicate_statistics.has_null = statistics.has_null; + predicate_statistics.has_not_null = statistics.has_not_null; + return predicate_statistics; +} + +} // namespace + +ParquetColumnStatistics ParquetStatisticsUtils::TransformColumnStatistics( + const ParquetColumnSchema& column_schema, + const std::shared_ptr<::parquet::Statistics>& statistics) { + ParquetColumnStatistics result; + if (statistics == nullptr) { + return result; + } + + result.has_null = statistics->HasNullCount() && statistics->null_count() > 0; + result.has_not_null = statistics->num_values() > 0 || statistics->HasMinMax(); + result.has_null_count = statistics->HasNullCount(); + if (!result.has_not_null || !statistics->HasMinMax()) { + return result; + } + + switch (statistics->physical_type()) { + case ::parquet::Type::BOOLEAN: + result.has_min_max = set_typed_min_max<::parquet::BooleanType, TYPE_BOOLEAN>( + statistics, [](bool value) { return static_cast(value); }, &result); + return result; + case ::parquet::Type::INT32: + result.has_min_max = set_typed_min_max<::parquet::Int32Type, TYPE_INT>( + statistics, [](int32_t value) { return value; }, &result); + return result; + case ::parquet::Type::INT64: + result.has_min_max = set_typed_min_max<::parquet::Int64Type, TYPE_BIGINT>( + statistics, [](int64_t value) { return value; }, &result); + return result; + case ::parquet::Type::FLOAT: + result.has_min_max = set_typed_min_max<::parquet::FloatType, TYPE_FLOAT>( + statistics, [](float value) { return value; }, &result); + return result; + case ::parquet::Type::DOUBLE: + result.has_min_max = set_typed_min_max<::parquet::DoubleType, TYPE_DOUBLE>( + statistics, [](double value) { return value; }, &result); + return result; + case ::parquet::Type::BYTE_ARRAY: + case ::parquet::Type::FIXED_LEN_BYTE_ARRAY: + result.has_min_max = set_string_min_max(statistics, column_schema.descriptor, &result); + return result; + default: + return result; + } +} + +bool ParquetStatisticsUtils::CheckStatistics(const reader::FileLocalFilter& local_filter, + const ParquetColumnStatistics& statistics) { + if (!statistics.has_any_statistics()) { + return false; + } + + // TODO: replace local_filter.predicates by local_filter.conjuncts + for (const auto& column_predicate : local_filter.predicates) { + if (is_null_only_predicate(*column_predicate)) { + if (!statistics.has_null_count) { + continue; + } + } else if (!statistics.has_any_statistics()) { + continue; + } + if (!column_predicate->evaluate_and(to_column_predicate_statistics(statistics))) { + return true; + } + } + return false; +} + +bool ParquetStatisticsUtils::RowGroupExcludes( + const ::parquet::RowGroupMetaData& row_group, + const std::vector>& schema, + const reader::FileLocalFilter& local_filter) { + DCHECK(local_filter.file_column_id >= 0 && + local_filter.file_column_id < row_group.num_columns()); + DCHECK_LT(local_filter.file_column_id, schema.size()); + auto column_chunk = row_group.ColumnChunk(local_filter.file_column_id); + if (column_chunk == nullptr) { + return false; + } + return CheckStatistics(local_filter, + TransformColumnStatistics(*schema[local_filter.file_column_id], + column_chunk->statistics())); +} + +Status ParquetStatisticsUtils::SelectRowGroups( + const ::parquet::FileMetaData& metadata, + const std::vector>& file_schema, + const reader::FileScanRequest& request, std::vector* selected_row_groups) { + if (selected_row_groups == nullptr) { + return Status::InvalidArgument("selected_row_groups is null"); + } + selected_row_groups->clear(); + + const int num_row_groups = metadata.num_row_groups(); + selected_row_groups->reserve(num_row_groups); + for (int row_group_idx = 0; row_group_idx < num_row_groups; ++row_group_idx) { + auto row_group = metadata.RowGroup(row_group_idx); + if (row_group == nullptr) { + selected_row_groups->push_back(row_group_idx); + continue; + } + bool drop = false; + for (const auto& local_filter : request.local_filters) { + if (RowGroupExcludes(*row_group, file_schema, local_filter)) { + drop = true; + break; + } + } + if (drop) { + continue; + } + selected_row_groups->push_back(row_group_idx); + } + return Status::OK(); +} + +bool ParquetStatisticsUtils::BloomFilterSupported(const ParquetColumnSchema& column_schema) { + switch (physical_filter_type(column_schema)) { + case TYPE_BOOLEAN: + case TYPE_INT: + case TYPE_BIGINT: + case TYPE_FLOAT: + case TYPE_DOUBLE: + case TYPE_STRING: + return true; + default: + return false; + } +} + +Status select_row_groups_by_statistics( + const ::parquet::FileMetaData& metadata, + const std::vector>& file_schema, + const reader::FileScanRequest& request, std::vector* selected_row_groups) { + return ParquetStatisticsUtils::SelectRowGroups(metadata, file_schema, request, + selected_row_groups); +} + +} // namespace doris::parquet diff --git a/be/src/format/new_parquet/parquet_statistics.h b/be/src/format/new_parquet/parquet_statistics.h new file mode 100644 index 00000000000000..0def08d4b084df --- /dev/null +++ b/be/src/format/new_parquet/parquet_statistics.h @@ -0,0 +1,88 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +#include "common/status.h" +#include "core/field.h" +#include "format/reader/file_reader.h" + +namespace parquet { +class FileMetaData; +class RowGroupMetaData; +class Statistics; +} // namespace parquet + +namespace doris { +class ColumnPredicate; +} // namespace doris + +namespace doris::parquet { + +struct ParquetColumnSchema; + +// Parquet row group column statistics 转换后的 Doris 统计视图。 +// DuckDB 会把 Parquet stats 转换成 BaseStatistics,然后让 TableFilter 自己判断; +// Doris 新 reader 先保存 file-local min/max/null 信息,再交给 ColumnPredicate 判断。 +struct ParquetColumnStatistics { + Field min_value; + Field max_value; + bool has_null = false; + bool has_not_null = false; + bool has_null_count = false; + bool has_min_max = false; + + bool has_any_statistics() const { return has_null_count || has_min_max; } +}; + +// Parquet file-local statistics/page index/bloom filter 工具类。 +// 结构参考 DuckDB ParquetStatisticsUtils:先把 Parquet metadata 转成统一统计对象, +// 再由 filter/predicate 判断是否可以裁剪。这里不理解 table/global schema。 +struct ParquetStatisticsUtils { + static ParquetColumnStatistics TransformColumnStatistics( + const ParquetColumnSchema& column_schema, + const std::shared_ptr<::parquet::Statistics>& statistics); + + // Return true if the statistics indicate that the row group can be safely skipped according to the local filter. + static bool CheckStatistics(const reader::FileLocalFilter& local_filter, + const ParquetColumnStatistics& statistics); + + static bool RowGroupExcludes(const ::parquet::RowGroupMetaData& row_group, + const std::vector>& schema, + const reader::FileLocalFilter& local_filter); + + static Status SelectRowGroups( + const ::parquet::FileMetaData& metadata, + const std::vector>& file_schema, + const reader::FileScanRequest& request, std::vector* selected_row_groups); + + static bool BloomFilterSupported(const ParquetColumnSchema& column_schema); +}; + +// Parquet file-local statistics/page index/bloom filter 裁剪入口。 +// 这里只消费已经 localize 到 file schema 的 FileScanRequest,不理解 table/global schema。 +// 后续 page index、dictionary、bloom filter 等文件格式优化也应继续收敛在这一层,避免污染 +// ParquetReader 的 scan 调度代码。 +Status select_row_groups_by_statistics( + const ::parquet::FileMetaData& metadata, + const std::vector>& file_schema, + const reader::FileScanRequest& request, std::vector* selected_row_groups); + +} // namespace doris::parquet diff --git a/be/src/format/new_parquet/parquet_type.cpp b/be/src/format/new_parquet/parquet_type.cpp new file mode 100644 index 00000000000000..53c7b4f2ed93ce --- /dev/null +++ b/be/src/format/new_parquet/parquet_type.cpp @@ -0,0 +1,349 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "format/new_parquet/parquet_type.h" + +#include + +#include +#include + +#include "core/data_type/data_type_factory.hpp" +#include "core/data_type/data_type_nullable.h" +#include "core/data_type/data_type_number.h" +#include "core/data_type/data_type_string.h" +#include "core/data_type/primitive_type.h" + +namespace doris::parquet { +namespace { + +DataTypePtr create_type(PrimitiveType type, bool nullable, int precision = 0, int scale = 0) { + return DataTypeFactory::instance().create_data_type(type, nullable, precision, scale); +} + +PrimitiveType decimal_primitive_type(int precision) { + return precision > 38 ? TYPE_DECIMAL256 : TYPE_DECIMAL128I; +} + +bool has_non_physical_annotation(const ::parquet::ColumnDescriptor* column) { + if (column == nullptr) { + return false; + } + const auto& logical_type = column->logical_type(); + return column->converted_type() != ::parquet::ConvertedType::NONE || + (logical_type != nullptr && logical_type->is_valid() && !logical_type->is_none()); +} + +void mark_decimal(const ::parquet::ColumnDescriptor* column, int precision, int scale, + ParquetTypeDescriptor* result) { + result->is_decimal = true; + result->decimal_precision = precision; + result->decimal_scale = scale; + switch (column->physical_type()) { + case ::parquet::Type::INT32: + result->extra_type_info = ParquetExtraTypeInfo::DECIMAL_INT32; + break; + case ::parquet::Type::INT64: + result->extra_type_info = ParquetExtraTypeInfo::DECIMAL_INT64; + break; + case ::parquet::Type::BYTE_ARRAY: + case ::parquet::Type::FIXED_LEN_BYTE_ARRAY: + result->extra_type_info = ParquetExtraTypeInfo::DECIMAL_BYTE_ARRAY; + break; + default: + result->extra_type_info = ParquetExtraTypeInfo::NONE; + break; + } +} + +DataTypePtr converted_type_to_doris_type(const ::parquet::ColumnDescriptor* column, + ParquetTypeDescriptor* result) { + const bool nullable = column->max_definition_level() > 0; + switch (column->converted_type()) { + case ::parquet::ConvertedType::UTF8: + case ::parquet::ConvertedType::ENUM: + case ::parquet::ConvertedType::JSON: + case ::parquet::ConvertedType::BSON: + return create_type(TYPE_STRING, nullable); + case ::parquet::ConvertedType::DECIMAL: + mark_decimal(column, column->type_precision(), column->type_scale(), result); + return create_type(decimal_primitive_type(column->type_precision()), nullable, + column->type_precision(), column->type_scale()); + case ::parquet::ConvertedType::DATE: + return create_type(TYPE_DATEV2, nullable); + case ::parquet::ConvertedType::TIME_MILLIS: + result->time_unit = ParquetTimeUnit::MILLIS; + result->extra_type_info = ParquetExtraTypeInfo::UNIT_MS; + return create_type(TYPE_TIMEV2, nullable, 0, 3); + case ::parquet::ConvertedType::TIME_MICROS: + result->time_unit = ParquetTimeUnit::MICROS; + result->extra_type_info = ParquetExtraTypeInfo::UNIT_MICROS; + return create_type(TYPE_TIMEV2, nullable, 0, 6); + case ::parquet::ConvertedType::TIMESTAMP_MILLIS: + result->is_timestamp = true; + result->time_unit = ParquetTimeUnit::MILLIS; + result->extra_type_info = ParquetExtraTypeInfo::UNIT_MS; + return create_type(TYPE_DATETIMEV2, nullable, 0, 3); + case ::parquet::ConvertedType::TIMESTAMP_MICROS: + result->is_timestamp = true; + result->time_unit = ParquetTimeUnit::MICROS; + result->extra_type_info = ParquetExtraTypeInfo::UNIT_MICROS; + return create_type(TYPE_DATETIMEV2, nullable, 0, 6); + case ::parquet::ConvertedType::INT_8: + return create_type(TYPE_TINYINT, nullable); + case ::parquet::ConvertedType::UINT_8: + case ::parquet::ConvertedType::INT_16: + return create_type(TYPE_SMALLINT, nullable); + case ::parquet::ConvertedType::UINT_16: + case ::parquet::ConvertedType::INT_32: + return create_type(TYPE_INT, nullable); + case ::parquet::ConvertedType::UINT_32: + case ::parquet::ConvertedType::INT_64: + return create_type(TYPE_BIGINT, nullable); + case ::parquet::ConvertedType::UINT_64: + return create_type(TYPE_LARGEINT, nullable); + case ::parquet::ConvertedType::NONE: + default: + return nullptr; + } +} + +DataTypePtr logical_type_to_doris_type(const ::parquet::ColumnDescriptor* column, + ParquetTypeDescriptor* result) { + const auto& logical_type = column->logical_type(); + if (logical_type == nullptr || !logical_type->is_valid() || logical_type->is_none()) { + return nullptr; + } + const bool nullable = column->max_definition_level() > 0; + if (logical_type->is_string() || logical_type->is_enum() || logical_type->is_JSON() || + logical_type->is_BSON() || logical_type->is_UUID()) { + return create_type(TYPE_STRING, nullable); + } + if (logical_type->is_decimal()) { + const auto& decimal_type = static_cast(*logical_type); + mark_decimal(column, decimal_type.precision(), decimal_type.scale(), result); + return create_type(decimal_primitive_type(decimal_type.precision()), nullable, + decimal_type.precision(), decimal_type.scale()); + } + if (logical_type->is_date()) { + return create_type(TYPE_DATEV2, nullable); + } + if (logical_type->is_time()) { + const auto& time_type = static_cast(*logical_type); + int scale = 0; + if (time_type.time_unit() == ::parquet::LogicalType::TimeUnit::MILLIS) { + scale = 3; + result->time_unit = ParquetTimeUnit::MILLIS; + result->extra_type_info = ParquetExtraTypeInfo::UNIT_MS; + } else if (time_type.time_unit() == ::parquet::LogicalType::TimeUnit::MICROS) { + scale = 6; + result->time_unit = ParquetTimeUnit::MICROS; + result->extra_type_info = ParquetExtraTypeInfo::UNIT_MICROS; + } else { + return nullptr; + } + return create_type(TYPE_TIMEV2, nullable, 0, scale); + } + if (logical_type->is_timestamp()) { + const auto& timestamp_type = + static_cast(*logical_type); + int scale = 0; + if (timestamp_type.time_unit() == ::parquet::LogicalType::TimeUnit::MILLIS) { + scale = 3; + result->time_unit = ParquetTimeUnit::MILLIS; + result->extra_type_info = ParquetExtraTypeInfo::UNIT_MS; + } else if (timestamp_type.time_unit() == ::parquet::LogicalType::TimeUnit::MICROS) { + scale = 6; + result->time_unit = ParquetTimeUnit::MICROS; + result->extra_type_info = ParquetExtraTypeInfo::UNIT_MICROS; + } else { + return nullptr; + } + result->is_timestamp = true; + return create_type(TYPE_DATETIMEV2, nullable, 0, scale); + } + if (logical_type->is_int()) { + const auto& int_type = static_cast(*logical_type); + switch (int_type.bit_width()) { + case 8: + return create_type(int_type.is_signed() ? TYPE_TINYINT : TYPE_SMALLINT, nullable); + case 16: + return create_type(int_type.is_signed() ? TYPE_SMALLINT : TYPE_INT, nullable); + case 32: + return create_type(int_type.is_signed() ? TYPE_INT : TYPE_BIGINT, nullable); + case 64: + return create_type(int_type.is_signed() ? TYPE_BIGINT : TYPE_LARGEINT, nullable); + default: + return nullptr; + } + } + return nullptr; +} + +DataTypePtr physical_type_to_doris_type(const ::parquet::ColumnDescriptor* column) { + const bool nullable = column->max_definition_level() > 0; + DataTypePtr type; + switch (column->physical_type()) { + case ::parquet::Type::BOOLEAN: + type = std::make_shared(); + break; + case ::parquet::Type::INT32: + type = std::make_shared(); + break; + case ::parquet::Type::INT64: + type = std::make_shared(); + break; + case ::parquet::Type::FLOAT: + type = std::make_shared(); + break; + case ::parquet::Type::DOUBLE: + type = std::make_shared(); + break; + case ::parquet::Type::BYTE_ARRAY: + case ::parquet::Type::FIXED_LEN_BYTE_ARRAY: + type = std::make_shared(); + break; + case ::parquet::Type::INT96: + type = std::make_shared(); + break; + default: + return nullptr; + } + return nullable ? make_nullable(type) : type; +} + +DataTypePtr direct_flat_primitive_doris_type(const ::parquet::ColumnDescriptor* column) { + if (column == nullptr || column->max_repetition_level() != 0 || + column->max_definition_level() > 1 || has_non_physical_annotation(column)) { + return nullptr; + } + + const bool nullable = column->max_definition_level() > 0; + switch (column->physical_type()) { + case ::parquet::Type::BOOLEAN: + return create_type(TYPE_BOOLEAN, nullable); + case ::parquet::Type::INT32: + return create_type(TYPE_INT, nullable); + case ::parquet::Type::INT64: + return create_type(TYPE_BIGINT, nullable); + case ::parquet::Type::FLOAT: + return create_type(TYPE_FLOAT, nullable); + case ::parquet::Type::DOUBLE: + return create_type(TYPE_DOUBLE, nullable); + default: + return nullptr; + } +} + +bool record_reader_physical_type_supported(::parquet::Type::type physical_type) { + switch (physical_type) { + case ::parquet::Type::BOOLEAN: + case ::parquet::Type::INT32: + case ::parquet::Type::INT64: + case ::parquet::Type::FLOAT: + case ::parquet::Type::DOUBLE: + case ::parquet::Type::BYTE_ARRAY: + case ::parquet::Type::FIXED_LEN_BYTE_ARRAY: + return true; + default: + return false; + } +} + +bool record_reader_integer_annotation_supported(const ::parquet::ColumnDescriptor* column, + const DataTypePtr& doris_type) { + const auto& logical_type = column->logical_type(); + const bool has_int_logical_type = + logical_type != nullptr && logical_type->is_valid() && logical_type->is_int(); + const bool has_int_converted_type = + column->converted_type() == ::parquet::ConvertedType::INT_8 || + column->converted_type() == ::parquet::ConvertedType::UINT_8 || + column->converted_type() == ::parquet::ConvertedType::INT_16 || + column->converted_type() == ::parquet::ConvertedType::UINT_16 || + column->converted_type() == ::parquet::ConvertedType::INT_32 || + column->converted_type() == ::parquet::ConvertedType::UINT_32 || + column->converted_type() == ::parquet::ConvertedType::INT_64 || + column->converted_type() == ::parquet::ConvertedType::UINT_64; + auto primitive_type = remove_nullable(doris_type)->get_primitive_type(); + return (has_int_logical_type || has_int_converted_type) && + (primitive_type == TYPE_TINYINT || primitive_type == TYPE_SMALLINT || + primitive_type == TYPE_INT || primitive_type == TYPE_BIGINT); +} + +} // namespace + +std::string parquet_column_name(const ::parquet::ColumnDescriptor* column) { + if (column == nullptr) { + return {}; + } + auto path = column->path(); + if (path) { + return path->ToDotString(); + } + return column->name(); +} + +ParquetTypeDescriptor resolve_parquet_type(const ::parquet::ColumnDescriptor* column) { + ParquetTypeDescriptor result; + if (column == nullptr) { + return result; + } + + result.physical_type = column->physical_type(); + result.converted_type = column->converted_type(); + result.fixed_length = column->type_length(); + + if (auto logical_type = logical_type_to_doris_type(column, &result); logical_type != nullptr) { + result.doris_type = logical_type; + } else if (auto converted_type = converted_type_to_doris_type(column, &result); + converted_type != nullptr) { + result.doris_type = converted_type; + } else { + result.doris_type = physical_type_to_doris_type(column); + if (result.physical_type == ::parquet::Type::INT96) { + result.extra_type_info = ParquetExtraTypeInfo::IMPALA_TIMESTAMP; + } + } + + result.is_string_like = + !result.is_decimal && (result.physical_type == ::parquet::Type::BYTE_ARRAY || + result.physical_type == ::parquet::Type::FIXED_LEN_BYTE_ARRAY); + + if (column->max_repetition_level() != 0 || column->max_definition_level() > 1) { + result.supports_record_reader = false; + return result; + } + if (!record_reader_physical_type_supported(result.physical_type)) { + result.supports_record_reader = false; + return result; + } + if (direct_flat_primitive_doris_type(column) != nullptr || result.is_string_like || + (result.is_decimal && result.decimal_precision <= 38) || + (result.is_timestamp && result.physical_type == ::parquet::Type::INT64) || + record_reader_integer_annotation_supported(column, result.doris_type) || + remove_nullable(result.doris_type)->get_primitive_type() == TYPE_DATEV2 || + remove_nullable(result.doris_type)->get_primitive_type() == TYPE_TIMEV2) { + result.supports_record_reader = true; + } + return result; +} + +bool supports_record_reader(const ParquetTypeDescriptor& type_descriptor) { + return type_descriptor.supports_record_reader; +} + +} // namespace doris::parquet diff --git a/be/src/format/new_parquet/parquet_type.h b/be/src/format/new_parquet/parquet_type.h new file mode 100644 index 00000000000000..1404f84bc362d6 --- /dev/null +++ b/be/src/format/new_parquet/parquet_type.h @@ -0,0 +1,84 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include + +#include "core/data_type/data_type.h" + +namespace parquet { +class ColumnDescriptor; +} // namespace parquet + +namespace doris::parquet { + +// Parquet logical/converted annotation 解析后留下的额外编码信息。 +// 这对应 DuckDB ParquetColumnSchema::type_info:Doris type 只能表达最终展示类型, +// 读值时还需要知道 decimal/timestamp/time 在 Parquet 中的物理编码方式。 +enum class ParquetExtraTypeInfo { + NONE, + DECIMAL_INT32, + DECIMAL_INT64, + DECIMAL_BYTE_ARRAY, + UNIT_MS, + UNIT_MICROS, + UNIT_NS, + IMPALA_TIMESTAMP, +}; + +enum class ParquetTimeUnit { + UNKNOWN, + MILLIS, + MICROS, + NANOS, +}; + +// Parquet file-local column descriptor 的类型解析结果。 +// 该结构只解释 Parquet physical/logical/converted type,不包含 table/global schema +// evolution,也不依赖 Arrow internal RecordReader API。 +struct ParquetTypeDescriptor { + DataTypePtr doris_type; + ParquetExtraTypeInfo extra_type_info = ParquetExtraTypeInfo::NONE; + ParquetTimeUnit time_unit = ParquetTimeUnit::UNKNOWN; + ::parquet::Type::type physical_type = ::parquet::Type::UNDEFINED; + ::parquet::ConvertedType::type converted_type = ::parquet::ConvertedType::UNDEFINED; + int decimal_precision = -1; + int decimal_scale = -1; + int fixed_length = -1; + bool is_decimal = false; + bool is_timestamp = false; + bool is_string_like = false; + bool supports_record_reader = false; +}; + +// 返回 Parquet leaf column 的 file-local 展示名。 +std::string parquet_column_name(const ::parquet::ColumnDescriptor* column); + +// 将 Parquet file-local column descriptor 解析成 Doris file-local 类型和读值所需的 +// 编码信息。这里不做 table schema evolution;类型提升和 default/generated/partition +// 列由 table 层处理。 +ParquetTypeDescriptor resolve_parquet_type(const ::parquet::ColumnDescriptor* column); + +// 判断当前阶段是否可以通过 Arrow Parquet RecordReader 读取该列。 +// 当前支持 flat primitive/string/decimal/timestamp。复杂 nested column 仍通过 children +// 递归组合,list/map assembler 后续补齐。 +bool supports_record_reader(const ParquetTypeDescriptor& type_descriptor); + +} // namespace doris::parquet diff --git a/be/src/format/new_parquet/selection_vector.h b/be/src/format/new_parquet/selection_vector.h new file mode 100644 index 00000000000000..22a9d3507e27dd --- /dev/null +++ b/be/src/format/new_parquet/selection_vector.h @@ -0,0 +1,116 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include + +#include "common/status.h" + +namespace doris::parquet { + +// 类似 DuckDB SelectionVector 的轻量行号视图。 +// 它只表达一个 batch 内被选中的 row offset,不持有 table/global schema 语义。 +// 未绑定 data 时表示 identity selection:get_index(i) == i。 +class SelectionVector { +public: + using Index = uint16_t; + + SelectionVector() = default; + + explicit SelectionVector(size_t count) { resize(count); } + + SelectionVector(Index* data, size_t count) { initialize(data, count); } + + void initialize(Index* data, size_t count) { + _owned.clear(); + _data = data; + _size = count; + } + + void resize(size_t count) { + _owned.resize(count); + _data = _owned.data(); + _size = count; + for (size_t idx = 0; idx < count; ++idx) { + _data[idx] = static_cast(idx); + } + } + + void clear() { + _owned.clear(); + _data = nullptr; + _size = 0; + } + + size_t size() const { return _size; } + + bool is_set() const { return _data != nullptr; } + + Index* data() { return _data; } + + const Index* data() const { return _data; } + + size_t get_index(size_t idx) const { + if (_data == nullptr) { + return idx; + } + return _data[idx]; + } + + void set_index(size_t idx, Index value) { _data[idx] = value; } + + Status verify(size_t count, int64_t batch_rows) const { + if (batch_rows < 0) { + return Status::InvalidArgument("Negative parquet selection batch rows {}", batch_rows); + } + if (count > static_cast(batch_rows)) { + return Status::InvalidArgument("Parquet selection count {} exceeds batch rows {}", + count, batch_rows); + } + if (_data != nullptr && count > _size) { + return Status::InvalidArgument("Parquet selection count {} exceeds vector size {}", + count, _size); + } + size_t previous = 0; + for (size_t idx = 0; idx < count; ++idx) { + const size_t current = get_index(idx); + if (current >= static_cast(batch_rows)) { + return Status::InvalidArgument( + "Parquet selection index {} out of range [0, {}) at position {}", current, + batch_rows, idx); + } + if (idx > 0 && current <= previous) { + return Status::InvalidArgument( + "Parquet selection index {} is not strictly greater than previous {} at " + "position {}", + current, previous, idx); + } + previous = current; + } + return Status::OK(); + } + +private: + std::vector _owned; + Index* _data = nullptr; + size_t _size = 0; +}; + +} // namespace doris::parquet diff --git a/be/src/format/parquet/parquet_reader.h b/be/src/format/parquet/parquet_reader.h deleted file mode 100644 index 65227aba04cf31..00000000000000 --- a/be/src/format/parquet/parquet_reader.h +++ /dev/null @@ -1,77 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include - -#include "common/status.h" -#include "format/reader/file_reader.h" - -namespace doris { -namespace io { -struct IOContext; -} // namespace io -} // namespace doris - -namespace doris::parquet { - -// ParquetReader 的 file-local scan 请求。 -// 当前没有新增 Parquet-only 字段,但保留独立类型,便于后续加入 row group/page index -// 等 Parquet 专属选项。 -struct ParquetScanRequest : public reader::FileScanRequest {}; - -// Parquet 文件物理读取层。 -// 该类只理解 Parquet file-local schema 和 ParquetScanRequest,不理解 Iceberg/global -// schema,不处理 table-level cast/default/generated/partition 语义。 -class ParquetReader : public reader::FileReader { -public: - virtual ~ParquetReader() = default; - - // 解析 Parquet footer 并返回 Parquet 文件自身的 schema。 - // 这里不做 Iceberg schema evolution,也不把字段转换成 table/global schema。 - Status get_schema(std::vector* file_schema) const override { - // 真实实现会从 Parquet footer / schema descriptor 展开 file-local schema。 - file_schema->clear(); - return Status::OK(); - } - - // 初始化 Parquet 专属 scan。 - // 后续可以在 ParquetScanRequest 中扩展 row group、page index、bloom filter 等 - // Parquet-only 选项;table-level 语义仍然必须由 TableColumnMapper 提前转换。 - Status init(const ParquetScanRequest& request) { - // 真实实现会根据 projected_file_columns、local_filters 和 reader_expression_map - // 初始化 row group、column chunk、page reader 以及延时物化计划。 - return reader::FileReader::init(request); - } - - // 读取下一批 Parquet file-local block。 - // 返回列必须保持 file-local 语义,不能在这里补 default/generated/partition 列。 - Status next(Block* file_block, size_t* rows, bool* eof) override { - // 真实实现会输出 file-local block。stub 默认立即 EOF。 - return reader::FileReader::next(file_block, rows, eof); - } - - // 通用 FileReader 初始化入口。 - // 当上层只持有 reader::FileReader 指针时会走该接口;Parquet 专属参数通过 - // ParquetScanRequest 重载表达。 - Status init(const reader::FileScanRequest& request) override { - return reader::FileReader::init(request); - } -}; - -} // namespace doris::parquet diff --git a/be/src/format/reader/column_mapper.cpp b/be/src/format/reader/column_mapper.cpp index 7510413d07fbac..0eed9d3e566b1b 100644 --- a/be/src/format/reader/column_mapper.cpp +++ b/be/src/format/reader/column_mapper.cpp @@ -95,9 +95,10 @@ Status TableColumnMapper::create_scan_request(const std::mapreader_expression_map.clear(); for (const auto& table_column : projected_columns) { const auto* mapping = _find_mapping(table_column.id); - if (mapping != nullptr && mapping->file_column_id.has_value() && - table_filters.count(table_column.id) == 0) { - file_request->non_predicate_columns.push_back(*mapping->file_column_id); + if (mapping != nullptr && mapping->file_column_id.has_value()) { + if (table_filters.count(table_column.id) == 0) { + file_request->non_predicate_columns.push_back(*mapping->file_column_id); + } } } RETURN_IF_ERROR(localize_filters(table_filters, file_request)); diff --git a/be/src/format/reader/expr/delete_predicate.cpp b/be/src/format/reader/expr/delete_predicate.cpp index d1ca03a5201d72..01844fa8a07069 100644 --- a/be/src/format/reader/expr/delete_predicate.cpp +++ b/be/src/format/reader/expr/delete_predicate.cpp @@ -86,7 +86,7 @@ Status DeletePredicate::execute(VExprContext* context, Block* block, int* result DCHECK_EQ(row_ids.size(), count); if (_deleted_rows.empty()) { block->insert({std::move(res_col), std::make_shared(), expr_name()}); - *result_column_id = block->get_columns().size() - 1; + *result_column_id = static_cast(block->get_columns().size() - 1); return Status::OK(); } const int64_t* delete_rows = _deleted_rows.data(); @@ -106,7 +106,7 @@ Status DeletePredicate::execute(VExprContext* context, Block* block, int* result ++start_index; } block->insert({std::move(res_col), std::make_shared(), expr_name()}); - *result_column_id = block->get_columns().size() - 1; + *result_column_id = static_cast(block->get_columns().size() - 1); return Status::OK(); } diff --git a/be/src/format/reader/file_reader.cpp b/be/src/format/reader/file_reader.cpp new file mode 100644 index 00000000000000..daf9e2cf4f82a7 --- /dev/null +++ b/be/src/format/reader/file_reader.cpp @@ -0,0 +1,42 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "format/reader/file_reader.h" + +#include "io/fs/buffered_reader.h" +#include "io/fs/tracing_file_reader.h" +#include "runtime/runtime_state.h" + +namespace doris::reader { + +Status FileReader::init(RuntimeState* state) { + _init_profile(); + SCOPED_RAW_TIMER(&_reader_statistics.file_reader_create_time); + ++_reader_statistics.open_file_num; + io::FileReaderOptions reader_options = + FileFactory::get_reader_options(state->query_options(), *_file_description); + _file_reader = DORIS_TRY(io::DelegateReader::create_file_reader( + _profile, *_system_properties, *_file_description, reader_options, + io::DelegateReader::AccessMode::RANDOM, _io_ctx)); + _tracing_file_reader = _io_ctx ? std::make_shared( + _file_reader, _io_ctx->file_reader_stats) + : _file_reader; + _eof = false; + return Status::OK(); +} + +} // namespace doris::reader diff --git a/be/src/format/reader/file_reader.h b/be/src/format/reader/file_reader.h index 96ace67d8defd0..7fbdb9b576c38a 100644 --- a/be/src/format/reader/file_reader.h +++ b/be/src/format/reader/file_reader.h @@ -19,6 +19,7 @@ #include #include +#include #include #include #include @@ -27,6 +28,7 @@ #include "common/status.h" #include "core/data_type/data_type.h" #include "exprs/vexpr_fwd.h" +#include "io/file_factory.h" #include "io/fs/file_reader_writer_fwd.h" namespace doris { @@ -96,64 +98,75 @@ struct FileScanRequest { std::vector local_filters; // fallback path if filters cannot be localized to file-local predicates. The expression can reference projected_file_columns and partition columns. std::vector> reader_expression_map; - // partition key -> value - std::map partition_values; - - // projected_columns' id is file-local column id, and they are all from file schema. - // For example, - // file schema: [0: id (int), 1: name (string), 2: age (int)] - // predicate: age > 30 - // table-level projection: [name, id] - // predicate_columns: [2] - // non_predicate_columns: [1, 0] - // projected_columns are columns in blocks returned to table reader: [1, 0] means only name and id are projected, - std::vector projected_columns; }; // 文件物理读取层通用接口。 // 该接口只描述 file-local schema、file-local scan request 和 file-local block。 // TableReader/IcebergTableReader 可以通过它组合不同文件格式 reader。 +/** + * +-----> get_schema() -----------------+ + * FileReader() -----> init() ----| -----> close() + * +-----> open() -----> get_block() ----+ + */ class FileReader { public: + struct ReaderStatistics { + int32_t filtered_row_groups = 0; + int32_t filtered_row_groups_by_min_max = 0; + int32_t filtered_row_groups_by_bloom_filter = 0; + int32_t read_row_groups = 0; + int64_t filtered_group_rows = 0; + int64_t filtered_page_rows = 0; + int64_t lazy_read_filtered_rows = 0; + int64_t read_rows = 0; + int64_t filtered_bytes = 0; + int64_t column_read_time = 0; + int64_t parse_meta_time = 0; + int64_t parse_footer_time = 0; + int64_t file_footer_read_calls = 0; + int64_t file_footer_hit_cache = 0; + int64_t file_reader_create_time = 0; + int64_t open_file_num = 0; + int64_t row_group_filter_time = 0; + int64_t page_index_filter_time = 0; + int64_t read_page_index_time = 0; + int64_t parse_page_index_time = 0; + int64_t predicate_filter_time = 0; + int64_t dict_filter_rewrite_time = 0; + int64_t bloom_filter_read_time = 0; + }; + + FileReader(std::unique_ptr& system_properties, + std::unique_ptr& file_description, + std::shared_ptr io_ctx, RuntimeProfile* profile) + : _system_properties(std::move(system_properties)), + _file_description(std::move(file_description)), + _io_ctx(io_ctx), + _profile(profile) {} virtual ~FileReader() = default; - // 打开一个物理文件并加载文件级元数据。 - // 该方法只建立 file-local reader 状态,不接收 table schema,也不做 projection/filter - // 规划;这些输入由 init(FileScanRequest) 提供。 - virtual Status open(io::FileReaderSPtr file, io::IOContext* io_ctx = nullptr) { - // 真实实现会保存文件句柄、IO 上下文并读取文件元数据。 - _file = std::move(file); - _io_ctx = io_ctx; - _eof = false; - return Status::OK(); - } + // Initialize file reader and parse file metadata. + virtual Status init(RuntimeState* state); - // 返回文件自己的 schema 视图。 - // 返回结果必须是 file-local schema:列 id、类型和 children 都按文件格式展开, - // 不在这里解释 Iceberg field id、缺失列、默认值或 generated column。 - virtual Status get_schema(std::vector* file_schema) const { - // 真实实现会展开文件格式自己的 file-local schema。 - file_schema->clear(); - return Status::OK(); - } + // Get file-local schema from file metadata. The file schema is determined by file format and file content, and does not contain table/global schema semantics. For example, Iceberg field id, name mapping, default/generated/partition columns are not interpreted in file reader. This method can only be called after init() successfully, but does not require open() to be called. + virtual Status get_schema(std::vector* file_schema) const = 0; - // 初始化一次 file-local scan。 - // request 由 TableColumnMapper 生成,只包含文件列投影、本地过滤条件和 reader - // expression。FileReader 可以基于它初始化 row group/page/stripe 等文件格式计划。 - virtual Status init(const FileScanRequest& request) { - // 真实实现会根据 projected columns、local filters 和 reader expressions - // 初始化文件格式自己的物理读取计划。 - // _request.projected_file_columns = request.projected_file_columns; - _request.local_filters = request.local_filters; - _request.reader_expression_map = request.reader_expression_map; + // Open the file reader with file-local scan request. The file reader should initialize its internal state according to the request, but does not need to interpret table/global schema semantics. For example, all schema change, filter localization, default/generated/partition columns should be handled in table reader layer. This method can only be called after init() successfully. + virtual Status open(std::unique_ptr& request) { + _request = std::move(request); return Status::OK(); } // 读取下一批 file-local block。 + // 该方法只能在 init(FileScanRequest) 成功后调用。 // file_block 的列顺序和类型必须遵守 FileScanRequest,而不是 table/global schema。 - // eof 表示当前文件 reader 是否读完;多文件切换由 TableReader 负责。 - virtual Status get_block(Block* file_block, bool* eof) { + // rows 返回当前批次输出行数;eof 表示当前文件 reader 是否读完;多文件切换由 + // TableReader 负责。 + virtual Status get_block(Block* file_block, size_t* rows, bool* eof) { // stub 默认立即 EOF。 + if (rows != nullptr) { + *rows = 0; + } if (eof != nullptr) { *eof = true; } @@ -164,18 +177,29 @@ class FileReader { // 关闭当前物理文件 reader 并释放文件层状态。 // 该方法不处理 table-level delete/finalize 状态,后者由 TableReader 子类管理。 virtual Status close() { - _file.reset(); - _io_ctx = nullptr; - _request = FileScanRequest {}; + _file_reader.reset(); + _tracing_file_reader.reset(); + _io_ctx.reset(); + _request.reset(); _eof = true; return Status::OK(); } protected: - io::FileReaderSPtr _file; - io::IOContext* _io_ctx = nullptr; - FileScanRequest _request; + virtual void _init_profile() {} + io::FileReaderSPtr _file_reader; + // _tracing_file_reader wraps _file_reader. + // _file_reader is original file reader. + // _tracing_file_reader is tracing file reader with io context. + // If io_ctx is null, _tracing_file_reader will be the same as file_reader. + io::FileReaderSPtr _tracing_file_reader = nullptr; + std::unique_ptr _request; bool _eof = true; + ReaderStatistics _reader_statistics; + std::unique_ptr _system_properties; + std::unique_ptr _file_description; + std::shared_ptr _io_ctx; + RuntimeProfile* _profile = nullptr; }; } // namespace doris::reader diff --git a/be/src/format/reader/table_reader.h b/be/src/format/reader/table_reader.h index c3744427aa093d..7572383b8ad213 100644 --- a/be/src/format/reader/table_reader.h +++ b/be/src/format/reader/table_reader.h @@ -181,7 +181,12 @@ class TableReader { // TODO: reuse column's memory current_block.insert({field.type->create_column(), field.type, field.name}); } - RETURN_IF_ERROR(_data_reader.reader->get_block(¤t_block, ¤t_eof)); + size_t current_rows = 0; + RETURN_IF_ERROR( + _data_reader.reader->get_block(¤t_block, ¤t_rows, ¤t_eof)); + if (current_rows == 0 && !current_eof) { + continue; + } size_t idx = 0; for (const auto& mapping : _data_reader.column_mapper.mappings()) { @@ -232,15 +237,16 @@ class TableReader { // 打开当前具体 reader。 // 子类在这里基于当前 split/task 初始化底层 FileReader。 virtual Status open_reader() { - _data_reader.block_schema.clear(); - RETURN_IF_ERROR(_data_reader.reader->get_schema(&_data_reader.block_schema)); - RETURN_IF_ERROR(_data_reader.column_mapper.create_mapping( - _options.projected_columns, _partition_values, _data_reader.block_schema)); + std::vector file_schema; + RETURN_IF_ERROR(_data_reader.reader->get_schema(&file_schema)); + _data_reader.block_schema = file_schema; + RETURN_IF_ERROR(_data_reader.column_mapper.create_mapping(_options.projected_columns, + _partition_values, file_schema)); - FileScanRequest file_request; + auto file_request = std::make_unique(); RETURN_IF_ERROR(_data_reader.column_mapper.create_scan_request( - _table_filters, _options.projected_columns, &file_request)); - RETURN_IF_ERROR(_data_reader.reader->init(file_request)); + _table_filters, _options.projected_columns, file_request.get())); + RETURN_IF_ERROR(_data_reader.reader->open(file_request)); return Status::OK(); } diff --git a/be/test/core/data_type_serde/data_type_serde_decoded_values_test.cpp b/be/test/core/data_type_serde/data_type_serde_decoded_values_test.cpp new file mode 100644 index 00000000000000..10f15bb28b1c10 --- /dev/null +++ b/be/test/core/data_type_serde/data_type_serde_decoded_values_test.cpp @@ -0,0 +1,278 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include +#include +#include + +#include "core/assert_cast.h" +#include "core/column/column_decimal.h" +#include "core/column/column_nullable.h" +#include "core/column/column_string.h" +#include "core/column/column_vector.h" +#include "core/data_type/data_type_date_or_datetime_v2.h" +#include "core/data_type/data_type_decimal.h" +#include "core/data_type/data_type_nullable.h" +#include "core/data_type/data_type_number.h" +#include "core/data_type/data_type_string.h" +#include "core/data_type_serde/decoded_column_view.h" +#include "core/string_ref.h" + +namespace doris { + +TEST(DataTypeSerDeDecodedValuesTest, ReadInt32Values) { + auto type = std::make_shared(); + auto column = type->create_column(); + const int32_t values[] = {10, -20, 30}; + + DecodedColumnView view; + view.value_kind = DecodedValueKind::INT32; + view.row_count = 3; + view.values = reinterpret_cast(values); + + auto st = type->get_serde()->read_column_from_decoded_values(*column, view); + ASSERT_TRUE(st.ok()) << st; + + const auto& int_column = assert_cast(*column); + ASSERT_EQ(int_column.size(), 3); + EXPECT_EQ(int_column.get_element(0), 10); + EXPECT_EQ(int_column.get_element(1), -20); + EXPECT_EQ(int_column.get_element(2), 30); +} + +TEST(DataTypeSerDeDecodedValuesTest, ReadPrimitiveNumberValues) { + { + auto type = std::make_shared(); + auto column = type->create_column(); + const bool values[] = {true, false, true}; + + DecodedColumnView view; + view.value_kind = DecodedValueKind::BOOL; + view.row_count = 3; + view.values = reinterpret_cast(values); + + auto st = type->get_serde()->read_column_from_decoded_values(*column, view); + ASSERT_TRUE(st.ok()) << st; + + const auto& bool_column = assert_cast(*column); + ASSERT_EQ(bool_column.size(), 3); + EXPECT_EQ(bool_column.get_element(0), 1); + EXPECT_EQ(bool_column.get_element(1), 0); + EXPECT_EQ(bool_column.get_element(2), 1); + } + { + auto type = std::make_shared(); + auto column = type->create_column(); + const int64_t values[] = {10000000000L, -9L, 42L}; + + DecodedColumnView view; + view.value_kind = DecodedValueKind::INT64; + view.row_count = 3; + view.values = reinterpret_cast(values); + + auto st = type->get_serde()->read_column_from_decoded_values(*column, view); + ASSERT_TRUE(st.ok()) << st; + + const auto& int_column = assert_cast(*column); + ASSERT_EQ(int_column.size(), 3); + EXPECT_EQ(int_column.get_element(0), 10000000000L); + EXPECT_EQ(int_column.get_element(1), -9L); + EXPECT_EQ(int_column.get_element(2), 42L); + } + { + auto type = std::make_shared(); + auto column = type->create_column(); + const float values[] = {1.5F, -2.25F}; + + DecodedColumnView view; + view.value_kind = DecodedValueKind::FLOAT; + view.row_count = 2; + view.values = reinterpret_cast(values); + + auto st = type->get_serde()->read_column_from_decoded_values(*column, view); + ASSERT_TRUE(st.ok()) << st; + + const auto& float_column = assert_cast(*column); + ASSERT_EQ(float_column.size(), 2); + EXPECT_FLOAT_EQ(float_column.get_element(0), 1.5F); + EXPECT_FLOAT_EQ(float_column.get_element(1), -2.25F); + } + { + auto type = std::make_shared(); + auto column = type->create_column(); + const double values[] = {3.5, -4.75}; + + DecodedColumnView view; + view.value_kind = DecodedValueKind::DOUBLE; + view.row_count = 2; + view.values = reinterpret_cast(values); + + auto st = type->get_serde()->read_column_from_decoded_values(*column, view); + ASSERT_TRUE(st.ok()) << st; + + const auto& double_column = assert_cast(*column); + ASSERT_EQ(double_column.size(), 2); + EXPECT_DOUBLE_EQ(double_column.get_element(0), 3.5); + EXPECT_DOUBLE_EQ(double_column.get_element(1), -4.75); + } +} + +TEST(DataTypeSerDeDecodedValuesTest, ReadStringValues) { + auto type = std::make_shared(); + auto column = type->create_column(); + std::vector values = { + StringRef("alpha", 5), + StringRef("beta", 4), + StringRef("gamma", 5), + }; + + DecodedColumnView view; + view.value_kind = DecodedValueKind::BINARY; + view.row_count = values.size(); + view.binary_values = &values; + + auto st = type->get_serde()->read_column_from_decoded_values(*column, view); + ASSERT_TRUE(st.ok()) << st; + + const auto& string_column = assert_cast(*column); + ASSERT_EQ(string_column.size(), 3); + EXPECT_EQ(string_column.get_data_at(0).to_string(), "alpha"); + EXPECT_EQ(string_column.get_data_at(1).to_string(), "beta"); + EXPECT_EQ(string_column.get_data_at(2).to_string(), "gamma"); +} + +TEST(DataTypeSerDeDecodedValuesTest, ReadDateAndDateTimeValues) { + { + auto type = std::make_shared(); + auto column = type->create_column(); + const int32_t values[] = {0, 1, 18628}; + + DecodedColumnView view; + view.value_kind = DecodedValueKind::INT32; + view.row_count = 3; + view.values = reinterpret_cast(values); + + auto st = type->get_serde()->read_column_from_decoded_values(*column, view); + ASSERT_TRUE(st.ok()) << st; + + ASSERT_EQ(column->size(), 3); + EXPECT_EQ(type->to_string(*column, 0), "1970-01-01"); + EXPECT_EQ(type->to_string(*column, 1), "1970-01-02"); + EXPECT_EQ(type->to_string(*column, 2), "2021-01-01"); + } + { + auto type = std::make_shared(6); + auto column = type->create_column(); + const int64_t values[] = {0, 1234567, -1}; + + DecodedColumnView view; + view.value_kind = DecodedValueKind::INT64; + view.time_unit = DecodedTimeUnit::MICROS; + view.row_count = 3; + view.values = reinterpret_cast(values); + + auto st = type->get_serde()->read_column_from_decoded_values(*column, view); + ASSERT_TRUE(st.ok()) << st; + + ASSERT_EQ(column->size(), 3); + EXPECT_EQ(type->to_string(*column, 0), "1970-01-01 00:00:00.000000"); + EXPECT_EQ(type->to_string(*column, 1), "1970-01-01 00:00:01.234567"); + EXPECT_EQ(type->to_string(*column, 2), "1969-12-31 23:59:59.999999"); + } +} + +TEST(DataTypeSerDeDecodedValuesTest, ReadDecimalValues) { + auto type = std::make_shared(18, 2); + auto column = type->create_column(); + const int64_t values[] = {12345, -67, 0}; + + DecodedColumnView view; + view.value_kind = DecodedValueKind::INT64; + view.row_count = 3; + view.values = reinterpret_cast(values); + view.decimal_precision = 18; + view.decimal_scale = 2; + + auto st = type->get_serde()->read_column_from_decoded_values(*column, view); + ASSERT_TRUE(st.ok()) << st; + + const auto& decimal_column = assert_cast(*column); + ASSERT_EQ(decimal_column.size(), 3); + EXPECT_EQ(decimal_column.get_element(0), Decimal128V3(12345)); + EXPECT_EQ(decimal_column.get_element(1), Decimal128V3(-67)); + EXPECT_EQ(decimal_column.get_element(2), Decimal128V3(0)); + EXPECT_EQ(type->to_string(*column, 0), "123.45"); + EXPECT_EQ(type->to_string(*column, 1), "-0.67"); +} + +TEST(DataTypeSerDeDecodedValuesTest, ReadNullableInt32Values) { + auto type = std::make_shared(std::make_shared()); + auto column = type->create_column(); + const int32_t values[] = {1, 2, 3, 4}; + const uint8_t null_map[] = {0, 1, 0, 1}; + + DecodedColumnView view; + view.value_kind = DecodedValueKind::INT32; + view.row_count = 4; + view.values = reinterpret_cast(values); + view.null_map = null_map; + + auto st = type->get_serde()->read_column_from_decoded_values(*column, view); + ASSERT_TRUE(st.ok()) << st; + + const auto& nullable_column = assert_cast(*column); + const auto& nested_column = assert_cast(nullable_column.get_nested_column()); + ASSERT_EQ(nullable_column.size(), 4); + EXPECT_FALSE(nullable_column.is_null_at(0)); + EXPECT_TRUE(nullable_column.is_null_at(1)); + EXPECT_FALSE(nullable_column.is_null_at(2)); + EXPECT_TRUE(nullable_column.is_null_at(3)); + EXPECT_EQ(nested_column.get_element(0), 1); + EXPECT_EQ(nested_column.get_element(1), 2); + EXPECT_EQ(nested_column.get_element(2), 3); + EXPECT_EQ(nested_column.get_element(3), 4); +} + +TEST(DataTypeSerDeDecodedValuesTest, RejectMismatchedValueKind) { + auto type = std::make_shared(); + auto column = type->create_column(); + const int64_t values[] = {1}; + + DecodedColumnView view; + view.value_kind = DecodedValueKind::INT64; + view.row_count = 1; + view.values = reinterpret_cast(values); + + auto st = type->get_serde()->read_column_from_decoded_values(*column, view); + EXPECT_FALSE(st.ok()); +} + +TEST(DataTypeSerDeDecodedValuesTest, RejectMissingValueBuffer) { + auto type = std::make_shared(); + auto column = type->create_column(); + + DecodedColumnView view; + view.value_kind = DecodedValueKind::INT32; + view.row_count = 1; + + auto st = type->get_serde()->read_column_from_decoded_values(*column, view); + EXPECT_FALSE(st.ok()); +} + +} // namespace doris diff --git a/be/test/format/new_parquet/parquet_column_reader_test.cpp b/be/test/format/new_parquet/parquet_column_reader_test.cpp new file mode 100644 index 00000000000000..e4a0841f5af168 --- /dev/null +++ b/be/test/format/new_parquet/parquet_column_reader_test.cpp @@ -0,0 +1,562 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "core/assert_cast.h" +#include "core/column/column_decimal.h" +#include "core/column/column_nullable.h" +#include "core/column/column_string.h" +#include "core/column/column_vector.h" +#include "core/data_type/data_type.h" +#include "core/data_type/data_type_nullable.h" +#include "core/types.h" +#include "format/new_parquet/column_reader.h" +#include "format/new_parquet/parquet_column_schema.h" +#include "format/new_parquet/selection_vector.h" + +namespace doris::parquet { +namespace { + +constexpr int64_t ROW_COUNT = 5; + +std::shared_ptr finish_array(arrow::ArrayBuilder* builder) { + std::shared_ptr array; + EXPECT_TRUE(builder->Finish(&array).ok()); + return array; +} + +class ParquetColumnReaderTest : public testing::Test { +protected: + void SetUp() override { + _test_dir = std::filesystem::temp_directory_path() / "doris_parquet_column_reader_test"; + std::filesystem::remove_all(_test_dir); + std::filesystem::create_directories(_test_dir); + _file_path = (_test_dir / "reader.parquet").string(); + write_parquet_file(); + _file_reader = ::parquet::ParquetFileReader::OpenFile(_file_path, false); + auto metadata = _file_reader->metadata(); + ASSERT_EQ(metadata->num_row_groups(), 1); + _row_group = _file_reader->RowGroup(0); + ASSERT_NE(_row_group, nullptr); + auto schema_descriptor = _file_reader->metadata()->schema(); + ASSERT_NE(schema_descriptor, nullptr); + auto st = build_parquet_column_schema(*schema_descriptor, &_fields); + ASSERT_TRUE(st.ok()) << st; + ASSERT_EQ(_fields.size(), _expected_by_field.size()); + } + + void TearDown() override { std::filesystem::remove_all(_test_dir); } + + template + std::shared_ptr build_required_array(const std::vector& values) { + Builder builder; + for (const auto& value : values) { + EXPECT_TRUE(builder.Append(value).ok()); + } + return finish_array(&builder); + } + + std::shared_ptr build_string_array(const std::vector& values) { + arrow::StringBuilder builder; + for (const auto& value : values) { + EXPECT_TRUE(builder.Append(value).ok()); + } + return finish_array(&builder); + } + + std::shared_ptr build_binary_array(const std::vector& values) { + arrow::BinaryBuilder builder; + for (const auto& value : values) { + EXPECT_TRUE(builder.Append(reinterpret_cast(value.data()), + static_cast(value.size())) + .ok()); + } + return finish_array(&builder); + } + + std::shared_ptr build_fixed_binary_array( + const std::shared_ptr& type, + const std::vector& values) { + arrow::FixedSizeBinaryBuilder builder(type, arrow::default_memory_pool()); + for (const auto& value : values) { + EXPECT_TRUE(builder.Append(reinterpret_cast(value.data())).ok()); + } + return finish_array(&builder); + } + + std::shared_ptr build_nullable_int32_array() { + arrow::Int32Builder builder; + EXPECT_TRUE(builder.Append(1).ok()); + EXPECT_TRUE(builder.AppendNull().ok()); + EXPECT_TRUE(builder.Append(3).ok()); + EXPECT_TRUE(builder.AppendNull().ok()); + EXPECT_TRUE(builder.Append(5).ok()); + return finish_array(&builder); + } + + std::shared_ptr build_time32_array(const std::shared_ptr& type, + const std::vector& values) { + arrow::Time32Builder builder(type, arrow::default_memory_pool()); + for (const auto value : values) { + EXPECT_TRUE(builder.Append(value).ok()); + } + return finish_array(&builder); + } + + std::shared_ptr build_time64_array(const std::shared_ptr& type, + const std::vector& values) { + arrow::Time64Builder builder(type, arrow::default_memory_pool()); + for (const auto value : values) { + EXPECT_TRUE(builder.Append(value).ok()); + } + return finish_array(&builder); + } + + std::shared_ptr build_timestamp_array( + const std::shared_ptr& type, + const std::vector& values) { + arrow::TimestampBuilder builder(type, arrow::default_memory_pool()); + for (const auto value : values) { + EXPECT_TRUE(builder.Append(value).ok()); + } + return finish_array(&builder); + } + + std::shared_ptr build_decimal_array( + const std::shared_ptr& type, + const std::vector& values) { + arrow::Decimal128Builder builder(type, arrow::default_memory_pool()); + for (const auto value : values) { + EXPECT_TRUE(builder.Append(arrow::Decimal128(value)).ok()); + } + return finish_array(&builder); + } + + void add_field(const std::shared_ptr& field, std::shared_ptr array, + std::function validator) { + _arrow_fields.push_back(field); + _arrays.push_back(std::move(array)); + _expected_by_field.push_back(std::move(validator)); + } + + void write_parquet_file() { + add_field(arrow::field("bool_col", arrow::boolean(), false), + build_required_array( + {true, false, true, false, true}), + [](const ParquetColumnSchema& schema, const IColumn& column) { + EXPECT_EQ(schema.type_descriptor.physical_type, ::parquet::Type::BOOLEAN); + const auto& values = assert_cast(column); + EXPECT_EQ(values.get_element(0), 1); + EXPECT_EQ(values.get_element(1), 0); + EXPECT_EQ(values.get_element(4), 1); + }); + add_field(arrow::field("int32_col", arrow::int32(), false), + build_required_array({10, 20, 30, 40, 50}), + [](const ParquetColumnSchema& schema, const IColumn& column) { + EXPECT_EQ(schema.type_descriptor.physical_type, ::parquet::Type::INT32); + const auto& values = assert_cast(column); + EXPECT_EQ(values.get_element(0), 10); + EXPECT_EQ(values.get_element(4), 50); + }); + add_field(arrow::field("int64_col", arrow::int64(), false), + build_required_array( + {10000000000L, -9L, 42L, 77L, 123L}), + [](const ParquetColumnSchema& schema, const IColumn& column) { + EXPECT_EQ(schema.type_descriptor.physical_type, ::parquet::Type::INT64); + const auto& values = assert_cast(column); + EXPECT_EQ(values.get_element(0), 10000000000L); + EXPECT_EQ(values.get_element(1), -9L); + }); + add_field(arrow::field("float_col", arrow::float32(), false), + build_required_array( + {1.5F, -2.25F, 3.0F, 4.5F, 5.75F}), + [](const ParquetColumnSchema& schema, const IColumn& column) { + EXPECT_EQ(schema.type_descriptor.physical_type, ::parquet::Type::FLOAT); + const auto& values = assert_cast(column); + EXPECT_FLOAT_EQ(values.get_element(0), 1.5F); + EXPECT_FLOAT_EQ(values.get_element(1), -2.25F); + }); + add_field(arrow::field("double_col", arrow::float64(), false), + build_required_array( + {3.5, -4.75, 6.0, 7.25, 8.5}), + [](const ParquetColumnSchema& schema, const IColumn& column) { + EXPECT_EQ(schema.type_descriptor.physical_type, ::parquet::Type::DOUBLE); + const auto& values = assert_cast(column); + EXPECT_DOUBLE_EQ(values.get_element(0), 3.5); + EXPECT_DOUBLE_EQ(values.get_element(1), -4.75); + }); + add_field(arrow::field("binary_col", arrow::binary(), false), + build_binary_array({"bin_a", "bin_b", "bin_c", "bin_d", "bin_e"}), + [](const ParquetColumnSchema& schema, const IColumn& column) { + EXPECT_EQ(schema.type_descriptor.physical_type, ::parquet::Type::BYTE_ARRAY); + const auto& values = assert_cast(column); + EXPECT_EQ(values.get_data_at(0).to_string(), "bin_a"); + EXPECT_EQ(values.get_data_at(3).to_string(), "bin_d"); + }); + add_field(arrow::field("string_col", arrow::utf8(), false), + build_string_array({"alpha", "beta", "gamma", "delta", "epsilon"}), + [](const ParquetColumnSchema& schema, const IColumn& column) { + EXPECT_TRUE(schema.type_descriptor.is_string_like); + const auto& values = assert_cast(column); + EXPECT_EQ(values.get_data_at(0).to_string(), "alpha"); + EXPECT_EQ(values.get_data_at(4).to_string(), "epsilon"); + }); + add_field(arrow::field("fixed_binary_col", arrow::fixed_size_binary(4), false), + build_fixed_binary_array(arrow::fixed_size_binary(4), + {"aaaa", "bbbb", "cccc", "dddd", "eeee"}), + [](const ParquetColumnSchema& schema, const IColumn& column) { + EXPECT_EQ(schema.type_descriptor.physical_type, + ::parquet::Type::FIXED_LEN_BYTE_ARRAY); + EXPECT_EQ(schema.type_descriptor.fixed_length, 4); + const auto& values = assert_cast(column); + EXPECT_EQ(values.get_data_at(0).to_string(), "aaaa"); + EXPECT_EQ(values.get_data_at(2).to_string(), "cccc"); + }); + add_field(arrow::field("date_col", arrow::date32(), false), + build_required_array({0, 1, 18628, 18629, 18630}), + [](const ParquetColumnSchema& schema, const IColumn& column) { + EXPECT_EQ(schema.type_descriptor.physical_type, ::parquet::Type::INT32); + EXPECT_EQ(remove_nullable(schema.type)->get_primitive_type(), TYPE_DATEV2); + EXPECT_EQ(schema.type->to_string(column, 0), "1970-01-01"); + EXPECT_EQ(schema.type->to_string(column, 2), "2021-01-01"); + }); + add_field(arrow::field("time_millis_col", arrow::time32(arrow::TimeUnit::MILLI), false), + build_time32_array(arrow::time32(arrow::TimeUnit::MILLI), + {0, 1000, 3723004, 43200000, 86399000}), + [](const ParquetColumnSchema& schema, const IColumn& column) { + EXPECT_EQ(schema.type_descriptor.physical_type, ::parquet::Type::INT32); + EXPECT_EQ(remove_nullable(schema.type)->get_primitive_type(), TYPE_TIMEV2); + EXPECT_EQ(schema.type->to_string(column, 1), "00:00:01.000"); + EXPECT_EQ(schema.type->to_string(column, 2), "01:02:03.004"); + }); + add_field(arrow::field("time_micros_col", arrow::time64(arrow::TimeUnit::MICRO), false), + build_time64_array(arrow::time64(arrow::TimeUnit::MICRO), + {0, 1000000, 3723004567, 43200000000, 86399000000}), + [](const ParquetColumnSchema& schema, const IColumn& column) { + EXPECT_EQ(schema.type_descriptor.physical_type, ::parquet::Type::INT64); + EXPECT_EQ(remove_nullable(schema.type)->get_primitive_type(), TYPE_TIMEV2); + EXPECT_EQ(schema.type->to_string(column, 1), "00:00:01.000000"); + EXPECT_EQ(schema.type->to_string(column, 2), "01:02:03.004567"); + }); + add_field(arrow::field("timestamp_millis_col", + arrow::timestamp(arrow::TimeUnit::MILLI), false), + build_timestamp_array(arrow::timestamp(arrow::TimeUnit::MILLI), + {0, 1234, 1609459200000, 1609459201000, -1}), + [](const ParquetColumnSchema& schema, const IColumn& column) { + EXPECT_EQ(schema.type_descriptor.physical_type, ::parquet::Type::INT64); + EXPECT_EQ(remove_nullable(schema.type)->get_primitive_type(), + TYPE_DATETIMEV2); + EXPECT_EQ(schema.type->to_string(column, 1), + "1970-01-01 00:00:01.234"); + EXPECT_EQ(schema.type->to_string(column, 4), + "1969-12-31 23:59:59.999"); + }); + add_field(arrow::field("timestamp_micros_col", + arrow::timestamp(arrow::TimeUnit::MICRO), false), + build_timestamp_array(arrow::timestamp(arrow::TimeUnit::MICRO), + {0, 1234567, 1609459200000000, 1609459201000000, -1}), + [](const ParquetColumnSchema& schema, const IColumn& column) { + EXPECT_EQ(schema.type_descriptor.physical_type, ::parquet::Type::INT64); + EXPECT_EQ(remove_nullable(schema.type)->get_primitive_type(), + TYPE_DATETIMEV2); + EXPECT_EQ(schema.type->to_string(column, 1), + "1970-01-01 00:00:01.234567"); + EXPECT_EQ(schema.type->to_string(column, 4), + "1969-12-31 23:59:59.999999"); + }); + add_field(arrow::field("decimal_fixed_binary_9_2_col", arrow::decimal128(9, 2), false), + build_decimal_array(arrow::decimal128(9, 2), {12345, -67, 0, 987, 1000}), + [](const ParquetColumnSchema& schema, const IColumn& column) { + EXPECT_EQ(schema.type_descriptor.physical_type, + ::parquet::Type::FIXED_LEN_BYTE_ARRAY); + EXPECT_TRUE(schema.type_descriptor.is_decimal); + EXPECT_EQ(remove_nullable(schema.type)->get_primitive_type(), + TYPE_DECIMAL32); + const auto& values = assert_cast(column); + EXPECT_EQ(values.get_element(0), Decimal32(12345)); + EXPECT_EQ(schema.type->to_string(column, 0), "123.45"); + }); + add_field(arrow::field("decimal_fixed_binary_18_6_col", arrow::decimal128(18, 6), false), + build_decimal_array(arrow::decimal128(18, 6), + {1234567, -670000, 0, 9870000, 1000000}), + [](const ParquetColumnSchema& schema, const IColumn& column) { + EXPECT_EQ(schema.type_descriptor.physical_type, + ::parquet::Type::FIXED_LEN_BYTE_ARRAY); + EXPECT_TRUE(schema.type_descriptor.is_decimal); + EXPECT_EQ(remove_nullable(schema.type)->get_primitive_type(), + TYPE_DECIMAL64); + const auto& values = assert_cast(column); + EXPECT_EQ(values.get_element(0), Decimal64(1234567)); + EXPECT_EQ(schema.type->to_string(column, 0), "1.234567"); + }); + add_field(arrow::field("nullable_int_col", arrow::int32(), true), + build_nullable_int32_array(), + [](const ParquetColumnSchema& schema, const IColumn& column) { + EXPECT_TRUE(schema.type->is_nullable()); + const auto& nullable_column = assert_cast(column); + const auto& nested_column = + assert_cast(nullable_column.get_nested_column()); + ASSERT_EQ(nullable_column.size(), ROW_COUNT); + EXPECT_FALSE(nullable_column.is_null_at(0)); + EXPECT_TRUE(nullable_column.is_null_at(1)); + EXPECT_FALSE(nullable_column.is_null_at(2)); + EXPECT_TRUE(nullable_column.is_null_at(3)); + EXPECT_EQ(nested_column.get_element(0), 1); + EXPECT_EQ(nested_column.get_element(2), 3); + }); + + auto schema = arrow::schema(_arrow_fields); + auto table = arrow::Table::Make(schema, _arrays); + + auto file_result = arrow::io::FileOutputStream::Open(_file_path); + ASSERT_TRUE(file_result.ok()) << file_result.status(); + std::shared_ptr out = *file_result; + + ::parquet::WriterProperties::Builder builder; + builder.version(::parquet::ParquetVersion::PARQUET_2_6); + builder.data_page_version(::parquet::ParquetDataPageVersion::V2); + builder.compression(::parquet::Compression::UNCOMPRESSED); + PARQUET_THROW_NOT_OK( + ::parquet::arrow::WriteTable(*table, arrow::default_memory_pool(), out, ROW_COUNT, + builder.build())); + } + + std::unique_ptr create_reader(size_t field_idx) const { + ParquetColumnReaderFactory factory(_row_group, _file_reader->metadata()->num_columns()); + std::unique_ptr reader; + auto st = factory.create(*_fields[field_idx], &reader); + EXPECT_TRUE(st.ok()) << st; + return reader; + } + + void read_and_validate(size_t field_idx) const { + auto reader = create_reader(field_idx); + MutableColumnPtr column = reader->type()->create_column(); + int64_t rows_read = 0; + auto st = reader->read(ROW_COUNT, column, &rows_read); + ASSERT_TRUE(st.ok()) << st; + ASSERT_EQ(rows_read, ROW_COUNT); + ASSERT_EQ(column->size(), ROW_COUNT); + _expected_by_field[field_idx](*_fields[field_idx], *column); + } + + std::filesystem::path _test_dir; + std::string _file_path; + std::unique_ptr<::parquet::ParquetFileReader> _file_reader; + std::shared_ptr<::parquet::RowGroupReader> _row_group; + std::vector> _fields; + std::vector> _arrow_fields; + std::vector> _arrays; + std::vector> _expected_by_field; +}; + +TEST_F(ParquetColumnReaderTest, ReadAllSupportedPhysicalAndLogicalTypes) { + for (size_t field_idx = 0; field_idx < _fields.size(); ++field_idx) { + SCOPED_TRACE(_fields[field_idx]->name); + ASSERT_TRUE(supports_record_reader(_fields[field_idx]->type_descriptor)); + read_and_validate(field_idx); + } +} + +TEST_F(ParquetColumnReaderTest, SkipThenRead) { + auto reader = create_reader(1); + auto st = reader->skip(2); + ASSERT_TRUE(st.ok()) << st; + + MutableColumnPtr column = reader->type()->create_column(); + int64_t rows_read = 0; + st = reader->read(2, column, &rows_read); + ASSERT_TRUE(st.ok()) << st; + ASSERT_EQ(rows_read, 2); + + const auto& int_values = assert_cast(*column); + ASSERT_EQ(int_values.size(), 2); + EXPECT_EQ(int_values.get_element(0), 30); + EXPECT_EQ(int_values.get_element(1), 40); +} + +TEST_F(ParquetColumnReaderTest, SelectReadsOnlySelectedRanges) { + auto reader = create_reader(1); + SelectionVector selection(3); + selection.set_index(0, 0); + selection.set_index(1, 2); + selection.set_index(2, 4); + + MutableColumnPtr column = reader->type()->create_column(); + auto st = reader->select(selection, 3, ROW_COUNT, column); + ASSERT_TRUE(st.ok()) << st; + + const auto& int_values = assert_cast(*column); + ASSERT_EQ(int_values.size(), 3); + EXPECT_EQ(int_values.get_element(0), 10); + EXPECT_EQ(int_values.get_element(1), 30); + EXPECT_EQ(int_values.get_element(2), 50); +} + +TEST_F(ParquetColumnReaderTest, ResolveSupportedPhysicalAndLogicalSchemas) { + std::vector<::parquet::schema::NodePtr> nodes = { + ::parquet::schema::PrimitiveNode::Make( + "required_bool", ::parquet::Repetition::REQUIRED, ::parquet::Type::BOOLEAN), + ::parquet::schema::PrimitiveNode::Make( + "required_int32", ::parquet::Repetition::REQUIRED, ::parquet::Type::INT32), + ::parquet::schema::PrimitiveNode::Make( + "required_int64", ::parquet::Repetition::REQUIRED, ::parquet::Type::INT64), + ::parquet::schema::PrimitiveNode::Make( + "required_float", ::parquet::Repetition::REQUIRED, ::parquet::Type::FLOAT), + ::parquet::schema::PrimitiveNode::Make( + "required_double", ::parquet::Repetition::REQUIRED, ::parquet::Type::DOUBLE), + ::parquet::schema::PrimitiveNode::Make( + "required_binary", ::parquet::Repetition::REQUIRED, ::parquet::Type::BYTE_ARRAY), + ::parquet::schema::PrimitiveNode::Make( + "required_fixed_binary", ::parquet::Repetition::REQUIRED, + ::parquet::Type::FIXED_LEN_BYTE_ARRAY, ::parquet::ConvertedType::NONE, 4), + ::parquet::schema::PrimitiveNode::Make( + "optional_int32", ::parquet::Repetition::OPTIONAL, ::parquet::Type::INT32), + ::parquet::schema::PrimitiveNode::Make( + "utf8_binary", ::parquet::Repetition::REQUIRED, ::parquet::Type::BYTE_ARRAY, + ::parquet::ConvertedType::UTF8), + ::parquet::schema::PrimitiveNode::Make( + "enum_binary", ::parquet::Repetition::REQUIRED, ::parquet::Type::BYTE_ARRAY, + ::parquet::ConvertedType::ENUM), + ::parquet::schema::PrimitiveNode::Make( + "json_binary", ::parquet::Repetition::REQUIRED, ::parquet::Type::BYTE_ARRAY, + ::parquet::ConvertedType::JSON), + ::parquet::schema::PrimitiveNode::Make( + "bson_binary", ::parquet::Repetition::REQUIRED, ::parquet::Type::BYTE_ARRAY, + ::parquet::ConvertedType::BSON), + ::parquet::schema::PrimitiveNode::Make( + "decimal_int32", ::parquet::Repetition::REQUIRED, ::parquet::Type::INT32, + ::parquet::ConvertedType::DECIMAL, -1, 9, 2), + ::parquet::schema::PrimitiveNode::Make( + "decimal_int64", ::parquet::Repetition::REQUIRED, ::parquet::Type::INT64, + ::parquet::ConvertedType::DECIMAL, -1, 18, 6), + ::parquet::schema::PrimitiveNode::Make( + "decimal_binary", ::parquet::Repetition::REQUIRED, ::parquet::Type::BYTE_ARRAY, + ::parquet::ConvertedType::DECIMAL, -1, 18, 6), + ::parquet::schema::PrimitiveNode::Make( + "decimal_fixed_binary", ::parquet::Repetition::REQUIRED, + ::parquet::Type::FIXED_LEN_BYTE_ARRAY, ::parquet::ConvertedType::DECIMAL, 8, + 18, 6), + ::parquet::schema::PrimitiveNode::Make( + "date_int32", ::parquet::Repetition::REQUIRED, ::parquet::Type::INT32, + ::parquet::ConvertedType::DATE), + ::parquet::schema::PrimitiveNode::Make( + "time_millis_int32", ::parquet::Repetition::REQUIRED, ::parquet::Type::INT32, + ::parquet::ConvertedType::TIME_MILLIS), + ::parquet::schema::PrimitiveNode::Make( + "time_micros_int64", ::parquet::Repetition::REQUIRED, ::parquet::Type::INT64, + ::parquet::ConvertedType::TIME_MICROS), + ::parquet::schema::PrimitiveNode::Make( + "timestamp_millis_int64", ::parquet::Repetition::REQUIRED, + ::parquet::Type::INT64, ::parquet::ConvertedType::TIMESTAMP_MILLIS), + ::parquet::schema::PrimitiveNode::Make( + "timestamp_micros_int64", ::parquet::Repetition::REQUIRED, + ::parquet::Type::INT64, ::parquet::ConvertedType::TIMESTAMP_MICROS), + ::parquet::schema::PrimitiveNode::Make( + "int8_int32", ::parquet::Repetition::REQUIRED, ::parquet::Type::INT32, + ::parquet::ConvertedType::INT_8), + ::parquet::schema::PrimitiveNode::Make( + "uint8_int32", ::parquet::Repetition::REQUIRED, ::parquet::Type::INT32, + ::parquet::ConvertedType::UINT_8), + ::parquet::schema::PrimitiveNode::Make( + "int16_int32", ::parquet::Repetition::REQUIRED, ::parquet::Type::INT32, + ::parquet::ConvertedType::INT_16), + ::parquet::schema::PrimitiveNode::Make( + "uint16_int32", ::parquet::Repetition::REQUIRED, ::parquet::Type::INT32, + ::parquet::ConvertedType::UINT_16), + ::parquet::schema::PrimitiveNode::Make( + "int32_int32", ::parquet::Repetition::REQUIRED, ::parquet::Type::INT32, + ::parquet::ConvertedType::INT_32), + ::parquet::schema::PrimitiveNode::Make( + "uint32_int32", ::parquet::Repetition::REQUIRED, ::parquet::Type::INT32, + ::parquet::ConvertedType::UINT_32), + ::parquet::schema::PrimitiveNode::Make( + "int64_int64", ::parquet::Repetition::REQUIRED, ::parquet::Type::INT64, + ::parquet::ConvertedType::INT_64), + }; + + auto schema = + ::parquet::schema::GroupNode::Make("schema", ::parquet::Repetition::REQUIRED, nodes); + ::parquet::SchemaDescriptor descriptor; + descriptor.Init(schema); + + std::vector> fields; + auto st = build_parquet_column_schema(descriptor, &fields); + ASSERT_TRUE(st.ok()) << st; + ASSERT_EQ(fields.size(), nodes.size()); + + for (const auto& field : fields) { + SCOPED_TRACE(field->name); + ASSERT_TRUE(supports_record_reader(field->type_descriptor)); + ASSERT_NE(field->type, nullptr); + } +} + +TEST_F(ParquetColumnReaderTest, RejectUnsupportedPhysicalAndLogicalTypes) { + auto schema = ::parquet::schema::GroupNode::Make( + "schema", ::parquet::Repetition::REQUIRED, + { + ::parquet::schema::PrimitiveNode::Make( + "int96_col", ::parquet::Repetition::REQUIRED, ::parquet::Type::INT96), + ::parquet::schema::PrimitiveNode::Make( + "repeated_int32_col", ::parquet::Repetition::REPEATED, + ::parquet::Type::INT32), + ::parquet::schema::PrimitiveNode::Make( + "decimal256_fixed_col", ::parquet::Repetition::REQUIRED, + ::parquet::Type::FIXED_LEN_BYTE_ARRAY, ::parquet::ConvertedType::DECIMAL, + 20, 39, 6), + ::parquet::schema::PrimitiveNode::Make( + "uint64_col", ::parquet::Repetition::REQUIRED, ::parquet::Type::INT64, + ::parquet::ConvertedType::UINT_64), + ::parquet::schema::PrimitiveNode::Make( + "time_nanos_col", ::parquet::Repetition::REQUIRED, + ::parquet::LogicalType::Time(false, + ::parquet::LogicalType::TimeUnit::NANOS), + ::parquet::Type::INT64), + ::parquet::schema::PrimitiveNode::Make( + "timestamp_nanos_col", ::parquet::Repetition::REQUIRED, + ::parquet::LogicalType::Timestamp( + false, ::parquet::LogicalType::TimeUnit::NANOS), + ::parquet::Type::INT64), + }); + ::parquet::SchemaDescriptor descriptor; + descriptor.Init(schema); + + std::vector> fields; + auto st = build_parquet_column_schema(descriptor, &fields); + ASSERT_TRUE(st.ok()) << st; + ASSERT_EQ(fields.size(), 6); + + for (const auto& field : fields) { + SCOPED_TRACE(field->name); + ASSERT_FALSE(supports_record_reader(field->type_descriptor)); + } +} + +} // namespace +} // namespace doris::parquet diff --git a/be/test/format/new_parquet/parquet_reader_test.cpp b/be/test/format/new_parquet/parquet_reader_test.cpp new file mode 100644 index 00000000000000..2086cd8cfb7dac --- /dev/null +++ b/be/test/format/new_parquet/parquet_reader_test.cpp @@ -0,0 +1,341 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "core/assert_cast.h" +#include "core/block/block.h" +#include "core/column/column_string.h" +#include "core/column/column_vector.h" +#include "core/data_type/data_type_number.h" +#include "core/data_type/primitive_type.h" +#include "core/field.h" +#include "format/new_parquet/parquet_reader.h" +#include "format/reader/file_reader.h" +#include "gen_cpp/Types_types.h" +#include "io/io_common.h" +#include "runtime/runtime_state.h" +#include "storage/predicate/predicate_creator.h" + +namespace doris { +namespace { + +constexpr int64_t ROW_COUNT = 5; + +std::shared_ptr finish_array(arrow::ArrayBuilder* builder) { + std::shared_ptr array; + EXPECT_TRUE(builder->Finish(&array).ok()); + return array; +} + +std::shared_ptr build_int32_array(const std::vector& values) { + arrow::Int32Builder builder; + for (const auto value : values) { + EXPECT_TRUE(builder.Append(value).ok()); + } + return finish_array(&builder); +} + +std::shared_ptr build_string_array(const std::vector& values) { + arrow::StringBuilder builder; + for (const auto& value : values) { + EXPECT_TRUE(builder.Append(value).ok()); + } + return finish_array(&builder); +} + +void write_parquet_file(const std::string& file_path, int64_t row_group_size = ROW_COUNT) { + auto schema = arrow::schema({ + arrow::field("id", arrow::int32(), false), + arrow::field("value", arrow::utf8(), false), + }); + auto table = arrow::Table::Make( + schema, {build_int32_array({1, 2, 3, 4, 5}), + build_string_array({"one", "two", "three", "four", "five"})}); + + auto file_result = arrow::io::FileOutputStream::Open(file_path); + ASSERT_TRUE(file_result.ok()) << file_result.status(); + std::shared_ptr out = *file_result; + + ::parquet::WriterProperties::Builder builder; + builder.version(::parquet::ParquetVersion::PARQUET_2_6); + builder.data_page_version(::parquet::ParquetDataPageVersion::V2); + builder.compression(::parquet::Compression::UNCOMPRESSED); + PARQUET_THROW_NOT_OK(::parquet::arrow::WriteTable( + *table, arrow::default_memory_pool(), out, row_group_size, builder.build())); +} + +Block build_file_block(const std::vector& schema) { + Block block; + for (const auto& field : schema) { + block.insert({field.type->create_column(), field.type, field.name}); + } + return block; +} + +class TestFileReader final : public reader::FileReader { +public: + TestFileReader(std::unique_ptr& system_properties, + std::unique_ptr& file_description, + std::shared_ptr io_ctx) + : reader::FileReader(system_properties, file_description, std::move(io_ctx), nullptr) {} + + Status get_schema(std::vector* file_schema) const override { + file_schema->clear(); + reader::SchemaField field; + field.id = 0; + field.name = "id"; + field.type = std::make_shared(); + file_schema->push_back(std::move(field)); + return Status::OK(); + } + + bool has_request() const { return _request != nullptr; } + + bool eof() const { return _eof; } +}; + +TEST(FileReaderTest, OpenStoresRequestAndCloseClearsState) { + auto system_properties = std::make_unique(); + system_properties->system_type = TFileType::FILE_LOCAL; + auto file_description = std::make_unique(); + auto io_ctx = std::make_shared(); + TestFileReader reader(system_properties, file_description, io_ctx); + + auto request = std::make_unique(); + request->non_predicate_columns.push_back(0); + ASSERT_TRUE(reader.open(request).ok()); + EXPECT_EQ(request, nullptr); + EXPECT_TRUE(reader.has_request()); + + ASSERT_TRUE(reader.close().ok()); + EXPECT_FALSE(reader.has_request()); + EXPECT_TRUE(reader.eof()); +} + +class NewParquetReaderTest : public testing::Test { +protected: + void SetUp() override { + _test_dir = std::filesystem::temp_directory_path() / "doris_new_parquet_reader_test"; + std::filesystem::remove_all(_test_dir); + std::filesystem::create_directories(_test_dir); + _file_path = (_test_dir / "reader.parquet").string(); + write_parquet_file(_file_path); + } + + void TearDown() override { std::filesystem::remove_all(_test_dir); } + + std::unique_ptr create_reader() const { + auto system_properties = std::make_unique(); + system_properties->system_type = TFileType::FILE_LOCAL; + auto file_description = std::make_unique(); + file_description->path = _file_path; + file_description->file_size = static_cast(std::filesystem::file_size(_file_path)); + return std::make_unique(system_properties, file_description, + nullptr, nullptr); + } + + std::filesystem::path _test_dir; + std::string _file_path; +}; + +TEST_F(NewParquetReaderTest, GetSchemaReturnsFileLocalColumns) { + auto reader = create_reader(); + RuntimeState state {TQueryOptions(), TQueryGlobals()}; + ASSERT_TRUE(reader->init(&state).ok()); + + std::vector schema; + ASSERT_TRUE(reader->get_schema(&schema).ok()); + ASSERT_EQ(schema.size(), 2); + EXPECT_EQ(schema[0].id, 0); + EXPECT_EQ(schema[0].name, "id"); + EXPECT_EQ(schema[0].type->get_primitive_type(), TYPE_INT); + EXPECT_EQ(schema[1].id, 1); + EXPECT_EQ(schema[1].name, "value"); + EXPECT_EQ(schema[1].type->get_primitive_type(), TYPE_STRING); +} + +TEST_F(NewParquetReaderTest, ReadSingleRowGroupThenEof) { + auto reader = create_reader(); + RuntimeState state {TQueryOptions(), TQueryGlobals()}; + ASSERT_TRUE(reader->init(&state).ok()); + + std::vector schema; + ASSERT_TRUE(reader->get_schema(&schema).ok()); + Block block = build_file_block(schema); + + auto request = std::make_unique(); + request->non_predicate_columns = {0, 1}; + ASSERT_TRUE(reader->open(request).ok()); + + size_t rows = 0; + bool eof = false; + ASSERT_TRUE(reader->get_block(&block, &rows, &eof).ok()); + EXPECT_FALSE(eof); + ASSERT_EQ(rows, ROW_COUNT); + + const auto& ids = assert_cast(*block.get_by_position(0).column); + const auto& values = assert_cast(*block.get_by_position(1).column); + ASSERT_EQ(ids.size(), ROW_COUNT); + ASSERT_EQ(values.size(), ROW_COUNT); + EXPECT_EQ(ids.get_element(0), 1); + EXPECT_EQ(ids.get_element(4), 5); + EXPECT_EQ(values.get_data_at(0).to_string(), "one"); + EXPECT_EQ(values.get_data_at(4).to_string(), "five"); + + rows = 0; + eof = false; + ASSERT_TRUE(reader->get_block(&block, &rows, &eof).ok()); + EXPECT_TRUE(eof); + EXPECT_EQ(rows, 0); +} + +TEST_F(NewParquetReaderTest, ReadMultipleRowGroups) { + write_parquet_file(_file_path, 2); + auto parquet_file_reader = ::parquet::ParquetFileReader::OpenFile(_file_path, false); + ASSERT_EQ(parquet_file_reader->metadata()->num_row_groups(), 3); + + auto reader = create_reader(); + RuntimeState state {TQueryOptions(), TQueryGlobals()}; + ASSERT_TRUE(reader->init(&state).ok()); + + std::vector schema; + ASSERT_TRUE(reader->get_schema(&schema).ok()); + auto request = std::make_unique(); + request->non_predicate_columns = {0, 1}; + ASSERT_TRUE(reader->open(request).ok()); + + std::vector ids; + std::vector values; + bool eof = false; + while (!eof) { + Block block = build_file_block(schema); + size_t rows = 0; + ASSERT_TRUE(reader->get_block(&block, &rows, &eof).ok()); + if (rows == 0) { + continue; + } + const auto& id_column = assert_cast(*block.get_by_position(0).column); + const auto& value_column = + assert_cast(*block.get_by_position(1).column); + for (size_t row = 0; row < rows; ++row) { + ids.push_back(id_column.get_element(row)); + values.push_back(value_column.get_data_at(row).to_string()); + } + } + + EXPECT_EQ(ids, std::vector({1, 2, 3, 4, 5})); + EXPECT_EQ(values, std::vector({"one", "two", "three", "four", "five"})); +} + +TEST_F(NewParquetReaderTest, ReadPredicateAndNonPredicateColumnsWithSelection) { + auto reader = create_reader(); + RuntimeState state {TQueryOptions(), TQueryGlobals()}; + ASSERT_TRUE(reader->init(&state).ok()); + + std::vector schema; + ASSERT_TRUE(reader->get_schema(&schema).ok()); + Block block = build_file_block(schema); + + auto request = std::make_unique(); + request->predicate_columns = {0}; + request->non_predicate_columns = {1}; + reader::FileLocalFilter filter; + filter.file_column_id = 0; + filter.predicates.push_back(create_comparison_predicate( + 0, "id", schema[0].type, Field::create_field(2), false)); + request->local_filters.push_back(std::move(filter)); + ASSERT_TRUE(reader->open(request).ok()); + + size_t rows = 0; + bool eof = false; + ASSERT_TRUE(reader->get_block(&block, &rows, &eof).ok()); + EXPECT_FALSE(eof); + ASSERT_EQ(rows, 3); + + const auto& ids = assert_cast(*block.get_by_position(0).column); + const auto& values = assert_cast(*block.get_by_position(1).column); + ASSERT_EQ(ids.size(), 3); + ASSERT_EQ(values.size(), 3); + EXPECT_EQ(ids.get_element(0), 3); + EXPECT_EQ(ids.get_element(1), 4); + EXPECT_EQ(ids.get_element(2), 5); + EXPECT_EQ(values.get_data_at(0).to_string(), "three"); + EXPECT_EQ(values.get_data_at(1).to_string(), "four"); + EXPECT_EQ(values.get_data_at(2).to_string(), "five"); + + rows = 0; + eof = false; + ASSERT_TRUE(reader->get_block(&block, &rows, &eof).ok()); + EXPECT_TRUE(eof); + EXPECT_EQ(rows, 0); +} + +TEST_F(NewParquetReaderTest, PredicateFiltersRowGroupsByStatistics) { + write_parquet_file(_file_path, 2); + auto parquet_file_reader = ::parquet::ParquetFileReader::OpenFile(_file_path, false); + ASSERT_EQ(parquet_file_reader->metadata()->num_row_groups(), 3); + + auto reader = create_reader(); + RuntimeState state {TQueryOptions(), TQueryGlobals()}; + ASSERT_TRUE(reader->init(&state).ok()); + + std::vector schema; + ASSERT_TRUE(reader->get_schema(&schema).ok()); + auto request = std::make_unique(); + request->non_predicate_columns = {0, 1}; + reader::FileLocalFilter filter; + filter.file_column_id = 0; + filter.predicates.push_back(create_comparison_predicate( + 0, "id", schema[0].type, Field::create_field(2), false)); + request->local_filters.push_back(std::move(filter)); + ASSERT_TRUE(reader->open(request).ok()); + + std::vector ids; + std::vector values; + bool eof = false; + while (!eof) { + Block block = build_file_block(schema); + size_t rows = 0; + ASSERT_TRUE(reader->get_block(&block, &rows, &eof).ok()); + if (rows == 0) { + continue; + } + const auto& id_column = assert_cast(*block.get_by_position(0).column); + const auto& value_column = + assert_cast(*block.get_by_position(1).column); + for (size_t row = 0; row < rows; ++row) { + ids.push_back(id_column.get_element(row)); + values.push_back(value_column.get_data_at(row).to_string()); + } + } + + EXPECT_EQ(ids, std::vector({3, 4, 5})); + EXPECT_EQ(values, std::vector({"three", "four", "five"})); +} + +} // namespace +} // namespace doris diff --git a/docs/doris-arrow-parquet-reader-implementation.md b/docs/doris-arrow-parquet-reader-implementation.md new file mode 100644 index 00000000000000..b3bfe5c69e04b0 --- /dev/null +++ b/docs/doris-arrow-parquet-reader-implementation.md @@ -0,0 +1,291 @@ +# Doris New Parquet Reader Design And Status + +This document describes the design and current implementation status of the new +Parquet reader under `be/src/format/new_parquet/`. + +The goal of this PR is to build a file-local Parquet reader based on Arrow C++ +Parquet core APIs while keeping Doris-owned `Block` and `Column` as the scan +output. It does not replace the old `vparquet` path yet. + +## Design Goals + +- Use Arrow C++ Parquet core APIs for Parquet file metadata, row group and column + decoding. +- Keep `doris::parquet::ParquetReader` as a file-local reader. +- Do not use `parquet::arrow::FileReader`, `arrow::RecordBatch`, + `arrow::Table` or `arrow::Array` as the scan output path. +- Do not put Iceberg table schema, schema evolution, default columns, generated + columns or partition columns into `ParquetReader`. +- Keep table schema mapping and filter localization in the reader/table layer, + especially `TableColumnMapper`. +- Let the new implementation live in `be/src/format/new_parquet/` so it can + evolve independently from the old `be/src/format/parquet/` implementation. + +## Layering + +```text +TableReader / IcebergTableReader + -> TableColumnMapper + -> reader::FileScanRequest + -> doris::parquet::ParquetReader + -> DorisRandomAccessFile + -> parquet::ParquetFileReader + -> parquet::RowGroupReader + -> parquet::ColumnReader / parquet::internal::RecordReader + -> Doris Block / Column +``` + +`ParquetReader` only consumes file-local information: + +- file-local schema fields; +- file-local projection columns; +- file-local predicate columns; +- file-local `ColumnPredicate` and `VExprContext` filters. + +Any table-level cast, default value, generated column, partition value, Iceberg +field id mapping or schema evolution rule must be handled before or after the +file reader layer. + +## Code Layout + +```text +be/src/format/new_parquet/parquet_reader.h +be/src/format/new_parquet/parquet_reader.cpp +be/src/format/new_parquet/column_reader.h +be/src/format/new_parquet/column_reader.cpp +be/src/format/new_parquet/parquet_statistics.h +be/src/format/new_parquet/parquet_statistics.cpp +``` + +`parquet_reader.*` owns file open, schema export, scan state, row group +scheduling, predicate-first reading and output block assembly. + +`column_reader.*` owns Doris column assembly for one projected Parquet field. It +wraps Arrow Parquet column-level APIs and converts decoded values into Doris +columns. + +`parquet_statistics.*` owns row group statistics pruning. Future page index, +bloom filter and dictionary pruning should also live there rather than being +mixed into the main scan loop. + +## Main Components + +### DorisRandomAccessFile + +`DorisRandomAccessFile` adapts Doris `io::FileReader` to +`arrow::io::RandomAccessFile`. + +It only handles random IO and file size lookup. It does not parse Parquet schema, +does not evaluate filters, and does not carry table-level semantics. + +### ParquetReaderScanState + +`ParquetReaderScanState` is an internal scan state stored in +`parquet_reader.cpp`. It tracks: + +- Arrow random access file; +- Arrow Parquet file reader and metadata; +- Parquet schema descriptor; +- selected row groups; +- current row group reader; +- current row group row offset; +- projected file columns; +- predicate columns and non-predicate output columns; +- current row group column readers. + +This state is intentionally private to the Parquet reader implementation. + +### ParquetColumnReader + +`ParquetColumnReader` is Doris's file-local column reader abstraction. It is not +the same as Arrow's `parquet::ColumnReader`. + +Current implementations: + +- `PrimitiveColumnReader` +- `StructColumnReader` + +`PrimitiveColumnReader` supports both the existing Arrow +`parquet::TypedColumnReader` path and the new +`parquet::internal::RecordReader` path for selected primitive reads. + +`StructColumnReader` currently supports basic required struct assembly by +recursively reading child readers. Complex nested selective materialization is +not complete. + +### ParquetColumnReaderFactory + +`ParquetColumnReaderFactory` creates Doris column readers from the current row +group's Arrow Parquet readers and the file-local `ParquetColumnSchema`. + +The factory centralizes reader construction so later work can add reader +options, Dremel assemblers, selected-read policies and cache state without +passing those details through free functions. + +### ParquetStatisticsUtils + +`ParquetStatisticsUtils` compiles file-local predicates into Parquet column +predicate plans and evaluates row group metadata conservatively. + +It only understands Parquet file-local schema and Doris `ColumnPredicate`. It +does not know Iceberg schema, slot descriptors or table schema mapping. + +## Scan Request Semantics + +The new reader consumes `reader::FileScanRequest`. + +Important fields: + +- `predicate_columns`: file-local columns that must be read first to evaluate + filters. +- `non_predicate_columns`: file-local projection columns that are only read + after selection is known. +- `projected_columns`: file-local columns that should appear in the output block. +- `local_filters`: file-local filters produced by the table layer. +- `reader_expression_map`: fallback expressions for filters that cannot be + represented as direct file-local predicates. + +The output block is still file-local. It is not a table/global schema block. + +## Predicate Pushdown + +Doris new reader uses two existing filter representations: + +- `ColumnPredicate`: structured single-column predicates, used for row group + statistics pruning and decoded value filtering. +- `VExprContext`: expression filters, used for fallback expression evaluation + and residual filters. + +Current implementation status: + +- row group min/max pruning is wired through `parquet_statistics.*`; +- supported stats types include boolean, int32, int64, float, double and + string/binary; +- unsupported stats, missing stats or unsafe cases keep the row group; +- `IS NULL` and `IS NOT NULL` pruning use null count when available; +- page index, bloom filter and dictionary pruning are not implemented yet. + +Correctness rule: pruning must be conservative. If the reader cannot prove that +a row group cannot match, it must keep the row group. + +## Lazy Materialization + +The scan loop follows a predicate-first model: + +1. Read predicate columns. +2. Evaluate `ColumnPredicate` and build a selection vector. +3. If a predicate column is also projected, reuse the decoded predicate column. +4. Read non-predicate output columns using the selection. +5. Assemble the file-local output block in projected column order. + +The current selected-read implementation uses Arrow Parquet +`parquet::internal::RecordReader` for supported primitive columns. + +Why this is needed: + +- `parquet::TypedColumnReader::Skip` skips physical values, not SQL rows. +- For nullable columns, row count and physical value count differ. +- For repeated/nested columns, a row can contain multiple physical values. + +`RecordReader::SkipRecords` and `RecordReader::ReadRecords` provide row-level +movement. Doris compresses the selection vector into row ranges and alternates +skip/read operations. + +Current support: + +- selected read for primitive boolean, int32, int64, float and double when the + RecordReader path is available; +- fallback path reads the whole batch and filters it when selected read is not + supported; +- output columns are skipped when the selection is empty; +- predicate columns are reused when they are also projected. + +Limitations: + +- `parquet::internal::RecordReader` is an Arrow internal/experimental API, so it + must remain hidden behind Doris `ParquetColumnReader`; +- string, decimal and timestamp selected reads still need broader validation; +- nested selected materialization needs a dedicated Dremel assembler. + +## Type Coverage + +Currently implemented: + +- flat required and nullable boolean; +- flat required and nullable int32 / int64; +- flat required and nullable float / double; +- BYTE_ARRAY and FIXED_LEN_BYTE_ARRAY string/binary with Doris-owned memory; +- decimal with precision up to 38 for int32, int64, byte array and fixed-length + byte array physical encodings; +- INT64 timestamp millis and micros into Doris `DateTimeV2`; +- basic required struct assembly. + +Not implemented or incomplete: + +- INT96 timestamp; +- nanosecond timestamp; +- TIMESTAMPTZ semantics; +- DECIMAL256; +- nullable struct; +- list and map; +- complex column pruning; +- complex column lazy materialization. + +## Current Implementation Status + +Implemented in this PR: + +- new `new_parquet` module; +- Arrow-backed Parquet file open and metadata read; +- file-local schema export; +- row group scheduling; +- projected leaf reader creation; +- primitive column decoding into Doris columns; +- string, decimal and INT64 timestamp decoding; +- basic struct reader; +- row group statistics pruning skeleton and initial implementation; +- predicate-first scan flow; +- primitive RecordReader-backed selected materialization; +- Debug BE build fixes. + +Validated: + +- `git diff --check`; +- `BUILD_TYPE=DEBUG ./build.sh --be` on + `fedora:/home/socrates/code/doris`. + +## Future Work + +Near term: + +- add unit tests for primitive required/nullable selected reads; +- validate selection edge cases: empty selection, full selection, sparse + selection and highly fragmented ranges; +- add a selection-rate policy so dense selections can fall back to whole-batch + read plus filter; +- stabilize string, decimal and timestamp selected reads; +- keep Arrow internal API usage isolated in `column_reader.*`. + +Mid term: + +- implement page index pruning in `parquet_statistics.*`; +- implement bloom filter pruning for equality predicates; +- add dictionary-aware filtering where Arrow exposes enough metadata safely; +- expand complex type assembly for nullable struct, list and map; +- add tests for row group pruning correctness and unsupported-type fallback. + +Long term: + +- support nested column pruning; +- support nested lazy materialization; +- support page-level row range selection; +- integrate the new file-local reader with table readers after the API boundary + is stable; +- keep old `vparquet` compatibility until the new path is functionally complete. + +## Key Rule + +`ParquetReader` must remain a file-local reader. If a feature requires table +schema, Iceberg schema evolution, partition values, default/generated columns or +final table output semantics, it belongs in `TableColumnMapper` or +`TableReader`, not in `be/src/format/new_parquet/`. From 5dc54d878a2ae257797fecb7400653587bdd0a54 Mon Sep 17 00:00:00 2001 From: Gabriel Date: Wed, 27 May 2026 09:14:47 +0800 Subject: [PATCH 10/38] [improvement](be) Reuse table reader file block (#63704) --- be/src/exec/scan/file_scanner.h | 2 +- be/src/format/new_parquet/parquet_reader.cpp | 49 ++++-- be/src/format/new_parquet/parquet_reader.h | 2 +- be/src/format/reader/column_mapper.cpp | 54 ++++-- be/src/format/reader/column_mapper.h | 1 + be/src/format/reader/file_reader.h | 7 +- be/src/format/reader/table_reader.cpp | 81 ++++++++- be/src/format/reader/table_reader.h | 154 +++++++++++------- .../new_parquet/parquet_reader_test.cpp | 8 +- 9 files changed, 264 insertions(+), 94 deletions(-) diff --git a/be/src/exec/scan/file_scanner.h b/be/src/exec/scan/file_scanner.h index 7c3d9d08b6ad7b..34f59cdee320a7 100644 --- a/be/src/exec/scan/file_scanner.h +++ b/be/src/exec/scan/file_scanner.h @@ -189,7 +189,7 @@ class FileScanner : public Scanner { std::unique_ptr _file_cache_statistics; std::unique_ptr _file_reader_stats; - std::unique_ptr _io_ctx; + std::shared_ptr _io_ctx; // Whether to fill partition columns from path, default is true. std::unordered_map> diff --git a/be/src/format/new_parquet/parquet_reader.cpp b/be/src/format/new_parquet/parquet_reader.cpp index fc00484758ecf3..7f442808523ef8 100644 --- a/be/src/format/new_parquet/parquet_reader.cpp +++ b/be/src/format/new_parquet/parquet_reader.cpp @@ -208,10 +208,11 @@ Status ParquetReader::_read_filter_columns(int64_t batch_rows, Block* file_block for (size_t filter_idx = 0; filter_idx < _request->predicate_columns.size(); ++filter_idx) { const int file_field_id = _request->predicate_columns[filter_idx]; auto& column_reader = _state->current_predicate_columns[filter_idx]; - auto column = file_block->get_by_position(column_reader->file_column_id()) - .column->assume_mutable(); - DCHECK_EQ(file_block->get_by_position(column_reader->file_column_id()) - .type->get_primitive_type(), + auto position_it = _request->column_positions.find(file_field_id); + DORIS_CHECK(position_it != _request->column_positions.end()); + const auto block_position = position_it->second; + auto column = file_block->get_by_position(block_position).column->assume_mutable(); + DCHECK_EQ(file_block->get_by_position(block_position).type->get_primitive_type(), column_reader->type()->get_primitive_type()); int64_t column_rows = 0; RETURN_IF_ERROR(column_reader->read(batch_rows, column, &column_rows)); @@ -236,7 +237,7 @@ Status ParquetReader::_read_filter_columns(int64_t batch_rows, Block* file_block } break; } - file_block->replace_by_position(file_field_id, std::move(column)); + file_block->replace_by_position(block_position, std::move(column)); if (*selected_rows == 0) { break; } @@ -313,7 +314,7 @@ Status ParquetReader::_open_next_row_group(bool* has_row_group) { return Status::OK(); } -// `file_block` has a complete struct derived from the file's schema. +// `file_block` has the same layout as FileScanRequest::column_positions. Status ParquetReader::_read_current_row_group_batch(int64_t batch_rows, Block* file_block, size_t* rows) { if (_state->current_predicate_columns.empty() && @@ -331,9 +332,12 @@ Status ParquetReader::_read_current_row_group_batch(int64_t batch_rows, Block* f if (need_filter_output) { IColumn::Filter output_filter = _selection_to_filter(selection, selected_rows, batch_rows); for (const auto file_field_id : _request->predicate_columns) { + auto position_it = _request->column_positions.find(file_field_id); + DORIS_CHECK(position_it != _request->column_positions.end()); + const auto block_position = position_it->second; RETURN_IF_CATCH_EXCEPTION(file_block->replace_by_position( - file_field_id, file_block->get_by_position(file_field_id) - .column->filter(output_filter, selected_rows))); + block_position, file_block->get_by_position(block_position) + .column->filter(output_filter, selected_rows))); } } @@ -341,9 +345,12 @@ Status ParquetReader::_read_current_row_group_batch(int64_t batch_rows, Block* f for (size_t output_idx = 0; output_idx < _state->current_non_predicate_columns.size(); ++output_idx) { auto& column_reader = _state->current_non_predicate_columns[output_idx]; - auto col = file_block->get_columns()[column_reader->file_column_id()]->assume_mutable(); - DCHECK_EQ(file_block->get_by_position(column_reader->file_column_id()) - .type->get_primitive_type(), + auto position_it = + _request->column_positions.find(_request->non_predicate_columns[output_idx]); + DORIS_CHECK(position_it != _request->column_positions.end()); + const auto block_position = position_it->second; + auto col = file_block->get_columns()[block_position]->assume_mutable(); + DCHECK_EQ(file_block->get_by_position(block_position).type->get_primitive_type(), column_reader->type()->get_primitive_type()); if (need_filter_output) { [[maybe_unused]] auto old_size = col->size(); @@ -368,7 +375,7 @@ Status ParquetReader::_read_current_row_group_batch(int64_t batch_rows, Block* f return Status::OK(); } -ParquetReader::ParquetReader(std::unique_ptr& system_properties, +ParquetReader::ParquetReader(std::shared_ptr& system_properties, std::unique_ptr& file_description, std::shared_ptr io_ctx, RuntimeProfile* profile) : FileReader(system_properties, file_description, io_ctx, profile) {} @@ -424,7 +431,25 @@ Status ParquetReader::open(std::unique_ptr& request) { } RETURN_IF_ERROR(reader::FileReader::open(request)); + // `_request->column_positions.empty()` means all columns are needed by table reader + if (_request->column_positions.empty()) { + for (const auto file_column_id : _request->predicate_columns) { + _request->column_positions.emplace(file_column_id, file_column_id); + } + for (const auto file_column_id : _request->non_predicate_columns) { + _request->column_positions.emplace(file_column_id, file_column_id); + } + } + const int num_fields = static_cast(_state->file_schema.size()); + for (const auto file_column_id : _request->predicate_columns) { + DORIS_CHECK(_request->column_positions.count(file_column_id) > 0); + DORIS_CHECK(file_column_id >= 0 && file_column_id < num_fields); + } + for (const auto file_column_id : _request->non_predicate_columns) { + DORIS_CHECK(_request->column_positions.count(file_column_id) > 0); + DORIS_CHECK(file_column_id >= 0 && file_column_id < num_fields); + } for (const auto& local_filter : _request->local_filters) { if (local_filter.file_column_id < 0 || local_filter.file_column_id >= num_fields) { return Status::InvalidArgument("Invalid parquet filter top-level field id {}", diff --git a/be/src/format/new_parquet/parquet_reader.h b/be/src/format/new_parquet/parquet_reader.h index 426960a4dfd042..6920f8c4d78a61 100644 --- a/be/src/format/new_parquet/parquet_reader.h +++ b/be/src/format/new_parquet/parquet_reader.h @@ -45,7 +45,7 @@ struct ParquetScanRequest : public reader::FileScanRequest {}; // schema,不处理 table-level cast/default/generated/partition 语义。 class ParquetReader : public reader::FileReader { public: - ParquetReader(std::unique_ptr& system_properties, + ParquetReader(std::shared_ptr& system_properties, std::unique_ptr& file_description, std::shared_ptr io_ctx, RuntimeProfile* profile); ~ParquetReader() override; diff --git a/be/src/format/reader/column_mapper.cpp b/be/src/format/reader/column_mapper.cpp index 0eed9d3e566b1b..b2453dbbfaf61c 100644 --- a/be/src/format/reader/column_mapper.cpp +++ b/be/src/format/reader/column_mapper.cpp @@ -17,6 +17,7 @@ #include "format/reader/column_mapper.h" +#include #include #include "common/status.h" @@ -30,6 +31,31 @@ namespace doris::reader { static constexpr const char* ROW_LINEAGE_ROW_ID = "_row_id"; static constexpr const char* ROW_LINEAGE_LAST_UPDATED_SEQ_NUMBER = "_last_updated_sequence_number"; +static void add_scan_column(FileScanRequest* file_request, ColumnId file_column_id, + std::vector* scan_columns) { + if (file_request->column_positions.count(file_column_id) == 0) { + file_request->column_positions.emplace(file_column_id, + file_request->column_positions.size()); + scan_columns->push_back(file_column_id); + } +} + +static void rebuild_projection(ColumnMapping* mapping, size_t block_position) { + DORIS_CHECK(mapping->file_column_id.has_value()); + if (mapping->is_trivial) { + mapping->projection = VExprContext::create_shared(TableSlotRef::create_shared( + cast_set(block_position), cast_set(block_position), -1, + mapping->file_type, mapping->file_column_name)); + return; + } + + auto expr = Cast::create_shared(mapping->table_type); + expr->add_child(TableSlotRef::create_shared(cast_set(block_position), + cast_set(block_position), -1, + mapping->file_type, mapping->file_column_name)); + mapping->projection = VExprContext::create_shared(expr); +} + Status TableColumnMapper::create_mapping(const std::vector& projected_columns, const std::map& partition_values, const std::vector& file_schema) { @@ -40,21 +66,9 @@ Status TableColumnMapper::create_mapping(const std::vector& project mapping.table_type = table_column.type; if (const auto* file_field = _find_file_field(table_column, file_schema)) { mapping.file_column_id = file_field->id; + mapping.file_column_name = file_field->name; mapping.file_type = file_field->type; mapping.is_trivial = _is_same_type(mapping.table_type, mapping.file_type); - if (!mapping.is_trivial) { - // 1. Data type mismatch (caused by schema evolution) and casting is needed. - auto expr = Cast::create_shared(mapping.table_type); - expr->add_child(TableSlotRef::create_shared(mapping.file_column_id.value(), - mapping.file_column_id.value(), -1, - mapping.file_type, file_field->name)); - mapping.projection = VExprContext::create_shared(expr); - } else { - // 2. Data type matches, trivial mapping. - mapping.projection = VExprContext::create_shared(TableSlotRef::create_shared( - mapping.file_column_id.value(), mapping.file_column_id.value(), -1, - mapping.file_type, file_field->name)); - } } else if (table_column.is_partition_key && partition_values.count(table_column.name) > 0) { // 3. Partition column, use partition value as a constant mapping. Note that partition column may also have default expression, but partition value should take precedence if it exists. mapping.default_expr = VExprContext::create_shared(TableLiteral::create_shared( @@ -91,17 +105,27 @@ Status TableColumnMapper::create_scan_request(const std::mappredicate_columns.clear(); file_request->non_predicate_columns.clear(); + file_request->column_positions.clear(); file_request->local_filters.clear(); file_request->reader_expression_map.clear(); for (const auto& table_column : projected_columns) { const auto* mapping = _find_mapping(table_column.id); if (mapping != nullptr && mapping->file_column_id.has_value()) { if (table_filters.count(table_column.id) == 0) { - file_request->non_predicate_columns.push_back(*mapping->file_column_id); + add_scan_column(file_request, *mapping->file_column_id, + &file_request->non_predicate_columns); } } } RETURN_IF_ERROR(localize_filters(table_filters, file_request)); + for (auto& mapping : _mappings) { + if (!mapping.file_column_id.has_value()) { + continue; + } + auto position_it = file_request->column_positions.find(*mapping.file_column_id); + DORIS_CHECK(position_it != file_request->column_positions.end()); + rebuild_projection(&mapping, position_it->second); + } return Status::OK(); } @@ -124,7 +148,7 @@ Status TableColumnMapper::localize_filters(const std::map& local_filter.predicates = it.second.predicates; file_request->local_filters.push_back(std::move(local_filter)); } - file_request->predicate_columns.push_back(*mapping->file_column_id); + add_scan_column(file_request, *mapping->file_column_id, &file_request->predicate_columns); } return Status::OK(); } diff --git a/be/src/format/reader/column_mapper.h b/be/src/format/reader/column_mapper.h index d0d8076798bfcf..4360b23e7de147 100644 --- a/be/src/format/reader/column_mapper.h +++ b/be/src/format/reader/column_mapper.h @@ -51,6 +51,7 @@ enum TableVirtualColumnType { struct ColumnMapping { int32_t table_column_id = -1; std::optional file_column_id; + std::string file_column_name; DataTypePtr file_type; DataTypePtr table_type; diff --git a/be/src/format/reader/file_reader.h b/be/src/format/reader/file_reader.h index 7fbdb9b576c38a..0b0527535e5a95 100644 --- a/be/src/format/reader/file_reader.h +++ b/be/src/format/reader/file_reader.h @@ -95,6 +95,7 @@ struct FileScanRequest { std::vector predicate_columns; std::vector non_predicate_columns; + std::map column_positions; std::vector local_filters; // fallback path if filters cannot be localized to file-local predicates. The expression can reference projected_file_columns and partition columns. std::vector> reader_expression_map; @@ -136,10 +137,10 @@ class FileReader { int64_t bloom_filter_read_time = 0; }; - FileReader(std::unique_ptr& system_properties, + FileReader(std::shared_ptr& system_properties, std::unique_ptr& file_description, std::shared_ptr io_ctx, RuntimeProfile* profile) - : _system_properties(std::move(system_properties)), + : _system_properties(system_properties), _file_description(std::move(file_description)), _io_ctx(io_ctx), _profile(profile) {} @@ -196,7 +197,7 @@ class FileReader { std::unique_ptr _request; bool _eof = true; ReaderStatistics _reader_statistics; - std::unique_ptr _system_properties; + std::shared_ptr _system_properties; std::unique_ptr _file_description; std::shared_ptr _io_ctx; RuntimeProfile* _profile = nullptr; diff --git a/be/src/format/reader/table_reader.cpp b/be/src/format/reader/table_reader.cpp index b89641c0bd2ee1..13f093228e6e70 100644 --- a/be/src/format/reader/table_reader.cpp +++ b/be/src/format/reader/table_reader.cpp @@ -17,16 +17,95 @@ #include "format/reader/table_reader.h" +#include +#include + #include #include "common/status.h" +#include "format/new_parquet/parquet_reader.h" #include "format/reader/column_mapper.h" #include "format/table/deletion_vector_reader.h" +#include "io/io_common.h" namespace doris::reader { +std::shared_ptr create_system_properties( + const TFileScanRangeParams* scan_params) { + auto system_properties = std::make_shared(); + if (scan_params == nullptr || !scan_params->__isset.file_type) { + system_properties->system_type = TFileType::FILE_LOCAL; + return system_properties; + } + system_properties->system_type = scan_params->file_type; + system_properties->properties = scan_params->properties; + system_properties->hdfs_params = scan_params->hdfs_params; + if (scan_params->__isset.broker_addresses) { + system_properties->broker_addresses.assign(scan_params->broker_addresses.begin(), + scan_params->broker_addresses.end()); + } + return system_properties; +} + +Status TableReader::init(TableReadOptions options) { + _scan_params = options.scan_params; + _format = options.format; + _io_ctx = options.io_ctx; + _runtime_state = options.runtime_state; + _scanner_profile = options.scanner_profile; + _projected_columns = std::move(options.projected_columns); + _system_properties = create_system_properties(_scan_params); + _profile = std::move(options.profile); + TableColumnMapperOptions mapper_options; + mapper_options.mode = TableColumnMappingMode::BY_FIELD_ID; + _data_reader.column_mapper = TableColumnMapper(mapper_options); + // TODO: + // _table_filters = build_table_filters_from_conjuncts(options.conjuncts); + return Status::OK(); +} + +Status TableReader::create_next_reader(bool* eos) { + DCHECK(_data_reader.reader == nullptr); + if (_current_task == nullptr) { + *eos = true; + return Status::OK(); + } + + switch (_format) { + case FileFormat::PARQUET: { + _data_reader.reader = std::make_unique( + _system_properties, _current_task->data_file, _io_ctx, _scanner_profile); + break; + } + case FileFormat::ORC: + case FileFormat::CSV: + return Status::NotSupported("TableReader does not support file format {}", + static_cast(_format)); + } + + RETURN_IF_ERROR(_data_reader.reader->init(_runtime_state)); + RETURN_IF_ERROR(open_reader()); + *eos = false; + return Status::OK(); +} + +std::unique_ptr create_file_description(const TFileRangeDesc& range) { + auto file_description = std::make_unique(); + file_description->path = range.path; + file_description->file_size = range.__isset.file_size ? range.file_size : -1; + if (range.__isset.fs_name) { + file_description->fs_name = range.fs_name; + } + if (range.__isset.file_cache_admission) { + file_description->file_cache_admission = range.file_cache_admission; + } + return file_description; +} + Status TableReader::prepare_split(const SplitReadOptions& options) { _partition_values = std::move(options.partition_values); + _current_task = std::make_unique(); + _current_task->data_file = create_file_description(options.current_range); return _parse_delete_predicates(options); } @@ -39,7 +118,7 @@ Status TableReader::_parse_delete_predicates(const SplitReadOptions& options) { auto* delete_rows = new DeleteRows; DeletionVectorReader dv_reader(_runtime_state, _scanner_profile, *_scan_params, desc, - _io_ctx); + _io_ctx.get()); create_status = dv_reader.open(); if (!create_status.ok()) [[unlikely]] { return nullptr; diff --git a/be/src/format/reader/table_reader.h b/be/src/format/reader/table_reader.h index 7572383b8ad213..53791747faf67f 100644 --- a/be/src/format/reader/table_reader.h +++ b/be/src/format/reader/table_reader.h @@ -19,10 +19,8 @@ #include -#include -#include +#include #include -#include #include #include #include @@ -34,8 +32,8 @@ #include "exprs/vexpr_fwd.h" #include "format/reader/column_mapper.h" #include "format/reader/expr/delete_predicate.h" -#include "format/reader/expr/literal.h" #include "format/reader/file_reader.h" +#include "runtime/descriptors.h" namespace doris { class Block; @@ -91,7 +89,7 @@ struct BaseDataFile { struct ScanTask { virtual ~ScanTask() = default; - std::unique_ptr data_file; + std::unique_ptr data_file; }; struct ReadProfile { @@ -106,11 +104,9 @@ struct TableReadOptions { const VExprContext conjuncts; const FileFormat format; TFileScanRangeParams* scan_params; - io::IOContext* io_ctx; + std::shared_ptr io_ctx; RuntimeState* runtime_state; RuntimeProfile* scanner_profile; - // Each task denotes a descriptor of a single file to read, along with file-level metadata such as stats and delete files. - std::vector> scan_tasks; std::unique_ptr profile; }; @@ -130,22 +126,7 @@ class TableReader { // 初始化 table reader 的通用运行参数。 // 子类可以在自己的 init(options) 中调用该方法;这里不接收具体表格式 schema/task。 - virtual Status init(TableReadOptions options) { - _scan_params = options.scan_params; - _format = options.format; - _io_ctx = options.io_ctx; - _runtime_state = options.runtime_state; - _scanner_profile = options.scanner_profile; - _scan_tasks = std::move(_options.scan_tasks); - _next_task_idx = 0; - _profile = std::move(options.profile); - TableColumnMapperOptions mapper_options; - mapper_options.mode = TableColumnMappingMode::BY_FIELD_ID; - _data_reader.column_mapper = TableColumnMapper(mapper_options); - // TODO: - // _table_filters = build_table_filters_from_conjuncts(options.conjuncts); - return Status::OK(); - } + virtual Status init(TableReadOptions options); // 读取当前 split/partition 之前初始化。 virtual Status prepare_split(const SplitReadOptions& options); @@ -166,7 +147,13 @@ class TableReader { // 基类负责 current reader 的打开、EOF 后切换和关闭;子类只实现 protected hook。 // table_block 的列必须已经是 table/global schema 语义。 Status get_block(Block* block, bool* eos) { - while (block->empty() && !*eos) { + DORIS_CHECK(block->columns() == _projected_columns.size()); + block->clear_column_data(_projected_columns.size()); + + while (true) { + if (*eos) { + return Status::OK(); + } if (!_data_reader.reader) { RETURN_IF_ERROR(create_next_reader(eos)); if (!_data_reader.reader) { @@ -176,23 +163,25 @@ class TableReader { } bool current_eof = false; - Block current_block; - for (const auto& field : _data_reader.block_schema) { - // TODO: reuse column's memory - current_block.insert({field.type->create_column(), field.type, field.name}); - } + _data_reader.block_template.clear_column_data(); size_t current_rows = 0; - RETURN_IF_ERROR( - _data_reader.reader->get_block(¤t_block, ¤t_rows, ¤t_eof)); - if (current_rows == 0 && !current_eof) { + RETURN_IF_ERROR(_data_reader.reader->get_block(&_data_reader.block_template, + ¤t_rows, ¤t_eof)); + if (current_rows == 0) { + if (current_eof) { + RETURN_IF_ERROR(close_current_reader()); + } continue; } + DCHECK_EQ(_data_reader.block_template.columns(), _data_reader.scan_schema.size()); + DORIS_CHECK(block->columns() == _data_reader.column_mapper.mappings().size()); size_t idx = 0; for (const auto& mapping : _data_reader.column_mapper.mappings()) { - int res_id; - RETURN_IF_ERROR(mapping.projection->execute(¤t_block, &res_id)); - block->replace_by_position(idx, current_block.get_columns()[res_id]); + ColumnPtr column; + RETURN_IF_ERROR(_materialize_mapping_column( + mapping, &_data_reader.block_template, current_rows, &column)); + block->replace_by_position(idx, std::move(column)); idx++; } RETURN_IF_ERROR(finalize_chunk(block)); @@ -200,8 +189,8 @@ class TableReader { if (current_eof) { RETURN_IF_ERROR(close_current_reader()); } + return Status::OK(); } - return Status::OK(); } // 关闭 table reader 及当前正在读取的底层 reader。 @@ -219,20 +208,7 @@ class TableReader { } // 切换到下一个 reader 的通用流程。 // 该方法先关闭当前 reader,再打开下一个具体 reader;子类不应重复实现这个循环。 - Status create_next_reader(bool* eos) { - // 多文件切换的公共流程留在基类:关闭当前 reader,然后打开下一个 reader。 - DCHECK(_data_reader.reader == nullptr); - // TODO: 创建_data_reader - // _data_reader = std::make_unique(...); - if (!_data_reader.reader) { - if (eos != nullptr) { - *eos = true; - } - return Status::OK(); - } - RETURN_IF_ERROR(open_reader()); - return Status::OK(); - } + Status create_next_reader(bool* eos); // 打开当前具体 reader。 // 子类在这里基于当前 split/task 初始化底层 FileReader。 @@ -240,13 +216,29 @@ class TableReader { std::vector file_schema; RETURN_IF_ERROR(_data_reader.reader->get_schema(&file_schema)); _data_reader.block_schema = file_schema; - RETURN_IF_ERROR(_data_reader.column_mapper.create_mapping(_options.projected_columns, + RETURN_IF_ERROR(_data_reader.column_mapper.create_mapping(_projected_columns, _partition_values, file_schema)); + DORIS_CHECK(_data_reader.column_mapper.mappings().size() == _projected_columns.size()); auto file_request = std::make_unique(); RETURN_IF_ERROR(_data_reader.column_mapper.create_scan_request( - _table_filters, _options.projected_columns, file_request.get())); + _table_filters, _projected_columns, file_request.get())); + _data_reader.scan_schema.clear(); + _data_reader.block_template.clear(); + _data_reader.scan_schema.resize(file_request->column_positions.size()); + for (const auto& [file_column_id, block_position] : file_request->column_positions) { + DORIS_CHECK(block_position < _data_reader.scan_schema.size()); + const auto* field = _find_schema_field(_data_reader.block_schema, file_column_id); + DORIS_CHECK(field != nullptr); + _data_reader.scan_schema[block_position] = *field; + } + _data_reader.block_template.reserve(_data_reader.scan_schema.size()); + for (const auto& field : _data_reader.scan_schema) { + _data_reader.block_template.insert( + {field.type->create_column(), field.type, field.name}); + } RETURN_IF_ERROR(_data_reader.reader->open(file_request)); + RETURN_IF_ERROR(_open_mapping_exprs()); return Status::OK(); } @@ -257,6 +249,9 @@ class TableReader { _data_reader.reader.reset(); _data_reader.column_mapper.clear(); _data_reader.block_schema.clear(); + _data_reader.scan_schema.clear(); + _data_reader.block_template.clear(); + _current_task.reset(); return Status::OK(); } @@ -272,28 +267,73 @@ class TableReader { return Status::OK(); } + Status _materialize_mapping_column(const ColumnMapping& mapping, Block* current_block, + size_t current_rows, ColumnPtr* column) { + if (mapping.projection != nullptr) { + int res_id; + RETURN_IF_ERROR(mapping.projection->execute(current_block, &res_id)); + *column = current_block->get_columns()[res_id]; + return Status::OK(); + } + if (mapping.default_expr != nullptr) { + int res_id; + RETURN_IF_ERROR(mapping.default_expr->execute(current_block, &res_id)); + *column = current_block->get_columns()[res_id]; + return Status::OK(); + } + *column = mapping.table_type->create_column_const_with_default_value(current_rows); + return Status::OK(); + } + + Status _open_mapping_exprs() { + RowDescriptor row_desc; + for (const auto& mapping : _data_reader.column_mapper.mappings()) { + if (mapping.projection != nullptr) { + RETURN_IF_ERROR(mapping.projection->prepare(_runtime_state, row_desc)); + RETURN_IF_ERROR(mapping.projection->open(_runtime_state)); + } + if (mapping.default_expr != nullptr) { + RETURN_IF_ERROR(mapping.default_expr->prepare(_runtime_state, row_desc)); + RETURN_IF_ERROR(mapping.default_expr->open(_runtime_state)); + } + } + return Status::OK(); + } + struct DataReader { std::unique_ptr reader; TableColumnMapper column_mapper; std::vector block_schema; + std::vector scan_schema; + Block block_template; }; DataReader _data_reader; - TableReadOptions _options; - std::vector> _scan_tasks; + std::vector _projected_columns; + std::unique_ptr _current_task; + std::shared_ptr _system_properties; // partition key -> value std::map _partition_values; - size_t _next_task_idx = 0; std::map _table_filters; std::unique_ptr _profile; // Parsed from DELETION_VECTOR in Iceberg and Paimon DeleteRows* _delete_rows; TFileScanRangeParams* _scan_params; - io::IOContext* _io_ctx; + std::shared_ptr _io_ctx; RuntimeState* _runtime_state; RuntimeProfile* _scanner_profile; FileFormat _format; private: + static const SchemaField* _find_schema_field(const std::vector& schema, + ColumnId column_id) { + for (const auto& field : schema) { + if (field.id == column_id) { + return &field; + } + } + return nullptr; + } + Status _parse_delete_predicates(const SplitReadOptions& options); }; diff --git a/be/test/format/new_parquet/parquet_reader_test.cpp b/be/test/format/new_parquet/parquet_reader_test.cpp index 2086cd8cfb7dac..e805243ca22952 100644 --- a/be/test/format/new_parquet/parquet_reader_test.cpp +++ b/be/test/format/new_parquet/parquet_reader_test.cpp @@ -98,10 +98,10 @@ Block build_file_block(const std::vector& schema) { class TestFileReader final : public reader::FileReader { public: - TestFileReader(std::unique_ptr& system_properties, + TestFileReader(std::shared_ptr& system_properties, std::unique_ptr& file_description, std::shared_ptr io_ctx) - : reader::FileReader(system_properties, file_description, std::move(io_ctx), nullptr) {} + : reader::FileReader(system_properties, file_description, io_ctx, nullptr) {} Status get_schema(std::vector* file_schema) const override { file_schema->clear(); @@ -119,7 +119,7 @@ class TestFileReader final : public reader::FileReader { }; TEST(FileReaderTest, OpenStoresRequestAndCloseClearsState) { - auto system_properties = std::make_unique(); + auto system_properties = std::make_shared(); system_properties->system_type = TFileType::FILE_LOCAL; auto file_description = std::make_unique(); auto io_ctx = std::make_shared(); @@ -149,7 +149,7 @@ class NewParquetReaderTest : public testing::Test { void TearDown() override { std::filesystem::remove_all(_test_dir); } std::unique_ptr create_reader() const { - auto system_properties = std::make_unique(); + auto system_properties = std::make_shared(); system_properties->system_type = TFileType::FILE_LOCAL; auto file_description = std::make_unique(); file_description->path = _file_path; From e65a09e8b2a1246cbaa264bad5fd3a2966a700bb Mon Sep 17 00:00:00 2001 From: Gabriel Date: Wed, 27 May 2026 09:49:33 +0800 Subject: [PATCH 11/38] [test](be) Cover parquet conjunct local filter (#63705) Problem Summary: NewParquetReaderTest only populated FileLocalFilter::predicates for local predicate filtering. Parquet row group pruning still uses predicates, while row filtering now uses conjunct, so the tests need to populate both with matching semantics. --- be/src/format/new_parquet/parquet_reader.cpp | 56 +++++++++---------- be/src/format/new_parquet/parquet_reader.h | 6 +- .../new_parquet/parquet_reader_test.cpp | 44 ++++++++++++++- 3 files changed, 73 insertions(+), 33 deletions(-) diff --git a/be/src/format/new_parquet/parquet_reader.cpp b/be/src/format/new_parquet/parquet_reader.cpp index 7f442808523ef8..ff9d939b4d064e 100644 --- a/be/src/format/new_parquet/parquet_reader.cpp +++ b/be/src/format/new_parquet/parquet_reader.cpp @@ -31,6 +31,7 @@ #include "common/exception.h" #include "core/block/block.h" #include "core/data_type/data_type_nullable.h" +#include "exprs/vexpr_context.h" #include "format/new_parquet/column_reader.h" #include "format/new_parquet/parquet_column_schema.h" #include "format/new_parquet/parquet_statistics.h" @@ -193,13 +194,8 @@ void ParquetReader::_fill_schema_field(const ParquetColumnSchema& column_schema, } } -bool ParquetReader::_has_structured_filter(const reader::FileLocalFilter& local_filter) { - for (const auto& predicate : local_filter.predicates) { - if (predicate != nullptr) { - return true; - } - } - return false; +bool ParquetReader::_has_expression_filter(const reader::FileLocalFilter& local_filter) { + return local_filter.conjunct != nullptr; } Status ParquetReader::_read_filter_columns(int64_t batch_rows, Block* file_block, @@ -220,24 +216,27 @@ Status ParquetReader::_read_filter_columns(int64_t batch_rows, Block* file_block return Status::Corruption("Parquet filter column {} returned {} rows, expected {} rows", column_reader->name(), column_rows, batch_rows); } + file_block->replace_by_position(block_position, std::move(column)); for (const auto& local_filter : _request->local_filters) { if (local_filter.file_column_id != file_field_id || - !_has_structured_filter(local_filter)) { + !_has_expression_filter(local_filter)) { continue; } if (*selected_rows == 0) { break; } - for (const auto& predicate : local_filter.predicates) { - *selected_rows = predicate->evaluate(*column, selection->data(), *selected_rows); - if (*selected_rows == 0) { - break; - } - } + IColumn::Filter filter(static_cast(batch_rows), 1); + bool can_filter_all = false; + RETURN_IF_ERROR(local_filter.conjunct->execute_filter( + file_block, filter.data(), static_cast(batch_rows), false, + &can_filter_all)); + *selected_rows = + can_filter_all + ? 0 + : _apply_filter_to_selection(filter, selection, *selected_rows); break; } - file_block->replace_by_position(block_position, std::move(column)); if (*selected_rows == 0) { break; } @@ -245,18 +244,6 @@ Status ParquetReader::_read_filter_columns(int64_t batch_rows, Block* file_block return Status::OK(); } -Status ParquetReader::_validate_supported_local_filters( - const std::vector& local_filters) { - for (const auto& local_filter : local_filters) { - if (local_filter.conjunct != nullptr) { - return Status::NotSupported( - "Parquet expression filter fallback is not implemented for field {}", - local_filter.file_column_id); - } - } - return Status::OK(); -} - IColumn::Filter ParquetReader::_selection_to_filter(const SelectionVector& selection, uint16_t selected_rows, int64_t batch_rows) { IColumn::Filter filter(static_cast(batch_rows), 0); @@ -266,6 +253,19 @@ IColumn::Filter ParquetReader::_selection_to_filter(const SelectionVector& selec return filter; } +uint16_t ParquetReader::_apply_filter_to_selection(const IColumn::Filter& filter, + SelectionVector* selection, + uint16_t selected_rows) { + uint16_t new_selected_rows = 0; + for (uint16_t selection_idx = 0; selection_idx < selected_rows; ++selection_idx) { + const auto row_idx = selection->get_index(selection_idx); + if (filter[row_idx] != 0) { + selection->set_index(new_selected_rows++, static_cast(row_idx)); + } + } + return new_selected_rows; +} + Status ParquetReader::_open_next_row_group(bool* has_row_group) { *has_row_group = false; while (_state->next_row_group_idx < _state->selected_row_groups.size()) { @@ -456,8 +456,6 @@ Status ParquetReader::open(std::unique_ptr& request) { local_filter.file_column_id); } } - RETURN_IF_ERROR(_validate_supported_local_filters(_request->local_filters)); - RETURN_IF_ERROR(select_row_groups_by_statistics(*_state->metadata, _state->file_schema, *_request, &_state->selected_row_groups)); RETURN_IF_ERROR(_reset_reader_position()); diff --git a/be/src/format/new_parquet/parquet_reader.h b/be/src/format/new_parquet/parquet_reader.h index 6920f8c4d78a61..40213ebb0d68da 100644 --- a/be/src/format/new_parquet/parquet_reader.h +++ b/be/src/format/new_parquet/parquet_reader.h @@ -121,13 +121,13 @@ class ParquetReader : public reader::FileReader { void _reset_current_row_group(); void _fill_schema_field(const ParquetColumnSchema& column_schema, reader::SchemaField* field) const; - bool _has_structured_filter(const reader::FileLocalFilter& local_filter); + bool _has_expression_filter(const reader::FileLocalFilter& local_filter); Status _read_filter_columns(int64_t batch_rows, Block* file_block, SelectionVector* selection, uint16_t* selected_rows); - Status _validate_supported_local_filters( - const std::vector& local_filters); IColumn::Filter _selection_to_filter(const SelectionVector& selection, uint16_t selected_rows, int64_t batch_rows); + uint16_t _apply_filter_to_selection(const IColumn::Filter& filter, SelectionVector* selection, + uint16_t selected_rows); Status _open_next_row_group(bool* has_row_group); Status _read_current_row_group_batch(int64_t batch_rows, Block* file_block, size_t* rows); diff --git a/be/test/format/new_parquet/parquet_reader_test.cpp b/be/test/format/new_parquet/parquet_reader_test.cpp index e805243ca22952..7e28b7fce5b25a 100644 --- a/be/test/format/new_parquet/parquet_reader_test.cpp +++ b/be/test/format/new_parquet/parquet_reader_test.cpp @@ -33,6 +33,8 @@ #include "core/data_type/data_type_number.h" #include "core/data_type/primitive_type.h" #include "core/field.h" +#include "exprs/vexpr.h" +#include "exprs/vexpr_context.h" #include "format/new_parquet/parquet_reader.h" #include "format/reader/file_reader.h" #include "gen_cpp/Types_types.h" @@ -45,6 +47,43 @@ namespace { constexpr int64_t ROW_COUNT = 5; +class Int32GreaterThanExpr final : public VExpr { +public: + Int32GreaterThanExpr(int column_id, int32_t value) + : VExpr(std::make_shared(), false), + _column_id(column_id), + _value(value) {} + + Status execute_column_impl(VExprContext* context, const Block* block, const Selector* selector, + size_t count, ColumnPtr& result_column) const override { + const auto& input = + assert_cast(*block->get_by_position(_column_id).column); + auto result = ColumnUInt8::create(); + auto& result_data = result->get_data(); + result_data.resize(count); + for (size_t row = 0; row < count; ++row) { + const size_t input_row = selector == nullptr ? row : (*selector)[row]; + result_data[row] = input.get_element(input_row) > _value; + } + result_column = std::move(result); + return Status::OK(); + } + + const std::string& expr_name() const override { return _expr_name; } + +private: + const int _column_id; + const int32_t _value; + const std::string _expr_name = "Int32GreaterThanExpr"; +}; + +VExprContextSPtr create_int32_greater_than_conjunct(int column_id, int32_t value) { + auto ctx = VExprContext::create_shared(std::make_shared(column_id, value)); + ctx->_prepared = true; + ctx->_opened = true; + return ctx; +} + std::shared_ptr finish_array(arrow::ArrayBuilder* builder) { std::shared_ptr array; EXPECT_TRUE(builder->Finish(&array).ok()); @@ -265,6 +304,7 @@ TEST_F(NewParquetReaderTest, ReadPredicateAndNonPredicateColumnsWithSelection) { request->non_predicate_columns = {1}; reader::FileLocalFilter filter; filter.file_column_id = 0; + filter.conjunct = create_int32_greater_than_conjunct(0, 2); filter.predicates.push_back(create_comparison_predicate( 0, "id", schema[0].type, Field::create_field(2), false)); request->local_filters.push_back(std::move(filter)); @@ -306,9 +346,11 @@ TEST_F(NewParquetReaderTest, PredicateFiltersRowGroupsByStatistics) { std::vector schema; ASSERT_TRUE(reader->get_schema(&schema).ok()); auto request = std::make_unique(); - request->non_predicate_columns = {0, 1}; + request->predicate_columns = {0}; + request->non_predicate_columns = {1}; reader::FileLocalFilter filter; filter.file_column_id = 0; + filter.conjunct = create_int32_greater_than_conjunct(0, 2); filter.predicates.push_back(create_comparison_predicate( 0, "id", schema[0].type, Field::create_field(2), false)); request->local_filters.push_back(std::move(filter)); From 4fe7254976e50e96edd2514bbe371fd8cfbc8b6d Mon Sep 17 00:00:00 2001 From: Gabriel Date: Wed, 27 May 2026 10:33:29 +0800 Subject: [PATCH 12/38] Refactor 0527 (#63712) --- be/test/format/reader/table_reader_test.cpp | 303 ++++++++++++++++++++ 1 file changed, 303 insertions(+) create mode 100644 be/test/format/reader/table_reader_test.cpp diff --git a/be/test/format/reader/table_reader_test.cpp b/be/test/format/reader/table_reader_test.cpp new file mode 100644 index 00000000000000..84c5700fc4c1ac --- /dev/null +++ b/be/test/format/reader/table_reader_test.cpp @@ -0,0 +1,303 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "format/reader/table_reader.h" + +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "core/assert_cast.h" +#include "core/block/block.h" +#include "core/column/column_string.h" +#include "core/column/column_vector.h" +#include "core/data_type/data_type_number.h" +#include "core/data_type/data_type_string.h" +#include "gen_cpp/PlanNodes_types.h" +#include "runtime/runtime_state.h" + +namespace doris::reader { +namespace { + +std::shared_ptr finish_array(arrow::ArrayBuilder* builder) { + std::shared_ptr array; + EXPECT_TRUE(builder->Finish(&array).ok()); + return array; +} + +std::shared_ptr build_int32_array(const std::vector& values) { + arrow::Int32Builder builder; + for (const auto value : values) { + EXPECT_TRUE(builder.Append(value).ok()); + } + return finish_array(&builder); +} + +std::shared_ptr build_string_array(const std::vector& values) { + arrow::StringBuilder builder; + for (const auto& value : values) { + EXPECT_TRUE(builder.Append(value).ok()); + } + return finish_array(&builder); +} + +void write_parquet_file(const std::string& file_path, int32_t id, const std::string& value) { + auto schema = arrow::schema({ + arrow::field("id", arrow::int32(), false), + arrow::field("value", arrow::utf8(), false), + }); + auto table = + arrow::Table::Make(schema, {build_int32_array({id}), build_string_array({value})}); + + auto file_result = arrow::io::FileOutputStream::Open(file_path); + ASSERT_TRUE(file_result.ok()) << file_result.status(); + std::shared_ptr out = *file_result; + + ::parquet::WriterProperties::Builder builder; + builder.version(::parquet::ParquetVersion::PARQUET_2_6); + builder.data_page_version(::parquet::ParquetDataPageVersion::V2); + builder.compression(::parquet::Compression::UNCOMPRESSED); + PARQUET_THROW_NOT_OK(::parquet::arrow::WriteTable( + *table, arrow::default_memory_pool(), out, 1, builder.build())); +} + +Block build_table_block(const std::vector& columns) { + Block block; + for (const auto& column : columns) { + block.insert({column.type->create_column(), column.type, column.name}); + } + return block; +} + +SplitReadOptions build_split_options(const std::string& file_path) { + SplitReadOptions options; + options.current_range.__set_path(file_path); + options.current_range.__set_file_size( + static_cast(std::filesystem::file_size(file_path))); + return options; +} + +TEST(TableReaderTest, ReopenSplitAfterClose) { + const auto test_dir = std::filesystem::temp_directory_path() / "doris_table_reader_test"; + std::filesystem::remove_all(test_dir); + std::filesystem::create_directories(test_dir); + + const std::vector file_paths = { + (test_dir / "split_1.parquet").string(), + (test_dir / "split_2.parquet").string(), + (test_dir / "split_3.parquet").string(), + }; + write_parquet_file(file_paths[0], 1, "one"); + write_parquet_file(file_paths[1], 2, "two"); + write_parquet_file(file_paths[2], 3, "three"); + + std::vector projected_columns; + projected_columns.push_back({.id = 0, .name = "id", .type = std::make_shared()}); + projected_columns.push_back( + {.id = 1, .name = "value", .type = std::make_shared()}); + + RuntimeState state {TQueryOptions(), TQueryGlobals()}; + TableReader reader; + ASSERT_TRUE(reader + .init({ + .projected_columns = projected_columns, + .conjuncts = VExprContext(nullptr), + .format = FileFormat::PARQUET, + .scan_params = nullptr, + .io_ctx = nullptr, + .runtime_state = &state, + .scanner_profile = nullptr, + }) + .ok()); + + // Simulate the scanner lifecycle for three different splits: + // init() once, then repeat prepare_split() -> get_block() -> close(). + // This verifies TableReader::close() fully releases the previous low-level reader and task + // state, so a later prepare_split() can open and read a new split on the same TableReader. + std::vector ids; + std::vector values; + for (const auto& file_path : file_paths) { + auto split_options = build_split_options(file_path); + ASSERT_TRUE(reader.prepare_split(split_options).ok()); + + Block block = build_table_block(projected_columns); + bool eos = false; + ASSERT_TRUE(reader.get_block(&block, &eos).ok()); + ASSERT_FALSE(eos); + + const auto& id_column = assert_cast(*block.get_by_position(0).column); + const auto& value_column = + assert_cast(*block.get_by_position(1).column); + ASSERT_EQ(id_column.size(), 1); + ASSERT_EQ(value_column.size(), 1); + ids.push_back(id_column.get_element(0)); + values.push_back(value_column.get_data_at(0).to_string()); + + ASSERT_TRUE(reader.close().ok()); + } + + EXPECT_EQ(ids, std::vector({1, 2, 3})); + EXPECT_EQ(values, std::vector({"one", "two", "three"})); + + std::filesystem::remove_all(test_dir); +} + +TEST(TableReaderTest, ProjectedColumnsRejectParquetSchemaMismatch) { + const auto test_dir = + std::filesystem::temp_directory_path() / "doris_table_reader_schema_mismatch_test"; + std::filesystem::remove_all(test_dir); + std::filesystem::create_directories(test_dir); + + const auto file_path = (test_dir / "split.parquet").string(); + write_parquet_file(file_path, 1, "one"); + + std::vector projected_columns; + projected_columns.push_back( + {.id = 99, .name = "missing_value", .type = std::make_shared()}); + + RuntimeState state {TQueryOptions(), TQueryGlobals()}; + TableReader reader; + ASSERT_TRUE(reader + .init({ + .projected_columns = projected_columns, + .conjuncts = VExprContext(nullptr), + .format = FileFormat::PARQUET, + .scan_params = nullptr, + .io_ctx = nullptr, + .runtime_state = &state, + .scanner_profile = nullptr, + }) + .ok()); + + ASSERT_TRUE(reader.prepare_split(build_split_options(file_path)).ok()); + + // The table projection asks for field id 99, but the ParquetReader exposes only file-local + // fields 0 and 1. get_block() opens the split lazily, so this is where TableReader must reject + // the mismatch between TableReadOptions::projected_columns and the Parquet file schema. + Block block = build_table_block(projected_columns); + bool eos = false; + const auto status = reader.get_block(&block, &eos); + ASSERT_FALSE(status.ok()); + EXPECT_NE(status.to_string().find("does not have a matching file column"), std::string::npos); + + ASSERT_TRUE(reader.close().ok()); + std::filesystem::remove_all(test_dir); +} + +TEST(TableReaderTest, ProjectedColumnsRejectSameNameDifferentIdParquetSchemaMismatch) { + const auto test_dir = + std::filesystem::temp_directory_path() / "doris_table_reader_same_name_diff_id_test"; + std::filesystem::remove_all(test_dir); + std::filesystem::create_directories(test_dir); + + const auto file_path = (test_dir / "split.parquet").string(); + write_parquet_file(file_path, 1, "one"); + + std::vector projected_columns; + projected_columns.push_back( + {.id = 99, .name = "id", .type = std::make_shared()}); + + RuntimeState state {TQueryOptions(), TQueryGlobals()}; + TableReader reader; + ASSERT_TRUE(reader + .init({ + .projected_columns = projected_columns, + .conjuncts = VExprContext(nullptr), + .format = FileFormat::PARQUET, + .scan_params = nullptr, + .io_ctx = nullptr, + .runtime_state = &state, + .scanner_profile = nullptr, + }) + .ok()); + + ASSERT_TRUE(reader.prepare_split(build_split_options(file_path)).ok()); + + // The table column has the same name as the Parquet field, but a different field id. + // TableReader configures ColumnMapper in BY_FIELD_ID mode, so the name match must not hide + // the id mismatch. + Block block = build_table_block(projected_columns); + bool eos = false; + const auto status = reader.get_block(&block, &eos); + ASSERT_FALSE(status.ok()); + EXPECT_NE(status.to_string().find("does not have a matching file column"), std::string::npos); + + ASSERT_TRUE(reader.close().ok()); + std::filesystem::remove_all(test_dir); +} + +TEST(TableReaderTest, ProjectedColumnsUseMapperExpressionsForParquetSchemaMismatch) { + const auto test_dir = + std::filesystem::temp_directory_path() / "doris_table_reader_mapper_expr_test"; + std::filesystem::remove_all(test_dir); + std::filesystem::create_directories(test_dir); + + const auto file_path = (test_dir / "split.parquet").string(); + write_parquet_file(file_path, 7, "seven"); + + std::vector projected_columns; + projected_columns.push_back( + {.id = 0, .name = "table_id", .type = std::make_shared()}); + projected_columns.push_back( + {.id = 1, .name = "table_value", .type = std::make_shared()}); + + RuntimeState state {TQueryOptions(), TQueryGlobals()}; + TableReader reader; + ASSERT_TRUE(reader + .init({ + .projected_columns = projected_columns, + .conjuncts = VExprContext(nullptr), + .format = FileFormat::PARQUET, + .scan_params = nullptr, + .io_ctx = nullptr, + .runtime_state = &state, + .scanner_profile = nullptr, + }) + .ok()); + + ASSERT_TRUE(reader.prepare_split(build_split_options(file_path)).ok()); + + // The table projection is intentionally different from the Parquet schema: + // field id 0 is requested as BIGINT instead of the file INT, so ColumnMapper should build a + // Cast expression; field id 1 has a different table name but the same type, so it should build + // a SlotRef projection. Both columns should still materialize in table schema order. + Block block = build_table_block(projected_columns); + bool eos = false; + ASSERT_TRUE(reader.get_block(&block, &eos).ok()); + ASSERT_FALSE(eos); + + ASSERT_EQ(block.get_by_position(0).name, "table_id"); + ASSERT_EQ(block.get_by_position(1).name, "table_value"); + const auto& id_column = assert_cast(*block.get_by_position(0).column); + const auto& value_column = assert_cast(*block.get_by_position(1).column); + ASSERT_EQ(id_column.size(), 1); + ASSERT_EQ(value_column.size(), 1); + EXPECT_EQ(id_column.get_element(0), 7); + EXPECT_EQ(value_column.get_data_at(0).to_string(), "seven"); + + ASSERT_TRUE(reader.close().ok()); + std::filesystem::remove_all(test_dir); +} + +} // namespace +} // namespace doris::reader From aabcbc3b51ff465646b453e38c81f242aef05077 Mon Sep 17 00:00:00 2001 From: Socrates Date: Wed, 27 May 2026 14:38:12 +0800 Subject: [PATCH 13/38] [parquet] Clarify reader lifecycle comments --- be/src/format/new_parquet/parquet_reader.h | 4 ++-- be/src/format/reader/file_reader.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/be/src/format/new_parquet/parquet_reader.h b/be/src/format/new_parquet/parquet_reader.h index 40213ebb0d68da..d7a9dc19a5919c 100644 --- a/be/src/format/new_parquet/parquet_reader.h +++ b/be/src/format/new_parquet/parquet_reader.h @@ -54,8 +54,8 @@ class ParquetReader : public reader::FileReader { // init 成功后可以调用 get_schema() 获取 Parquet file-local schema。 Status init(RuntimeState* state) override; - // 解析 Parquet footer 并返回 Parquet 文件自身的 schema。 - // 该方法只能在 open() 成功后调用,不要求 init() 已经执行。 + // 返回 init() 阶段解析出的 Parquet 文件自身 schema。 + // 该方法只能在 init() 成功后调用,不要求 open() 已经执行。 // 这里不做 Iceberg schema evolution,也不把字段转换成 table/global schema。 Status get_schema(std::vector* file_schema) const override; diff --git a/be/src/format/reader/file_reader.h b/be/src/format/reader/file_reader.h index 0b0527535e5a95..cb2096fd80ad51 100644 --- a/be/src/format/reader/file_reader.h +++ b/be/src/format/reader/file_reader.h @@ -159,7 +159,7 @@ class FileReader { } // 读取下一批 file-local block。 - // 该方法只能在 init(FileScanRequest) 成功后调用。 + // 该方法只能在 open(FileScanRequest) 成功后调用。 // file_block 的列顺序和类型必须遵守 FileScanRequest,而不是 table/global schema。 // rows 返回当前批次输出行数;eof 表示当前文件 reader 是否读完;多文件切换由 // TableReader 负责。 From e1ed7f204d4658035da0378d37ea9ae64b0ea737 Mon Sep 17 00:00:00 2001 From: Socrates Date: Wed, 27 May 2026 15:17:38 +0800 Subject: [PATCH 14/38] [parquet] Update new reader design docs --- ...ris-arrow-parquet-reader-implementation.md | 427 +++++++++--------- docs/doris-new-parquet-dictionary-pushdown.md | 359 +++++++++++++++ 2 files changed, 582 insertions(+), 204 deletions(-) create mode 100644 docs/doris-new-parquet-dictionary-pushdown.md diff --git a/docs/doris-arrow-parquet-reader-implementation.md b/docs/doris-arrow-parquet-reader-implementation.md index b3bfe5c69e04b0..d191229e44562c 100644 --- a/docs/doris-arrow-parquet-reader-implementation.md +++ b/docs/doris-arrow-parquet-reader-implementation.md @@ -1,291 +1,310 @@ -# Doris New Parquet Reader Design And Status +# Doris Arrow Parquet Reader 实现方案与当前状态 -This document describes the design and current implementation status of the new -Parquet reader under `be/src/format/new_parquet/`. +本文档描述 `be/src/format/new_parquet/` 下新 Parquet reader 的设计、当前实现状态和后续缺口。 -The goal of this PR is to build a file-local Parquet reader based on Arrow C++ -Parquet core APIs while keeping Doris-owned `Block` and `Column` as the scan -output. It does not replace the old `vparquet` path yet. +当前目标不是替换旧 `vparquet` 路径,而是在新 reader API 下先实现一个 file-local Parquet reader: -## Design Goals +- 底层复用 Arrow C++ Parquet core API 解析文件、row group 和 column chunk。 +- 输出仍然是 Doris 自己的 `Block` 和 `Column`。 +- 不使用 `parquet::arrow::FileReader`、`arrow::RecordBatch` 或 `arrow::Table` 作为 scan 输出路径。 +- `ParquetReader` 只理解 Parquet file-local schema,不理解 Iceberg/global schema。 +- schema change、filter localization、default/generated/partition column 等 table-level 语义放在 `TableReader` 和 `TableColumnMapper`。 -- Use Arrow C++ Parquet core APIs for Parquet file metadata, row group and column - decoding. -- Keep `doris::parquet::ParquetReader` as a file-local reader. -- Do not use `parquet::arrow::FileReader`, `arrow::RecordBatch`, - `arrow::Table` or `arrow::Array` as the scan output path. -- Do not put Iceberg table schema, schema evolution, default columns, generated - columns or partition columns into `ParquetReader`. -- Keep table schema mapping and filter localization in the reader/table layer, - especially `TableColumnMapper`. -- Let the new implementation live in `be/src/format/new_parquet/` so it can - evolve independently from the old `be/src/format/parquet/` implementation. +## 分层边界 -## Layering +当前分层如下: ```text -TableReader / IcebergTableReader +FileScanner / TableReader / IcebergTableReader -> TableColumnMapper -> reader::FileScanRequest -> doris::parquet::ParquetReader -> DorisRandomAccessFile -> parquet::ParquetFileReader -> parquet::RowGroupReader - -> parquet::ColumnReader / parquet::internal::RecordReader + -> parquet::internal::RecordReader -> Doris Block / Column ``` -`ParquetReader` only consumes file-local information: +关键边界: -- file-local schema fields; -- file-local projection columns; -- file-local predicate columns; -- file-local `ColumnPredicate` and `VExprContext` filters. +- `TableReader` 输出 table/global schema block。 +- `ParquetReader` 输出 file-local block。 +- `TableColumnMapper` 负责把 table projection/filter 转成 file-local projection/filter。 +- `ParquetReader` 不补 default column,不物化 partition column,不处理 generated column,不做 Iceberg schema evolution。 +- 所有 table-level cast/finalize/delete/virtual column 都不能塞回 `ParquetReader`。 -Any table-level cast, default value, generated column, partition value, Iceberg -field id mapping or schema evolution rule must be handled before or after the -file reader layer. +## FileReader 生命周期 -## Code Layout +`ParquetReader` 继承 `reader::FileReader`,当前生命周期是: + +```text +init(RuntimeState*) + -> get_schema(std::vector*) + -> open(std::unique_ptr&) + -> get_block(Block* file_block, size_t* rows, bool* eof) + -> close() +``` + +语义约束: + +- `init()` 打开物理文件并解析 Parquet footer metadata。 +- `get_schema()` 在 `init()` 成功后可调用,不要求 `open()`。 +- `open()` 接收已经 localize 的 `FileScanRequest`,并完成 row group pruning 和 reader 游标初始化。 +- `get_block()` 只能在 `open()` 成功后调用,输出 file-local block。 +- `rows` 表示本批 file-local block 输出行数,`eof` 表示当前物理文件是否读完。 + +## 代码布局 ```text be/src/format/new_parquet/parquet_reader.h be/src/format/new_parquet/parquet_reader.cpp be/src/format/new_parquet/column_reader.h be/src/format/new_parquet/column_reader.cpp +be/src/format/new_parquet/parquet_column_schema.h +be/src/format/new_parquet/parquet_column_schema.cpp +be/src/format/new_parquet/parquet_type.h +be/src/format/new_parquet/parquet_type.cpp be/src/format/new_parquet/parquet_statistics.h be/src/format/new_parquet/parquet_statistics.cpp +be/src/format/new_parquet/selection_vector.h ``` -`parquet_reader.*` owns file open, schema export, scan state, row group -scheduling, predicate-first reading and output block assembly. - -`column_reader.*` owns Doris column assembly for one projected Parquet field. It -wraps Arrow Parquet column-level APIs and converts decoded values into Doris -columns. +职责划分: -`parquet_statistics.*` owns row group statistics pruning. Future page index, -bloom filter and dictionary pruning should also live there rather than being -mixed into the main scan loop. +- `parquet_reader.*`:文件打开、schema 导出、scan state、row group 调度、谓词列优先读取、file-local block 组装。 +- `column_reader.*`:单个 Parquet 字段到 Doris column 的读取;封装 Arrow internal `RecordReader`。 +- `parquet_column_schema.*`:从 Parquet schema descriptor 构建 file-local schema tree。 +- `parquet_type.*`:解析 Parquet physical/logical/converted type,生成 Doris file-local type 和额外类型信息。 +- `parquet_statistics.*`:基于 row group metadata 做保守的统计信息裁剪。 +- `selection_vector.h`:表达 batch 内被选中的 row offset,用于延时物化。 -## Main Components +## 核心组件 ### DorisRandomAccessFile -`DorisRandomAccessFile` adapts Doris `io::FileReader` to -`arrow::io::RandomAccessFile`. +`DorisRandomAccessFile` 把 Doris `io::FileReader` 适配成 `arrow::io::RandomAccessFile`。 -It only handles random IO and file size lookup. It does not parse Parquet schema, -does not evaluate filters, and does not carry table-level semantics. +它只处理随机读和文件大小查询,不解析 Parquet schema,不携带 table schema,也不执行 filter。 ### ParquetReaderScanState -`ParquetReaderScanState` is an internal scan state stored in -`parquet_reader.cpp`. It tracks: +`ParquetReaderScanState` 是 `parquet_reader.cpp` 内部状态,记录: -- Arrow random access file; -- Arrow Parquet file reader and metadata; -- Parquet schema descriptor; -- selected row groups; -- current row group reader; -- current row group row offset; -- projected file columns; -- predicate columns and non-predicate output columns; -- current row group column readers. +- Arrow random access file; +- Arrow Parquet file reader; +- Parquet footer metadata; +- Parquet schema descriptor; +- file-local schema tree; +- 被 row group statistics 选中的 row group; +- 当前 row group reader; +- 当前 row group 内已读行数; +- predicate column readers; +- non-predicate column readers。 -This state is intentionally private to the Parquet reader implementation. +该状态不暴露给 table reader。 + +### ParquetColumnSchema 和 ParquetTypeDescriptor + +`ParquetColumnSchema` 描述 file-local schema tree,包括: + +- Parquet node name; +- Parquet field id; +- top-level field id; +- leaf column id; +- Doris file-local type; +- 子列 schema; +- primitive column 的 `ParquetTypeDescriptor`。 + +`ParquetTypeDescriptor` 负责保存 Parquet annotation 解析结果,包括: + +- physical type; +- logical type / converted type 推导后的 Doris type; +- decimal precision/scale; +- time/timestamp unit; +- 是否 string-like; +- 是否支持当前 RecordReader 读取路径。 + +类型解析已经从 `column_reader.cpp` 前移到 `parquet_type.*`,`ColumnReader` 热路径只消费解析结果。 ### ParquetColumnReader -`ParquetColumnReader` is Doris's file-local column reader abstraction. It is not -the same as Arrow's `parquet::ColumnReader`. +`ParquetColumnReader` 是 Doris 自己的 file-local column reader 抽象,不是 Arrow 的 `parquet::ColumnReader`。 -Current implementations: +当前接口收敛为: -- `PrimitiveColumnReader` -- `StructColumnReader` +```text +read(rows, column, rows_read) +skip(rows) +select(selection, selected_rows, batch_rows, column) +``` + +当前实现: -`PrimitiveColumnReader` supports both the existing Arrow -`parquet::TypedColumnReader` path and the new -`parquet::internal::RecordReader` path for selected primitive reads. +- `ScalarColumnReader`:基于 Arrow internal `RecordReader` 读取 flat primitive/string/decimal/time/timestamp。 +- `StructColumnReader`:递归读取 children,支持非常基础的 struct 组装。 -`StructColumnReader` currently supports basic required struct assembly by -recursively reading child readers. Complex nested selective materialization is -not complete. +`select()` 在基类中统一实现:把 `SelectionVector` 合并成连续 row ranges,然后交替调用 `skip()` 和 `read()`。当前不实现整批 read 后再 filter 的 fallback。 ### ParquetColumnReaderFactory -`ParquetColumnReaderFactory` creates Doris column readers from the current row -group's Arrow Parquet readers and the file-local `ParquetColumnSchema`. +`ParquetColumnReaderFactory` 根据当前 row group 和 `ParquetColumnSchema` 创建 column reader。 + +它集中封装 Arrow internal `RecordReader` 的创建和缓存,避免 Arrow internal API 泄露到 `ParquetReader` 主流程。 + +### DataTypeSerDe decoded value 读取接口 + +`ScalarColumnReader` 不直接把 Parquet value switch 到 Doris column,而是构造 `DecodedColumnView`,再调用: + +```text +DataTypeSerDe::read_column_from_decoded_values(...) +``` + +当前已接入的 SerDe 包括 number、string、decimal、date/time/datetime、nullable 等类型。这样可以把“Parquet 解码”和“Doris 类型写入”拆开,减少 `ColumnReader` 内部的 Doris 类型分发逻辑。 + +## Scan Request 语义 + +新 reader 消费 `reader::FileScanRequest`。 + +重要字段: + +- `predicate_columns`:需要先读取,用于计算 selection 的 file-local columns。 +- `non_predicate_columns`:selection 确定后再读取的 file-local columns。 +- `column_positions`:file column id 到 file-local output block position 的映射。 +- `local_filters`:已经 localize 到 file schema 的 filter。 +- `reader_expression_map`:table filter 无法安全转换成 file-local predicate 时的 fallback 表达式。 -The factory centralizes reader construction so later work can add reader -options, Dremel assemblers, selected-read policies and cache state without -passing those details through free functions. +输出 block 的列顺序和类型遵守 `column_positions`,不是 table/global schema。 -### ParquetStatisticsUtils +## 谓词下推 -`ParquetStatisticsUtils` compiles file-local predicates into Parquet column -predicate plans and evaluates row group metadata conservatively. +当前已实现: -It only understands Parquet file-local schema and Doris `ColumnPredicate`. It -does not know Iceberg schema, slot descriptors or table schema mapping. +- row group 级 min/max 统计信息裁剪; +- null count 驱动的 `IS NULL` / `IS NOT NULL` 裁剪; +- unsupported statistics、缺失 statistics、不安全比较时保守保留 row group。 -## Scan Request Semantics +当前未实现: -The new reader consumes `reader::FileScanRequest`. +- page index pruning; +- bloom filter pruning; +- dictionary pruning; +- batch 内直接执行结构化 `ColumnPredicate`; +- `reader_expression_map` fallback 表达式执行。 -Important fields: +注意:当前 `local_filters.predicates` 已经进入 row group statistics 路径,但在 batch 内过滤阶段,`ParquetReader::_read_filter_columns()` 主要处理 `local_filter.conjunct`。因此如果某个谓词只以 `ColumnPredicate` 形式存在,目前还缺少 batch 内二次过滤闭环。 -- `predicate_columns`: file-local columns that must be read first to evaluate - filters. -- `non_predicate_columns`: file-local projection columns that are only read - after selection is known. -- `projected_columns`: file-local columns that should appear in the output block. -- `local_filters`: file-local filters produced by the table layer. -- `reader_expression_map`: fallback expressions for filters that cannot be - represented as direct file-local predicates. +## 延时物化当前状态 -The output block is still file-local. It is not a table/global schema block. +当前 scan loop 是 predicate-first 模型: -## Predicate Pushdown +1. 读取 `predicate_columns`。 +2. 执行表达式 filter,生成 `SelectionVector`。 +3. 如果谓词列也在 output block 中,则复用已经解码的谓词列,并按 selection filter。 +4. 对 `non_predicate_columns` 调用 `ColumnReader::select()`,只读取被选中的行。 +5. 返回 file-local block。 -Doris new reader uses two existing filter representations: +已有能力: -- `ColumnPredicate`: structured single-column predicates, used for row group - statistics pruning and decoded value filtering. -- `VExprContext`: expression filters, used for fallback expression evaluation - and residual filters. +- flat primitive/string/decimal/time/timestamp 的基础 selected read; +- empty selection 时跳过整批 non-predicate columns; +- sparse selection 会被合并成多个连续 ranges; +- predicate column 同时是 projection 时,不会重新读取该列。 -Current implementation status: +主要缺口: -- row group min/max pruning is wired through `parquet_statistics.*`; -- supported stats types include boolean, int32, int64, float, double and - string/binary; -- unsupported stats, missing stats or unsafe cases keep the row group; -- `IS NULL` and `IS NOT NULL` pruning use null count when available; -- page index, bloom filter and dictionary pruning are not implemented yet. +- batch 内 `ColumnPredicate` 执行未接入 selection; +- `reader_expression_map` 仍是 TODO; +- selection index 当前是 `uint16_t`,需要显式约束 batch size; +- selected read 依赖 Arrow internal `RecordReader::SkipRecords` 和 `ReadRecords`,需要继续隔离在 `column_reader.*`; +- 没有 page-level row range selection; +- 复杂列延时物化尚未实现。 -Correctness rule: pruning must be conservative. If the reader cannot prove that -a row group cannot match, it must keep the row group. +## Schema Change 当前状态 -## Lazy Materialization +当前原则是:`ParquetReader` 不理解 schema change,schema change 由 `TableColumnMapper` 和 `TableReader` 处理。 -The scan loop follows a predicate-first model: +已有能力: -1. Read predicate columns. -2. Evaluate `ColumnPredicate` and build a selection vector. -3. If a predicate column is also projected, reuse the decoded predicate column. -4. Read non-predicate output columns using the selection. -5. Assemble the file-local output block in projected column order. +- `TableReader` 初始化时默认使用 `TableColumnMappingMode::BY_FIELD_ID`。 +- `TableColumnMapper` 可以根据 table column 和 file schema 建立 `ColumnMapping`。 +- 缺失 partition column 可以用 partition value 生成 constant mapping。 +- 缺失普通列可以使用 `default_expr`。 +- file type 与 table type 不同的时候,可以生成 finalize cast projection。 +- virtual column 有 `ROW_ID` 和 `LAST_UPDATED_SEQUENCE_NUMBER` 的 mapping 标记。 -The current selected-read implementation uses Arrow Parquet -`parquet::internal::RecordReader` for supported primitive columns. +主要缺口: -Why this is needed: +- 当前 `SchemaField::id` 同时承担 file-local column id 和 mapping id,边界还不够清晰。尤其 top-level primitive 目前会使用 leaf column id,Iceberg field id 映射还需要重新梳理。 +- `_is_same_type()` 只是 `DataTypePtr` 指针比较,不能可靠表达类型等价。 +- filter localization 仍是 stub,没有完整实现 trivial mapping、safe cast、reader expression fallback、finalize-only filter。 +- `reader_filter_expr` 没有真正生成或执行。 +- 复杂列 schema change 没有 child-level mapping。 +- `IcebergTableReader` 的 equality delete、position delete、virtual column、finalize 仍是框架 stub。 -- `parquet::TypedColumnReader::Skip` skips physical values, not SQL rows. -- For nullable columns, row count and physical value count differ. -- For repeated/nested columns, a row can contain multiple physical values. +## 复杂列当前状态 -`RecordReader::SkipRecords` and `RecordReader::ReadRecords` provide row-level -movement. Doris compresses the selection vector into row ranges and alternates -skip/read operations. +已有能力: -Current support: +- schema builder 能识别 `STRUCT`、`LIST`、`MAP`。 +- 可以把复杂 Parquet schema 组合成 Doris `DataTypeStruct`、`DataTypeArray`、`DataTypeMap`。 +- `StructColumnReader` 可以递归读取 children,支持非常基础的非 nullable struct。 -- selected read for primitive boolean, int32, int64, float and double when the - RecordReader path is available; -- fallback path reads the whole batch and filters it when selected read is not - supported; -- output columns are skipped when the selection is empty; -- predicate columns are reused when they are also projected. +主要缺口: -Limitations: +- nullable struct 未实现。 +- list reader 未实现。 +- map reader 未实现。 +- repeated / nested definition level assembler 未实现。 +- primitive reader 当前只支持 `max_repetition_level == 0 && max_definition_level <= 1` 的 RecordReader 路径。 +- 复杂列裁剪未实现。 +- 复杂列延时物化未实现。 +- 复杂列 schema evolution / child remap 未实现。 -- `parquet::internal::RecordReader` is an Arrow internal/experimental API, so it - must remain hidden behind Doris `ParquetColumnReader`; -- string, decimal and timestamp selected reads still need broader validation; -- nested selected materialization needs a dedicated Dremel assembler. +结论:当前复杂列“schema 可见”,但“读取能力不完整”。真正可用还需要实现 Dremel assembler 或等价的 nested column assembler。 -## Type Coverage +## 当前可用能力总结 -Currently implemented: +当前新 reader 已经具备: -- flat required and nullable boolean; -- flat required and nullable int32 / int64; -- flat required and nullable float / double; -- BYTE_ARRAY and FIXED_LEN_BYTE_ARRAY string/binary with Doris-owned memory; -- decimal with precision up to 38 for int32, int64, byte array and fixed-length - byte array physical encodings; -- INT64 timestamp millis and micros into Doris `DateTimeV2`; -- basic required struct assembly. +- 打开 Parquet 文件并解析 footer; +- 导出 file-local schema; +- 基于 row group statistics 做保守裁剪; +- 读取 flat required/nullable primitive; +- 读取 string/binary; +- 读取 decimal precision <= 38 的常见物理编码; +- 读取 date/time/datetime 的部分编码; +- 通过 `DataTypeSerDe::read_column_from_decoded_values()` 写入 Doris column; +- 基础 predicate-first scan; +- flat column selected read; +- 非 nullable struct 的初步读取框架。 -Not implemented or incomplete: +当前还不具备完整生产能力,尤其缺少: -- INT96 timestamp; -- nanosecond timestamp; -- TIMESTAMPTZ semantics; -- DECIMAL256; -- nullable struct; -- list and map; -- complex column pruning; -- complex column lazy materialization. +- schema change 的完整 field id 语义; +- filter localization 的完整实现; +- batch 内 `ColumnPredicate` 执行; +- `reader_expression_map`; +- page index / bloom filter / dictionary pruning; +- list/map/nullable struct; +- nested column pruning; +- nested lazy materialization; +- 充分单测覆盖。 -## Current Implementation Status +## 下一步优先级 -Implemented in this PR: +建议按以下顺序推进: -- new `new_parquet` module; -- Arrow-backed Parquet file open and metadata read; -- file-local schema export; -- row group scheduling; -- projected leaf reader creation; -- primitive column decoding into Doris columns; -- string, decimal and INT64 timestamp decoding; -- basic struct reader; -- row group statistics pruning skeleton and initial implementation; -- predicate-first scan flow; -- primitive RecordReader-backed selected materialization; -- Debug BE build fixes. +1. 收敛 `SchemaField` 和 `ColumnMapping` 的 id 语义,区分 Iceberg field id、Parquet leaf column id 和 file-local output position。 +2. 补齐 batch 内 `ColumnPredicate` 执行,让 row group pruning 之后仍有正确 residual filter。 +3. 实现 `reader_expression_map`,支撑 schema change 下无法安全下推的 filter fallback。 +4. 补 flat primitive/string/decimal/timestamp 的 selected read 单测。 +5. 实现 nullable struct,再实现 list/map assembler。 +6. 在复杂列 assembler 稳定后,再做 nested pruning 和 nested lazy materialization。 +7. 后续再接 page index、bloom filter、dictionary pruning。 -Validated: - -- `git diff --check`; -- `BUILD_TYPE=DEBUG ./build.sh --be` on - `fedora:/home/socrates/code/doris`. - -## Future Work +## 核心规则 -Near term: - -- add unit tests for primitive required/nullable selected reads; -- validate selection edge cases: empty selection, full selection, sparse - selection and highly fragmented ranges; -- add a selection-rate policy so dense selections can fall back to whole-batch - read plus filter; -- stabilize string, decimal and timestamp selected reads; -- keep Arrow internal API usage isolated in `column_reader.*`. - -Mid term: +`ParquetReader` 必须保持 file-local reader。 -- implement page index pruning in `parquet_statistics.*`; -- implement bloom filter pruning for equality predicates; -- add dictionary-aware filtering where Arrow exposes enough metadata safely; -- expand complex type assembly for nullable struct, list and map; -- add tests for row group pruning correctness and unsupported-type fallback. - -Long term: - -- support nested column pruning; -- support nested lazy materialization; -- support page-level row range selection; -- integrate the new file-local reader with table readers after the API boundary - is stable; -- keep old `vparquet` compatibility until the new path is functionally complete. - -## Key Rule - -`ParquetReader` must remain a file-local reader. If a feature requires table -schema, Iceberg schema evolution, partition values, default/generated columns or -final table output semantics, it belongs in `TableColumnMapper` or -`TableReader`, not in `be/src/format/new_parquet/`. +只要某个功能需要 table schema、Iceberg schema evolution、partition value、default/generated column、delete file 或最终 table block 语义,就应该放在 `TableColumnMapper`、`TableReader` 或具体 table reader 中,而不是放进 `be/src/format/new_parquet/`。 diff --git a/docs/doris-new-parquet-dictionary-pushdown.md b/docs/doris-new-parquet-dictionary-pushdown.md new file mode 100644 index 00000000000000..7ce6b1a12c3ff6 --- /dev/null +++ b/docs/doris-new-parquet-dictionary-pushdown.md @@ -0,0 +1,359 @@ +# Doris New Parquet Reader Dictionary Predicate Pushdown 方案 + +## 背景 + +当前 new parquet reader 位于 `be/src/format/new_parquet/`,读取路径基于 Arrow +Parquet core API,并输出 Doris `Block` / `Column`。 + +当前已经实现的谓词相关能力主要有两类: + +- row group 级 min/max/null statistics 裁剪; +- 读取谓词列后,用 Doris `ColumnPredicate` 生成 `SelectionVector`,再对非谓词列做延时物化。 + +但当前还没有实现 dictionary predicate pushdown。主要原因是 +`ParquetColumnReaderFactory` 创建 Arrow `RecordReader` 时使用: + +```cpp +_row_group->RecordReader(leaf_column_id, /*read_dictionary=*/false); +``` + +因此底层会把字典编码列直接解码成普通值。等 `ParquetReader` 执行 +`ColumnPredicate::evaluate()` 时,已经看不到 dictionary page,也看不到 dictionary id。 + +本文档描述后续在 new parquet reader 中实现字典列谓词下推的设计方案。 + +## 目标 + +字典谓词下推的目标不是替代现有 statistics pruning,而是补充一类更强的过滤能力: + +```sql +where c = 'abc' +where c in ('a', 'b', 'c') +where c != 'x' +``` + +如果 Parquet column chunk 是全字典编码,可以只检查 dictionary values 或 dictionary +ids,而不必先把整列解码成字符串列。 + +预期收益: + +- 在 row group 级提前跳过不可能命中的 row group; +- 在 batch 级避免谓词列 string materialization; +- 和现有 `SelectionVector` / 延时物化路径结合,减少非谓词列读取量。 + +## 当前实现状态 + +### 已具备 + +- `ParquetStatisticsUtils` 已经有 file-local `ParquetColumnPredicate` 计划结构。 +- `ParquetReader` 已经有谓词列优先读取流程。 +- `SelectionVector` 已经能表示 batch 内选中 row offset。 +- `ParquetColumnReader::select()` 已经能按 selection 对非谓词列做 selected read。 + +### 不具备 + +- 没有判断 column chunk 是否全字典编码。 +- 没有读取 dictionary page 并转换成 Doris Column 的接口。 +- 没有 dictionary id reader。 +- 没有 dictionary value 到 dict id 的谓词重写。 +- 没有把 dictionary id selection 接入当前 `SelectionVector`。 + +因此当前实现不能利用字典列谓词下推。 + +## 分层原则 + +字典谓词下推必须保持 file-local 语义: + +- `TableColumnMapper` 负责把 table filter 转换成 file-local `ColumnPredicate`。 +- `ParquetReader` 只消费 file-local `FileScanRequest`。 +- 字典页、encoding、dictionary id 都属于 Parquet 文件格式层,不能泄露到 + Iceberg/table schema 层。 + +建议放置位置: + +```text +be/src/format/new_parquet/parquet_statistics.* + row group 级 dictionary pruning + +be/src/format/new_parquet/column_reader.* + dictionary values / dictionary ids 读取能力 + +be/src/format/new_parquet/parquet_reader.cpp + 将 dictionary selection 接入现有 predicate-first scan loop +``` + +## 方案一:Row Group 级字典裁剪 + +### 思路 + +对于全字典编码的 column chunk,dictionary page 包含该 row group 中所有可能出现的非 +NULL 值。如果所有 dictionary values 都不能满足谓词,则整个 row group 可以跳过。 + +例子: + +```text +predicate: name = 'Bob' +dictionary values: ['Alice', 'Cindy'] + +=> dictionary 中没有任何值满足 name = 'Bob' +=> row group 可以跳过 +``` + +### 流程 + +```text +FileScanRequest.local_filters + -> Build ParquetColumnPredicate + -> 对每个 row group / column chunk: + 1. 判断 column chunk 是否全字典编码 + 2. 读取 dictionary page + 3. 将 dictionary values materialize 成 Doris Column + 4. 对 dictionary values 执行 ColumnPredicate + 5. 如果没有任何 dictionary value 命中,则跳过 row group +``` + +### 全字典编码判断 + +Parquet 允许同一个 column chunk 先使用字典编码,后续 fallback 到 plain encoding。 +这种 mixed encoding 不能用于 row group 级字典裁剪,否则会漏读 plain page 中的值。 + +判断方式可以参考旧 `vparquet`: + +- 优先使用 `encoding_stats`: + - 所有 `DATA_PAGE` 必须是 `PLAIN_DICTIONARY` 或 `RLE_DICTIONARY`; + - 不能存在 count > 0 的非字典 data page。 +- 如果没有 `encoding_stats`,退化检查 `encodings`: + - 必须包含 dictionary encoding; + - 除 dictionary encoding、`RLE`、`BIT_PACKED` 外,不能包含其它 data encoding。 + +需要注意:`RLE` / `BIT_PACKED` 可能用于 definition/repetition levels,不代表 value +不是字典编码。 + +### 支持的谓词 + +第一阶段建议只支持结构化 `ColumnPredicate`: + +- `EQ` +- `IN` +- `NE` +- `NOT IN` +- `IS NULL` +- `IS NOT NULL` + +其中 null 语义需要谨慎: + +- dictionary page 不包含 NULL; +- `IS NULL` / `IS NOT NULL` 仍需要结合 column chunk null count; +- 不能仅靠 dictionary values 判断 NULL 谓词。 + +更复杂的表达式型 filter,例如 `lower(name) = 'abc'`,不在第一阶段支持。 + +### 正确性规则 + +row group 级裁剪必须保守: + +- 不能确认全字典编码时,保留 row group; +- 不能读取 dictionary page 时,保留 row group; +- 谓词类型不支持时,保留 row group; +- 类型转换不安全时,保留 row group; +- NULL 语义不能确认时,保留 row group。 + +## 方案二:Batch 级 Dict Id Selection + +### 思路 + +row group 不能整体跳过时,仍可以避免把谓词列完整解码成字符串列。 + +例子: + +```text +dictionary values: + id 0 -> 'Alice' + id 1 -> 'Bob' + id 2 -> 'Cindy' + +predicate: + name = 'Bob' + +matched dict ids: + {1} + +data page ids: + [0, 1, 1, 2, 0] + +selection: + [1, 2] +``` + +这时谓词列只需要扫描 dictionary ids,不需要 materialize 成 `ColumnString`。 +非谓词列继续复用当前 `SelectionVector` 做延时物化。 + +### 流程 + +```text +打开 row group + -> 对字典谓词列读取 dictionary values + -> 对 dictionary values 执行 ColumnPredicate + -> 得到 matched dict id set + +读取 batch + -> 读取该 batch 的 dictionary ids + -> 用 matched dict id set 生成 SelectionVector + -> 非谓词列按 SelectionVector selected read + -> 如果字典谓词列也在 projection 中,再按需转换成真实值列 +``` + +### Reader 抽象 + +建议在 `column_reader.*` 增加独立 reader 分支,而不是把逻辑塞进 +`PrimitiveColumnReader::read()`: + +```text +ParquetColumnReader + PrimitiveColumnReader + DictionaryColumnReader +``` + +或者先不新增类,通过内部 strategy 表达: + +```text +PrimitiveColumnReader + decoded reader path + dictionary reader path +``` + +需要暴露的能力: + +```text +read_dictionary_values(MutableColumnPtr* values) +read_dictionary_ids(int64_t rows, MutableColumnPtr* ids, int64_t* rows_read) +select_by_dictionary_ids(...) +materialize_dictionary_ids(...) +``` + +具体命名可以在实现时收敛,但边界应保持: + +- dictionary values / ids 读取属于 `column_reader.*`; +- 用谓词生成 matched dict ids 属于 `parquet_statistics.*` 或新的 filter helper; +- 将 selection 接入 scan loop 属于 `parquet_reader.cpp`。 + +### Arrow RecordReader 的限制 + +Arrow Parquet `RecordReader` 有 `read_dictionary` 参数和 `ReadDictionary()` API。 +但当前代码用的是 `read_dictionary=false`。 + +后续可以尝试: + +```cpp +_row_group->RecordReader(leaf_column_id, /*read_dictionary=*/true) +``` + +需要验证: + +- 只有全字典编码 column chunk 是否才会暴露 dictionary ids; +- mixed encoding 是否自动 fallback 为 decoded values; +- `RecordReader::read_dictionary()` 是否能可靠表示当前 reader 是否真的在读 ids; +- `BYTE_ARRAY` / `FIXED_LEN_BYTE_ARRAY` 之外的类型支持情况; +- nullable column 下 ids 和 def levels 的行对齐方式。 + +从 Arrow 头文件注释看,dictionary expose 主要是 experimental API,且对 fully +dictionary encoded byte array column chunk 更可靠。因此第一版实现应该只针对 string-like +列,并且必须有 fallback。 + +## 和旧 vparquet 的关系 + +旧 `vparquet` 已经实现了一套字典过滤思路: + +1. 判断 column chunk 是否全字典编码; +2. 读取 dictionary values 到临时 string column; +3. 执行原始谓词; +4. 将命中的 dictionary value 下标重写成 int dict code 谓词; +5. 读取 data page 时输出 dict id column; +6. 最终需要输出该列时再把 dict id 转回 string。 + +new parquet reader 可以复用这个设计思想,但不建议直接复用旧实现代码: + +- 旧实现基于 Doris 自研 page decoder; +- new parquet reader 当前基于 Arrow Parquet core API; +- new reader 已有 `SelectionVector`,可以直接用 dict ids 生成 selection,而不一定要重写成 + `VExprContext`。 + +更适合 new reader 的方式是: + +```text +dictionary values -> ColumnPredicate -> matched dict id set -> SelectionVector +``` + +而不是: + +```text +dictionary values -> VExprContext -> rewrite predicate expression +``` + +## 推荐实施顺序 + +### 阶段一:Metadata 判断和 Row Group 级 Dictionary Pruning + +新增能力: + +- 判断 column chunk 是否全字典编码; +- 为 string-like primitive column 读取 dictionary values; +- 对 dictionary values 执行 `ColumnPredicate`; +- 在 `ParquetStatisticsUtils::SelectRowGroups()` 中额外执行 dictionary pruning。 + +约束: + +- 只支持 `BYTE_ARRAY` / `FIXED_LEN_BYTE_ARRAY` string-like 列; +- 只支持结构化 `ColumnPredicate`; +- 不处理 expression fallback; +- 不处理 mixed encoding; +- 不能确认时保守保留 row group。 + +### 阶段二:Batch 级 Dict Id Selection + +新增能力: + +- 构造 dictionary-aware predicate column reader; +- 读取 batch dictionary ids; +- 用 matched dict id set 生成 `SelectionVector`; +- 和现有延时物化路径合并。 + +约束: + +- 谓词列如果也在 projection 中,需要按 selection materialize 成真实 Doris column; +- dict id column 不应泄露到 `ParquetReader` 输出 block; +- fallback 到 decoded value path 必须保持正确。 + +### 阶段三:扩展类型和复杂谓词 + +后续再考虑: + +- numeric dictionary; +- decimal dictionary; +- timestamp/date dictionary; +- `LIKE` / prefix filter; +- expression fallback; +- page index + dictionary 组合裁剪。 + +## 当前实现是否可以直接做到 + +不能。 + +当前实现缺少以下关键点: + +- `RecordReader` 使用 `read_dictionary=false`; +- 没有 dictionary metadata 判断; +- 没有 dictionary page 读取接口; +- 没有 dict id column 或 dict id selection; +- 谓词过滤发生在已经 materialize 的 Doris Column 上。 + +因此,当前最多只能做 decoded value filter,不能做 dictionary predicate pushdown。 + +## 关键设计结论 + +- 字典优化应该放在 Parquet file-local 层,不进入 table schema / Iceberg 层。 +- 第一阶段优先做 row group 级 dictionary pruning,收益明确且风险低。 +- 第二阶段再做 batch 级 dict id selection,与现有 `SelectionVector` 和延时物化结合。 +- 基于 Arrow Parquet API 时,必须明确 fallback 策略,不能假设所有字典编码列都能暴露 + dictionary ids。 +- 输出 block 必须始终是正常 Doris Column,不能把 dict id column 暴露给上层。 From f19f78aa940271a5125e0a0beebe0be530df3437 Mon Sep 17 00:00:00 2001 From: Gabriel Date: Wed, 27 May 2026 16:16:57 +0800 Subject: [PATCH 15/38] [feature](be) Build table filters from conjuncts (#63733) --- be/src/exprs/vslot_ref.h | 4 +- be/src/format/reader/column_mapper.cpp | 74 ++++++- be/src/format/reader/table_reader.cpp | 61 +++++- be/src/format/reader/table_reader.h | 8 + be/test/format/reader/table_reader_test.cpp | 214 ++++++++++++++++++-- 5 files changed, 331 insertions(+), 30 deletions(-) diff --git a/be/src/exprs/vslot_ref.h b/be/src/exprs/vslot_ref.h index 6e7197f4cf6876..8cb26f9bcfd296 100644 --- a/be/src/exprs/vslot_ref.h +++ b/be/src/exprs/vslot_ref.h @@ -75,7 +75,9 @@ class VSlotRef : public VExpr { protected: VSlotRef(int slot_id, int column_id, int column_uniq_id) - : _slot_id(slot_id), _column_id(column_id), _column_uniq_id(column_uniq_id) {} + : _slot_id(slot_id), _column_id(column_id), _column_uniq_id(column_uniq_id) { + set_node_type(TExprNodeType::SLOT_REF); + } private: int _slot_id; diff --git a/be/src/format/reader/column_mapper.cpp b/be/src/format/reader/column_mapper.cpp index b2453dbbfaf61c..5790517f7bb71f 100644 --- a/be/src/format/reader/column_mapper.cpp +++ b/be/src/format/reader/column_mapper.cpp @@ -21,6 +21,7 @@ #include #include "common/status.h" +#include "core/assert_cast.h" #include "format/reader/expr/cast.h" #include "format/reader/expr/slot_ref.h" #include "format/reader/file_reader.h" @@ -28,6 +29,35 @@ namespace doris::reader { +static VExprSPtr rewrite_table_expr_to_file_expr( + const VExprSPtr& expr, const std::map& table_column_to_file_position) { + if (expr == nullptr) { + return nullptr; + } + if (expr->is_slot_ref()) { + const auto* slot_ref = assert_cast(expr.get()); + const auto position_it = table_column_to_file_position.find(slot_ref->slot_id()); + if (position_it != table_column_to_file_position.end()) { + return TableSlotRef::create_shared(slot_ref->slot_id(), + cast_set(position_it->second), -1, + slot_ref->data_type(), slot_ref->expr_name()); + } + return expr; + } + + // VExpr currently does not provide a generic deep-clone API for arbitrary expression types. + // Keep all slot-localization mutation inside ColumnMapper and rebuild it for every split + // before the localized expression is prepared/opened by TableReader. + VExprSPtrs rewritten_children; + rewritten_children.reserve(expr->children().size()); + for (const auto& child : expr->children()) { + rewritten_children.push_back( + rewrite_table_expr_to_file_expr(child, table_column_to_file_position)); + } + expr->set_children(std::move(rewritten_children)); + return expr; +} + static constexpr const char* ROW_LINEAGE_ROW_ID = "_row_id"; static constexpr const char* ROW_LINEAGE_LAST_UPDATED_SEQ_NUMBER = "_last_updated_sequence_number"; @@ -56,6 +86,21 @@ static void rebuild_projection(ColumnMapping* mapping, size_t block_position) { mapping->projection = VExprContext::create_shared(expr); } +static std::map build_file_position_map( + const std::vector& mappings, const FileScanRequest& file_request) { + std::map table_column_to_file_position; + for (const auto& mapping : mappings) { + if (!mapping.file_column_id.has_value()) { + continue; + } + const auto position_it = file_request.column_positions.find(*mapping.file_column_id); + if (position_it != file_request.column_positions.end()) { + table_column_to_file_position.emplace(mapping.table_column_id, position_it->second); + } + } + return table_column_to_file_position; +} + Status TableColumnMapper::create_mapping(const std::vector& projected_columns, const std::map& partition_values, const std::vector& file_schema) { @@ -102,7 +147,8 @@ Status TableColumnMapper::create_mapping(const std::vector& project Status TableColumnMapper::create_scan_request(const std::map& table_filters, const std::vector& projected_columns, FileScanRequest* file_request) { - // 真实实现会把 table projection/filter 转换成 file-local projection/filter。 + // FileReader evaluates expressions against a file-local block. This mapper owns the + // table-column to file-column conversion, so it also owns the file-local block positions. file_request->predicate_columns.clear(); file_request->non_predicate_columns.clear(); file_request->column_positions.clear(); @@ -141,15 +187,29 @@ Status TableColumnMapper::localize_filters(const std::map& if (!it.second.can_be_localized()) { // TODO: Rewrite table filter to reader_expression_map // file_request->reader_expression_map.emplace_back(mapping->table_column_id, it.second.conjunct); - } else { - FileLocalFilter local_filter; - local_filter.file_column_id = *mapping->file_column_id; - local_filter.conjunct = it.second.conjunct; - local_filter.predicates = it.second.predicates; - file_request->local_filters.push_back(std::move(local_filter)); + continue; } add_scan_column(file_request, *mapping->file_column_id, &file_request->predicate_columns); } + + // Build the complete table-slot to file-block position map after all predicate columns have + // been assigned. This keeps expression localization independent from filter iteration order. + const auto table_column_to_file_position = build_file_position_map(_mappings, *file_request); + for (const auto& it : table_filters) { + const auto* mapping = _find_mapping(it.first); + if (mapping == nullptr || !mapping->file_column_id.has_value() || + !it.second.can_be_localized()) { + continue; + } + FileLocalFilter local_filter; + local_filter.file_column_id = *mapping->file_column_id; + if (it.second.conjunct != nullptr) { + local_filter.conjunct = VExprContext::create_shared(rewrite_table_expr_to_file_expr( + it.second.conjunct->root(), table_column_to_file_position)); + } + local_filter.predicates = it.second.predicates; + file_request->local_filters.push_back(std::move(local_filter)); + } return Status::OK(); } diff --git a/be/src/format/reader/table_reader.cpp b/be/src/format/reader/table_reader.cpp index 13f093228e6e70..f6cfa21600ea61 100644 --- a/be/src/format/reader/table_reader.cpp +++ b/be/src/format/reader/table_reader.cpp @@ -20,15 +20,54 @@ #include #include +#include #include #include "common/status.h" +#include "core/assert_cast.h" +#include "exprs/vslot_ref.h" #include "format/new_parquet/parquet_reader.h" #include "format/reader/column_mapper.h" #include "format/table/deletion_vector_reader.h" #include "io/io_common.h" namespace doris::reader { +namespace { + +void collect_table_slot_ids(const VExprSPtr& expr, std::set* slot_ids) { + if (expr == nullptr) { + return; + } + if (expr->is_slot_ref()) { + const auto* slot_ref = assert_cast(expr.get()); + slot_ids->insert(slot_ref->slot_id()); + } + for (const auto& child : expr->children()) { + collect_table_slot_ids(child, slot_ids); + } +} + +void build_table_filters_from_conjunct(const VExprSPtr& conjunct, + std::map* table_filters) { + if (conjunct == nullptr) { + return; + } + std::set slot_ids; + collect_table_slot_ids(conjunct, &slot_ids); + if (slot_ids.size() == 1) { + (*table_filters)[*slot_ids.begin()].conjunct = VExprContext::create_shared(conjunct); + return; + } + if (conjunct->node_type() == TExprNodeType::COMPOUND_PRED && + conjunct->op() == TExprOpcode::COMPOUND_AND) { + for (const auto& child : conjunct->children()) { + build_table_filters_from_conjunct(child, table_filters); + } + return; + } +} + +} // namespace std::shared_ptr create_system_properties( const TFileScanRangeParams* scan_params) { @@ -58,9 +97,27 @@ Status TableReader::init(TableReadOptions options) { _profile = std::move(options.profile); TableColumnMapperOptions mapper_options; mapper_options.mode = TableColumnMappingMode::BY_FIELD_ID; + mapper_options.allow_missing_columns = options.allow_missing_columns; _data_reader.column_mapper = TableColumnMapper(mapper_options); - // TODO: - // _table_filters = build_table_filters_from_conjuncts(options.conjuncts); + _conjuncts = std::move(options.conjuncts); + return Status::OK(); +} + +Status TableReader::_build_table_filters_from_conjuncts() { + _table_filters.clear(); + build_table_filters_from_conjunct(_conjuncts.root(), &_table_filters); + return Status::OK(); +} + +Status TableReader::_open_local_filter_exprs(const FileScanRequest& file_request) { + RowDescriptor row_desc; + for (const auto& local_filter : file_request.local_filters) { + if (local_filter.conjunct == nullptr) { + continue; + } + RETURN_IF_ERROR(local_filter.conjunct->prepare(_runtime_state, row_desc)); + RETURN_IF_ERROR(local_filter.conjunct->open(_runtime_state)); + } return Status::OK(); } diff --git a/be/src/format/reader/table_reader.h b/be/src/format/reader/table_reader.h index 53791747faf67f..4f28c4e1aaa9f8 100644 --- a/be/src/format/reader/table_reader.h +++ b/be/src/format/reader/table_reader.h @@ -107,6 +107,7 @@ struct TableReadOptions { std::shared_ptr io_ctx; RuntimeState* runtime_state; RuntimeProfile* scanner_profile; + const bool allow_missing_columns = true; std::unique_ptr profile; }; @@ -219,10 +220,12 @@ class TableReader { RETURN_IF_ERROR(_data_reader.column_mapper.create_mapping(_projected_columns, _partition_values, file_schema)); DORIS_CHECK(_data_reader.column_mapper.mappings().size() == _projected_columns.size()); + RETURN_IF_ERROR(_build_table_filters_from_conjuncts()); auto file_request = std::make_unique(); RETURN_IF_ERROR(_data_reader.column_mapper.create_scan_request( _table_filters, _projected_columns, file_request.get())); + RETURN_IF_ERROR(_open_local_filter_exprs(*file_request)); _data_reader.scan_schema.clear(); _data_reader.block_template.clear(); _data_reader.scan_schema.resize(file_request->column_positions.size()); @@ -242,12 +245,16 @@ class TableReader { return Status::OK(); } + Status _build_table_filters_from_conjuncts(); + Status _open_local_filter_exprs(const FileScanRequest& file_request); + // 关闭当前具体 reader。 // 该 hook 会被 create_next_reader 和 close 调用;实现应保持幂等。 virtual Status close_current_reader() { RETURN_IF_ERROR(_data_reader.reader->close()); _data_reader.reader.reset(); _data_reader.column_mapper.clear(); + _table_filters.clear(); _data_reader.block_schema.clear(); _data_reader.scan_schema.clear(); _data_reader.block_template.clear(); @@ -314,6 +321,7 @@ class TableReader { // partition key -> value std::map _partition_values; std::map _table_filters; + VExprContext _conjuncts {nullptr}; std::unique_ptr _profile; // Parsed from DELETION_VECTOR in Iceberg and Paimon DeleteRows* _delete_rows; diff --git a/be/test/format/reader/table_reader_test.cpp b/be/test/format/reader/table_reader_test.cpp index 84c5700fc4c1ac..dc2e26f35ea222 100644 --- a/be/test/format/reader/table_reader_test.cpp +++ b/be/test/format/reader/table_reader_test.cpp @@ -33,12 +33,48 @@ #include "core/column/column_vector.h" #include "core/data_type/data_type_number.h" #include "core/data_type/data_type_string.h" +#include "exprs/vexpr.h" +#include "format/reader/expr/slot_ref.h" #include "gen_cpp/PlanNodes_types.h" #include "runtime/runtime_state.h" namespace doris::reader { namespace { +class TableInt32GreaterThanExpr final : public VExpr { +public: + TableInt32GreaterThanExpr(int slot_id, int column_id, int32_t value) + : VExpr(std::make_shared(), false), _value(value) { + add_child(TableSlotRef::create_shared(slot_id, column_id, -1, + std::make_shared(), "id")); + set_node_type(TExprNodeType::BINARY_PRED); + _opcode = TExprOpcode::GT; + } + + Status execute_column_impl(VExprContext* context, const Block* block, const Selector* selector, + size_t count, ColumnPtr& result_column) const override { + const auto* slot_ref = assert_cast(get_child(0).get()); + const auto& input = + assert_cast( + *block->get_by_position(slot_ref->column_id()).column); + auto result = ColumnUInt8::create(); + auto& result_data = result->get_data(); + result_data.resize(count); + for (size_t row = 0; row < count; ++row) { + const size_t input_row = selector == nullptr ? row : (*selector)[row]; + result_data[row] = input.get_element(input_row) > _value; + } + result_column = std::move(result); + return Status::OK(); + } + + const std::string& expr_name() const override { return _expr_name; } + +private: + const int32_t _value; + const std::string _expr_name = "TableInt32GreaterThanExpr"; +}; + std::shared_ptr finish_array(arrow::ArrayBuilder* builder) { std::shared_ptr array; EXPECT_TRUE(builder->Finish(&array).ok()); @@ -97,6 +133,14 @@ SplitReadOptions build_split_options(const std::string& file_path) { return options; } +TableColumn make_table_column(ColumnId id, const std::string& name, const DataTypePtr& type) { + TableColumn column; + column.id = id; + column.name = name; + column.type = type; + return column; +} + TEST(TableReaderTest, ReopenSplitAfterClose) { const auto test_dir = std::filesystem::temp_directory_path() / "doris_table_reader_test"; std::filesystem::remove_all(test_dir); @@ -112,21 +156,23 @@ TEST(TableReaderTest, ReopenSplitAfterClose) { write_parquet_file(file_paths[2], 3, "three"); std::vector projected_columns; - projected_columns.push_back({.id = 0, .name = "id", .type = std::make_shared()}); - projected_columns.push_back( - {.id = 1, .name = "value", .type = std::make_shared()}); + projected_columns.push_back(make_table_column(1, "value", std::make_shared())); + projected_columns.push_back(make_table_column(0, "id", std::make_shared())); RuntimeState state {TQueryOptions(), TQueryGlobals()}; TableReader reader; ASSERT_TRUE(reader .init({ .projected_columns = projected_columns, - .conjuncts = VExprContext(nullptr), + .conjuncts = VExprContext( + std::make_shared(0, 0, 0)), .format = FileFormat::PARQUET, .scan_params = nullptr, .io_ctx = nullptr, .runtime_state = &state, .scanner_profile = nullptr, + .allow_missing_columns = true, + .profile = nullptr, }) .ok()); @@ -134,6 +180,9 @@ TEST(TableReaderTest, ReopenSplitAfterClose) { // init() once, then repeat prepare_split() -> get_block() -> close(). // This verifies TableReader::close() fully releases the previous low-level reader and task // state, so a later prepare_split() can open and read a new split on the same TableReader. + // The table-level conjunct is also rebuilt for each split. The projection order puts value + // before id, so the pushed conjunct has to be rewritten to the ParquetReader file-local block + // position every time a new split is opened. std::vector ids; std::vector values; for (const auto& file_path : file_paths) { @@ -145,9 +194,9 @@ TEST(TableReaderTest, ReopenSplitAfterClose) { ASSERT_TRUE(reader.get_block(&block, &eos).ok()); ASSERT_FALSE(eos); - const auto& id_column = assert_cast(*block.get_by_position(0).column); const auto& value_column = - assert_cast(*block.get_by_position(1).column); + assert_cast(*block.get_by_position(0).column); + const auto& id_column = assert_cast(*block.get_by_position(1).column); ASSERT_EQ(id_column.size(), 1); ASSERT_EQ(value_column.size(), 1); ids.push_back(id_column.get_element(0)); @@ -162,7 +211,81 @@ TEST(TableReaderTest, ReopenSplitAfterClose) { std::filesystem::remove_all(test_dir); } -TEST(TableReaderTest, ProjectedColumnsRejectParquetSchemaMismatch) { +TEST(TableReaderTest, OpenReaderBuildsTableFiltersFromConjuncts) { + const auto test_dir = + std::filesystem::temp_directory_path() / "doris_table_reader_conjunct_filter_test"; + std::filesystem::remove_all(test_dir); + std::filesystem::create_directories(test_dir); + + const auto file_path = (test_dir / "split.parquet").string(); + write_parquet_file(file_path, 3, "three"); + + std::vector projected_columns; + projected_columns.push_back(make_table_column(1, "value", std::make_shared())); + projected_columns.push_back(make_table_column(0, "id", std::make_shared())); + + RuntimeState state {TQueryOptions(), TQueryGlobals()}; + TableReader reader; + ASSERT_TRUE(reader + .init({ + .projected_columns = projected_columns, + .conjuncts = VExprContext(std::make_shared( + 0, 0, 2)), + .format = FileFormat::PARQUET, + .scan_params = nullptr, + .io_ctx = nullptr, + .runtime_state = &state, + .scanner_profile = nullptr, + .allow_missing_columns = true, + .profile = nullptr, + }) + .ok()); + + ASSERT_TRUE(reader.prepare_split(build_split_options(file_path)).ok()); + + // open_reader() should convert the table-level conjunct on projected column id 0 into + // _table_filters before ColumnMapper creates the FileScanRequest. ColumnMapper then rewrites + // the conjunct's slot ref from table column id 0 to the file-local block position used by + // ParquetReader. The projection order intentionally puts value before id, so the id filter + // column is not at position 0 in the file block. + Block block = build_table_block(projected_columns); + bool eos = false; + ASSERT_TRUE(reader.get_block(&block, &eos).ok()); + ASSERT_FALSE(eos); + const auto& id_column = assert_cast(*block.get_by_position(1).column); + ASSERT_EQ(id_column.size(), 1); + EXPECT_EQ(id_column.get_element(0), 3); + + ASSERT_TRUE(reader.close().ok()); + + TableReader filtered_reader; + ASSERT_TRUE(filtered_reader + .init({ + .projected_columns = projected_columns, + .conjuncts = VExprContext(std::make_shared( + 0, 0, 4)), + .format = FileFormat::PARQUET, + .scan_params = nullptr, + .io_ctx = nullptr, + .runtime_state = &state, + .scanner_profile = nullptr, + .allow_missing_columns = true, + .profile = nullptr, + }) + .ok()); + ASSERT_TRUE(filtered_reader.prepare_split(build_split_options(file_path)).ok()); + + block = build_table_block(projected_columns); + eos = false; + ASSERT_TRUE(filtered_reader.get_block(&block, &eos).ok()); + EXPECT_TRUE(eos); + EXPECT_EQ(block.get_by_position(1).column->size(), 0); + + ASSERT_TRUE(filtered_reader.close().ok()); + std::filesystem::remove_all(test_dir); +} + +TEST(TableReaderTest, ProjectedColumnsFillDefaultForParquetSchemaMismatch) { const auto test_dir = std::filesystem::temp_directory_path() / "doris_table_reader_schema_mismatch_test"; std::filesystem::remove_all(test_dir); @@ -173,7 +296,7 @@ TEST(TableReaderTest, ProjectedColumnsRejectParquetSchemaMismatch) { std::vector projected_columns; projected_columns.push_back( - {.id = 99, .name = "missing_value", .type = std::make_shared()}); + make_table_column(99, "missing_value", std::make_shared())); RuntimeState state {TQueryOptions(), TQueryGlobals()}; TableReader reader; @@ -186,14 +309,59 @@ TEST(TableReaderTest, ProjectedColumnsRejectParquetSchemaMismatch) { .io_ctx = nullptr, .runtime_state = &state, .scanner_profile = nullptr, + .allow_missing_columns = true, + .profile = nullptr, }) .ok()); ASSERT_TRUE(reader.prepare_split(build_split_options(file_path)).ok()); // The table projection asks for field id 99, but the ParquetReader exposes only file-local - // fields 0 and 1. get_block() opens the split lazily, so this is where TableReader must reject - // the mismatch between TableReadOptions::projected_columns and the Parquet file schema. + // fields 0 and 1. Missing columns are allowed by the current mapper options, so TableReader + // should still use the Parquet row count and fill a default column in table schema. + Block block = build_table_block(projected_columns); + bool eos = false; + ASSERT_TRUE(reader.get_block(&block, &eos).ok()); + ASSERT_FALSE(eos); + EXPECT_EQ(block.get_by_position(0).column->size(), 1); + + ASSERT_TRUE(reader.close().ok()); + std::filesystem::remove_all(test_dir); +} + +TEST(TableReaderTest, ProjectedColumnsRejectParquetSchemaMismatchWhenMissingColumnsDisallowed) { + const auto test_dir = std::filesystem::temp_directory_path() / + "doris_table_reader_schema_mismatch_reject_test"; + std::filesystem::remove_all(test_dir); + std::filesystem::create_directories(test_dir); + + const auto file_path = (test_dir / "split.parquet").string(); + write_parquet_file(file_path, 1, "one"); + + std::vector projected_columns; + projected_columns.push_back( + make_table_column(99, "missing_value", std::make_shared())); + + RuntimeState state {TQueryOptions(), TQueryGlobals()}; + TableReader reader; + ASSERT_TRUE(reader + .init({ + .projected_columns = projected_columns, + .conjuncts = VExprContext(nullptr), + .format = FileFormat::PARQUET, + .scan_params = nullptr, + .io_ctx = nullptr, + .runtime_state = &state, + .scanner_profile = nullptr, + .allow_missing_columns = false, + .profile = nullptr, + }) + .ok()); + + ASSERT_TRUE(reader.prepare_split(build_split_options(file_path)).ok()); + + // With allow_missing_columns disabled, the same missing projected column should fail while + // opening the split instead of being materialized as a default column. Block block = build_table_block(projected_columns); bool eos = false; const auto status = reader.get_block(&block, &eos); @@ -204,7 +372,7 @@ TEST(TableReaderTest, ProjectedColumnsRejectParquetSchemaMismatch) { std::filesystem::remove_all(test_dir); } -TEST(TableReaderTest, ProjectedColumnsRejectSameNameDifferentIdParquetSchemaMismatch) { +TEST(TableReaderTest, ProjectedColumnsUseMapperExpressionForSameNameDifferentIdParquetSchema) { const auto test_dir = std::filesystem::temp_directory_path() / "doris_table_reader_same_name_diff_id_test"; std::filesystem::remove_all(test_dir); @@ -214,8 +382,7 @@ TEST(TableReaderTest, ProjectedColumnsRejectSameNameDifferentIdParquetSchemaMism write_parquet_file(file_path, 1, "one"); std::vector projected_columns; - projected_columns.push_back( - {.id = 99, .name = "id", .type = std::make_shared()}); + projected_columns.push_back(make_table_column(99, "id", std::make_shared())); RuntimeState state {TQueryOptions(), TQueryGlobals()}; TableReader reader; @@ -228,19 +395,24 @@ TEST(TableReaderTest, ProjectedColumnsRejectSameNameDifferentIdParquetSchemaMism .io_ctx = nullptr, .runtime_state = &state, .scanner_profile = nullptr, + .allow_missing_columns = true, + .profile = nullptr, }) .ok()); ASSERT_TRUE(reader.prepare_split(build_split_options(file_path)).ok()); // The table column has the same name as the Parquet field, but a different field id. - // TableReader configures ColumnMapper in BY_FIELD_ID mode, so the name match must not hide - // the id mismatch. + // ColumnMapper should still resolve it by name and build a SlotRef projection from the file + // column into the requested table column. Block block = build_table_block(projected_columns); bool eos = false; - const auto status = reader.get_block(&block, &eos); - ASSERT_FALSE(status.ok()); - EXPECT_NE(status.to_string().find("does not have a matching file column"), std::string::npos); + ASSERT_TRUE(reader.get_block(&block, &eos).ok()); + ASSERT_FALSE(eos); + + const auto& id_column = assert_cast(*block.get_by_position(0).column); + ASSERT_EQ(id_column.size(), 1); + EXPECT_EQ(id_column.get_element(0), 1); ASSERT_TRUE(reader.close().ok()); std::filesystem::remove_all(test_dir); @@ -257,9 +429,9 @@ TEST(TableReaderTest, ProjectedColumnsUseMapperExpressionsForParquetSchemaMismat std::vector projected_columns; projected_columns.push_back( - {.id = 0, .name = "table_id", .type = std::make_shared()}); + make_table_column(0, "table_id", std::make_shared())); projected_columns.push_back( - {.id = 1, .name = "table_value", .type = std::make_shared()}); + make_table_column(1, "table_value", std::make_shared())); RuntimeState state {TQueryOptions(), TQueryGlobals()}; TableReader reader; @@ -272,6 +444,8 @@ TEST(TableReaderTest, ProjectedColumnsUseMapperExpressionsForParquetSchemaMismat .io_ctx = nullptr, .runtime_state = &state, .scanner_profile = nullptr, + .allow_missing_columns = true, + .profile = nullptr, }) .ok()); From 8882d08a6a0fd2dad2b8aa9e6e543c521103db50 Mon Sep 17 00:00:00 2001 From: Socrates Date: Wed, 27 May 2026 16:47:19 +0800 Subject: [PATCH 16/38] [feature](be) Support nested projection in new parquet reader Issue Number: None Related PR: None Problem Summary: Add file-local nested schema metadata and projection plumbing for the new Arrow Parquet reader. Struct child projection is now pushed into the Parquet column reader factory, table scan schema is rebuilt from projected complex types, and the mapper preserves path metadata for future complex schema change handling while explicitly rejecting unsupported child schema evolution for now. None - Test: Unit Test - Added BE unit coverage for struct projection, nested schema path metadata, and table mapper complex projection generation. - Ran clang-format 16 dry-run on modified C++ files. - Ran git diff --check. - Attempted ./run-be-ut.sh --run '--filter=ParquetColumnReaderTest.*:TableColumnMapperTest.*:NewParquetReaderTest.*:FileReaderTest.*', but local CMake compiler sanity check failed before Doris code compilation because ld could not find library 'c++'. - Behavior changed: No - Does this need documentation: Yes (included docs/doris-arrow-parquet-complex-types-implementation.md) --- be/src/format/new_parquet/column_reader.cpp | 48 +- be/src/format/new_parquet/column_reader.h | 15 +- .../new_parquet/parquet_column_schema.cpp | 103 +++- .../new_parquet/parquet_column_schema.h | 10 + be/src/format/new_parquet/parquet_reader.cpp | 142 ++++- be/src/format/new_parquet/parquet_reader.h | 6 + be/src/format/reader/column_mapper.cpp | 206 ++++++- be/src/format/reader/column_mapper.h | 21 +- be/src/format/reader/file_reader.h | 14 + be/src/format/reader/table_reader.h | 97 ++- .../parquet_column_reader_test.cpp | 297 ++++++---- .../new_parquet/parquet_reader_test.cpp | 71 ++- ...ow-parquet-complex-types-implementation.md | 559 ++++++++++++++++++ 13 files changed, 1436 insertions(+), 153 deletions(-) create mode 100644 docs/doris-arrow-parquet-complex-types-implementation.md diff --git a/be/src/format/new_parquet/column_reader.cpp b/be/src/format/new_parquet/column_reader.cpp index bb101b5c5fc910..f1674b767b09e0 100644 --- a/be/src/format/new_parquet/column_reader.cpp +++ b/be/src/format/new_parquet/column_reader.cpp @@ -33,8 +33,10 @@ #include "core/column/column.h" #include "core/column/column_struct.h" #include "core/data_type/data_type_nullable.h" +#include "core/data_type/data_type_struct.h" #include "core/data_type_serde/decoded_column_view.h" #include "format/new_parquet/parquet_column_schema.h" +#include "format/reader/file_reader.h" namespace doris::parquet { namespace { @@ -77,10 +79,10 @@ class ScalarColumnReader final : public ParquetColumnReader { class StructColumnReader final : public ParquetColumnReader { public: - StructColumnReader(const ParquetColumnSchema& schema, + StructColumnReader(const ParquetColumnSchema& schema, DataTypePtr type, std::vector> children) - : _field_id(schema.field_id), - _type(schema.type), + : _field_id(schema.top_level_field_id), + _type(std::move(type)), _name(schema.name), _children(std::move(children)) {} @@ -364,6 +366,7 @@ Status StructColumnReader::read(int64_t rows, MutableColumnPtr& column, int64_t* size_t child_idx = 0; DCHECK_EQ(assert_cast(*column).get_columns().size(), _children.size()); for (auto& child_reader : _children) { + DORIS_CHECK(child_reader != nullptr); int64_t child_rows = 0; auto child_column = assert_cast(*column).get_column_ptr(child_idx)->assume_mutable(); @@ -517,7 +520,7 @@ Status ParquetColumnReaderFactory::get_record_reader( } Status ParquetColumnReaderFactory::create_struct_column_reader( - const ParquetColumnSchema& column_schema, + const ParquetColumnSchema& column_schema, const reader::FieldProjection* projection, std::unique_ptr* reader) const { if (reader == nullptr) { return Status::InvalidArgument("reader is null"); @@ -529,16 +532,45 @@ Status ParquetColumnReaderFactory::create_struct_column_reader( } std::vector> child_readers; child_readers.reserve(column_schema.children.size()); - for (const auto& child_schema : column_schema.children) { + DataTypes projected_child_types; + Strings projected_child_names; + for (size_t child_idx = 0; child_idx < column_schema.children.size(); ++child_idx) { + const auto& child_schema = column_schema.children[child_idx]; + const reader::FieldProjection* child_projection = nullptr; + if (projection != nullptr && !projection->project_all_children) { + auto it = std::find_if(projection->children.begin(), projection->children.end(), + [&](const reader::FieldProjection& child) { + return child.file_path == child_schema->file_path; + }); + if (it == projection->children.end()) { + continue; + } + child_projection = &*it; + } std::unique_ptr child_reader; - RETURN_IF_ERROR(create(*child_schema, &child_reader)); + RETURN_IF_ERROR(create(*child_schema, child_projection, &child_reader)); + projected_child_types.push_back(child_reader->type()); + projected_child_names.push_back(child_reader->name()); child_readers.push_back(std::move(child_reader)); } - *reader = std::make_unique(column_schema, std::move(child_readers)); + if (child_readers.empty() && !column_schema.children.empty()) { + return Status::NotSupported("Parquet STRUCT projection for column {} contains no children", + column_schema.name); + } + DataTypePtr type = column_schema.type; + if (projection != nullptr && !projection->project_all_children) { + type = std::make_shared(projected_child_types, projected_child_names); + if (column_schema.type != nullptr && column_schema.type->is_nullable()) { + type = make_nullable(type); + } + } + *reader = std::make_unique(column_schema, std::move(type), + std::move(child_readers)); return Status::OK(); } Status ParquetColumnReaderFactory::create(const ParquetColumnSchema& column_schema, + const reader::FieldProjection* projection, std::unique_ptr* reader) const { if (reader == nullptr) { return Status::InvalidArgument("reader is null"); @@ -547,7 +579,7 @@ Status ParquetColumnReaderFactory::create(const ParquetColumnSchema& column_sche case ParquetColumnSchemaKind::PRIMITIVE: return create_scalar_column_reader(column_schema, reader); case ParquetColumnSchemaKind::STRUCT: - return create_struct_column_reader(column_schema, reader); + return create_struct_column_reader(column_schema, projection, reader); case ParquetColumnSchemaKind::LIST: return Status::NotSupported("Parquet LIST reader is not implemented for column {}", column_schema.name); diff --git a/be/src/format/new_parquet/column_reader.h b/be/src/format/new_parquet/column_reader.h index cd59fc3960a7db..93881ac8c48077 100644 --- a/be/src/format/new_parquet/column_reader.h +++ b/be/src/format/new_parquet/column_reader.h @@ -39,6 +39,10 @@ class RecordReader; namespace doris { class IColumn; +namespace reader { +struct FieldProjection; +} // namespace reader + namespace parquet { struct ParquetColumnSchema; @@ -88,14 +92,21 @@ class ParquetColumnReaderFactory { // 根据 file-local schema tree 创建 column reader。复杂类型会在这里递归创建 // children。该入口只理解 Parquet file schema,不处理 table/global schema。 Status create(const ParquetColumnSchema& column_schema, + const reader::FieldProjection* projection, std::unique_ptr* reader) const; + Status create(const ParquetColumnSchema& column_schema, + std::unique_ptr* reader) const { + return create(column_schema, nullptr, reader); + } + private: Status create_scalar_column_reader(const ParquetColumnSchema& column_schema, - std::unique_ptr* reader) const; + std::unique_ptr* reader) const; Status create_struct_column_reader(const ParquetColumnSchema& column_schema, - std::unique_ptr* reader) const; + const reader::FieldProjection* projection, + std::unique_ptr* reader) const; Status get_record_reader(int leaf_column_id, const ::parquet::ColumnDescriptor* descriptor, const std::string& name, diff --git a/be/src/format/new_parquet/parquet_column_schema.cpp b/be/src/format/new_parquet/parquet_column_schema.cpp index 131bf9f22c0e18..3235ea38a0671e 100644 --- a/be/src/format/new_parquet/parquet_column_schema.cpp +++ b/be/src/format/new_parquet/parquet_column_schema.cpp @@ -19,6 +19,7 @@ #include +#include #include #include #include @@ -32,6 +33,19 @@ namespace doris::parquet { namespace { +struct SchemaBuildContext { + int32_t top_level_field_id = -1; + int32_t parent_schema_node_id = -1; + int16_t definition_level = 0; + int16_t repetition_level = 0; + int16_t nullable_definition_level = 0; + int16_t repeated_repetition_level = 0; + std::vector file_path; + std::vector field_id_path; + std::vector name_path; + int* next_schema_node_id = nullptr; +}; + bool is_list_node(const ::parquet::schema::Node& node) { const auto& logical_type = node.logical_type(); return node.converted_type() == ::parquet::ConvertedType::LIST || @@ -49,16 +63,63 @@ DataTypePtr nullable_if_needed(DataTypePtr type, const ::parquet::schema::Node& return node.is_optional() ? make_nullable(type) : type; } +void inherit_common_schema_state(const ::parquet::schema::Node& node, + const SchemaBuildContext& context, + ParquetColumnSchema* column_schema) { + DORIS_CHECK(column_schema != nullptr); + DORIS_CHECK(context.next_schema_node_id != nullptr); + column_schema->field_id = node.field_id(); + column_schema->top_level_field_id = context.top_level_field_id; + column_schema->schema_node_id = (*context.next_schema_node_id)++; + column_schema->parent_schema_node_id = context.parent_schema_node_id; + column_schema->file_path = context.file_path; + column_schema->field_id_path = context.field_id_path; + column_schema->name_path = context.name_path; + column_schema->name = node.name(); + column_schema->node = &node; + column_schema->max_definition_level = context.definition_level; + column_schema->max_repetition_level = context.repetition_level; + column_schema->nullable_definition_level = context.nullable_definition_level; + column_schema->repeated_repetition_level = context.repeated_repetition_level; +} + +SchemaBuildContext child_context(const SchemaBuildContext& parent, + const ::parquet::schema::Node& child_node, int32_t child_idx, + int32_t parent_schema_node_id) { + SchemaBuildContext result = parent; + result.parent_schema_node_id = parent_schema_node_id; + result.file_path.push_back(child_idx); + result.field_id_path.push_back(child_node.field_id()); + result.name_path.push_back(child_node.name()); + if (child_node.repetition() != ::parquet::Repetition::REQUIRED) { + result.definition_level++; + result.nullable_definition_level = result.definition_level; + } + if (child_node.is_repeated()) { + result.repetition_level++; + result.repeated_repetition_level = result.repetition_level; + } + return result; +} + +void propagate_child_levels(ParquetColumnSchema* column_schema) { + DORIS_CHECK(column_schema != nullptr); + for (const auto& child : column_schema->children) { + column_schema->max_definition_level = + std::max(column_schema->max_definition_level, child->max_definition_level); + column_schema->max_repetition_level = + std::max(column_schema->max_repetition_level, child->max_repetition_level); + } +} + Status build_node_schema(const ::parquet::SchemaDescriptor& schema, - const ::parquet::schema::Node& node, + const ::parquet::schema::Node& node, const SchemaBuildContext& context, std::unique_ptr* result) { if (result == nullptr) { return Status::InvalidArgument("result is null"); } auto column_schema = std::make_unique(); - column_schema->field_id = node.field_id(); - column_schema->name = node.name(); - column_schema->node = &node; + inherit_common_schema_state(node, context, column_schema.get()); if (node.is_primitive()) { const int leaf_column_id = schema.ColumnIndex(node); @@ -69,6 +130,10 @@ Status build_node_schema(const ::parquet::SchemaDescriptor& schema, column_schema->kind = ParquetColumnSchemaKind::PRIMITIVE; column_schema->leaf_column_id = leaf_column_id; column_schema->descriptor = schema.Column(leaf_column_id); + if (column_schema->descriptor != nullptr) { + column_schema->max_definition_level = column_schema->descriptor->max_definition_level(); + column_schema->max_repetition_level = column_schema->descriptor->max_repetition_level(); + } column_schema->type_descriptor = resolve_parquet_type(column_schema->descriptor); column_schema->type = column_schema->type_descriptor.doris_type; if (column_schema->type == nullptr) { @@ -87,10 +152,13 @@ Status build_node_schema(const ::parquet::SchemaDescriptor& schema, node.name()); } std::unique_ptr child; - RETURN_IF_ERROR(build_node_schema(schema, *group.field(0), &child)); + RETURN_IF_ERROR(build_node_schema( + schema, *group.field(0), + child_context(context, *group.field(0), 0, column_schema->schema_node_id), &child)); column_schema->type = nullable_if_needed(std::make_shared(child->type), node); column_schema->children.push_back(std::move(child)); + propagate_child_levels(column_schema.get()); *result = std::move(column_schema); return Status::OK(); } @@ -102,16 +170,25 @@ Status build_node_schema(const ::parquet::SchemaDescriptor& schema, node.name()); } std::unique_ptr key_value; - RETURN_IF_ERROR(build_node_schema(schema, *group.field(0), &key_value)); + RETURN_IF_ERROR(build_node_schema( + schema, *group.field(0), + child_context(context, *group.field(0), 0, column_schema->schema_node_id), + &key_value)); if (key_value->children.size() != 2) { return Status::NotSupported("Unsupported parquet MAP key_value layout for column {}", node.name()); } + if (key_value->children[0]->node == nullptr || + key_value->children[0]->node->repetition() != ::parquet::Repetition::REQUIRED) { + return Status::NotSupported("Unsupported nullable parquet MAP key for column {}", + node.name()); + } auto key_type = key_value->children[0]->type; auto value_type = key_value->children[1]->type; column_schema->type = nullable_if_needed(std::make_shared(key_type, value_type), node); column_schema->children.push_back(std::move(key_value)); + propagate_child_levels(column_schema.get()); *result = std::move(column_schema); return Status::OK(); } @@ -123,13 +200,17 @@ Status build_node_schema(const ::parquet::SchemaDescriptor& schema, child_names.reserve(group.field_count()); for (int child_idx = 0; child_idx < group.field_count(); ++child_idx) { std::unique_ptr child; - RETURN_IF_ERROR(build_node_schema(schema, *group.field(child_idx), &child)); + RETURN_IF_ERROR(build_node_schema(schema, *group.field(child_idx), + child_context(context, *group.field(child_idx), child_idx, + column_schema->schema_node_id), + &child)); child_types.push_back(child->type); child_names.push_back(child->name); column_schema->children.push_back(std::move(child)); } column_schema->type = nullable_if_needed(std::make_shared(child_types, child_names), node); + propagate_child_levels(column_schema.get()); *result = std::move(column_schema); return Status::OK(); } @@ -146,10 +227,16 @@ Status build_parquet_column_schema(const ::parquet::SchemaDescriptor& schema, if (root == nullptr) { return Status::InvalidArgument("Parquet schema root is null"); } + int next_schema_node_id = 0; fields->reserve(root->field_count()); for (int field_idx = 0; field_idx < root->field_count(); ++field_idx) { std::unique_ptr field; - RETURN_IF_ERROR(build_node_schema(schema, *root->field(field_idx), &field)); + SchemaBuildContext context; + context.top_level_field_id = field_idx; + context.next_schema_node_id = &next_schema_node_id; + RETURN_IF_ERROR(build_node_schema( + schema, *root->field(field_idx), + child_context(context, *root->field(field_idx), field_idx, -1), &field)); fields->push_back(std::move(field)); } return Status::OK(); diff --git a/be/src/format/new_parquet/parquet_column_schema.h b/be/src/format/new_parquet/parquet_column_schema.h index 0d089a0f9cdcbf..81f9536243e8ee 100644 --- a/be/src/format/new_parquet/parquet_column_schema.h +++ b/be/src/format/new_parquet/parquet_column_schema.h @@ -47,16 +47,26 @@ enum class ParquetColumnSchemaKind { // 它描述 Parquet 逻辑字段到 leaf column ordinal 的关系,不包含 table/global schema 语义。 struct ParquetColumnSchema { int field_id = -1; + int top_level_field_id = -1; // Parquet schema 中的 primitive leaf column ordinal。 // 该 id 用于访问 ColumnDescriptor、RowGroupReader::RecordReader、ColumnChunk // metadata 和 statistics。复杂类型节点本身没有单一 leaf column,因此为 -1。 int leaf_column_id = -1; + int schema_node_id = -1; + int parent_schema_node_id = -1; + std::vector file_path; + std::vector field_id_path; + std::vector name_path; std::string name; DataTypePtr type; ParquetTypeDescriptor type_descriptor; ParquetColumnSchemaKind kind = ParquetColumnSchemaKind::PRIMITIVE; const ::parquet::schema::Node* node = nullptr; const ::parquet::ColumnDescriptor* descriptor = nullptr; + int16_t max_definition_level = 0; + int16_t max_repetition_level = 0; + int16_t nullable_definition_level = 0; + int16_t repeated_repetition_level = 0; std::vector> children; }; diff --git a/be/src/format/new_parquet/parquet_reader.cpp b/be/src/format/new_parquet/parquet_reader.cpp index ff9d939b4d064e..677a596debf733 100644 --- a/be/src/format/new_parquet/parquet_reader.cpp +++ b/be/src/format/new_parquet/parquet_reader.cpp @@ -23,14 +23,19 @@ #include #include +#include #include #include #include #include #include "common/exception.h" +#include "core/assert_cast.h" #include "core/block/block.h" +#include "core/data_type/data_type_array.h" +#include "core/data_type/data_type_map.h" #include "core/data_type/data_type_nullable.h" +#include "core/data_type/data_type_struct.h" #include "exprs/vexpr_context.h" #include "format/new_parquet/column_reader.h" #include "format/new_parquet/parquet_column_schema.h" @@ -181,10 +186,12 @@ void ParquetReader::_reset_current_row_group() { void ParquetReader::_fill_schema_field(const ParquetColumnSchema& column_schema, reader::SchemaField* field) const { - field->id = column_schema.leaf_column_id >= 0 ? column_schema.leaf_column_id - : column_schema.field_id; + field->id = column_schema.top_level_field_id; field->name = column_schema.name; field->type = column_schema.type; + field->file_path = column_schema.file_path; + field->field_id_path = column_schema.field_id_path; + field->name_path = column_schema.name_path; field->children.clear(); field->children.reserve(column_schema.children.size()); for (const auto& child : column_schema.children) { @@ -194,6 +201,95 @@ void ParquetReader::_fill_schema_field(const ParquetColumnSchema& column_schema, } } +Status ParquetReader::_fill_projected_schema_field(const ParquetColumnSchema& column_schema, + const reader::FieldProjection* projection, + reader::SchemaField* field) const { + if (field == nullptr) { + return Status::InvalidArgument("projected schema field is null"); + } + _fill_schema_field(column_schema, field); + if (projection == nullptr || projection->project_all_children || + column_schema.children.empty()) { + return Status::OK(); + } + + field->children.clear(); + std::map child_projection_by_idx; + for (const auto& child_projection : projection->children) { + if (child_projection.file_path.empty()) { + return Status::InvalidArgument("Empty parquet projection path for column {}", + column_schema.name); + } + child_projection_by_idx.emplace(child_projection.file_path.back(), &child_projection); + } + + DataTypes child_types; + Strings child_names; + for (size_t child_idx = 0; child_idx < column_schema.children.size(); ++child_idx) { + auto it = child_projection_by_idx.find(static_cast(child_idx)); + if (it == child_projection_by_idx.end()) { + continue; + } + if (it->second->file_path != column_schema.children[child_idx]->file_path) { + return Status::InvalidArgument("Invalid parquet projection path for column {}", + column_schema.children[child_idx]->name); + } + reader::SchemaField child_field; + RETURN_IF_ERROR(_fill_projected_schema_field(*column_schema.children[child_idx], it->second, + &child_field)); + child_types.push_back(child_field.type); + child_names.push_back(child_field.name); + field->children.push_back(std::move(child_field)); + } + + if (field->children.empty()) { + return Status::NotSupported("Parquet projection for column {} contains no children", + column_schema.name); + } + + const auto primitive_type = remove_nullable(column_schema.type)->get_primitive_type(); + DataTypePtr projected_type; + switch (primitive_type) { + case TYPE_STRUCT: + projected_type = std::make_shared(child_types, child_names); + break; + case TYPE_ARRAY: + DORIS_CHECK(child_types.size() == 1); + projected_type = std::make_shared(child_types[0]); + break; + case TYPE_MAP: + DORIS_CHECK(child_types.size() == 1); + DORIS_CHECK(remove_nullable(child_types[0])->get_primitive_type() == TYPE_STRUCT); + { + const auto* entry_type = + assert_cast(remove_nullable(child_types[0]).get()); + DORIS_CHECK(entry_type->get_elements().size() == 2); + projected_type = std::make_shared(entry_type->get_element(0), + entry_type->get_element(1)); + } + break; + default: + return Status::InvalidArgument("Cannot project children from non-complex parquet column {}", + column_schema.name); + } + field->type = + column_schema.type->is_nullable() ? make_nullable(projected_type) : projected_type; + return Status::OK(); +} + +Status ParquetReader::_get_projected_schema_field(reader::ColumnId file_column_id, + const reader::FieldProjection* projection, + reader::SchemaField* field) const { + if (file_column_id < 0 || + file_column_id >= static_cast(_state->file_schema.size())) { + return Status::InvalidArgument("Invalid parquet field id {}", file_column_id); + } + RETURN_IF_ERROR( + _fill_projected_schema_field(*_state->file_schema[file_column_id], projection, field)); + field->id = file_column_id; + return Status::OK(); +} + bool ParquetReader::_has_expression_filter(const reader::FileLocalFilter& local_filter) { return local_filter.conjunct != nullptr; } @@ -228,13 +324,12 @@ Status ParquetReader::_read_filter_columns(int64_t batch_rows, Block* file_block } IColumn::Filter filter(static_cast(batch_rows), 1); bool can_filter_all = false; - RETURN_IF_ERROR(local_filter.conjunct->execute_filter( - file_block, filter.data(), static_cast(batch_rows), false, - &can_filter_all)); + RETURN_IF_ERROR(local_filter.conjunct->execute_filter(file_block, filter.data(), + static_cast(batch_rows), + false, &can_filter_all)); *selected_rows = - can_filter_all - ? 0 - : _apply_filter_to_selection(filter, selection, *selected_rows); + can_filter_all ? 0 + : _apply_filter_to_selection(filter, selection, *selected_rows); break; } if (*selected_rows == 0) { @@ -298,14 +393,24 @@ Status ParquetReader::_open_next_row_group(bool* has_row_group) { _state->schema->num_columns()); for (const auto file_field_id : _request->predicate_columns) { const auto& column_schema = _state->file_schema[file_field_id]; + const auto projection_it = _request->complex_projections.find(file_field_id); + const auto* projection = projection_it == _request->complex_projections.end() + ? nullptr + : &projection_it->second; std::unique_ptr column_reader; - RETURN_IF_ERROR(column_reader_factory.create(*column_schema, &column_reader)); + RETURN_IF_ERROR( + column_reader_factory.create(*column_schema, projection, &column_reader)); _state->current_predicate_columns.push_back(std::move(column_reader)); } for (const auto file_field_id : _request->non_predicate_columns) { const auto& column_schema = _state->file_schema[file_field_id]; + const auto projection_it = _request->complex_projections.find(file_field_id); + const auto* projection = projection_it == _request->complex_projections.end() + ? nullptr + : &projection_it->second; std::unique_ptr column_reader; - RETURN_IF_ERROR(column_reader_factory.create(*column_schema, &column_reader)); + RETURN_IF_ERROR( + column_reader_factory.create(*column_schema, projection, &column_reader)); _state->current_non_predicate_columns.push_back(std::move(column_reader)); } *has_row_group = true; @@ -456,6 +561,23 @@ Status ParquetReader::open(std::unique_ptr& request) { local_filter.file_column_id); } } + for (const auto& [file_column_id, projection] : _request->complex_projections) { + if (file_column_id < 0 || file_column_id >= num_fields) { + return Status::InvalidArgument("Invalid parquet projection top-level field id {}", + file_column_id); + } + if (projection.file_column_id != file_column_id) { + return Status::InvalidArgument( + "Parquet projection column id mismatch: key={}, value={}", file_column_id, + projection.file_column_id); + } + if (!projection.file_path.empty() && projection.file_path.front() != file_column_id) { + return Status::InvalidArgument("Invalid parquet projection root path for column {}", + file_column_id); + } + reader::SchemaField projected_field; + RETURN_IF_ERROR(_get_projected_schema_field(file_column_id, &projection, &projected_field)); + } RETURN_IF_ERROR(select_row_groups_by_statistics(*_state->metadata, _state->file_schema, *_request, &_state->selected_row_groups)); RETURN_IF_ERROR(_reset_reader_position()); diff --git a/be/src/format/new_parquet/parquet_reader.h b/be/src/format/new_parquet/parquet_reader.h index d7a9dc19a5919c..f6d47f4613404e 100644 --- a/be/src/format/new_parquet/parquet_reader.h +++ b/be/src/format/new_parquet/parquet_reader.h @@ -121,6 +121,12 @@ class ParquetReader : public reader::FileReader { void _reset_current_row_group(); void _fill_schema_field(const ParquetColumnSchema& column_schema, reader::SchemaField* field) const; + Status _fill_projected_schema_field(const ParquetColumnSchema& column_schema, + const reader::FieldProjection* projection, + reader::SchemaField* field) const; + Status _get_projected_schema_field(reader::ColumnId file_column_id, + const reader::FieldProjection* projection, + reader::SchemaField* field) const; bool _has_expression_filter(const reader::FileLocalFilter& local_filter); Status _read_filter_columns(int64_t batch_rows, Block* file_block, SelectionVector* selection, uint16_t* selected_rows); diff --git a/be/src/format/reader/column_mapper.cpp b/be/src/format/reader/column_mapper.cpp index 5790517f7bb71f..4d9afdeff3297a 100644 --- a/be/src/format/reader/column_mapper.cpp +++ b/be/src/format/reader/column_mapper.cpp @@ -18,10 +18,16 @@ #include "format/reader/column_mapper.h" #include +#include +#include #include #include "common/status.h" #include "core/assert_cast.h" +#include "core/data_type/data_type_array.h" +#include "core/data_type/data_type_map.h" +#include "core/data_type/data_type_nullable.h" +#include "core/data_type/data_type_struct.h" #include "format/reader/expr/cast.h" #include "format/reader/expr/slot_ref.h" #include "format/reader/file_reader.h" @@ -86,8 +92,8 @@ static void rebuild_projection(ColumnMapping* mapping, size_t block_position) { mapping->projection = VExprContext::create_shared(expr); } -static std::map build_file_position_map( - const std::vector& mappings, const FileScanRequest& file_request) { +static std::map build_file_position_map(const std::vector& mappings, + const FileScanRequest& file_request) { std::map table_column_to_file_position; for (const auto& mapping : mappings) { if (!mapping.file_column_id.has_value()) { @@ -101,6 +107,109 @@ static std::map build_file_position_map( return table_column_to_file_position; } +static bool is_complex_type(const DataTypePtr& type) { + DORIS_CHECK(type != nullptr); + const auto primitive_type = remove_nullable(type)->get_primitive_type(); + return primitive_type == TYPE_STRUCT || primitive_type == TYPE_ARRAY || + primitive_type == TYPE_MAP; +} + +static const SchemaField* find_file_child_by_table_column( + const TableColumn& table_column, const std::vector& file_children, + TableColumnMappingMode mode) { + for (const auto& field : file_children) { + if (mode == TableColumnMappingMode::BY_FIELD_ID && !field.field_id_path.empty() && + field.field_id_path.back() != -1 && field.field_id_path.back() == table_column.id) { + return &field; + } + if (field.name == table_column.name) { + return &field; + } + } + return nullptr; +} + +static bool complex_projection_has_pruned_children(const ColumnMapping& mapping) { + if (!is_complex_type(mapping.file_type)) { + return false; + } + if (mapping.child_mappings.empty()) { + return false; + } + DORIS_CHECK(mapping.file_type != nullptr); + DORIS_CHECK(mapping.table_type != nullptr); + if (remove_nullable(mapping.file_type)->get_primitive_type() != + remove_nullable(mapping.table_type)->get_primitive_type()) { + return true; + } + if (!mapping.table_type->equals(*mapping.file_type)) { + return true; + } + for (const auto& child_mapping : mapping.child_mappings) { + if (!child_mapping.file_column_id.has_value() || + complex_projection_has_pruned_children(child_mapping)) { + return true; + } + } + return false; +} + +static Status rebuild_projected_file_type(ColumnMapping* mapping) { + if (mapping == nullptr) { + return Status::InvalidArgument("mapping is null"); + } + DORIS_CHECK(is_complex_type(mapping->file_type)); + DataTypes child_types; + Strings child_names; + child_types.reserve(mapping->child_mappings.size()); + child_names.reserve(mapping->child_mappings.size()); + for (auto& child_mapping : mapping->child_mappings) { + if (!child_mapping.file_column_id.has_value()) { + continue; + } + if (complex_projection_has_pruned_children(child_mapping)) { + RETURN_IF_ERROR(rebuild_projected_file_type(&child_mapping)); + } + child_types.push_back(child_mapping.file_type); + child_names.push_back(child_mapping.file_column_name); + } + if (child_types.empty()) { + return Status::NotSupported("Projection for complex column {} contains no file children", + mapping->file_column_name); + } + DataTypePtr projected_type; + const auto primitive_type = remove_nullable(mapping->file_type)->get_primitive_type(); + switch (primitive_type) { + case TYPE_STRUCT: + projected_type = std::make_shared(child_types, child_names); + break; + case TYPE_ARRAY: + DORIS_CHECK(child_types.size() == 1); + projected_type = std::make_shared(child_types[0]); + break; + case TYPE_MAP: + DORIS_CHECK(child_types.size() == 1); + DORIS_CHECK(remove_nullable(child_types[0])->get_primitive_type() == TYPE_STRUCT); + { + const auto* entry_type = + assert_cast(remove_nullable(child_types[0]).get()); + DORIS_CHECK(entry_type->get_elements().size() == 2); + projected_type = std::make_shared(entry_type->get_element(0), + entry_type->get_element(1)); + } + break; + default: + return Status::InvalidArgument("Cannot project children from non-complex column {}", + mapping->file_column_name); + } + mapping->file_type = + mapping->file_type->is_nullable() ? make_nullable(projected_type) : projected_type; + mapping->is_trivial = + mapping->table_type != nullptr && mapping->table_type->equals(*mapping->file_type); + mapping->has_complex_projection = true; + return Status::OK(); +} + Status TableColumnMapper::create_mapping(const std::vector& projected_columns, const std::map& partition_values, const std::vector& file_schema) { @@ -110,10 +219,7 @@ Status TableColumnMapper::create_mapping(const std::vector& project mapping.table_column_id = table_column.id; mapping.table_type = table_column.type; if (const auto* file_field = _find_file_field(table_column, file_schema)) { - mapping.file_column_id = file_field->id; - mapping.file_column_name = file_field->name; - mapping.file_type = file_field->type; - mapping.is_trivial = _is_same_type(mapping.table_type, mapping.file_type); + RETURN_IF_ERROR(_create_direct_mapping(table_column, *file_field, &mapping)); } else if (table_column.is_partition_key && partition_values.count(table_column.name) > 0) { // 3. Partition column, use partition value as a constant mapping. Note that partition column may also have default expression, but partition value should take precedence if it exists. mapping.default_expr = VExprContext::create_shared(TableLiteral::create_shared( @@ -130,12 +236,12 @@ Status TableColumnMapper::create_mapping(const std::vector& project } else { if (table_column.is_partition_key) { return Status::InvalidArgument( - "Table column '%s' (id=%d) does not have a matching partition value", - table_column.name); + "Table column '{}' (id={}) does not have a matching partition value", + table_column.name, table_column.id); } if (!_options.allow_missing_columns) { return Status::InvalidArgument( - "Table column '%s' (id=%d) does not have a matching file column", + "Table column '{}' (id={}) does not have a matching file column", table_column.name, table_column.id); } } @@ -152,15 +258,26 @@ Status TableColumnMapper::create_scan_request(const std::mappredicate_columns.clear(); file_request->non_predicate_columns.clear(); file_request->column_positions.clear(); + file_request->complex_projections.clear(); file_request->local_filters.clear(); file_request->reader_expression_map.clear(); for (const auto& table_column : projected_columns) { - const auto* mapping = _find_mapping(table_column.id); + auto* mapping = _find_mapping(table_column.id); if (mapping != nullptr && mapping->file_column_id.has_value()) { if (table_filters.count(table_column.id) == 0) { add_scan_column(file_request, *mapping->file_column_id, &file_request->non_predicate_columns); } + if (mapping->has_complex_projection || + complex_projection_has_pruned_children(*mapping)) { + if (!mapping->has_complex_projection) { + RETURN_IF_ERROR(rebuild_projected_file_type(mapping)); + } + FieldProjection projection; + RETURN_IF_ERROR(_build_complex_projection(*mapping, &projection)); + file_request->complex_projections.emplace(*mapping->file_column_id, + std::move(projection)); + } } } RETURN_IF_ERROR(localize_filters(table_filters, file_request)); @@ -216,8 +333,15 @@ Status TableColumnMapper::localize_filters(const std::map& const SchemaField* TableColumnMapper::_find_file_field( const TableColumn& table_column, const std::vector& file_schema) const { for (const auto& field : file_schema) { - if (_options.mode == TableColumnMappingMode::BY_FIELD_ID && field.id == table_column.id) { - return &field; + if (_options.mode == TableColumnMappingMode::BY_FIELD_ID) { + if (!field.field_id_path.empty() && field.field_id_path.back() != -1 && + field.field_id_path.back() == table_column.id) { + return &field; + } + if ((field.field_id_path.empty() || field.field_id_path.back() == -1) && + field.id == table_column.id) { + return &field; + } } if (field.name == table_column.name) { return &field; @@ -226,4 +350,62 @@ const SchemaField* TableColumnMapper::_find_file_field( return nullptr; } +Status TableColumnMapper::_create_direct_mapping(const TableColumn& table_column, + const SchemaField& file_field, + ColumnMapping* mapping) const { + if (mapping == nullptr) { + return Status::InvalidArgument("mapping is null"); + } + mapping->file_column_id = file_field.id; + mapping->file_column_name = file_field.name; + mapping->file_path = file_field.file_path; + mapping->file_type = file_field.type; + mapping->is_trivial = _is_same_type(mapping->table_type, mapping->file_type); + mapping->child_mappings.clear(); + + if (!table_column.children.empty() && is_complex_type(file_field.type)) { + for (const auto& table_child : table_column.children) { + const auto* file_child = find_file_child_by_table_column( + table_child, file_field.children, _options.mode); + if (file_child == nullptr) { + return Status::NotSupported( + "Complex schema change is not implemented: table child column '{}' " + "(id={}) does not have a matching file child under column '{}'", + table_child.name, table_child.id, table_column.name); + } + ColumnMapping child_mapping; + child_mapping.table_column_id = table_child.id; + child_mapping.table_type = table_child.type; + RETURN_IF_ERROR(_create_direct_mapping(table_child, *file_child, &child_mapping)); + mapping->child_mappings.push_back(std::move(child_mapping)); + } + } + return Status::OK(); +} + +Status TableColumnMapper::_build_complex_projection(const ColumnMapping& mapping, + FieldProjection* projection) const { + if (projection == nullptr) { + return Status::InvalidArgument("projection is null"); + } + DORIS_CHECK(mapping.file_column_id.has_value()); + projection->file_column_id = *mapping.file_column_id; + projection->file_path = mapping.file_path; + projection->project_all_children = mapping.child_mappings.empty(); + projection->children.clear(); + for (const auto& child_mapping : mapping.child_mappings) { + if (!child_mapping.file_column_id.has_value()) { + continue; + } + FieldProjection child_projection; + RETURN_IF_ERROR(_build_complex_projection(child_mapping, &child_projection)); + projection->children.push_back(std::move(child_projection)); + } + if (!projection->project_all_children && projection->children.empty()) { + return Status::NotSupported("Projection for complex column {} contains no file children", + mapping.file_column_name); + } + return Status::OK(); +} + } // namespace doris::reader diff --git a/be/src/format/reader/column_mapper.h b/be/src/format/reader/column_mapper.h index 4360b23e7de147..0c6ac9c8e6c5f6 100644 --- a/be/src/format/reader/column_mapper.h +++ b/be/src/format/reader/column_mapper.h @@ -19,7 +19,9 @@ #include #include +#include #include +#include #include #include #include @@ -28,12 +30,14 @@ #include "core/data_type/data_type.h" #include "exprs/vexpr_fwd.h" #include "format/reader/expr/literal.h" + namespace doris::reader { struct TableColumn; struct TableFilter; struct SchemaField; struct FileScanRequest; +struct FieldProjection; enum class TableColumnMappingMode { BY_FIELD_ID, @@ -52,6 +56,7 @@ struct ColumnMapping { int32_t table_column_id = -1; std::optional file_column_id; std::string file_column_name; + std::vector file_path; DataTypePtr file_type; DataTypePtr table_type; @@ -66,6 +71,7 @@ struct ColumnMapping { std::vector child_mappings; bool is_trivial = false; bool is_constant = false; + bool has_complex_projection = false; TableVirtualColumnType virtual_column_type = TableVirtualColumnType::INVALID; VExprContextSPtr default_expr; }; @@ -110,8 +116,21 @@ class TableColumnMapper { private: const SchemaField* _find_file_field(const TableColumn& table_column, const std::vector& file_schema) const; + Status _create_direct_mapping(const TableColumn& table_column, const SchemaField& file_field, + ColumnMapping* mapping) const; + Status _build_complex_projection(const ColumnMapping& mapping, + FieldProjection* projection) const; + + ColumnMapping* _find_mapping(int32_t table_column_id) { + for (auto& mapping : _mappings) { + if (mapping.table_column_id == table_column_id) { + return &mapping; + } + } + return nullptr; + } - const ColumnMapping* _find_mapping(ColumnId table_column_id) const { + const ColumnMapping* _find_mapping(int32_t table_column_id) const { for (const auto& mapping : _mappings) { if (mapping.table_column_id == table_column_id) { return &mapping; diff --git a/be/src/format/reader/file_reader.h b/be/src/format/reader/file_reader.h index cb2096fd80ad51..918e2b4bd351d2 100644 --- a/be/src/format/reader/file_reader.h +++ b/be/src/format/reader/file_reader.h @@ -59,9 +59,22 @@ struct SchemaField { std::string name; DataTypePtr type; std::vector children; + std::vector file_path; + std::vector field_id_path; + std::vector name_path; ColumnType column_type = ColumnType::DATA_COLUMN; }; +// File-local nested projection. The top-level scan column is still represented +// by FileScanRequest::predicate_columns/non_predicate_columns; this tree only +// describes which child paths are needed inside a complex top-level field. +struct FieldProjection { + ColumnId file_column_id = -1; + std::vector file_path; + bool project_all_children = true; + std::vector children; +}; + // 已经 localize 到文件 schema 的过滤条件。 // TableColumnMapper 负责把 table-level filter 转成这个结构;FileReader 只消费 // file-local column id、表达式和结构化谓词。 @@ -96,6 +109,7 @@ struct FileScanRequest { std::vector predicate_columns; std::vector non_predicate_columns; std::map column_positions; + std::map complex_projections; std::vector local_filters; // fallback path if filters cannot be localized to file-local predicates. The expression can reference projected_file_columns and partition columns. std::vector> reader_expression_map; diff --git a/be/src/format/reader/table_reader.h b/be/src/format/reader/table_reader.h index 4f28c4e1aaa9f8..c9589af8017dc2 100644 --- a/be/src/format/reader/table_reader.h +++ b/be/src/format/reader/table_reader.h @@ -26,8 +26,13 @@ #include #include "common/status.h" +#include "core/assert_cast.h" #include "core/block/block.h" #include "core/data_type/data_type.h" +#include "core/data_type/data_type_array.h" +#include "core/data_type/data_type_map.h" +#include "core/data_type/data_type_nullable.h" +#include "core/data_type/data_type_struct.h" #include "exprs/vexpr_context.h" #include "exprs/vexpr_fwd.h" #include "format/reader/column_mapper.h" @@ -180,8 +185,8 @@ class TableReader { size_t idx = 0; for (const auto& mapping : _data_reader.column_mapper.mappings()) { ColumnPtr column; - RETURN_IF_ERROR(_materialize_mapping_column( - mapping, &_data_reader.block_template, current_rows, &column)); + RETURN_IF_ERROR(_materialize_mapping_column(mapping, &_data_reader.block_template, + current_rows, &column)); block->replace_by_position(idx, std::move(column)); idx++; } @@ -233,7 +238,13 @@ class TableReader { DORIS_CHECK(block_position < _data_reader.scan_schema.size()); const auto* field = _find_schema_field(_data_reader.block_schema, file_column_id); DORIS_CHECK(field != nullptr); - _data_reader.scan_schema[block_position] = *field; + auto projection_it = file_request->complex_projections.find(file_column_id); + if (projection_it == file_request->complex_projections.end()) { + _data_reader.scan_schema[block_position] = *field; + } else { + RETURN_IF_ERROR(_project_schema_field(*field, projection_it->second, + &_data_reader.scan_schema[block_position])); + } } _data_reader.block_template.reserve(_data_reader.scan_schema.size()); for (const auto& field : _data_reader.scan_schema) { @@ -342,6 +353,86 @@ class TableReader { return nullptr; } + static Status _project_schema_field(const SchemaField& field, const FieldProjection& projection, + SchemaField* projected_field) { + if (projected_field == nullptr) { + return Status::InvalidArgument("projected_field is null"); + } + *projected_field = field; + if (projection.project_all_children || projection.children.empty()) { + return Status::OK(); + } + projected_field->children.clear(); + for (const auto& child_projection : projection.children) { + if (child_projection.file_path.empty()) { + return Status::InvalidArgument("Empty projection path for field {}", field.name); + } + const int32_t child_idx = child_projection.file_path.back(); + if (child_idx < 0 || child_idx >= static_cast(field.children.size())) { + return Status::InvalidArgument("Invalid projection child index {} for field {}", + child_idx, field.name); + } + if (child_projection.file_path != field.children[child_idx].file_path) { + return Status::InvalidArgument("Invalid projection path for field {}", + field.children[child_idx].name); + } + SchemaField projected_child; + RETURN_IF_ERROR(_project_schema_field(field.children[child_idx], child_projection, + &projected_child)); + projected_field->children.push_back(std::move(projected_child)); + } + if (projected_field->children.empty()) { + return Status::NotSupported("Projection for field {} contains no children", field.name); + } + RETURN_IF_ERROR(_rebuild_projected_type(field.type, projected_field)); + return Status::OK(); + } + + static Status _rebuild_projected_type(const DataTypePtr& original_type, + SchemaField* projected_field) { + if (original_type == nullptr) { + return Status::InvalidArgument("Cannot rebuild projected type for field {}", + projected_field->name); + } + DataTypes child_types; + Strings child_names; + child_types.reserve(projected_field->children.size()); + child_names.reserve(projected_field->children.size()); + for (const auto& child : projected_field->children) { + child_types.push_back(child.type); + child_names.push_back(child.name); + } + + const auto primitive_type = remove_nullable(original_type)->get_primitive_type(); + DataTypePtr projected_type; + switch (primitive_type) { + case TYPE_STRUCT: + projected_type = std::make_shared(child_types, child_names); + break; + case TYPE_ARRAY: + DORIS_CHECK(child_types.size() == 1); + projected_type = std::make_shared(child_types[0]); + break; + case TYPE_MAP: + DORIS_CHECK(child_types.size() == 1); + DORIS_CHECK(remove_nullable(child_types[0])->get_primitive_type() == TYPE_STRUCT); + { + const auto* entry_type = + assert_cast(remove_nullable(child_types[0]).get()); + DORIS_CHECK(entry_type->get_elements().size() == 2); + projected_type = std::make_shared(entry_type->get_element(0), + entry_type->get_element(1)); + } + break; + default: + return Status::InvalidArgument("Cannot project children from non-complex field {}", + projected_field->name); + } + projected_field->type = + original_type->is_nullable() ? make_nullable(projected_type) : projected_type; + return Status::OK(); + } + Status _parse_delete_predicates(const SplitReadOptions& options); }; diff --git a/be/test/format/new_parquet/parquet_column_reader_test.cpp b/be/test/format/new_parquet/parquet_column_reader_test.cpp index e4a0841f5af168..97773a5bada910 100644 --- a/be/test/format/new_parquet/parquet_column_reader_test.cpp +++ b/be/test/format/new_parquet/parquet_column_reader_test.cpp @@ -25,19 +25,23 @@ #include #include #include +#include #include #include "core/assert_cast.h" #include "core/column/column_decimal.h" #include "core/column/column_nullable.h" #include "core/column/column_string.h" +#include "core/column/column_struct.h" #include "core/column/column_vector.h" #include "core/data_type/data_type.h" #include "core/data_type/data_type_nullable.h" +#include "core/data_type/data_type_struct.h" #include "core/types.h" #include "format/new_parquet/column_reader.h" #include "format/new_parquet/parquet_column_schema.h" #include "format/new_parquet/selection_vector.h" +#include "format/reader/file_reader.h" namespace doris::parquet { namespace { @@ -100,8 +104,7 @@ class ParquetColumnReaderTest : public testing::Test { } std::shared_ptr build_fixed_binary_array( - const std::shared_ptr& type, - const std::vector& values) { + const std::shared_ptr& type, const std::vector& values) { arrow::FixedSizeBinaryBuilder builder(type, arrow::default_memory_pool()); for (const auto& value : values) { EXPECT_TRUE(builder.Append(reinterpret_cast(value.data())).ok()); @@ -119,6 +122,28 @@ class ParquetColumnReaderTest : public testing::Test { return finish_array(&builder); } + std::shared_ptr build_required_struct_array() { + auto struct_type = arrow::struct_({arrow::field("a", arrow::int32(), false), + arrow::field("b", arrow::utf8(), false)}); + std::vector> field_builders; + auto a_array_builder = std::make_unique(); + field_builders.push_back(std::shared_ptr(std::move(a_array_builder))); + auto b_array_builder = std::make_unique(); + field_builders.push_back(std::shared_ptr(std::move(b_array_builder))); + arrow::StructBuilder builder(struct_type, arrow::default_memory_pool(), + std::move(field_builders)); + auto* a_builder = assert_cast(builder.field_builder(0)); + auto* b_builder = assert_cast(builder.field_builder(1)); + const std::vector a_values = {101, 102, 103, 104, 105}; + const std::vector b_values = {"sa", "sb", "sc", "sd", "se"}; + for (size_t row = 0; row < a_values.size(); ++row) { + EXPECT_TRUE(builder.Append().ok()); + EXPECT_TRUE(a_builder->Append(a_values[row]).ok()); + EXPECT_TRUE(b_builder->Append(b_values[row]).ok()); + } + return finish_array(&builder); + } + std::shared_ptr build_time32_array(const std::shared_ptr& type, const std::vector& values) { arrow::Time32Builder builder(type, arrow::default_memory_pool()); @@ -138,8 +163,7 @@ class ParquetColumnReaderTest : public testing::Test { } std::shared_ptr build_timestamp_array( - const std::shared_ptr& type, - const std::vector& values) { + const std::shared_ptr& type, const std::vector& values) { arrow::TimestampBuilder builder(type, arrow::default_memory_pool()); for (const auto value : values) { EXPECT_TRUE(builder.Append(value).ok()); @@ -147,9 +171,8 @@ class ParquetColumnReaderTest : public testing::Test { return finish_array(&builder); } - std::shared_ptr build_decimal_array( - const std::shared_ptr& type, - const std::vector& values) { + std::shared_ptr build_decimal_array(const std::shared_ptr& type, + const std::vector& values) { arrow::Decimal128Builder builder(type, arrow::default_memory_pool()); for (const auto value : values) { EXPECT_TRUE(builder.Append(arrow::Decimal128(value)).ok()); @@ -165,16 +188,16 @@ class ParquetColumnReaderTest : public testing::Test { } void write_parquet_file() { - add_field(arrow::field("bool_col", arrow::boolean(), false), - build_required_array( - {true, false, true, false, true}), - [](const ParquetColumnSchema& schema, const IColumn& column) { - EXPECT_EQ(schema.type_descriptor.physical_type, ::parquet::Type::BOOLEAN); - const auto& values = assert_cast(column); - EXPECT_EQ(values.get_element(0), 1); - EXPECT_EQ(values.get_element(1), 0); - EXPECT_EQ(values.get_element(4), 1); - }); + add_field( + arrow::field("bool_col", arrow::boolean(), false), + build_required_array({true, false, true, false, true}), + [](const ParquetColumnSchema& schema, const IColumn& column) { + EXPECT_EQ(schema.type_descriptor.physical_type, ::parquet::Type::BOOLEAN); + const auto& values = assert_cast(column); + EXPECT_EQ(values.get_element(0), 1); + EXPECT_EQ(values.get_element(1), 0); + EXPECT_EQ(values.get_element(4), 1); + }); add_field(arrow::field("int32_col", arrow::int32(), false), build_required_array({10, 20, 30, 40, 50}), [](const ParquetColumnSchema& schema, const IColumn& column) { @@ -192,18 +215,17 @@ class ParquetColumnReaderTest : public testing::Test { EXPECT_EQ(values.get_element(0), 10000000000L); EXPECT_EQ(values.get_element(1), -9L); }); - add_field(arrow::field("float_col", arrow::float32(), false), - build_required_array( - {1.5F, -2.25F, 3.0F, 4.5F, 5.75F}), - [](const ParquetColumnSchema& schema, const IColumn& column) { - EXPECT_EQ(schema.type_descriptor.physical_type, ::parquet::Type::FLOAT); - const auto& values = assert_cast(column); - EXPECT_FLOAT_EQ(values.get_element(0), 1.5F); - EXPECT_FLOAT_EQ(values.get_element(1), -2.25F); - }); + add_field( + arrow::field("float_col", arrow::float32(), false), + build_required_array({1.5F, -2.25F, 3.0F, 4.5F, 5.75F}), + [](const ParquetColumnSchema& schema, const IColumn& column) { + EXPECT_EQ(schema.type_descriptor.physical_type, ::parquet::Type::FLOAT); + const auto& values = assert_cast(column); + EXPECT_FLOAT_EQ(values.get_element(0), 1.5F); + EXPECT_FLOAT_EQ(values.get_element(1), -2.25F); + }); add_field(arrow::field("double_col", arrow::float64(), false), - build_required_array( - {3.5, -4.75, 6.0, 7.25, 8.5}), + build_required_array({3.5, -4.75, 6.0, 7.25, 8.5}), [](const ParquetColumnSchema& schema, const IColumn& column) { EXPECT_EQ(schema.type_descriptor.physical_type, ::parquet::Type::DOUBLE); const auto& values = assert_cast(column); @@ -263,31 +285,27 @@ class ParquetColumnReaderTest : public testing::Test { EXPECT_EQ(schema.type->to_string(column, 1), "00:00:01.000000"); EXPECT_EQ(schema.type->to_string(column, 2), "01:02:03.004567"); }); - add_field(arrow::field("timestamp_millis_col", - arrow::timestamp(arrow::TimeUnit::MILLI), false), + add_field(arrow::field("timestamp_millis_col", arrow::timestamp(arrow::TimeUnit::MILLI), + false), build_timestamp_array(arrow::timestamp(arrow::TimeUnit::MILLI), {0, 1234, 1609459200000, 1609459201000, -1}), [](const ParquetColumnSchema& schema, const IColumn& column) { EXPECT_EQ(schema.type_descriptor.physical_type, ::parquet::Type::INT64); EXPECT_EQ(remove_nullable(schema.type)->get_primitive_type(), TYPE_DATETIMEV2); - EXPECT_EQ(schema.type->to_string(column, 1), - "1970-01-01 00:00:01.234"); - EXPECT_EQ(schema.type->to_string(column, 4), - "1969-12-31 23:59:59.999"); + EXPECT_EQ(schema.type->to_string(column, 1), "1970-01-01 00:00:01.234"); + EXPECT_EQ(schema.type->to_string(column, 4), "1969-12-31 23:59:59.999"); }); - add_field(arrow::field("timestamp_micros_col", - arrow::timestamp(arrow::TimeUnit::MICRO), false), + add_field(arrow::field("timestamp_micros_col", arrow::timestamp(arrow::TimeUnit::MICRO), + false), build_timestamp_array(arrow::timestamp(arrow::TimeUnit::MICRO), {0, 1234567, 1609459200000000, 1609459201000000, -1}), [](const ParquetColumnSchema& schema, const IColumn& column) { EXPECT_EQ(schema.type_descriptor.physical_type, ::parquet::Type::INT64); EXPECT_EQ(remove_nullable(schema.type)->get_primitive_type(), TYPE_DATETIMEV2); - EXPECT_EQ(schema.type->to_string(column, 1), - "1970-01-01 00:00:01.234567"); - EXPECT_EQ(schema.type->to_string(column, 4), - "1969-12-31 23:59:59.999999"); + EXPECT_EQ(schema.type->to_string(column, 1), "1970-01-01 00:00:01.234567"); + EXPECT_EQ(schema.type->to_string(column, 4), "1969-12-31 23:59:59.999999"); }); add_field(arrow::field("decimal_fixed_binary_9_2_col", arrow::decimal128(9, 2), false), build_decimal_array(arrow::decimal128(9, 2), {12345, -67, 0, 987, 1000}), @@ -295,8 +313,7 @@ class ParquetColumnReaderTest : public testing::Test { EXPECT_EQ(schema.type_descriptor.physical_type, ::parquet::Type::FIXED_LEN_BYTE_ARRAY); EXPECT_TRUE(schema.type_descriptor.is_decimal); - EXPECT_EQ(remove_nullable(schema.type)->get_primitive_type(), - TYPE_DECIMAL32); + EXPECT_EQ(remove_nullable(schema.type)->get_primitive_type(), TYPE_DECIMAL32); const auto& values = assert_cast(column); EXPECT_EQ(values.get_element(0), Decimal32(12345)); EXPECT_EQ(schema.type->to_string(column, 0), "123.45"); @@ -308,8 +325,7 @@ class ParquetColumnReaderTest : public testing::Test { EXPECT_EQ(schema.type_descriptor.physical_type, ::parquet::Type::FIXED_LEN_BYTE_ARRAY); EXPECT_TRUE(schema.type_descriptor.is_decimal); - EXPECT_EQ(remove_nullable(schema.type)->get_primitive_type(), - TYPE_DECIMAL64); + EXPECT_EQ(remove_nullable(schema.type)->get_primitive_type(), TYPE_DECIMAL64); const auto& values = assert_cast(column); EXPECT_EQ(values.get_element(0), Decimal64(1234567)); EXPECT_EQ(schema.type->to_string(column, 0), "1.234567"); @@ -329,6 +345,26 @@ class ParquetColumnReaderTest : public testing::Test { EXPECT_EQ(nested_column.get_element(0), 1); EXPECT_EQ(nested_column.get_element(2), 3); }); + add_field(arrow::field("struct_col", + arrow::struct_({ + arrow::field("a", arrow::int32(), false), + arrow::field("b", arrow::utf8(), false), + }), + false), + build_required_struct_array(), + [](const ParquetColumnSchema& schema, const IColumn& column) { + EXPECT_EQ(remove_nullable(schema.type)->get_primitive_type(), TYPE_STRUCT); + const auto& struct_column = assert_cast(column); + ASSERT_EQ(struct_column.get_columns().size(), 2); + const auto& a_values = + assert_cast(struct_column.get_column(0)); + const auto& b_values = + assert_cast(struct_column.get_column(1)); + EXPECT_EQ(a_values.get_element(0), 101); + EXPECT_EQ(a_values.get_element(4), 105); + EXPECT_EQ(b_values.get_data_at(1).to_string(), "sb"); + EXPECT_EQ(b_values.get_data_at(4).to_string(), "se"); + }); auto schema = arrow::schema(_arrow_fields); auto table = arrow::Table::Make(schema, _arrays); @@ -341,9 +377,8 @@ class ParquetColumnReaderTest : public testing::Test { builder.version(::parquet::ParquetVersion::PARQUET_2_6); builder.data_page_version(::parquet::ParquetDataPageVersion::V2); builder.compression(::parquet::Compression::UNCOMPRESSED); - PARQUET_THROW_NOT_OK( - ::parquet::arrow::WriteTable(*table, arrow::default_memory_pool(), out, ROW_COUNT, - builder.build())); + PARQUET_THROW_NOT_OK(::parquet::arrow::WriteTable(*table, arrow::default_memory_pool(), out, + ROW_COUNT, builder.build())); } std::unique_ptr create_reader(size_t field_idx) const { @@ -378,6 +413,9 @@ class ParquetColumnReaderTest : public testing::Test { TEST_F(ParquetColumnReaderTest, ReadAllSupportedPhysicalAndLogicalTypes) { for (size_t field_idx = 0; field_idx < _fields.size(); ++field_idx) { SCOPED_TRACE(_fields[field_idx]->name); + if (_fields[field_idx]->kind != ParquetColumnSchemaKind::PRIMITIVE) { + continue; + } ASSERT_TRUE(supports_record_reader(_fields[field_idx]->type_descriptor)); read_and_validate(field_idx); } @@ -418,10 +456,64 @@ TEST_F(ParquetColumnReaderTest, SelectReadsOnlySelectedRanges) { EXPECT_EQ(int_values.get_element(2), 50); } +TEST_F(ParquetColumnReaderTest, ReadProjectedStructChildren) { + const auto field_idx = _fields.size() - 1; + const auto& struct_schema = *_fields[field_idx]; + ASSERT_EQ(struct_schema.name, "struct_col"); + ASSERT_EQ(struct_schema.children.size(), 2); + + reader::FieldProjection projection; + projection.file_column_id = struct_schema.top_level_field_id; + projection.file_path = struct_schema.file_path; + projection.project_all_children = false; + reader::FieldProjection child_projection; + child_projection.file_column_id = struct_schema.top_level_field_id; + child_projection.file_path = struct_schema.children[1]->file_path; + projection.children.push_back(std::move(child_projection)); + + ParquetColumnReaderFactory factory(_row_group, _file_reader->metadata()->num_columns()); + std::unique_ptr reader; + auto st = factory.create(struct_schema, &projection, &reader); + ASSERT_TRUE(st.ok()) << st; + ASSERT_EQ(remove_nullable(reader->type())->get_primitive_type(), TYPE_STRUCT); + const auto* projected_type = + assert_cast(remove_nullable(reader->type()).get()); + ASSERT_EQ(projected_type->get_elements().size(), 1); + EXPECT_EQ(projected_type->get_element_name(0), "b"); + + MutableColumnPtr column = reader->type()->create_column(); + int64_t rows_read = 0; + st = reader->read(ROW_COUNT, column, &rows_read); + ASSERT_TRUE(st.ok()) << st; + ASSERT_EQ(rows_read, ROW_COUNT); + const auto& struct_column = assert_cast(*column); + ASSERT_EQ(struct_column.get_columns().size(), 1); + const auto& values = assert_cast(struct_column.get_column(0)); + EXPECT_EQ(values.get_data_at(0).to_string(), "sa"); + EXPECT_EQ(values.get_data_at(4).to_string(), "se"); +} + +TEST_F(ParquetColumnReaderTest, BuildComplexSchemaPathMetadata) { + const auto field_idx = _fields.size() - 1; + const auto& struct_schema = *_fields[field_idx]; + ASSERT_EQ(struct_schema.name, "struct_col"); + ASSERT_EQ(struct_schema.children.size(), 2); + EXPECT_EQ(struct_schema.file_path, std::vector({static_cast(field_idx)})); + EXPECT_EQ(struct_schema.name_path, std::vector({"struct_col"})); + EXPECT_EQ(struct_schema.children[0]->file_path, + std::vector({static_cast(field_idx), 0})); + EXPECT_EQ(struct_schema.children[1]->file_path, + std::vector({static_cast(field_idx), 1})); + EXPECT_EQ(struct_schema.children[0]->name_path, std::vector({"struct_col", "a"})); + EXPECT_EQ(struct_schema.children[1]->name_path, std::vector({"struct_col", "b"})); + EXPECT_EQ(struct_schema.max_definition_level, 0); + EXPECT_EQ(struct_schema.max_repetition_level, 0); +} + TEST_F(ParquetColumnReaderTest, ResolveSupportedPhysicalAndLogicalSchemas) { std::vector<::parquet::schema::NodePtr> nodes = { - ::parquet::schema::PrimitiveNode::Make( - "required_bool", ::parquet::Repetition::REQUIRED, ::parquet::Type::BOOLEAN), + ::parquet::schema::PrimitiveNode::Make("required_bool", ::parquet::Repetition::REQUIRED, + ::parquet::Type::BOOLEAN), ::parquet::schema::PrimitiveNode::Make( "required_int32", ::parquet::Repetition::REQUIRED, ::parquet::Type::INT32), ::parquet::schema::PrimitiveNode::Make( @@ -430,41 +522,42 @@ TEST_F(ParquetColumnReaderTest, ResolveSupportedPhysicalAndLogicalSchemas) { "required_float", ::parquet::Repetition::REQUIRED, ::parquet::Type::FLOAT), ::parquet::schema::PrimitiveNode::Make( "required_double", ::parquet::Repetition::REQUIRED, ::parquet::Type::DOUBLE), - ::parquet::schema::PrimitiveNode::Make( - "required_binary", ::parquet::Repetition::REQUIRED, ::parquet::Type::BYTE_ARRAY), + ::parquet::schema::PrimitiveNode::Make("required_binary", + ::parquet::Repetition::REQUIRED, + ::parquet::Type::BYTE_ARRAY), ::parquet::schema::PrimitiveNode::Make( "required_fixed_binary", ::parquet::Repetition::REQUIRED, ::parquet::Type::FIXED_LEN_BYTE_ARRAY, ::parquet::ConvertedType::NONE, 4), ::parquet::schema::PrimitiveNode::Make( "optional_int32", ::parquet::Repetition::OPTIONAL, ::parquet::Type::INT32), - ::parquet::schema::PrimitiveNode::Make( - "utf8_binary", ::parquet::Repetition::REQUIRED, ::parquet::Type::BYTE_ARRAY, - ::parquet::ConvertedType::UTF8), - ::parquet::schema::PrimitiveNode::Make( - "enum_binary", ::parquet::Repetition::REQUIRED, ::parquet::Type::BYTE_ARRAY, - ::parquet::ConvertedType::ENUM), - ::parquet::schema::PrimitiveNode::Make( - "json_binary", ::parquet::Repetition::REQUIRED, ::parquet::Type::BYTE_ARRAY, - ::parquet::ConvertedType::JSON), - ::parquet::schema::PrimitiveNode::Make( - "bson_binary", ::parquet::Repetition::REQUIRED, ::parquet::Type::BYTE_ARRAY, - ::parquet::ConvertedType::BSON), - ::parquet::schema::PrimitiveNode::Make( - "decimal_int32", ::parquet::Repetition::REQUIRED, ::parquet::Type::INT32, - ::parquet::ConvertedType::DECIMAL, -1, 9, 2), - ::parquet::schema::PrimitiveNode::Make( - "decimal_int64", ::parquet::Repetition::REQUIRED, ::parquet::Type::INT64, - ::parquet::ConvertedType::DECIMAL, -1, 18, 6), + ::parquet::schema::PrimitiveNode::Make("utf8_binary", ::parquet::Repetition::REQUIRED, + ::parquet::Type::BYTE_ARRAY, + ::parquet::ConvertedType::UTF8), + ::parquet::schema::PrimitiveNode::Make("enum_binary", ::parquet::Repetition::REQUIRED, + ::parquet::Type::BYTE_ARRAY, + ::parquet::ConvertedType::ENUM), + ::parquet::schema::PrimitiveNode::Make("json_binary", ::parquet::Repetition::REQUIRED, + ::parquet::Type::BYTE_ARRAY, + ::parquet::ConvertedType::JSON), + ::parquet::schema::PrimitiveNode::Make("bson_binary", ::parquet::Repetition::REQUIRED, + ::parquet::Type::BYTE_ARRAY, + ::parquet::ConvertedType::BSON), + ::parquet::schema::PrimitiveNode::Make("decimal_int32", ::parquet::Repetition::REQUIRED, + ::parquet::Type::INT32, + ::parquet::ConvertedType::DECIMAL, -1, 9, 2), + ::parquet::schema::PrimitiveNode::Make("decimal_int64", ::parquet::Repetition::REQUIRED, + ::parquet::Type::INT64, + ::parquet::ConvertedType::DECIMAL, -1, 18, 6), ::parquet::schema::PrimitiveNode::Make( "decimal_binary", ::parquet::Repetition::REQUIRED, ::parquet::Type::BYTE_ARRAY, ::parquet::ConvertedType::DECIMAL, -1, 18, 6), - ::parquet::schema::PrimitiveNode::Make( - "decimal_fixed_binary", ::parquet::Repetition::REQUIRED, - ::parquet::Type::FIXED_LEN_BYTE_ARRAY, ::parquet::ConvertedType::DECIMAL, 8, - 18, 6), - ::parquet::schema::PrimitiveNode::Make( - "date_int32", ::parquet::Repetition::REQUIRED, ::parquet::Type::INT32, - ::parquet::ConvertedType::DATE), + ::parquet::schema::PrimitiveNode::Make("decimal_fixed_binary", + ::parquet::Repetition::REQUIRED, + ::parquet::Type::FIXED_LEN_BYTE_ARRAY, + ::parquet::ConvertedType::DECIMAL, 8, 18, 6), + ::parquet::schema::PrimitiveNode::Make("date_int32", ::parquet::Repetition::REQUIRED, + ::parquet::Type::INT32, + ::parquet::ConvertedType::DATE), ::parquet::schema::PrimitiveNode::Make( "time_millis_int32", ::parquet::Repetition::REQUIRED, ::parquet::Type::INT32, ::parquet::ConvertedType::TIME_MILLIS), @@ -477,27 +570,27 @@ TEST_F(ParquetColumnReaderTest, ResolveSupportedPhysicalAndLogicalSchemas) { ::parquet::schema::PrimitiveNode::Make( "timestamp_micros_int64", ::parquet::Repetition::REQUIRED, ::parquet::Type::INT64, ::parquet::ConvertedType::TIMESTAMP_MICROS), - ::parquet::schema::PrimitiveNode::Make( - "int8_int32", ::parquet::Repetition::REQUIRED, ::parquet::Type::INT32, - ::parquet::ConvertedType::INT_8), - ::parquet::schema::PrimitiveNode::Make( - "uint8_int32", ::parquet::Repetition::REQUIRED, ::parquet::Type::INT32, - ::parquet::ConvertedType::UINT_8), - ::parquet::schema::PrimitiveNode::Make( - "int16_int32", ::parquet::Repetition::REQUIRED, ::parquet::Type::INT32, - ::parquet::ConvertedType::INT_16), - ::parquet::schema::PrimitiveNode::Make( - "uint16_int32", ::parquet::Repetition::REQUIRED, ::parquet::Type::INT32, - ::parquet::ConvertedType::UINT_16), - ::parquet::schema::PrimitiveNode::Make( - "int32_int32", ::parquet::Repetition::REQUIRED, ::parquet::Type::INT32, - ::parquet::ConvertedType::INT_32), - ::parquet::schema::PrimitiveNode::Make( - "uint32_int32", ::parquet::Repetition::REQUIRED, ::parquet::Type::INT32, - ::parquet::ConvertedType::UINT_32), - ::parquet::schema::PrimitiveNode::Make( - "int64_int64", ::parquet::Repetition::REQUIRED, ::parquet::Type::INT64, - ::parquet::ConvertedType::INT_64), + ::parquet::schema::PrimitiveNode::Make("int8_int32", ::parquet::Repetition::REQUIRED, + ::parquet::Type::INT32, + ::parquet::ConvertedType::INT_8), + ::parquet::schema::PrimitiveNode::Make("uint8_int32", ::parquet::Repetition::REQUIRED, + ::parquet::Type::INT32, + ::parquet::ConvertedType::UINT_8), + ::parquet::schema::PrimitiveNode::Make("int16_int32", ::parquet::Repetition::REQUIRED, + ::parquet::Type::INT32, + ::parquet::ConvertedType::INT_16), + ::parquet::schema::PrimitiveNode::Make("uint16_int32", ::parquet::Repetition::REQUIRED, + ::parquet::Type::INT32, + ::parquet::ConvertedType::UINT_16), + ::parquet::schema::PrimitiveNode::Make("int32_int32", ::parquet::Repetition::REQUIRED, + ::parquet::Type::INT32, + ::parquet::ConvertedType::INT_32), + ::parquet::schema::PrimitiveNode::Make("uint32_int32", ::parquet::Repetition::REQUIRED, + ::parquet::Type::INT32, + ::parquet::ConvertedType::UINT_32), + ::parquet::schema::PrimitiveNode::Make("int64_int64", ::parquet::Repetition::REQUIRED, + ::parquet::Type::INT64, + ::parquet::ConvertedType::INT_64), }; auto schema = @@ -523,13 +616,13 @@ TEST_F(ParquetColumnReaderTest, RejectUnsupportedPhysicalAndLogicalTypes) { { ::parquet::schema::PrimitiveNode::Make( "int96_col", ::parquet::Repetition::REQUIRED, ::parquet::Type::INT96), - ::parquet::schema::PrimitiveNode::Make( - "repeated_int32_col", ::parquet::Repetition::REPEATED, - ::parquet::Type::INT32), + ::parquet::schema::PrimitiveNode::Make("repeated_int32_col", + ::parquet::Repetition::REPEATED, + ::parquet::Type::INT32), ::parquet::schema::PrimitiveNode::Make( "decimal256_fixed_col", ::parquet::Repetition::REQUIRED, - ::parquet::Type::FIXED_LEN_BYTE_ARRAY, ::parquet::ConvertedType::DECIMAL, - 20, 39, 6), + ::parquet::Type::FIXED_LEN_BYTE_ARRAY, + ::parquet::ConvertedType::DECIMAL, 20, 39, 6), ::parquet::schema::PrimitiveNode::Make( "uint64_col", ::parquet::Repetition::REQUIRED, ::parquet::Type::INT64, ::parquet::ConvertedType::UINT_64), diff --git a/be/test/format/new_parquet/parquet_reader_test.cpp b/be/test/format/new_parquet/parquet_reader_test.cpp index 7e28b7fce5b25a..5341b4060b5a8f 100644 --- a/be/test/format/new_parquet/parquet_reader_test.cpp +++ b/be/test/format/new_parquet/parquet_reader_test.cpp @@ -15,6 +15,8 @@ // specific language governing permissions and limitations // under the License. +#include "format/new_parquet/parquet_reader.h" + #include #include #include @@ -24,6 +26,7 @@ #include #include #include +#include #include #include "core/assert_cast.h" @@ -31,11 +34,13 @@ #include "core/column/column_string.h" #include "core/column/column_vector.h" #include "core/data_type/data_type_number.h" +#include "core/data_type/data_type_string.h" +#include "core/data_type/data_type_struct.h" #include "core/data_type/primitive_type.h" #include "core/field.h" #include "exprs/vexpr.h" #include "exprs/vexpr_context.h" -#include "format/new_parquet/parquet_reader.h" +#include "format/reader/column_mapper.h" #include "format/reader/file_reader.h" #include "gen_cpp/Types_types.h" #include "io/io_common.h" @@ -78,7 +83,8 @@ class Int32GreaterThanExpr final : public VExpr { }; VExprContextSPtr create_int32_greater_than_conjunct(int column_id, int32_t value) { - auto ctx = VExprContext::create_shared(std::make_shared(column_id, value)); + auto ctx = + VExprContext::create_shared(std::make_shared(column_id, value)); ctx->_prepared = true; ctx->_opened = true; return ctx; @@ -111,9 +117,9 @@ void write_parquet_file(const std::string& file_path, int64_t row_group_size = R arrow::field("id", arrow::int32(), false), arrow::field("value", arrow::utf8(), false), }); - auto table = arrow::Table::Make( - schema, {build_int32_array({1, 2, 3, 4, 5}), - build_string_array({"one", "two", "three", "four", "five"})}); + auto table = arrow::Table::Make(schema, + {build_int32_array({1, 2, 3, 4, 5}), + build_string_array({"one", "two", "three", "four", "five"})}); auto file_result = arrow::io::FileOutputStream::Open(file_path); ASSERT_TRUE(file_result.ok()) << file_result.status(); @@ -123,8 +129,8 @@ void write_parquet_file(const std::string& file_path, int64_t row_group_size = R builder.version(::parquet::ParquetVersion::PARQUET_2_6); builder.data_page_version(::parquet::ParquetDataPageVersion::V2); builder.compression(::parquet::Compression::UNCOMPRESSED); - PARQUET_THROW_NOT_OK(::parquet::arrow::WriteTable( - *table, arrow::default_memory_pool(), out, row_group_size, builder.build())); + PARQUET_THROW_NOT_OK(::parquet::arrow::WriteTable(*table, arrow::default_memory_pool(), out, + row_group_size, builder.build())); } Block build_file_block(const std::vector& schema) { @@ -175,6 +181,57 @@ TEST(FileReaderTest, OpenStoresRequestAndCloseClearsState) { EXPECT_TRUE(reader.eof()); } +TEST(TableColumnMapperTest, CreatesComplexProjectionForStructChildren) { + reader::SchemaField struct_field; + struct_field.id = 0; + struct_field.name = "s"; + struct_field.file_path = {0}; + reader::SchemaField a_field; + a_field.id = 0; + a_field.name = "a"; + a_field.type = std::make_shared(); + a_field.file_path = {0, 0}; + reader::SchemaField b_field; + b_field.id = 0; + b_field.name = "b"; + b_field.type = std::make_shared(); + b_field.file_path = {0, 1}; + struct_field.children = {a_field, b_field}; + struct_field.type = std::make_shared(DataTypes {a_field.type, b_field.type}, + Strings {"a", "b"}); + + reader::TableColumn table_child; + table_child.id = 101; + table_child.name = "b"; + table_child.type = b_field.type; + reader::TableColumn table_column; + table_column.id = 100; + table_column.name = "s"; + table_column.type = std::make_shared(DataTypes {b_field.type}, Strings {"b"}); + table_column.children = {table_child}; + + reader::TableColumnMapperOptions options; + options.mode = reader::TableColumnMappingMode::BY_NAME; + reader::TableColumnMapper mapper(options); + ASSERT_TRUE(mapper.create_mapping({table_column}, {}, {struct_field}).ok()); + + auto request = std::make_unique(); + ASSERT_TRUE(mapper.create_scan_request({}, {table_column}, request.get()).ok()); + ASSERT_EQ(request->non_predicate_columns, std::vector({0})); + ASSERT_EQ(request->complex_projections.size(), 1); + const auto& projection = request->complex_projections.at(0); + EXPECT_EQ(projection.file_path, std::vector({0})); + ASSERT_FALSE(projection.project_all_children); + ASSERT_EQ(projection.children.size(), 1); + EXPECT_EQ(projection.children[0].file_path, std::vector({0, 1})); + + ASSERT_EQ(mapper.mappings().size(), 1); + const auto* projected_type = + assert_cast(mapper.mappings()[0].file_type.get()); + ASSERT_EQ(projected_type->get_elements().size(), 1); + EXPECT_EQ(projected_type->get_element_name(0), "b"); +} + class NewParquetReaderTest : public testing::Test { protected: void SetUp() override { diff --git a/docs/doris-arrow-parquet-complex-types-implementation.md b/docs/doris-arrow-parquet-complex-types-implementation.md new file mode 100644 index 00000000000000..1ee0dabc944fe7 --- /dev/null +++ b/docs/doris-arrow-parquet-complex-types-implementation.md @@ -0,0 +1,559 @@ +# Doris Arrow Parquet Reader 复杂类型完整支持方案 + +本文档描述 `be/src/format/new_parquet/` 新 Parquet reader 对 `STRUCT`、`LIST`、`MAP` 复杂类型的完整支持方案。 + +目标是在现有 file-local reader 边界内补齐复杂类型读取能力: + +- 继续复用 Arrow C++ Parquet core API 解析文件、row group、column chunk 和 leaf value。 +- 输出仍然是 Doris `Block` / `Column`,不引入 `parquet::arrow::FileReader`、`arrow::RecordBatch` 或 `arrow::Table` 作为 scan 输出路径。 +- `ParquetReader` 仍只理解 Parquet file-local schema,不处理 Iceberg/global schema evolution。 +- schema change、default/generated/partition column、delete、virtual column 仍由 `TableReader` / `TableColumnMapper` 负责。 +- 复杂类型读取必须以 Parquet definition level / repetition level 为准,不能依赖简单 row count 拼接。 +- 复杂类型列裁剪是本轮实现目标:读取 top-level complex column 时,只读取被请求的 child subtree。 +- 复杂类型 schema change 不在本轮实现,但本轮设计必须保留 field id、path、level 和 projection tree 边界,保证后续可以在 `TableColumnMapper` 中补齐 child-level mapping。 + +## 参考实现:DuckDB Parquet Reader + +参考目录: + +```text +/Users/xiaogangsu/code/duckdb/extension/parquet/ +``` + +重点参考文件: + +```text +extension/parquet/include/parquet_column_schema.hpp +extension/parquet/include/column_reader.hpp +extension/parquet/parquet_reader.cpp +extension/parquet/column_reader.cpp +extension/parquet/reader/struct_column_reader.cpp +extension/parquet/reader/list_column_reader.cpp +``` + +DuckDB 中值得借鉴的核心结构: + +- `ParquetColumnSchema` 保存 `max_define`、`max_repeat`、`schema_index`、`column_index`,schema tree 本身携带 Dremel level 信息。 +- `ParseSchemaRecursive()` 在解析 schema 时递增 definition/repetition level,并把 legacy repeated field、3-level LIST、MAP/MAP_KEY_VALUE 统一成 reader 可消费的 schema tree。 +- primitive reader 读取 leaf value 的同时输出 definition/repetition level。 +- struct reader 递归读取 children,并用 child 输出的 definition level 设置 struct null。 +- list/map reader 不直接按 row 数读取 child;它读取 child leaf stream,根据当前 list/map 层的 repetition level 折叠出 parent rows、offsets 和 null map。 +- skip/select 是 reader 级语义,不是 column filter fallback;复杂类型 skip 也必须消费对应的 level stream,保证所有 child reader 游标一致。 + +Doris 不需要照搬 DuckDB 的 thrift/page decoder;当前方案仍优先封装 Arrow internal `RecordReader`。但 DuckDB 的 reader 分层和 level 组装模型应作为 Doris 复杂类型支持的主参考。 + +## 当前 Doris 状态 + +现有文件: + +```text +be/src/format/new_parquet/parquet_reader.* +be/src/format/new_parquet/column_reader.* +be/src/format/new_parquet/parquet_column_schema.* +be/src/format/new_parquet/parquet_type.* +be/src/format/new_parquet/selection_vector.h +``` + +已有能力: + +- schema builder 可以识别 `STRUCT`、`LIST`、`MAP`,并生成 `DataTypeStruct`、`DataTypeArray`、`DataTypeMap`。 +- `ScalarColumnReader` 支持 flat primitive/string/decimal/date/time/timestamp。 +- `StructColumnReader` 递归读取 children,支持非常基础的非 nullable struct。 +- `ColumnReader::select()` 已经定义为 `skip + read` 的 selected read,不退化为整批读取后过滤。 + +主要缺口: + +- `ParquetColumnSchema` 没有保存完整 `max_definition_level` / `max_repetition_level` 和各复杂节点的 level 边界。 +- `ScalarColumnReader` 当前只支持 `max_repetition_level == 0 && max_definition_level <= 1`。 +- primitive reader 没有向 parent reader 暴露 leaf definition/repetition level stream。 +- nullable struct、list、map 没有 assembler。 +- repeated primitive、legacy repeated group、嵌套 list/map/struct 没有统一 schema 规约。 +- `skip(rows)` 对复杂类型还不是 parent-row 语义。 + +## 总体设计 + +复杂类型读取分为两层: + +```text +ParquetReader + -> ParquetColumnReader public API + read(parent_rows, output_column, rows_read) + skip(parent_rows) + select(selection, selected_rows, batch_rows, output_column) + -> Nested read API + read_nested(parent_rows, level_state, output_column, rows_read) + skip_nested(parent_rows, level_state) + -> Leaf RecordReader adapter + read leaf values + definition levels + repetition levels + -> Dremel assembler + Struct / List / Map build Doris columns +``` + +对 `ParquetReader` 来说,接口仍然是 top-level file-local row batch;复杂类型细节只存在于 `column_reader.*` 内部。 + +### 关键原则 + +- public `read(rows)` 和 `skip(rows)` 的 `rows` 始终表示当前 reader 对外暴露的 parent rows。 +- leaf reader 内部可以读取更多 physical records,但不能把 physical value count 泄露给 `ParquetReader`。 +- list/map 的 offsets 只能由 repetition level 生成,不能用 child column size 推断。 +- nullable 信息只能由 definition level 生成,不能通过 value 缺失猜测。 +- 所有复杂类型 reader 必须保持 child reader 游标严格同步;遇到不一致 level stream 应返回 `Corruption`。 +- 复杂类型 reader 不处理 table/global schema change;child-level schema evolution 后续在 `TableColumnMapper` 处理。 +- 复杂类型 reader 必须支持 file-local child projection。未投影 child 不创建 leaf reader,不读取对应 column chunk,不参与 value materialization。 +- 即使 child 被裁剪,也必须保留足够的 schema/path/level 元数据,使后续 schema change 可以把 table child 映射到 file child、default child 或 cast projection。 + +## Schema 扩展 + +扩展 `ParquetColumnSchema`: + +```text +struct ParquetColumnSchema { + int field_id; + int top_level_field_id; + int leaf_column_id; + int schema_node_id; + int parent_schema_node_id; + std::vector file_path; + std::vector field_id_path; + std::vector name_path; + std::string name; + DataTypePtr type; + ParquetColumnSchemaKind kind; + const parquet::schema::Node* node; + const parquet::ColumnDescriptor* descriptor; + ParquetTypeDescriptor type_descriptor; + int16_t max_definition_level; + int16_t max_repetition_level; + int16_t nullable_definition_level; + int16_t repeated_repetition_level; + std::vector> children; +}; +``` + +字段含义: + +- `schema_node_id`:Parquet schema tree 中的 node ordinal,用于 debug、error message、field id tracing。 +- `top_level_field_id`:FileScanRequest 使用的 file-local top-level id。 +- `leaf_column_id`:Parquet physical leaf column ordinal。复杂节点为 `-1`。 +- `file_path`:从 top-level field 到当前节点的 file-local child ordinal path,例如 `profile.address.city` 可以表示为 `[3, 0, 1]`。 +- `field_id_path`:从 top-level field 到当前节点的 Parquet field id path。缺失 field id 时使用 `-1` 占位,不在 file reader 层解释 Iceberg 语义。 +- `name_path`:从 top-level field 到当前节点的 Parquet node name path,用于 by-name fallback、error message 和后续 schema change。 +- `max_definition_level` / `max_repetition_level`:该节点下 leaf stream 的最大 level。复杂节点取其 subtree leaf 的约束值。 +- `nullable_definition_level`:该节点自身从 null 变成 defined 所需的 definition level。required 节点为 parent level,不额外增加。 +- `repeated_repetition_level`:该 repeated/list/map 层对应的 repetition level。非 repeated 节点为 parent level。 + +Schema builder 改造: + +- 从 root 递归解析,每进入 optional 节点 `definition_level + 1`。 +- 每进入 repeated 节点 `definition_level + 1` 且 `repetition_level + 1`。 +- 识别标准 3-level LIST: + +```text +optional group a (LIST) { + repeated group list { + optional element; + } +} +``` + +- 识别 legacy repeated primitive/group: + +```text +repeated int32 a; +repeated group a { ... } +``` + +并规约为 Doris `Array(element_type)`。 + +- 识别 MAP/MAP_KEY_VALUE: + +```text +optional group m (MAP) { + repeated group key_value { + required key_type key; + optional value_type value; + } +} +``` + +并规约为 Doris `Map(key_type, value_type)`。 + +- MAP key 按 Parquet 规范应为 required。若文件声明 nullable key,应在 schema 阶段返回 `NotSupported` 或 `Corruption`,不生成可继续执行的 reader。 + +## 复杂类型列裁剪 + +复杂类型列裁剪应在 file-local 层实现,语义是“只读取投影需要的 child subtree”,不是 table schema evolution。 + +建议扩展 `reader::FileScanRequest`,增加嵌套 projection tree: + +```text +struct FieldProjection { + ColumnId file_column_id; + std::vector file_path; + bool project_all_children; + std::vector children; +}; + +struct FileScanRequest { + std::vector predicate_columns; + std::vector non_predicate_columns; + std::map column_positions; + std::map complex_projections; + ... +}; +``` + +约束: + +- `predicate_columns` / `non_predicate_columns` 仍表示 top-level file-local fields。 +- `complex_projections` 只描述 top-level complex field 内部需要读取哪些 child。 +- 没有出现在 `complex_projections` 的 top-level complex field 默认 `project_all_children = true`,保持兼容。 +- 对 `STRUCT`,允许只投影部分 children,输出 `DataTypeStruct` 只包含被投影 children,child 顺序保持 file schema 顺序。 +- 对 `LIST`,允许裁剪 element subtree。例如 `Array(Struct)` 投影 `a,c` 时,输出 `Array(Struct)`。 +- 对 `MAP`,key 永远需要读取并输出;value subtree 可以裁剪。例如 `Map>` 投影 value.a 时,输出 `Map>`。 +- 对 nullable parent,parent null map 和 offsets 必须完整生成;裁剪只影响 child value materialization,不能影响 parent row shape。 +- 对所有 children 都被裁剪的 `STRUCT`,仍要能够根据某个保留的 level-driving child 生成 parent row/null 形态。第一版可以要求至少保留一个 child;如果上层真的只需要 parent 存在性,后续补充 `NullShapeColumnReader`。 + +`ParquetColumnReaderFactory` 应接收 projection tree: + +```text +Status create(const ParquetColumnSchema& column_schema, + const FieldProjection* projection, + std::unique_ptr* reader) const; +``` + +实现要求: + +- factory 只为投影中的 leaf 创建 `ScalarColumnReader`。 +- struct/list/map reader 保存 child reader slot;未投影 child 用 `nullptr` 表示,参考 DuckDB `StructColumnReader` 的 child reader 布局。 +- `TotalCompressedSize`、prefetch、statistics 等后续能力只能统计已投影 leaf。 +- 对 top-level output block,`TableReader` 需要使用 projection 后的 `SchemaField` / `DataTypePtr` 构建 block template,而不是原始完整 file schema。 + +列裁剪与延时物化的关系: + +- predicate complex child projection 和 output complex child projection 需要合并,避免同一 leaf 重复读取。 +- 如果 predicate 只依赖 complex child,FileScanRequest 应能表达该 child path 是 predicate projection。 +- 本轮可以先支持 output child pruning;predicate child pruning 可在 batch 内 complex predicate 接入时补齐,但 projection tree 的结构必须现在预留。 + +## Schema Change 兼容边界 + +复杂类型 schema change 不在本轮实现,原因是它涉及 table/global schema、Iceberg field id、default value、cast、generated column 和 filter fallback,属于 `TableColumnMapper` / `TableReader` 范围。 + +但本轮实现必须保证后续可扩展: + +- file schema 中每个 node 都必须导出 `file_path`、`field_id_path`、`name_path`、file-local type 和 child schema。 +- reader 内部不得把 `SchemaField::id` 同时当作 Iceberg field id 和 file-local column id。top-level scan id 只表示 file-local top-level ordinal。 +- `TableColumnMapper` 后续可以根据 table child field id/name path 生成 `FieldProjection`,也可以为缺失 child 生成 default/constant/finalize projection。 +- file reader 输出的 pruned complex type 是 file-local projected type;table reader 负责把它 finalize 成 table/global type。 +- filter localization 后续可以定位到 complex child path。无法安全定位或需要 cast 的 filter 进入 `reader_expression_map` 或 table-level finalize filter。 +- 不在 `ParquetReader` 中补缺失 child,不在 `ParquetReader` 中做 child cast,不在 `ParquetReader` 中解释 Iceberg field id。 + +后续 schema change 的目标形态: + +```text +table projection/filter + -> TableColumnMapper child-level mapping + -> FieldProjection(file-local child paths) + -> ParquetReader reads projected file-local complex block + -> TableReader fills default/generated/partition children + -> TableReader applies child cast/finalize/delete/virtual semantics +``` + +因此,本轮列裁剪实现时不能把 output type 和 original file type 强绑定。所有 `ColumnReader` 创建和 block template 构造都应基于 projected schema view。 + +## Level 读取抽象 + +新增内部结构,位置建议: + +```text +be/src/format/new_parquet/level.h +be/src/format/new_parquet/level.cpp +``` + +核心结构: + +```text +struct LevelBatch { + int64_t record_count; + int64_t value_count; + std::vector definition_levels; + std::vector repetition_levels; +}; + +struct NestedReadResult { + int64_t parent_rows; + int64_t physical_records; +}; +``` + +`ScalarColumnReader` 内部新增 leaf read 路径: + +```text +read_leaf_records(max_records, decoded_values, level_batch) +skip_leaf_records(max_records, level_batch) +``` + +要求: + +- Arrow internal `RecordReader` 的创建和调用继续封装在 `column_reader.*`,不能泄露到 `ParquetReader`。 +- flat primitive 保持当前 `read()` 快路径。 +- nested primitive 必须允许 `max_repetition_level > 0` 或 `max_definition_level > 1`,并输出 definition/repetition levels。 +- `DecodedColumnView::row_count` 对 nested leaf 应表示 value slots 数量,null slot 由 definition level 决定。 + +如果 Arrow internal `RecordReader` 无法稳定提供 Doris 需要的 level/value 对齐语义,则新增 Doris 自己的 leaf page decoder,范围仍限制在 `format/new_parquet/`,不要把 page decoder 细节扩散到 `ParquetReader` 主流程。 + +## Reader 分层 + +建议拆分 `column_reader.cpp`,避免复杂类型 assembler 混在 scalar 读值热路径: + +```text +be/src/format/new_parquet/column_reader.h +be/src/format/new_parquet/column_reader.cpp +be/src/format/new_parquet/scalar_column_reader.cpp +be/src/format/new_parquet/struct_column_reader.cpp +be/src/format/new_parquet/list_column_reader.cpp +be/src/format/new_parquet/map_column_reader.cpp +be/src/format/new_parquet/level.h +be/src/format/new_parquet/level.cpp +``` + +### ScalarColumnReader + +职责: + +- 读取 primitive leaf values。 +- 生成 leaf-level definition/repetition level。 +- 对 flat column 直接写 Doris scalar/nullable column。 +- 对 nested leaf 只作为 child reader 被复杂类型 assembler 调用。 + +flat path: + +```text +read(rows) + -> RecordReader::ReadRecords(rows) + -> DecodedColumnView + -> DataTypeSerDe::read_column_from_decoded_values +``` + +nested path: + +```text +read_nested(parent_rows, level_state) + -> read leaf records until parent_rows complete + -> append valid leaf values into child column + -> expose level_batch to parent assembler +``` + +### StructColumnReader + +输出: + +- non-nullable struct:`ColumnStruct`。 +- nullable struct:`ColumnNullable(ColumnStruct, null_map)`。 + +算法: + +1. 对每个 child reader 读取同样的 parent row count。 +2. child reader 返回的 parent rows 必须一致。 +3. struct 自身 nullable 时,根据 definition level 判断 struct row 是否 null。 +4. 对 null struct row,每个 child column 仍必须补一个 default/null slot,保证 `ColumnStruct` 所有 child size 等于 struct row count。 +5. child 本身的 null 由 child reader 自己根据更深层 definition level 处理。 + +注意: + +- 当前实现仅递归读取 children,没有处理 nullable struct;应改为显式处理 struct-level null map。 +- 对未投影 children 不创建 reader、不写入 output `ColumnStruct`。 +- 对所有 children 都未投影的 struct,第一版可以返回 `NotSupported`,后续用 shape-only reader 支持 parent 存在性读取。 + +### ListColumnReader + +输出: + +- non-nullable array:`ColumnArray(element_column, offsets)`。 +- nullable array:`ColumnNullable(ColumnArray, null_map)`。 + +核心算法参考 DuckDB list reader: + +1. 从 child reader 读取 leaf stream,获得 child values、definition levels、repetition levels。 +2. 根据当前 list 层的 `repeated_repetition_level` 判断一个 child record 是否属于当前 list: + - `rep == list_repetition_level`:当前 list 的后续 element。 + - `rep < list_repetition_level`:新的 parent row 开始。 +3. 根据 definition level 判断 parent row 状态: + - `def < list_defined_level`:null list。 + - `def == empty_list_level`:empty list。 + - `def >= element_defined_level`:有 element。 +4. 对每个 parent row 写一个 offset。 +5. 只有 element defined 时向 element column append value;empty/null list 不 append element。 + +需要维护 overflow: + +- child reader 一次读取可能跨过本次 `parent_rows` 的边界。 +- list reader 必须缓存未消费的 child values 和 levels,下一次 `read()` 继续使用。 +- 该缓存是 reader 游标状态的一部分,`skip()` 和 `read()` 都必须共享。 + +### MapColumnReader + +输出: + +- non-nullable map:`ColumnMap(key_column, value_column, offsets)`。 +- nullable map:`ColumnNullable(ColumnMap, null_map)`。 + +实现方式: + +- 按 Parquet schema 将 map 规约为 `LIST>` 的 level stream。 +- 复用 list assembler 的 parent row 边界判断。 +- 对每个 entry: + - key 必须 defined;key 缺失是文件格式错误。 + - value 可 nullable;由 value child definition level 生成 value null map。 +- append entry 时分别写 key column 和 value column。 +- offsets 表示每个 map row 的 entry 数。 + +不要把 `MAP` 先 materialize 成 `Array(Struct(key,value))` 再转换为 `ColumnMap`,否则会产生额外内存和拷贝。可以在内部复用 list 的边界识别逻辑,但直接写 `ColumnMap` 的 keys/values/offsets。 + +## Skip 和 Select + +public 语义保持不变: + +```text +skip(parent_rows) +select(selection, selected_rows, batch_rows, column) +``` + +复杂类型要求: + +- `skip()` 必须消费 parent rows 对应的所有 child physical records 和 level stream。 +- `select()` 继续使用现有 range 合并策略,即按 selected row ranges 调用 `skip()` + `read()`。 +- list/map 的 `skip()` 不能只跳过 child value count;必须按 repetition level 找到 parent row 边界。 +- empty selection 时必须跳过整个 batch 的 parent rows,保证 reader 游标推进。 + +第一阶段不实现 page-level row range selection;只保证 `skip + read` 的 selected read 正确。 + +## 与 ParquetReader Scan Loop 的关系 + +`ParquetReader::_read_current_row_group_batch()` 不需要理解复杂类型: + +- predicate columns 仍先读。 +- non-predicate columns 仍根据 selection 调用 `read()` 或 `select()`。 +- column reader 自己负责 complex column 的 parent-row 语义。 + +限制: + +- 初期不支持复杂类型直接作为 filter column 执行 batch predicate。 +- row group statistics 仍只对 primitive leaf 做保守裁剪。 +- complex child-level projection 是本轮 reader 实现目标;但 complex child predicate 执行和 schema change finalize 不在本轮完成。 + +## 错误处理 + +遇到明确违反 Parquet spec 或 reader invariant 的情况,应返回错误或触发检查,不能静默修复: + +- MAP key nullable 或 key definition level 缺失。 +- 同一 struct 的 children parent row count 不一致。 +- list/map repetition level 非法回退或超过当前 schema 最大值。 +- leaf reader 返回的 value count、definition/repetition level 数量不一致。 +- child reader overflow 状态与下一次 read/skip 请求冲突。 + +对合法但暂未支持的编码形态返回 `NotSupported`,例如后续若发现 Arrow internal `RecordReader` 无法支持某类 nested level 输出。 + +## 测试计划 + +新增或扩展 BE UT: + +```text +be/test/format/new_parquet/parquet_complex_reader_test.cpp +``` + +优先用 Arrow writer 生成小 Parquet 文件,覆盖: + +- required struct。 +- optional struct。 +- struct child nullable。 +- array of primitive:null array、empty array、array with null element。 +- array of struct。 +- nested array:`Array(Array(String))`。 +- map:empty map、null map、nullable value。 +- struct containing array/map。 +- multiple row groups。 +- child projection:struct child 裁剪、array element struct child 裁剪、map value struct child 裁剪。 +- selected read:复杂列作为 non-predicate column,predicate column 过滤出稀疏 selection。 +- skip then read:直接验证复杂列 reader 游标。 + +后续回归测试: + +```text +regression-test/suites/external_table_p0/parquet_complex_types.groovy +``` + +要求: + +- 结果排序稳定,使用 `order_qt` 或显式 `order by`。 +- 错误场景使用 `test { sql; exception }`。 +- 测试前 drop table,不在测试末尾 drop,便于失败后排查。 + +## 分阶段落地 + +### 阶段 1:Schema level 信息补齐 + +- 扩展 `ParquetColumnSchema`,保存 definition/repetition level。 +- 增加 `file_path`、`field_id_path`、`name_path`,并明确 top-level file-local id 与 table field id 的边界。 +- 重写 `build_parquet_column_schema()` 的复杂类型规约逻辑。 +- 增加 schema-only UT,覆盖 LIST/MAP legacy 和 standard encodings。 + +### 阶段 1.5:Projection tree 和 projected schema view + +- 扩展 `FileScanRequest`,表达 top-level complex field 的 child projection tree。 +- 增加 projected `SchemaField` / `DataTypePtr` 构造逻辑。 +- `ParquetColumnReaderFactory` 接收 projection tree,只创建被投影 child reader。 +- 增加 child pruning UT,验证未投影 leaf 不创建 reader、不读取 column chunk。 + +### 阶段 2:Leaf level reader + +- 为 `ScalarColumnReader` 增加 nested leaf read API。 +- 去掉 `max_repetition_level == 0 && max_definition_level <= 1` 的硬限制,改成 flat path 和 nested path 分支。 +- 验证 nullable primitive 在 nested path 下的 value/null 对齐。 + +### 阶段 3:Struct reader 完整化 + +- 实现 nullable struct。 +- 保证 null struct row 对所有 children 插入 default/null slot。 +- 增加 required/optional struct UT。 + +### 阶段 4:List reader + +- 实现 list assembler、offset 写入、null/empty list 区分。 +- 实现 overflow child buffer。 +- 实现 list `skip()`。 +- 增加 array、nested array、array of struct UT。 + +### 阶段 5:Map reader + +- 实现 map schema 规约到 key/value children。 +- 直接写 `ColumnMap` keys、values、offsets。 +- 校验 required key。 +- 增加 map UT。 + +### 阶段 6:Selected read 和集成测试 + +- 验证 complex non-predicate column 在 lazy materialization 下正确。 +- 验证 complex projected child 在 lazy materialization 下正确。 +- 增加 sparse selection、empty selection、multi-row-group 测试。 +- 将复杂类型 reader 接入 `ParquetReader` 现有 scan loop,不改 table/global schema 边界。 + +### 阶段 7:优化和扩展 + +- complex child predicate execution。 +- complex column statistics 和 page index 支持。 +- complex predicate fallback。 +- 复杂列 schema change child-level mapping。 + +## 验收标准 + +完成“复杂类型完整支持”至少需要满足: + +- `STRUCT`、nullable `STRUCT`、`LIST`、nested `LIST`、`MAP` 可以正确读入 Doris complex columns。 +- 复杂类型 child projection 可以裁剪未请求 leaf,并输出 projected complex type。 +- null、empty、missing element/value 的语义与 Parquet definition/repetition level 一致。 +- `read()`、`skip()`、`select()` 在复杂类型上均保持 parent-row 语义。 +- flat primitive 现有测试不退化。 +- 新增 BE UT 覆盖复杂类型基础、嵌套、selected read 和 multi-row-group。 +- `ParquetReader` 不引入 table/global schema 语义。 +- schema/path/level 元数据足够后续 `TableColumnMapper` 实现 child-level schema change,不需要重写复杂类型 reader 主体。 From ee4a33299c1dedfcef9cd868cab6121f003e190c Mon Sep 17 00:00:00 2001 From: Socrates Date: Wed, 27 May 2026 17:14:56 +0800 Subject: [PATCH 17/38] [fix](be) Fix VSlotRef constructor build ### What problem does this PR solve? Issue Number: close #xxx Related PR: #xxx Problem Summary: Fix the non-BE_TEST build failure caused by calling the test-only set_node_type helper from the VSlotRef protected constructor. ### Release note None ### Check List (For Author) - Test: Manual test - Ran clang-format dry-run and git diff --check for the modified header. Fedora DEBUG BE build was run and exposed the fixed compile failure; full build will be rerun after syncing this commit. - Behavior changed: No - Does this need documentation: No --- be/src/exprs/vslot_ref.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/be/src/exprs/vslot_ref.h b/be/src/exprs/vslot_ref.h index 8cb26f9bcfd296..a3b849a87138bd 100644 --- a/be/src/exprs/vslot_ref.h +++ b/be/src/exprs/vslot_ref.h @@ -76,7 +76,7 @@ class VSlotRef : public VExpr { protected: VSlotRef(int slot_id, int column_id, int column_uniq_id) : _slot_id(slot_id), _column_id(column_id), _column_uniq_id(column_uniq_id) { - set_node_type(TExprNodeType::SLOT_REF); + _node_type = TExprNodeType::SLOT_REF; } private: From 472c1cca43c0336a353d999f98ba55d8ada4f9bd Mon Sep 17 00:00:00 2001 From: Socrates Date: Wed, 27 May 2026 17:41:20 +0800 Subject: [PATCH 18/38] [fix](be) Fix parquet batch row cast ### What problem does this PR solve? Issue Number: close #xxx Related PR: #xxx Problem Summary: Fix a DEBUG build failure in the new parquet reader by asserting the read batch size before converting it to the selection vector row count type. ### Release note None ### Check List (For Author) - Test: Manual test - Ran clang-format dry-run and git diff --check for the modified parquet reader file. Fedora DEBUG BE build was run and exposed the fixed compile failure; full build will be rerun after syncing this commit. - Behavior changed: No - Does this need documentation: No --- be/src/format/new_parquet/parquet_reader.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/be/src/format/new_parquet/parquet_reader.cpp b/be/src/format/new_parquet/parquet_reader.cpp index 677a596debf733..190aa87f251b46 100644 --- a/be/src/format/new_parquet/parquet_reader.cpp +++ b/be/src/format/new_parquet/parquet_reader.cpp @@ -23,6 +23,7 @@ #include #include +#include #include #include #include @@ -428,7 +429,8 @@ Status ParquetReader::_read_current_row_group_batch(int64_t batch_rows, Block* f return Status::OK(); } SelectionVector selection; - uint16_t selected_rows = batch_rows; + DORIS_CHECK(batch_rows <= std::numeric_limits::max()); + uint16_t selected_rows = static_cast(batch_rows); // 1. Read all predicate columns and evaluate selection vector. RETURN_IF_ERROR(_read_filter_columns(batch_rows, file_block, &selection, &selected_rows)); From 4073912ea6705a84e4bb6a3add2a37294af400d8 Mon Sep 17 00:00:00 2001 From: Gabriel Date: Wed, 27 May 2026 18:23:58 +0800 Subject: [PATCH 19/38] [feature](be) Support expression filters on file reader (#63748) --- be/src/format/new_parquet/parquet_reader.cpp | 45 ++- be/src/format/new_parquet/parquet_reader.h | 3 +- .../format/new_parquet/parquet_statistics.cpp | 26 +- .../format/new_parquet/parquet_statistics.h | 7 +- be/src/format/reader/column_mapper.cpp | 98 ++++-- be/src/format/reader/column_mapper.h | 16 +- be/src/format/reader/file_reader.h | 23 +- be/src/format/reader/table_reader.cpp | 26 +- be/src/format/reader/table_reader.h | 16 +- .../new_parquet/parquet_reader_test.cpp | 127 ++++++- be/test/format/reader/table_reader_test.cpp | 312 +++++++++++++++++- 11 files changed, 589 insertions(+), 110 deletions(-) diff --git a/be/src/format/new_parquet/parquet_reader.cpp b/be/src/format/new_parquet/parquet_reader.cpp index 190aa87f251b46..6d0ef3eb742840 100644 --- a/be/src/format/new_parquet/parquet_reader.cpp +++ b/be/src/format/new_parquet/parquet_reader.cpp @@ -291,10 +291,6 @@ Status ParquetReader::_get_projected_schema_field(reader::ColumnId file_column_i return Status::OK(); } -bool ParquetReader::_has_expression_filter(const reader::FileLocalFilter& local_filter) { - return local_filter.conjunct != nullptr; -} - Status ParquetReader::_read_filter_columns(int64_t batch_rows, Block* file_block, SelectionVector* selection, uint16_t* selected_rows) { selection->resize(static_cast(batch_rows)); @@ -314,28 +310,29 @@ Status ParquetReader::_read_filter_columns(int64_t batch_rows, Block* file_block column_reader->name(), column_rows, batch_rows); } file_block->replace_by_position(block_position, std::move(column)); + } + return _execute_filter_conjuncts(batch_rows, file_block, selection, selected_rows); +} - for (const auto& local_filter : _request->local_filters) { - if (local_filter.file_column_id != file_field_id || - !_has_expression_filter(local_filter)) { - continue; - } - if (*selected_rows == 0) { - break; - } - IColumn::Filter filter(static_cast(batch_rows), 1); - bool can_filter_all = false; - RETURN_IF_ERROR(local_filter.conjunct->execute_filter(file_block, filter.data(), - static_cast(batch_rows), - false, &can_filter_all)); - *selected_rows = - can_filter_all ? 0 - : _apply_filter_to_selection(filter, selection, *selected_rows); - break; +Status ParquetReader::_execute_filter_conjuncts(int64_t batch_rows, Block* file_block, + SelectionVector* selection, + uint16_t* selected_rows) { + // Expression filters may reference several predicate columns. Execute them only after all + // predicate columns in the file-local block have been materialized. + for (const auto& expression_filter : _request->expression_filters) { + if (expression_filter.conjunct == nullptr) { + continue; } if (*selected_rows == 0) { break; } + IColumn::Filter filter(static_cast(batch_rows), 1); + bool can_filter_all = false; + RETURN_IF_ERROR(expression_filter.conjunct->execute_filter( + file_block, filter.data(), static_cast(batch_rows), false, + &can_filter_all)); + *selected_rows = + can_filter_all ? 0 : _apply_filter_to_selection(filter, selection, *selected_rows); } return Status::OK(); } @@ -557,10 +554,10 @@ Status ParquetReader::open(std::unique_ptr& request) { DORIS_CHECK(_request->column_positions.count(file_column_id) > 0); DORIS_CHECK(file_column_id >= 0 && file_column_id < num_fields); } - for (const auto& local_filter : _request->local_filters) { - if (local_filter.file_column_id < 0 || local_filter.file_column_id >= num_fields) { + for (const auto& column_filter : _request->column_predicate_filters) { + if (column_filter.file_column_id < 0 || column_filter.file_column_id >= num_fields) { return Status::InvalidArgument("Invalid parquet filter top-level field id {}", - local_filter.file_column_id); + column_filter.file_column_id); } } for (const auto& [file_column_id, projection] : _request->complex_projections) { diff --git a/be/src/format/new_parquet/parquet_reader.h b/be/src/format/new_parquet/parquet_reader.h index f6d47f4613404e..aa5cbfb5fcd450 100644 --- a/be/src/format/new_parquet/parquet_reader.h +++ b/be/src/format/new_parquet/parquet_reader.h @@ -127,9 +127,10 @@ class ParquetReader : public reader::FileReader { Status _get_projected_schema_field(reader::ColumnId file_column_id, const reader::FieldProjection* projection, reader::SchemaField* field) const; - bool _has_expression_filter(const reader::FileLocalFilter& local_filter); Status _read_filter_columns(int64_t batch_rows, Block* file_block, SelectionVector* selection, uint16_t* selected_rows); + Status _execute_filter_conjuncts(int64_t batch_rows, Block* file_block, + SelectionVector* selection, uint16_t* selected_rows); IColumn::Filter _selection_to_filter(const SelectionVector& selection, uint16_t selected_rows, int64_t batch_rows); uint16_t _apply_filter_to_selection(const IColumn::Filter& filter, SelectionVector* selection, diff --git a/be/src/format/new_parquet/parquet_statistics.cpp b/be/src/format/new_parquet/parquet_statistics.cpp index aebc6d4e04d9fb..a28ccb8ae25cd0 100644 --- a/be/src/format/new_parquet/parquet_statistics.cpp +++ b/be/src/format/new_parquet/parquet_statistics.cpp @@ -159,14 +159,13 @@ ParquetColumnStatistics ParquetStatisticsUtils::TransformColumnStatistics( } } -bool ParquetStatisticsUtils::CheckStatistics(const reader::FileLocalFilter& local_filter, +bool ParquetStatisticsUtils::CheckStatistics(const reader::FileColumnPredicateFilter& column_filter, const ParquetColumnStatistics& statistics) { if (!statistics.has_any_statistics()) { return false; } - // TODO: replace local_filter.predicates by local_filter.conjuncts - for (const auto& column_predicate : local_filter.predicates) { + for (const auto& column_predicate : column_filter.predicates) { if (is_null_only_predicate(*column_predicate)) { if (!statistics.has_null_count) { continue; @@ -184,16 +183,19 @@ bool ParquetStatisticsUtils::CheckStatistics(const reader::FileLocalFilter& loca bool ParquetStatisticsUtils::RowGroupExcludes( const ::parquet::RowGroupMetaData& row_group, const std::vector>& schema, - const reader::FileLocalFilter& local_filter) { - DCHECK(local_filter.file_column_id >= 0 && - local_filter.file_column_id < row_group.num_columns()); - DCHECK_LT(local_filter.file_column_id, schema.size()); - auto column_chunk = row_group.ColumnChunk(local_filter.file_column_id); + const reader::FileColumnPredicateFilter& column_filter) { + if (column_filter.predicates.empty()) { + return false; + } + DCHECK(column_filter.file_column_id >= 0 && + column_filter.file_column_id < row_group.num_columns()); + DCHECK_LT(column_filter.file_column_id, schema.size()); + auto column_chunk = row_group.ColumnChunk(column_filter.file_column_id); if (column_chunk == nullptr) { return false; } - return CheckStatistics(local_filter, - TransformColumnStatistics(*schema[local_filter.file_column_id], + return CheckStatistics(column_filter, + TransformColumnStatistics(*schema[column_filter.file_column_id], column_chunk->statistics())); } @@ -215,8 +217,8 @@ Status ParquetStatisticsUtils::SelectRowGroups( continue; } bool drop = false; - for (const auto& local_filter : request.local_filters) { - if (RowGroupExcludes(*row_group, file_schema, local_filter)) { + for (const auto& column_filter : request.column_predicate_filters) { + if (RowGroupExcludes(*row_group, file_schema, column_filter)) { drop = true; break; } diff --git a/be/src/format/new_parquet/parquet_statistics.h b/be/src/format/new_parquet/parquet_statistics.h index 0def08d4b084df..4f43ae245b57bf 100644 --- a/be/src/format/new_parquet/parquet_statistics.h +++ b/be/src/format/new_parquet/parquet_statistics.h @@ -60,13 +60,14 @@ struct ParquetStatisticsUtils { const ParquetColumnSchema& column_schema, const std::shared_ptr<::parquet::Statistics>& statistics); - // Return true if the statistics indicate that the row group can be safely skipped according to the local filter. - static bool CheckStatistics(const reader::FileLocalFilter& local_filter, + // Return true if the statistics indicate that the row group can be safely skipped according to + // the local single-column predicate filter. + static bool CheckStatistics(const reader::FileColumnPredicateFilter& column_filter, const ParquetColumnStatistics& statistics); static bool RowGroupExcludes(const ::parquet::RowGroupMetaData& row_group, const std::vector>& schema, - const reader::FileLocalFilter& local_filter); + const reader::FileColumnPredicateFilter& column_filter); static Status SelectRowGroups( const ::parquet::FileMetaData& metadata, diff --git a/be/src/format/reader/column_mapper.cpp b/be/src/format/reader/column_mapper.cpp index 4d9afdeff3297a..1a33781b9651f1 100644 --- a/be/src/format/reader/column_mapper.cpp +++ b/be/src/format/reader/column_mapper.cpp @@ -17,6 +17,7 @@ #include "format/reader/column_mapper.h" +#include #include #include #include @@ -69,6 +70,8 @@ static constexpr const char* ROW_LINEAGE_LAST_UPDATED_SEQ_NUMBER = "_last_update static void add_scan_column(FileScanRequest* file_request, ColumnId file_column_id, std::vector* scan_columns) { + // column_positions is the global read-column index for this scan request, so it also + // deduplicates predicate_columns and non_predicate_columns across all filter/projection paths. if (file_request->column_positions.count(file_column_id) == 0) { file_request->column_positions.emplace(file_column_id, file_request->column_positions.size()); @@ -210,6 +213,13 @@ static Status rebuild_projected_file_type(ColumnMapping* mapping) { return Status::OK(); } +static std::vector filter_slot_ids(const TableFilter& table_filter) { + if (!table_filter.slot_ids.empty()) { + return table_filter.slot_ids; + } + return {}; +} + Status TableColumnMapper::create_mapping(const std::vector& projected_columns, const std::map& partition_values, const std::vector& file_schema) { @@ -250,7 +260,8 @@ Status TableColumnMapper::create_mapping(const std::vector& project return Status::OK(); } -Status TableColumnMapper::create_scan_request(const std::map& table_filters, +Status TableColumnMapper::create_scan_request(const std::vector& table_filters, + const TableColumnPredicates& table_column_predicates, const std::vector& projected_columns, FileScanRequest* file_request) { // FileReader evaluates expressions against a file-local block. This mapper owns the @@ -259,12 +270,27 @@ Status TableColumnMapper::create_scan_request(const std::mapnon_predicate_columns.clear(); file_request->column_positions.clear(); file_request->complex_projections.clear(); - file_request->local_filters.clear(); + file_request->expression_filters.clear(); + file_request->column_predicate_filters.clear(); file_request->reader_expression_map.clear(); + // 1. Build referenced non-predicate columns for (const auto& table_column : projected_columns) { auto* mapping = _find_mapping(table_column.id); if (mapping != nullptr && mapping->file_column_id.has_value()) { - if (table_filters.count(table_column.id) == 0) { + // A file column can be read lazily as a non-predicate column only when it is not used + // by either expression filters or single-column predicate pruning. + bool used_by_filter = table_column_predicates.count(table_column.id) > 0; + if (!used_by_filter) { + for (const auto& table_filter : table_filters) { + const auto slot_ids = filter_slot_ids(table_filter); + if (std::find(slot_ids.begin(), slot_ids.end(), table_column.id) != + slot_ids.end()) { + used_by_filter = true; + break; + } + } + } + if (!used_by_filter) { add_scan_column(file_request, *mapping->file_column_id, &file_request->non_predicate_columns); } @@ -280,7 +306,9 @@ Status TableColumnMapper::create_scan_request(const std::map& table_filters, +Status TableColumnMapper::localize_filters(const std::vector& table_filters, + const TableColumnPredicates& table_column_predicates, FileScanRequest* file_request) const { // 真实实现会处理 trivial mapping、safe cast、reader expression fallback 和 // finalize-only filter。stub 只复制能够直接定位到 file column 的谓词。 - for (const auto& it : table_filters) { - const auto* mapping = _find_mapping(it.first); - if (mapping == nullptr || !mapping->file_column_id.has_value()) { + for (const auto& table_filter : table_filters) { + if (!table_filter.can_be_localized()) { + // TODO: Rewrite table filter to reader_expression_map + // file_request->reader_expression_map.emplace_back(..., table_filter.conjunct); continue; } - if (!it.second.can_be_localized()) { - // TODO: Rewrite table filter to reader_expression_map - // file_request->reader_expression_map.emplace_back(mapping->table_column_id, it.second.conjunct); + for (const auto table_column_id : filter_slot_ids(table_filter)) { + const auto* mapping = _find_mapping(table_column_id); + if (mapping == nullptr || !mapping->file_column_id.has_value()) { + continue; + } + add_scan_column(file_request, *mapping->file_column_id, + &file_request->predicate_columns); + } + } + for (const auto& [table_column_id, _] : table_column_predicates) { + const auto* mapping = _find_mapping(table_column_id); + if (mapping == nullptr || !mapping->file_column_id.has_value()) { continue; } add_scan_column(file_request, *mapping->file_column_id, &file_request->predicate_columns); @@ -312,20 +351,35 @@ Status TableColumnMapper::localize_filters(const std::map& // Build the complete table-slot to file-block position map after all predicate columns have // been assigned. This keeps expression localization independent from filter iteration order. const auto table_column_to_file_position = build_file_position_map(_mappings, *file_request); - for (const auto& it : table_filters) { - const auto* mapping = _find_mapping(it.first); - if (mapping == nullptr || !mapping->file_column_id.has_value() || - !it.second.can_be_localized()) { + for (const auto& table_filter : table_filters) { + if (!table_filter.can_be_localized()) { continue; } - FileLocalFilter local_filter; - local_filter.file_column_id = *mapping->file_column_id; - if (it.second.conjunct != nullptr) { - local_filter.conjunct = VExprContext::create_shared(rewrite_table_expr_to_file_expr( - it.second.conjunct->root(), table_column_to_file_position)); + if (table_filter.conjunct != nullptr) { + FileExpressionFilter expression_filter; + expression_filter.conjunct = + VExprContext::create_shared(rewrite_table_expr_to_file_expr( + table_filter.conjunct->root(), table_column_to_file_position)); + expression_filter.file_column_ids.reserve(table_filter.slot_ids.size()); + for (const auto table_column_id : table_filter.slot_ids) { + const auto* mapping = _find_mapping(table_column_id); + if (mapping == nullptr || !mapping->file_column_id.has_value()) { + continue; + } + expression_filter.file_column_ids.push_back(*mapping->file_column_id); + } + file_request->expression_filters.push_back(std::move(expression_filter)); + } + } + for (const auto& [table_column_id, predicates] : table_column_predicates) { + const auto* mapping = _find_mapping(table_column_id); + if (mapping == nullptr || !mapping->file_column_id.has_value() || predicates.empty()) { + continue; } - local_filter.predicates = it.second.predicates; - file_request->local_filters.push_back(std::move(local_filter)); + FileColumnPredicateFilter column_predicate_filter; + column_predicate_filter.file_column_id = *mapping->file_column_id; + column_predicate_filter.predicates = predicates; + file_request->column_predicate_filters.push_back(std::move(column_predicate_filter)); } return Status::OK(); } diff --git a/be/src/format/reader/column_mapper.h b/be/src/format/reader/column_mapper.h index 0c6ac9c8e6c5f6..bcfe71522088dd 100644 --- a/be/src/format/reader/column_mapper.h +++ b/be/src/format/reader/column_mapper.h @@ -31,6 +31,10 @@ #include "exprs/vexpr_fwd.h" #include "format/reader/expr/literal.h" +namespace doris { +class ColumnPredicate; +} // namespace doris + namespace doris::reader { struct TableColumn; @@ -39,6 +43,9 @@ struct SchemaField; struct FileScanRequest; struct FieldProjection; +using TableColumnPredicates = + std::map>>; + enum class TableColumnMappingMode { BY_FIELD_ID, BY_NAME, @@ -100,15 +107,18 @@ class TableColumnMapper { // 把 table-level scan 请求转换成 file-local scan 请求。 // table_request 使用 table/global schema;file_request 只包含 FileReader 能理解的 - // projected_file_columns、local_filters 和 reader_expression_map。 - virtual Status create_scan_request(const std::map& table_filters, + // projected_file_columns、expression_filters、column_predicate_filters 和 + // reader_expression_map。 + virtual Status create_scan_request(const std::vector& table_filters, + const TableColumnPredicates& table_column_predicates, const std::vector& projected_columns, FileScanRequest* file_request); // 将 table-level filter 定位到文件 schema。 // trivial mapping 可以直接复制结构化谓词;类型变化时可以尝试安全 cast;无法安全 // 下推的表达式应通过 reader_expression_map 或 table-level finalize/filter fallback 处理。 - virtual Status localize_filters(const std::map& table_filters, + virtual Status localize_filters(const std::vector& table_filters, + const TableColumnPredicates& table_column_predicates, FileScanRequest* file_request) const; void clear() { _mappings.clear(); } const std::vector& mappings() const { return _mappings; } diff --git a/be/src/format/reader/file_reader.h b/be/src/format/reader/file_reader.h index 918e2b4bd351d2..69720bc8f9a2b7 100644 --- a/be/src/format/reader/file_reader.h +++ b/be/src/format/reader/file_reader.h @@ -75,21 +75,19 @@ struct FieldProjection { std::vector children; }; -// 已经 localize 到文件 schema 的过滤条件。 -// TableColumnMapper 负责把 table-level filter 转成这个结构;FileReader 只消费 -// file-local column id、表达式和结构化谓词。 -struct FileLocalFilter { - ColumnId file_column_id = -1; - - // 表达式过滤。适合 cast、复杂表达式或 reader_expression_map 生成的临时列过滤。 - // 它通常不能直接驱动 row group stats、page index、dictionary、bloom filter。 +// File-local expression filter. It may reference multiple predicate_columns, so FileReader should +// evaluate it after all referenced predicate columns have been materialized in the file-local block. +struct FileExpressionFilter { VExprContextSPtr conjunct; // DeletePredicate VExprContextSPtr delete_conjunct; + std::vector file_column_ids; +}; - // 结构化列谓词。适合文件层 pruning,例如 min/max、page index、dictionary、 - // bloom filter 等只理解单列谓词的优化。 - // TODO: conjunct 支持表达所有 filter 语义之后删除。 +// File-local single-column predicates for file-layer pruning, such as min/max, page index, +// dictionary and bloom filter. Predicates must all belong to file_column_id. +struct FileColumnPredicateFilter { + ColumnId file_column_id = -1; std::vector> predicates; }; @@ -110,7 +108,8 @@ struct FileScanRequest { std::vector non_predicate_columns; std::map column_positions; std::map complex_projections; - std::vector local_filters; + std::vector expression_filters; + std::vector column_predicate_filters; // fallback path if filters cannot be localized to file-local predicates. The expression can reference projected_file_columns and partition columns. std::vector> reader_expression_map; }; diff --git a/be/src/format/reader/table_reader.cpp b/be/src/format/reader/table_reader.cpp index f6cfa21600ea61..58de83785892fd 100644 --- a/be/src/format/reader/table_reader.cpp +++ b/be/src/format/reader/table_reader.cpp @@ -48,16 +48,10 @@ void collect_table_slot_ids(const VExprSPtr& expr, std::set* slot_ids) { } void build_table_filters_from_conjunct(const VExprSPtr& conjunct, - std::map* table_filters) { + std::vector* table_filters) { if (conjunct == nullptr) { return; } - std::set slot_ids; - collect_table_slot_ids(conjunct, &slot_ids); - if (slot_ids.size() == 1) { - (*table_filters)[*slot_ids.begin()].conjunct = VExprContext::create_shared(conjunct); - return; - } if (conjunct->node_type() == TExprNodeType::COMPOUND_PRED && conjunct->op() == TExprOpcode::COMPOUND_AND) { for (const auto& child : conjunct->children()) { @@ -65,6 +59,15 @@ void build_table_filters_from_conjunct(const VExprSPtr& conjunct, } return; } + std::set slot_ids; + collect_table_slot_ids(conjunct, &slot_ids); + if (!slot_ids.empty()) { + TableFilter table_filter; + table_filter.conjunct = VExprContext::create_shared(conjunct); + table_filter.slot_ids.assign(slot_ids.begin(), slot_ids.end()); + table_filters->push_back(std::move(table_filter)); + return; + } } } // namespace @@ -100,6 +103,7 @@ Status TableReader::init(TableReadOptions options) { mapper_options.allow_missing_columns = options.allow_missing_columns; _data_reader.column_mapper = TableColumnMapper(mapper_options); _conjuncts = std::move(options.conjuncts); + _table_column_predicates = std::move(options.column_predicates); return Status::OK(); } @@ -111,12 +115,12 @@ Status TableReader::_build_table_filters_from_conjuncts() { Status TableReader::_open_local_filter_exprs(const FileScanRequest& file_request) { RowDescriptor row_desc; - for (const auto& local_filter : file_request.local_filters) { - if (local_filter.conjunct == nullptr) { + for (const auto& expression_filter : file_request.expression_filters) { + if (expression_filter.conjunct == nullptr) { continue; } - RETURN_IF_ERROR(local_filter.conjunct->prepare(_runtime_state, row_desc)); - RETURN_IF_ERROR(local_filter.conjunct->open(_runtime_state)); + RETURN_IF_ERROR(expression_filter.conjunct->prepare(_runtime_state, row_desc)); + RETURN_IF_ERROR(expression_filter.conjunct->open(_runtime_state)); } return Status::OK(); } diff --git a/be/src/format/reader/table_reader.h b/be/src/format/reader/table_reader.h index c9589af8017dc2..2cf5eb30468b8a 100644 --- a/be/src/format/reader/table_reader.h +++ b/be/src/format/reader/table_reader.h @@ -62,15 +62,14 @@ struct TableColumn { }; // table-level filter。 -// TableColumnMapper 负责把它转换成 FileLocalFilter 或 reader_expression_map。 +// TableColumnMapper 负责把它转换成 FileExpressionFilter 或 reader_expression_map。 struct TableFilter { // 表达式过滤,适合表达 cast、复杂表达式、复杂列提取等语义。 VExprContextSPtr conjunct; - // 结构化列谓词,适合下推到文件层做 row group stats、page index、dictionary、 - // bloom filter 等优化。 - // TODO: conjunct 支持表达所有 filter 语义之后删除。 - std::vector> predicates; + // Table slot ids referenced by conjunct. A single expression filter may depend on multiple + // columns, while ColumnPredicate pruning still belongs to one concrete column. + std::vector slot_ids; bool can_be_localized() const { return true; } }; @@ -105,6 +104,7 @@ struct ReadProfile { struct TableReadOptions { const std::vector projected_columns; + const TableColumnPredicates column_predicates; // All conjuncts from scan operator const VExprContext conjuncts; const FileFormat format; @@ -229,7 +229,7 @@ class TableReader { auto file_request = std::make_unique(); RETURN_IF_ERROR(_data_reader.column_mapper.create_scan_request( - _table_filters, _projected_columns, file_request.get())); + _table_filters, _table_column_predicates, _projected_columns, file_request.get())); RETURN_IF_ERROR(_open_local_filter_exprs(*file_request)); _data_reader.scan_schema.clear(); _data_reader.block_template.clear(); @@ -266,6 +266,7 @@ class TableReader { _data_reader.reader.reset(); _data_reader.column_mapper.clear(); _table_filters.clear(); + _table_column_predicates.clear(); _data_reader.block_schema.clear(); _data_reader.scan_schema.clear(); _data_reader.block_template.clear(); @@ -331,7 +332,8 @@ class TableReader { std::shared_ptr _system_properties; // partition key -> value std::map _partition_values; - std::map _table_filters; + std::vector _table_filters; + TableColumnPredicates _table_column_predicates; VExprContext _conjuncts {nullptr}; std::unique_ptr _profile; // Parsed from DELETION_VECTOR in Iceberg and Paimon diff --git a/be/test/format/new_parquet/parquet_reader_test.cpp b/be/test/format/new_parquet/parquet_reader_test.cpp index 5341b4060b5a8f..43ec9cc0ab1c03 100644 --- a/be/test/format/new_parquet/parquet_reader_test.cpp +++ b/be/test/format/new_parquet/parquet_reader_test.cpp @@ -42,6 +42,7 @@ #include "exprs/vexpr_context.h" #include "format/reader/column_mapper.h" #include "format/reader/file_reader.h" +#include "format/reader/table_reader.h" #include "gen_cpp/Types_types.h" #include "io/io_common.h" #include "runtime/runtime_state.h" @@ -82,6 +83,41 @@ class Int32GreaterThanExpr final : public VExpr { const std::string _expr_name = "Int32GreaterThanExpr"; }; +class Int32SumGreaterThanExpr final : public VExpr { +public: + Int32SumGreaterThanExpr(int left_column_id, int right_column_id, int32_t value) + : VExpr(std::make_shared(), false), + _left_column_id(left_column_id), + _right_column_id(right_column_id), + _value(value) {} + + Status execute_column_impl(VExprContext* context, const Block* block, const Selector* selector, + size_t count, ColumnPtr& result_column) const override { + const auto& left_input = + assert_cast(*block->get_by_position(_left_column_id).column); + const auto& right_input = + assert_cast(*block->get_by_position(_right_column_id).column); + auto result = ColumnUInt8::create(); + auto& result_data = result->get_data(); + result_data.resize(count); + for (size_t row = 0; row < count; ++row) { + const size_t input_row = selector == nullptr ? row : (*selector)[row]; + result_data[row] = + left_input.get_element(input_row) + right_input.get_element(input_row) > _value; + } + result_column = std::move(result); + return Status::OK(); + } + + const std::string& expr_name() const override { return _expr_name; } + +private: + const int _left_column_id; + const int _right_column_id; + const int32_t _value; + const std::string _expr_name = "Int32SumGreaterThanExpr"; +}; + VExprContextSPtr create_int32_greater_than_conjunct(int column_id, int32_t value) { auto ctx = VExprContext::create_shared(std::make_shared(column_id, value)); @@ -90,6 +126,15 @@ VExprContextSPtr create_int32_greater_than_conjunct(int column_id, int32_t value return ctx; } +VExprContextSPtr create_int32_sum_greater_than_conjunct(int left_column_id, int right_column_id, + int32_t value) { + auto ctx = VExprContext::create_shared( + std::make_shared(left_column_id, right_column_id, value)); + ctx->_prepared = true; + ctx->_opened = true; + return ctx; +} + std::shared_ptr finish_array(arrow::ArrayBuilder* builder) { std::shared_ptr array; EXPECT_TRUE(builder->Finish(&array).ok()); @@ -133,6 +178,28 @@ void write_parquet_file(const std::string& file_path, int64_t row_group_size = R row_group_size, builder.build())); } +void write_int_pair_parquet_file(const std::string& file_path, int64_t row_group_size = ROW_COUNT) { + auto schema = arrow::schema({ + arrow::field("id", arrow::int32(), false), + arrow::field("score", arrow::int32(), false), + arrow::field("value", arrow::utf8(), false), + }); + auto table = arrow::Table::Make( + schema, {build_int32_array({1, 2, 3, 4, 5}), build_int32_array({1, 2, 3, 4, 5}), + build_string_array({"one", "two", "three", "four", "five"})}); + + auto file_result = arrow::io::FileOutputStream::Open(file_path); + ASSERT_TRUE(file_result.ok()) << file_result.status(); + std::shared_ptr out = *file_result; + + ::parquet::WriterProperties::Builder builder; + builder.version(::parquet::ParquetVersion::PARQUET_2_6); + builder.data_page_version(::parquet::ParquetDataPageVersion::V2); + builder.compression(::parquet::Compression::UNCOMPRESSED); + PARQUET_THROW_NOT_OK(::parquet::arrow::WriteTable( + *table, arrow::default_memory_pool(), out, row_group_size, builder.build())); +} + Block build_file_block(const std::vector& schema) { Block block; for (const auto& field : schema) { @@ -216,7 +283,7 @@ TEST(TableColumnMapperTest, CreatesComplexProjectionForStructChildren) { ASSERT_TRUE(mapper.create_mapping({table_column}, {}, {struct_field}).ok()); auto request = std::make_unique(); - ASSERT_TRUE(mapper.create_scan_request({}, {table_column}, request.get()).ok()); + ASSERT_TRUE(mapper.create_scan_request({}, {}, {table_column}, request.get()).ok()); ASSERT_EQ(request->non_predicate_columns, std::vector({0})); ASSERT_EQ(request->complex_projections.size(), 1); const auto& projection = request->complex_projections.at(0); @@ -359,12 +426,14 @@ TEST_F(NewParquetReaderTest, ReadPredicateAndNonPredicateColumnsWithSelection) { auto request = std::make_unique(); request->predicate_columns = {0}; request->non_predicate_columns = {1}; - reader::FileLocalFilter filter; - filter.file_column_id = 0; - filter.conjunct = create_int32_greater_than_conjunct(0, 2); - filter.predicates.push_back(create_comparison_predicate( + reader::FileExpressionFilter expression_filter; + expression_filter.conjunct = create_int32_greater_than_conjunct(0, 2); + request->expression_filters.push_back(std::move(expression_filter)); + reader::FileColumnPredicateFilter column_filter; + column_filter.file_column_id = 0; + column_filter.predicates.push_back(create_comparison_predicate( 0, "id", schema[0].type, Field::create_field(2), false)); - request->local_filters.push_back(std::move(filter)); + request->column_predicate_filters.push_back(std::move(column_filter)); ASSERT_TRUE(reader->open(request).ok()); size_t rows = 0; @@ -391,6 +460,40 @@ TEST_F(NewParquetReaderTest, ReadPredicateAndNonPredicateColumnsWithSelection) { EXPECT_EQ(rows, 0); } +TEST_F(NewParquetReaderTest, ReadMultiPredicateColumnsBeforeExpressionFilter) { + write_int_pair_parquet_file(_file_path); + auto reader = create_reader(); + RuntimeState state {TQueryOptions(), TQueryGlobals()}; + ASSERT_TRUE(reader->init(&state).ok()); + + std::vector schema; + ASSERT_TRUE(reader->get_schema(&schema).ok()); + Block block = build_file_block(schema); + + auto request = std::make_unique(); + request->predicate_columns = {0, 1}; + request->non_predicate_columns = {}; + reader::FileExpressionFilter expression_filter; + expression_filter.conjunct = create_int32_sum_greater_than_conjunct(0, 1, 7); + request->expression_filters.push_back(std::move(expression_filter)); + ASSERT_TRUE(reader->open(request).ok()); + + size_t rows = 0; + bool eof = false; + ASSERT_TRUE(reader->get_block(&block, &rows, &eof).ok()); + EXPECT_FALSE(eof); + ASSERT_EQ(rows, 2); + + const auto& ids = assert_cast(*block.get_by_position(0).column); + const auto& scores = assert_cast(*block.get_by_position(1).column); + ASSERT_EQ(ids.size(), 2); + ASSERT_EQ(scores.size(), 2); + EXPECT_EQ(ids.get_element(0), 4); + EXPECT_EQ(ids.get_element(1), 5); + EXPECT_EQ(scores.get_element(0), 4); + EXPECT_EQ(scores.get_element(1), 5); +} + TEST_F(NewParquetReaderTest, PredicateFiltersRowGroupsByStatistics) { write_parquet_file(_file_path, 2); auto parquet_file_reader = ::parquet::ParquetFileReader::OpenFile(_file_path, false); @@ -405,12 +508,14 @@ TEST_F(NewParquetReaderTest, PredicateFiltersRowGroupsByStatistics) { auto request = std::make_unique(); request->predicate_columns = {0}; request->non_predicate_columns = {1}; - reader::FileLocalFilter filter; - filter.file_column_id = 0; - filter.conjunct = create_int32_greater_than_conjunct(0, 2); - filter.predicates.push_back(create_comparison_predicate( + reader::FileExpressionFilter expression_filter; + expression_filter.conjunct = create_int32_greater_than_conjunct(0, 2); + request->expression_filters.push_back(std::move(expression_filter)); + reader::FileColumnPredicateFilter column_filter; + column_filter.file_column_id = 0; + column_filter.predicates.push_back(create_comparison_predicate( 0, "id", schema[0].type, Field::create_field(2), false)); - request->local_filters.push_back(std::move(filter)); + request->column_predicate_filters.push_back(std::move(column_filter)); ASSERT_TRUE(reader->open(request).ok()); std::vector ids; diff --git a/be/test/format/reader/table_reader_test.cpp b/be/test/format/reader/table_reader_test.cpp index dc2e26f35ea222..3d132244122ff7 100644 --- a/be/test/format/reader/table_reader_test.cpp +++ b/be/test/format/reader/table_reader_test.cpp @@ -22,6 +22,7 @@ #include #include +#include #include #include #include @@ -37,6 +38,7 @@ #include "format/reader/expr/slot_ref.h" #include "gen_cpp/PlanNodes_types.h" #include "runtime/runtime_state.h" +#include "storage/predicate/predicate_creator.h" namespace doris::reader { namespace { @@ -75,6 +77,86 @@ class TableInt32GreaterThanExpr final : public VExpr { const std::string _expr_name = "TableInt32GreaterThanExpr"; }; +class TableInt32SumGreaterThanExpr final : public VExpr { +public: + TableInt32SumGreaterThanExpr(int left_slot_id, int left_column_id, int right_slot_id, + int right_column_id, int32_t value) + : VExpr(std::make_shared(), false), _value(value) { + add_child(TableSlotRef::create_shared(left_slot_id, left_column_id, -1, + std::make_shared(), "id")); + add_child(TableSlotRef::create_shared(right_slot_id, right_column_id, -1, + std::make_shared(), "score")); + set_node_type(TExprNodeType::BINARY_PRED); + _opcode = TExprOpcode::GT; + } + + Status execute_column_impl(VExprContext* context, const Block* block, const Selector* selector, + size_t count, ColumnPtr& result_column) const override { + const auto* left_slot_ref = assert_cast(get_child(0).get()); + const auto* right_slot_ref = assert_cast(get_child(1).get()); + const auto& left_input = assert_cast( + *block->get_by_position(left_slot_ref->column_id()).column); + const auto& right_input = assert_cast( + *block->get_by_position(right_slot_ref->column_id()).column); + auto result = ColumnUInt8::create(); + auto& result_data = result->get_data(); + result_data.resize(count); + for (size_t row = 0; row < count; ++row) { + const size_t input_row = selector == nullptr ? row : (*selector)[row]; + result_data[row] = + left_input.get_element(input_row) + right_input.get_element(input_row) > _value; + } + result_column = std::move(result); + return Status::OK(); + } + + const std::string& expr_name() const override { return _expr_name; } + +private: + const int32_t _value; + const std::string _expr_name = "TableInt32SumGreaterThanExpr"; +}; + +class TableInt32SumLessThanExpr final : public VExpr { +public: + TableInt32SumLessThanExpr(int left_slot_id, int left_column_id, int right_slot_id, + int right_column_id, int32_t value) + : VExpr(std::make_shared(), false), _value(value) { + add_child(TableSlotRef::create_shared(left_slot_id, left_column_id, -1, + std::make_shared(), "id")); + add_child(TableSlotRef::create_shared(right_slot_id, right_column_id, -1, + std::make_shared(), "score")); + set_node_type(TExprNodeType::BINARY_PRED); + _opcode = TExprOpcode::LT; + } + + Status execute_column_impl(VExprContext* context, const Block* block, const Selector* selector, + size_t count, ColumnPtr& result_column) const override { + const auto* left_slot_ref = assert_cast(get_child(0).get()); + const auto* right_slot_ref = assert_cast(get_child(1).get()); + const auto& left_input = assert_cast( + *block->get_by_position(left_slot_ref->column_id()).column); + const auto& right_input = assert_cast( + *block->get_by_position(right_slot_ref->column_id()).column); + auto result = ColumnUInt8::create(); + auto& result_data = result->get_data(); + result_data.resize(count); + for (size_t row = 0; row < count; ++row) { + const size_t input_row = selector == nullptr ? row : (*selector)[row]; + result_data[row] = + left_input.get_element(input_row) + right_input.get_element(input_row) < _value; + } + result_column = std::move(result); + return Status::OK(); + } + + const std::string& expr_name() const override { return _expr_name; } + +private: + const int32_t _value; + const std::string _expr_name = "TableInt32SumLessThanExpr"; +}; + std::shared_ptr finish_array(arrow::ArrayBuilder* builder) { std::shared_ptr array; EXPECT_TRUE(builder->Finish(&array).ok()); @@ -117,6 +199,32 @@ void write_parquet_file(const std::string& file_path, int32_t id, const std::str *table, arrow::default_memory_pool(), out, 1, builder.build())); } +void write_int_pair_parquet_file(const std::string& file_path, const std::vector& ids, + const std::vector& scores, + const std::vector& values, + int64_t row_group_size = -1) { + auto schema = arrow::schema({ + arrow::field("id", arrow::int32(), false), + arrow::field("score", arrow::int32(), false), + arrow::field("value", arrow::utf8(), false), + }); + auto table = arrow::Table::Make(schema, {build_int32_array(ids), build_int32_array(scores), + build_string_array(values)}); + + auto file_result = arrow::io::FileOutputStream::Open(file_path); + ASSERT_TRUE(file_result.ok()) << file_result.status(); + std::shared_ptr out = *file_result; + + ::parquet::WriterProperties::Builder builder; + builder.version(::parquet::ParquetVersion::PARQUET_2_6); + builder.data_page_version(::parquet::ParquetDataPageVersion::V2); + builder.compression(::parquet::Compression::UNCOMPRESSED); + const auto write_row_group_size = + row_group_size > 0 ? row_group_size : static_cast(ids.size()); + PARQUET_THROW_NOT_OK(::parquet::arrow::WriteTable( + *table, arrow::default_memory_pool(), out, write_row_group_size, builder.build())); +} + Block build_table_block(const std::vector& columns) { Block block; for (const auto& column : columns) { @@ -164,6 +272,7 @@ TEST(TableReaderTest, ReopenSplitAfterClose) { ASSERT_TRUE(reader .init({ .projected_columns = projected_columns, + .column_predicates = {}, .conjuncts = VExprContext( std::make_shared(0, 0, 0)), .format = FileFormat::PARQUET, @@ -229,8 +338,9 @@ TEST(TableReaderTest, OpenReaderBuildsTableFiltersFromConjuncts) { ASSERT_TRUE(reader .init({ .projected_columns = projected_columns, - .conjuncts = VExprContext(std::make_shared( - 0, 0, 2)), + .column_predicates = {}, + .conjuncts = VExprContext( + std::make_shared(0, 0, 2)), .format = FileFormat::PARQUET, .scan_params = nullptr, .io_ctx = nullptr, @@ -262,8 +372,9 @@ TEST(TableReaderTest, OpenReaderBuildsTableFiltersFromConjuncts) { ASSERT_TRUE(filtered_reader .init({ .projected_columns = projected_columns, - .conjuncts = VExprContext(std::make_shared( - 0, 0, 4)), + .column_predicates = {}, + .conjuncts = VExprContext( + std::make_shared(0, 0, 4)), .format = FileFormat::PARQUET, .scan_params = nullptr, .io_ctx = nullptr, @@ -285,6 +396,195 @@ TEST(TableReaderTest, OpenReaderBuildsTableFiltersFromConjuncts) { std::filesystem::remove_all(test_dir); } +TEST(TableReaderTest, OpenReaderBuildsColumnPredicateFilters) { + const auto test_dir = + std::filesystem::temp_directory_path() / "doris_table_reader_column_predicate_test"; + std::filesystem::remove_all(test_dir); + std::filesystem::create_directories(test_dir); + + const auto file_path = (test_dir / "split.parquet").string(); + // ColumnPredicate is only used for row-group/statistics pruning. Keep one row per row + // group so the predicate can prune the first two row groups and leave only id = 3. + write_int_pair_parquet_file(file_path, {1, 2, 3}, {1, 5, 8}, {"one", "two", "three"}, 1); + + std::vector projected_columns; + projected_columns.push_back(make_table_column(2, "value", std::make_shared())); + projected_columns.push_back(make_table_column(0, "id", std::make_shared())); + + TableColumnPredicates column_predicates; + column_predicates[0].push_back(create_comparison_predicate( + 0, "id", std::make_shared(), Field::create_field(2), false)); + + RuntimeState state {TQueryOptions(), TQueryGlobals()}; + TableReader reader; + ASSERT_TRUE(reader + .init({ + .projected_columns = projected_columns, + .column_predicates = std::move(column_predicates), + .conjuncts = VExprContext(nullptr), + .format = FileFormat::PARQUET, + .scan_params = nullptr, + .io_ctx = nullptr, + .runtime_state = &state, + .scanner_profile = nullptr, + .allow_missing_columns = true, + .profile = nullptr, + }) + .ok()); + + ASSERT_TRUE(reader.prepare_split(build_split_options(file_path)).ok()); + + Block block = build_table_block(projected_columns); + bool eos = false; + ASSERT_TRUE(reader.get_block(&block, &eos).ok()); + ASSERT_FALSE(eos); + + const auto& value_column = assert_cast(*block.get_by_position(0).column); + const auto& id_column = assert_cast(*block.get_by_position(1).column); + ASSERT_EQ(id_column.size(), 1); + ASSERT_EQ(value_column.size(), 1); + EXPECT_EQ(id_column.get_element(0), 3); + EXPECT_EQ(value_column.get_data_at(0).to_string(), "three"); + + ASSERT_TRUE(reader.close().ok()); + std::filesystem::remove_all(test_dir); +} + +TEST(TableReaderTest, CreateScanRequestDeduplicatesSharedPredicateColumns) { + const auto int_type = std::make_shared(); + const std::vector projected_columns = { + make_table_column(0, "a", int_type), + make_table_column(1, "b", int_type), + make_table_column(2, "c", int_type), + make_table_column(3, "value", std::make_shared()), + }; + const std::vector file_schema = { + {.id = 0, + .name = "a", + .type = int_type, + .children = {}, + .file_path = {0}, + .field_id_path = {0}, + .name_path = {"a"}, + .column_type = DATA_COLUMN}, + {.id = 1, + .name = "b", + .type = int_type, + .children = {}, + .file_path = {1}, + .field_id_path = {1}, + .name_path = {"b"}, + .column_type = DATA_COLUMN}, + {.id = 2, + .name = "c", + .type = int_type, + .children = {}, + .file_path = {2}, + .field_id_path = {2}, + .name_path = {"c"}, + .column_type = DATA_COLUMN}, + {.id = 3, + .name = "value", + .type = std::make_shared(), + .children = {}, + .file_path = {3}, + .field_id_path = {3}, + .name_path = {"value"}, + .column_type = DATA_COLUMN}, + }; + + TableColumnMapper mapper; + ASSERT_TRUE(mapper.create_mapping(projected_columns, {}, file_schema).ok()); + + std::vector table_filters; + table_filters.push_back({ + .conjunct = VExprContext::create_shared( + std::make_shared(0, 0, 1, 1, 1)), + .slot_ids = {0, 1}, + }); + table_filters.push_back({ + .conjunct = VExprContext::create_shared( + std::make_shared(0, 0, 2, 2, 3)), + .slot_ids = {0, 2}, + }); + + FileScanRequest file_request; + ASSERT_TRUE(mapper.create_scan_request(table_filters, {}, projected_columns, &file_request) + .ok()); + + // Both filters reference column a. It must still be read once as a predicate column, and a + // predicate column must not be repeated as a non-predicate column. + EXPECT_EQ(file_request.predicate_columns, std::vector({0, 1, 2})); + EXPECT_EQ(file_request.non_predicate_columns, std::vector({3})); + ASSERT_EQ(file_request.column_positions.size(), 4); + EXPECT_EQ(file_request.column_positions.at(3), 0); + EXPECT_EQ(file_request.column_positions.at(0), 1); + EXPECT_EQ(file_request.column_positions.at(1), 2); + EXPECT_EQ(file_request.column_positions.at(2), 3); + for (const auto predicate_column : file_request.predicate_columns) { + EXPECT_TRUE(std::find(file_request.non_predicate_columns.begin(), + file_request.non_predicate_columns.end(), + predicate_column) == file_request.non_predicate_columns.end()); + } +} + +TEST(TableReaderTest, OpenReaderPushesMultiColumnConjunctToParquetReader) { + const auto test_dir = + std::filesystem::temp_directory_path() / "doris_table_reader_multi_conjunct_test"; + std::filesystem::remove_all(test_dir); + std::filesystem::create_directories(test_dir); + + const auto file_path = (test_dir / "split.parquet").string(); + write_int_pair_parquet_file(file_path, {1, 2, 3}, {1, 5, 8}, {"one", "two", "three"}); + + std::vector projected_columns; + projected_columns.push_back(make_table_column(2, "value", std::make_shared())); + projected_columns.push_back(make_table_column(0, "id", std::make_shared())); + projected_columns.push_back(make_table_column(1, "score", std::make_shared())); + + RuntimeState state {TQueryOptions(), TQueryGlobals()}; + TableReader reader; + ASSERT_TRUE(reader + .init({ + .projected_columns = projected_columns, + .column_predicates = {}, + .conjuncts = VExprContext( + std::make_shared( + 0, 0, 1, 1, 8)), + .format = FileFormat::PARQUET, + .scan_params = nullptr, + .io_ctx = nullptr, + .runtime_state = &state, + .scanner_profile = nullptr, + .allow_missing_columns = true, + .profile = nullptr, + }) + .ok()); + + ASSERT_TRUE(reader.prepare_split(build_split_options(file_path)).ok()); + + // The conjunct references both id and score, so ColumnMapper must put both file columns into + // predicate_columns and rewrite both slot refs to ParquetReader's file-local block positions. + // ParquetReader then evaluates the expression after all predicate columns have been read. + Block block = build_table_block(projected_columns); + bool eos = false; + ASSERT_TRUE(reader.get_block(&block, &eos).ok()); + ASSERT_FALSE(eos); + + const auto& value_column = assert_cast(*block.get_by_position(0).column); + const auto& id_column = assert_cast(*block.get_by_position(1).column); + const auto& score_column = assert_cast(*block.get_by_position(2).column); + ASSERT_EQ(id_column.size(), 1); + ASSERT_EQ(score_column.size(), 1); + ASSERT_EQ(value_column.size(), 1); + EXPECT_EQ(id_column.get_element(0), 3); + EXPECT_EQ(score_column.get_element(0), 8); + EXPECT_EQ(value_column.get_data_at(0).to_string(), "three"); + + ASSERT_TRUE(reader.close().ok()); + std::filesystem::remove_all(test_dir); +} + TEST(TableReaderTest, ProjectedColumnsFillDefaultForParquetSchemaMismatch) { const auto test_dir = std::filesystem::temp_directory_path() / "doris_table_reader_schema_mismatch_test"; @@ -303,6 +603,7 @@ TEST(TableReaderTest, ProjectedColumnsFillDefaultForParquetSchemaMismatch) { ASSERT_TRUE(reader .init({ .projected_columns = projected_columns, + .column_predicates = {}, .conjuncts = VExprContext(nullptr), .format = FileFormat::PARQUET, .scan_params = nullptr, @@ -347,6 +648,7 @@ TEST(TableReaderTest, ProjectedColumnsRejectParquetSchemaMismatchWhenMissingColu ASSERT_TRUE(reader .init({ .projected_columns = projected_columns, + .column_predicates = {}, .conjuncts = VExprContext(nullptr), .format = FileFormat::PARQUET, .scan_params = nullptr, @@ -389,6 +691,7 @@ TEST(TableReaderTest, ProjectedColumnsUseMapperExpressionForSameNameDifferentIdP ASSERT_TRUE(reader .init({ .projected_columns = projected_columns, + .column_predicates = {}, .conjuncts = VExprContext(nullptr), .format = FileFormat::PARQUET, .scan_params = nullptr, @@ -438,6 +741,7 @@ TEST(TableReaderTest, ProjectedColumnsUseMapperExpressionsForParquetSchemaMismat ASSERT_TRUE(reader .init({ .projected_columns = projected_columns, + .column_predicates = {}, .conjuncts = VExprContext(nullptr), .format = FileFormat::PARQUET, .scan_params = nullptr, From 5ad0921fe021d5c939824b95f44b882fe678470d Mon Sep 17 00:00:00 2001 From: Gabriel Date: Wed, 27 May 2026 19:12:12 +0800 Subject: [PATCH 20/38] [fix](be) Cast localized filter slots for file schema types (#63754) --- be/src/format/reader/column_mapper.cpp | 53 +++++++++----- be/src/format/reader/file_reader.h | 2 +- be/test/format/reader/expr/cast_test.cpp | 89 ++++++++++++++++++++++++ 3 files changed, 127 insertions(+), 17 deletions(-) diff --git a/be/src/format/reader/column_mapper.cpp b/be/src/format/reader/column_mapper.cpp index 1a33781b9651f1..80a81f6c76d57c 100644 --- a/be/src/format/reader/column_mapper.cpp +++ b/be/src/format/reader/column_mapper.cpp @@ -36,18 +36,33 @@ namespace doris::reader { +struct FileSlotRewriteInfo { + size_t block_position = 0; + DataTypePtr file_type; + DataTypePtr table_type; + std::string file_column_name; +}; + static VExprSPtr rewrite_table_expr_to_file_expr( - const VExprSPtr& expr, const std::map& table_column_to_file_position) { + const VExprSPtr& expr, + const std::map& table_column_to_file_slot) { if (expr == nullptr) { return nullptr; } if (expr->is_slot_ref()) { const auto* slot_ref = assert_cast(expr.get()); - const auto position_it = table_column_to_file_position.find(slot_ref->slot_id()); - if (position_it != table_column_to_file_position.end()) { - return TableSlotRef::create_shared(slot_ref->slot_id(), - cast_set(position_it->second), -1, - slot_ref->data_type(), slot_ref->expr_name()); + const auto rewrite_it = table_column_to_file_slot.find(slot_ref->slot_id()); + if (rewrite_it != table_column_to_file_slot.end()) { + const auto& rewrite_info = rewrite_it->second; + auto file_slot = TableSlotRef::create_shared( + slot_ref->slot_id(), cast_set(rewrite_info.block_position), -1, + rewrite_info.file_type, rewrite_info.file_column_name); + if (rewrite_info.file_type->equals(*rewrite_info.table_type)) { + return file_slot; + } + auto cast_expr = Cast::create_shared(rewrite_info.table_type); + cast_expr->add_child(std::move(file_slot)); + return cast_expr; } return expr; } @@ -59,7 +74,7 @@ static VExprSPtr rewrite_table_expr_to_file_expr( rewritten_children.reserve(expr->children().size()); for (const auto& child : expr->children()) { rewritten_children.push_back( - rewrite_table_expr_to_file_expr(child, table_column_to_file_position)); + rewrite_table_expr_to_file_expr(child, table_column_to_file_slot)); } expr->set_children(std::move(rewritten_children)); return expr; @@ -95,19 +110,25 @@ static void rebuild_projection(ColumnMapping* mapping, size_t block_position) { mapping->projection = VExprContext::create_shared(expr); } -static std::map build_file_position_map(const std::vector& mappings, - const FileScanRequest& file_request) { - std::map table_column_to_file_position; +// Build a map from table column id to file slot rewrite info for all columns in the given mappings that have a file column id and are present in the file request. +static std::map build_file_slot_rewrite_map( + const std::vector& mappings, const FileScanRequest& file_request) { + std::map table_column_to_file_slot; for (const auto& mapping : mappings) { if (!mapping.file_column_id.has_value()) { continue; } const auto position_it = file_request.column_positions.find(*mapping.file_column_id); if (position_it != file_request.column_positions.end()) { - table_column_to_file_position.emplace(mapping.table_column_id, position_it->second); + table_column_to_file_slot.emplace( + mapping.table_column_id, + FileSlotRewriteInfo {.block_position = position_it->second, + .file_type = mapping.file_type, + .table_type = mapping.table_type, + .file_column_name = mapping.file_column_name}); } } - return table_column_to_file_position; + return table_column_to_file_slot; } static bool is_complex_type(const DataTypePtr& type) { @@ -348,9 +369,9 @@ Status TableColumnMapper::localize_filters(const std::vector& table add_scan_column(file_request, *mapping->file_column_id, &file_request->predicate_columns); } - // Build the complete table-slot to file-block position map after all predicate columns have - // been assigned. This keeps expression localization independent from filter iteration order. - const auto table_column_to_file_position = build_file_position_map(_mappings, *file_request); + // Build the complete table-slot rewrite map after all predicate columns have been assigned. + // This keeps expression localization independent from filter iteration order. + const auto table_column_to_file_slot = build_file_slot_rewrite_map(_mappings, *file_request); for (const auto& table_filter : table_filters) { if (!table_filter.can_be_localized()) { continue; @@ -359,7 +380,7 @@ Status TableColumnMapper::localize_filters(const std::vector& table FileExpressionFilter expression_filter; expression_filter.conjunct = VExprContext::create_shared(rewrite_table_expr_to_file_expr( - table_filter.conjunct->root(), table_column_to_file_position)); + table_filter.conjunct->root(), table_column_to_file_slot)); expression_filter.file_column_ids.reserve(table_filter.slot_ids.size()); for (const auto table_column_id : table_filter.slot_ids) { const auto* mapping = _find_mapping(table_column_id); diff --git a/be/src/format/reader/file_reader.h b/be/src/format/reader/file_reader.h index 69720bc8f9a2b7..28de8f068b0f6c 100644 --- a/be/src/format/reader/file_reader.h +++ b/be/src/format/reader/file_reader.h @@ -106,7 +106,7 @@ struct FileScanRequest { std::vector predicate_columns; std::vector non_predicate_columns; - std::map column_positions; + std::map column_positions; // file_column_id -> file-local block position std::map complex_projections; std::vector expression_filters; std::vector column_predicate_filters; diff --git a/be/test/format/reader/expr/cast_test.cpp b/be/test/format/reader/expr/cast_test.cpp index 4f2154189532e9..cab4e6c5b0db20 100644 --- a/be/test/format/reader/expr/cast_test.cpp +++ b/be/test/format/reader/expr/cast_test.cpp @@ -64,6 +64,33 @@ class CastTest : public testing::Test { MockRuntimeState state; }; +class Int64ChildGreaterThanExpr final : public VExpr { +public: + explicit Int64ChildGreaterThanExpr(int64_t value) + : VExpr(std::make_shared(), false), _value(value) {} + + Status execute_column_impl(VExprContext* context, const Block* block, const Selector* selector, + size_t count, ColumnPtr& result_column) const override { + ColumnPtr child_column; + RETURN_IF_ERROR(get_child(0)->execute_column(context, block, selector, count, child_column)); + const auto& input = assert_cast(*child_column); + auto result = ColumnUInt8::create(); + auto& result_data = result->get_data(); + result_data.resize(count); + for (size_t row = 0; row < count; ++row) { + result_data[row] = input.get_element(row) > _value; + } + result_column = std::move(result); + return Status::OK(); + } + + const std::string& expr_name() const override { return _expr_name; } + +private: + const int64_t _value; + const std::string _expr_name = "Int64ChildGreaterThanExpr"; +}; + TEST_F(CastTest, CastIntSlotToBigInt) { auto source_type = std::make_shared(); auto return_type = std::make_shared(); @@ -189,6 +216,9 @@ TEST_F(CastTest, ColumnMapperBuildsCastProjectionForTypeMismatch) { auto status = mapper.create_mapping(projected_columns, {}, file_schema); ASSERT_TRUE(status.ok()) << status; ASSERT_EQ(mapper.mappings().size(), 1); + reader::FileScanRequest file_request; + status = mapper.create_scan_request({}, {}, projected_columns, &file_request); + ASSERT_TRUE(status.ok()) << status; const auto& mapping = mapper.mappings()[0]; EXPECT_FALSE(mapping.is_trivial); ASSERT_NE(mapping.projection, nullptr); @@ -207,4 +237,63 @@ TEST_F(CastTest, ColumnMapperBuildsCastProjectionForTypeMismatch) { mapping.projection->close(); } +TEST_F(CastTest, ColumnMapperBuildsCastFilterForTypeMismatch) { + reader::TableColumnMapper mapper; + reader::TableColumn table_column; + table_column.id = 7; + table_column.name = "value"; + table_column.type = std::make_shared(); + std::vector projected_columns {table_column}; + + reader::SchemaField file_field; + file_field.id = 0; + file_field.name = "value"; + file_field.type = std::make_shared(); + std::vector file_schema {file_field}; + + auto status = mapper.create_mapping(projected_columns, {}, file_schema); + ASSERT_TRUE(status.ok()) << status; + + auto predicate = std::make_shared(15); + predicate->add_child(TableSlotRef::create_shared(7, 7, -1, table_column.type, "value")); + reader::TableFilter table_filter; + table_filter.conjunct = VExprContext::create_shared(predicate); + table_filter.slot_ids = {7}; + + reader::FileScanRequest file_request; + ASSERT_TRUE(mapper.create_scan_request({table_filter}, {}, projected_columns, &file_request) + .ok()); + ASSERT_EQ(file_request.expression_filters.size(), 1); + ASSERT_EQ(file_request.predicate_columns, std::vector({0})); + const auto& localized_expr = file_request.expression_filters[0].conjunct->root(); + ASSERT_EQ(localized_expr->get_num_children(), 1); + const auto& localized_child = localized_expr->children()[0]; + ASSERT_NE(dynamic_cast(localized_child.get()), nullptr); + ASSERT_EQ(localized_child->get_num_children(), 1); + const auto* localized_slot = + assert_cast(localized_child->children()[0].get()); + EXPECT_EQ(localized_slot->column_id(), 0); + EXPECT_TRUE(localized_slot->data_type()->equals(*file_field.type)); + EXPECT_TRUE(localized_child->data_type()->equals(*table_column.type)); + + Block block; + block.insert(ColumnHelper::create_column_with_name({11, 22})); + auto* conjunct = file_request.expression_filters[0].conjunct.get(); + status = conjunct->prepare(&state, RowDescriptor()); + ASSERT_TRUE(status.ok()) << status; + status = conjunct->open(&state); + ASSERT_TRUE(status.ok()) << status; + IColumn::Filter filter(block.rows(), 1); + bool can_filter_all = false; + status = conjunct->execute_filter(&block, filter.data(), block.rows(), false, + &can_filter_all); + ASSERT_TRUE(status.ok()) << status; + EXPECT_FALSE(can_filter_all); + ASSERT_EQ(filter.size(), 2); + EXPECT_EQ(filter[0], 0); + EXPECT_EQ(filter[1], 1); + + file_request.expression_filters[0].conjunct->close(); +} + } // namespace doris From 6f4dac325011df9b4c4df11f5087c0aa5e5db1c3 Mon Sep 17 00:00:00 2001 From: Socrates Date: Thu, 28 May 2026 11:02:39 +0800 Subject: [PATCH 21/38] [feature](be) Add basic parquet list reader ### What problem does this PR solve? Issue Number: close #xxx Related PR: #xxx Problem Summary: Add the next step of complex type support in the new parquet reader by normalizing standard LIST schema to Array(element), allowing nested leaf RecordReader usage, and reading non-empty LIST columns from repetition levels. ### Release note None ### Check List (For Author) - Test: Manual test - Ran clang-format dry-run and git diff --check for modified files. - Ran BUILD_TYPE=DEBUG ./build.sh --be on Fedora successfully with the patch applied. - Attempted ParquetColumnReaderTest on Fedora, but stopped the ASAN_UT build because it triggered a fresh full UT build; no test binary execution result was produced. - Behavior changed: Yes. The new parquet reader can now read a limited non-empty LIST shape and reports NotSupported for unsupported list shapes instead of rejecting all LIST columns. - Does this need documentation: No --- be/src/format/new_parquet/column_reader.cpp | 235 +++++++++++++++--- be/src/format/new_parquet/column_reader.h | 7 + .../new_parquet/parquet_column_schema.cpp | 24 +- be/src/format/new_parquet/parquet_type.cpp | 4 - .../parquet_column_reader_test.cpp | 39 +++ 5 files changed, 274 insertions(+), 35 deletions(-) diff --git a/be/src/format/new_parquet/column_reader.cpp b/be/src/format/new_parquet/column_reader.cpp index f1674b767b09e0..9952016832c788 100644 --- a/be/src/format/new_parquet/column_reader.cpp +++ b/be/src/format/new_parquet/column_reader.cpp @@ -31,7 +31,9 @@ #include #include "core/column/column.h" +#include "core/column/column_array.h" #include "core/column/column_struct.h" +#include "core/data_type/data_type_array.h" #include "core/data_type/data_type_nullable.h" #include "core/data_type/data_type_struct.h" #include "core/data_type_serde/decoded_column_view.h" @@ -66,6 +68,7 @@ class ScalarColumnReader final : public ParquetColumnReader { const std::shared_ptr<::parquet::internal::RecordReader>& record_reader() const { return _record_reader; } + const ParquetTypeDescriptor& type_descriptor() const { return _type_descriptor; } private: int _file_column_id = -1; @@ -101,6 +104,32 @@ class StructColumnReader final : public ParquetColumnReader { std::vector> _children; }; +class ListColumnReader final : public ParquetColumnReader { +public: + ListColumnReader(const ParquetColumnSchema& schema, DataTypePtr type, + std::unique_ptr element_reader) + : _field_id(schema.top_level_field_id), + _repeated_repetition_level(schema.repeated_repetition_level), + _type(std::move(type)), + _name(schema.name), + _element_reader(std::move(element_reader)) {} + + int file_column_id() const override { return _field_id; } + int parquet_leaf_column_id() const override { return -1; } + const DataTypePtr& type() const override { return _type; } + const std::string& name() const override { return _name; } + + Status read(int64_t rows, MutableColumnPtr& column, int64_t* rows_read) override; + Status skip(int64_t rows) override; + +private: + int _field_id = -1; + int16_t _repeated_repetition_level = 0; + DataTypePtr _type; + std::string _name; + std::unique_ptr _element_reader; +}; + Status read_records(ScalarColumnReader& column_reader, int64_t batch_rows, ::parquet::internal::RecordReader** record_reader, int64_t* rows_read) { auto reader = column_reader.record_reader(); @@ -275,6 +304,34 @@ Status build_binary_values(const ScalarColumnReader& column_reader, return Status::OK(); } +Status append_scalar_values(const ScalarColumnReader& column_reader, + ::parquet::internal::RecordReader& record_reader, int64_t row_count, + const NullMap* null_map, MutableColumnPtr& column) { + std::vector binary_values; + std::vector> binary_chunks; + DecodedColumnView view; + view.value_kind = decoded_value_kind(column_reader.type_descriptor()); + view.time_unit = decoded_time_unit(column_reader.type_descriptor().time_unit); + view.row_count = row_count; + view.decimal_precision = column_reader.type_descriptor().decimal_precision; + view.decimal_scale = column_reader.type_descriptor().decimal_scale; + view.fixed_length = column_reader.type_descriptor().fixed_length; + view.null_map = null_map == nullptr || null_map->empty() ? nullptr : null_map->data(); + if (view.value_kind == DecodedValueKind::BINARY || + view.value_kind == DecodedValueKind::FIXED_BINARY) { + RETURN_IF_ERROR(get_binary_chunks(column_reader, record_reader, &binary_chunks)); + RETURN_IF_ERROR( + build_binary_values(column_reader, binary_chunks, row_count, &binary_values)); + view.binary_values = &binary_values; + } else { + view.values = record_reader.values(); + } + + RETURN_IF_ERROR( + column_reader.type()->get_serde()->read_column_from_decoded_values(*column, view)); + return Status::OK(); +} + } // namespace Status ScalarColumnReader::read(int64_t rows, MutableColumnPtr& column, int64_t* rows_read) { @@ -297,26 +354,7 @@ Status ScalarColumnReader::read(int64_t rows, MutableColumnPtr& column, int64_t* NullMap null_map; RETURN_IF_ERROR(build_null_map(*this, *record_reader, *rows_read, &null_map)); - std::vector binary_values; - std::vector> binary_chunks; - DecodedColumnView view; - view.value_kind = decoded_value_kind(_type_descriptor); - view.time_unit = decoded_time_unit(_type_descriptor.time_unit); - view.row_count = *rows_read; - view.decimal_precision = _type_descriptor.decimal_precision; - view.decimal_scale = _type_descriptor.decimal_scale; - view.fixed_length = _type_descriptor.fixed_length; - view.null_map = null_map.empty() ? nullptr : null_map.data(); - if (view.value_kind == DecodedValueKind::BINARY || - view.value_kind == DecodedValueKind::FIXED_BINARY) { - RETURN_IF_ERROR(get_binary_chunks(*this, *record_reader, &binary_chunks)); - RETURN_IF_ERROR(build_binary_values(*this, binary_chunks, *rows_read, &binary_values)); - view.binary_values = &binary_values; - } else { - view.values = record_reader->values(); - } - - RETURN_IF_ERROR(_type->get_serde()->read_column_from_decoded_values(*column, view)); + RETURN_IF_ERROR(append_scalar_values(*this, *record_reader, *rows_read, &null_map, column)); return Status::OK(); } @@ -395,6 +433,92 @@ Status StructColumnReader::skip(int64_t rows) { return Status::OK(); } +Status ListColumnReader::read(int64_t rows, MutableColumnPtr& column, int64_t* rows_read) { + if (column.get() == nullptr || rows_read == nullptr) { + return Status::InvalidArgument("Invalid parquet list read result pointer for column {}", + _name); + } + if (_element_reader == nullptr) { + return Status::InternalError("Parquet list element reader is not initialized for column {}", + _name); + } + auto* element_reader = dynamic_cast(_element_reader.get()); + if (element_reader == nullptr) { + return Status::NotSupported( + "Current parquet LIST reader only supports scalar elements for column {}", _name); + } + if (element_reader->descriptor()->max_definition_level() != 1) { + return Status::NotSupported( + "Current parquet LIST reader only supports required elements for column {}", _name); + } + + ::parquet::internal::RecordReader* record_reader = nullptr; + int64_t records_read = 0; + RETURN_IF_ERROR(read_records(*element_reader, rows, &record_reader, &records_read)); + const int64_t levels_written = record_reader->levels_written(); + if (records_read != rows || levels_written < records_read) { + return Status::Corruption( + "Invalid parquet LIST read result for column {}: rows={}, levels={}", _name, + records_read, levels_written); + } + if (record_reader->values_written() != levels_written) { + return Status::NotSupported( + "Current parquet LIST reader only supports non-empty lists with required " + "elements for column {}", + _name); + } + const int16_t max_definition_level = element_reader->descriptor()->max_definition_level(); + if (auto* def_levels = record_reader->def_levels(); def_levels != nullptr) { + for (int64_t level_idx = 0; level_idx < levels_written; ++level_idx) { + if (def_levels[level_idx] != max_definition_level) { + return Status::NotSupported( + "Current parquet LIST reader only supports non-empty lists with required " + "elements for column {}", + _name); + } + } + } + + auto& array_column = assert_cast(*column); + auto nested_column = array_column.get_data_ptr()->assume_mutable(); + RETURN_IF_ERROR(append_scalar_values(*element_reader, *record_reader, levels_written, nullptr, + nested_column)); + array_column.get_data_ptr() = std::move(nested_column); + + auto* rep_levels = record_reader->rep_levels(); + if (rep_levels == nullptr && levels_written > 0) { + return Status::Corruption( + "Parquet LIST reader returned null repetition levels for column {}", _name); + } + auto& offsets = array_column.get_offsets(); + offsets.reserve(offsets.size() + static_cast(records_read)); + size_t current_offset = offsets.empty() ? 0 : offsets.back(); + int64_t current_record = 0; + for (int64_t level_idx = 0; level_idx < levels_written; ++level_idx) { + if (level_idx == 0 || rep_levels[level_idx] < _repeated_repetition_level) { + if (level_idx != 0) { + offsets.push_back(current_offset); + current_record++; + } + } + current_offset++; + } + while (current_record < records_read) { + offsets.push_back(current_offset); + current_record++; + } + *rows_read = records_read; + return Status::OK(); +} + +Status ListColumnReader::skip(int64_t rows) { + if (rows <= 0) { + return Status::OK(); + } + DORIS_CHECK(_element_reader != nullptr); + return _element_reader->skip(rows); +} + Status ParquetColumnReader::skip(int64_t rows) { return Status::NotSupported("Parquet column skip is not implemented, rows={}", rows); } @@ -469,6 +593,42 @@ Status ParquetColumnReaderFactory::create_scalar_column_reader( "column {} is not supported", column_schema.name); } + if (column_schema.descriptor == nullptr || + column_schema.descriptor->max_repetition_level() != 0 || + column_schema.descriptor->max_definition_level() > 1) { + return Status::NotSupported( + "Current parquet scalar reader only supports flat primitive columns; column {} is " + "not supported", + column_schema.name); + } + std::shared_ptr<::parquet::internal::RecordReader> record_reader; + RETURN_IF_ERROR(get_record_reader(column_schema.leaf_column_id, column_schema.descriptor, + column_schema.name, &record_reader)); + return create_scalar_reader(column_schema.leaf_column_id, column_schema.type_descriptor, + column_schema.descriptor, column_schema.type, column_schema.name, + std::move(record_reader), reader); +} + +Status ParquetColumnReaderFactory::create_nested_scalar_column_reader( + const ParquetColumnSchema& column_schema, + std::unique_ptr* reader) const { + if (reader == nullptr) { + return Status::InvalidArgument("reader is null"); + } + if (column_schema.kind != ParquetColumnSchemaKind::PRIMITIVE) { + return Status::InvalidArgument("Parquet nested scalar reader requires primitive column {}", + column_schema.name); + } + if (column_schema.leaf_column_id < 0 || + column_schema.leaf_column_id >= static_cast(_record_readers.size())) { + return Status::InvalidArgument("Invalid parquet leaf column id {} for column {}", + column_schema.leaf_column_id, column_schema.name); + } + if (!supports_record_reader(column_schema.type_descriptor)) { + return Status::NotSupported( + "Current parquet nested scalar reader does not support column {}", + column_schema.name); + } std::shared_ptr<::parquet::internal::RecordReader> record_reader; RETURN_IF_ERROR(get_record_reader(column_schema.leaf_column_id, column_schema.descriptor, column_schema.name, &record_reader)); @@ -494,12 +654,6 @@ Status ParquetColumnReaderFactory::get_record_reader( if (descriptor == nullptr) { return Status::InvalidArgument("Parquet column descriptor is null for column {}", name); } - if (descriptor->max_repetition_level() != 0 || descriptor->max_definition_level() > 1) { - return Status::NotSupported( - "Current parquet reader only supports RecordReader-backed columns; column {} is " - "not supported", - name); - } if (_record_readers[leaf_column_id] == nullptr) { try { _record_readers[leaf_column_id] = @@ -569,6 +723,32 @@ Status ParquetColumnReaderFactory::create_struct_column_reader( return Status::OK(); } +Status ParquetColumnReaderFactory::create_list_column_reader( + const ParquetColumnSchema& column_schema, const reader::FieldProjection* projection, + std::unique_ptr* reader) const { + if (reader == nullptr) { + return Status::InvalidArgument("reader is null"); + } + if (projection != nullptr && !projection->project_all_children) { + return Status::NotSupported("Parquet LIST projection is not implemented for column {}", + column_schema.name); + } + if (column_schema.type != nullptr && column_schema.type->is_nullable()) { + return Status::NotSupported("Nullable parquet LIST reader is not implemented for column {}", + column_schema.name); + } + if (column_schema.children.size() != 1) { + return Status::NotSupported("Unsupported parquet LIST layout for column {}", + column_schema.name); + } + std::unique_ptr element_reader; + RETURN_IF_ERROR( + create_nested_scalar_column_reader(*column_schema.children[0], &element_reader)); + *reader = std::make_unique(column_schema, column_schema.type, + std::move(element_reader)); + return Status::OK(); +} + Status ParquetColumnReaderFactory::create(const ParquetColumnSchema& column_schema, const reader::FieldProjection* projection, std::unique_ptr* reader) const { @@ -581,8 +761,7 @@ Status ParquetColumnReaderFactory::create(const ParquetColumnSchema& column_sche case ParquetColumnSchemaKind::STRUCT: return create_struct_column_reader(column_schema, projection, reader); case ParquetColumnSchemaKind::LIST: - return Status::NotSupported("Parquet LIST reader is not implemented for column {}", - column_schema.name); + return create_list_column_reader(column_schema, projection, reader); case ParquetColumnSchemaKind::MAP: return Status::NotSupported("Parquet MAP reader is not implemented for column {}", column_schema.name); diff --git a/be/src/format/new_parquet/column_reader.h b/be/src/format/new_parquet/column_reader.h index 93881ac8c48077..ec691a9743e4ed 100644 --- a/be/src/format/new_parquet/column_reader.h +++ b/be/src/format/new_parquet/column_reader.h @@ -104,10 +104,17 @@ class ParquetColumnReaderFactory { Status create_scalar_column_reader(const ParquetColumnSchema& column_schema, std::unique_ptr* reader) const; + Status create_nested_scalar_column_reader(const ParquetColumnSchema& column_schema, + std::unique_ptr* reader) const; + Status create_struct_column_reader(const ParquetColumnSchema& column_schema, const reader::FieldProjection* projection, std::unique_ptr* reader) const; + Status create_list_column_reader(const ParquetColumnSchema& column_schema, + const reader::FieldProjection* projection, + std::unique_ptr* reader) const; + Status get_record_reader(int leaf_column_id, const ::parquet::ColumnDescriptor* descriptor, const std::string& name, std::shared_ptr<::parquet::internal::RecordReader>* reader) const; diff --git a/be/src/format/new_parquet/parquet_column_schema.cpp b/be/src/format/new_parquet/parquet_column_schema.cpp index 3235ea38a0671e..8541769c1d2840 100644 --- a/be/src/format/new_parquet/parquet_column_schema.cpp +++ b/be/src/format/new_parquet/parquet_column_schema.cpp @@ -140,6 +140,9 @@ Status build_node_schema(const ::parquet::SchemaDescriptor& schema, return Status::NotSupported("Unsupported parquet column type for column {}", node.name()); } + column_schema->type = node.is_optional() + ? make_nullable(remove_nullable(column_schema->type)) + : remove_nullable(column_schema->type); *result = std::move(column_schema); return Status::OK(); } @@ -151,10 +154,25 @@ Status build_node_schema(const ::parquet::SchemaDescriptor& schema, return Status::NotSupported("Unsupported parquet LIST encoding for column {}", node.name()); } + const auto& repeated_node = *group.field(0); + if (!repeated_node.is_repeated() || repeated_node.is_primitive()) { + return Status::NotSupported("Unsupported parquet LIST encoding for column {}", + node.name()); + } + const auto& repeated_group = + static_cast(repeated_node); + if (repeated_group.field_count() != 1) { + return Status::NotSupported("Unsupported parquet LIST element layout for column {}", + node.name()); + } + auto repeated_context = + child_context(context, repeated_node, 0, column_schema->schema_node_id); + column_schema->repeated_repetition_level = repeated_context.repeated_repetition_level; std::unique_ptr child; - RETURN_IF_ERROR(build_node_schema( - schema, *group.field(0), - child_context(context, *group.field(0), 0, column_schema->schema_node_id), &child)); + RETURN_IF_ERROR(build_node_schema(schema, *repeated_group.field(0), + child_context(repeated_context, *repeated_group.field(0), + 0, column_schema->schema_node_id), + &child)); column_schema->type = nullable_if_needed(std::make_shared(child->type), node); column_schema->children.push_back(std::move(child)); diff --git a/be/src/format/new_parquet/parquet_type.cpp b/be/src/format/new_parquet/parquet_type.cpp index 53c7b4f2ed93ce..4079c989f7d232 100644 --- a/be/src/format/new_parquet/parquet_type.cpp +++ b/be/src/format/new_parquet/parquet_type.cpp @@ -323,10 +323,6 @@ ParquetTypeDescriptor resolve_parquet_type(const ::parquet::ColumnDescriptor* co !result.is_decimal && (result.physical_type == ::parquet::Type::BYTE_ARRAY || result.physical_type == ::parquet::Type::FIXED_LEN_BYTE_ARRAY); - if (column->max_repetition_level() != 0 || column->max_definition_level() > 1) { - result.supports_record_reader = false; - return result; - } if (!record_reader_physical_type_supported(result.physical_type)) { result.supports_record_reader = false; return result; diff --git a/be/test/format/new_parquet/parquet_column_reader_test.cpp b/be/test/format/new_parquet/parquet_column_reader_test.cpp index 97773a5bada910..b85bbb80a6aa21 100644 --- a/be/test/format/new_parquet/parquet_column_reader_test.cpp +++ b/be/test/format/new_parquet/parquet_column_reader_test.cpp @@ -29,12 +29,14 @@ #include #include "core/assert_cast.h" +#include "core/column/column_array.h" #include "core/column/column_decimal.h" #include "core/column/column_nullable.h" #include "core/column/column_string.h" #include "core/column/column_struct.h" #include "core/column/column_vector.h" #include "core/data_type/data_type.h" +#include "core/data_type/data_type_array.h" #include "core/data_type/data_type_nullable.h" #include "core/data_type/data_type_struct.h" #include "core/types.h" @@ -144,6 +146,21 @@ class ParquetColumnReaderTest : public testing::Test { return finish_array(&builder); } + std::shared_ptr build_required_int_list_array() { + auto value_builder = std::make_shared(); + arrow::ListBuilder builder(arrow::default_memory_pool(), value_builder); + const std::vector> values = { + {1, 2}, {3}, {4, 5, 6}, {7}, {8, 9}, + }; + for (const auto& row : values) { + EXPECT_TRUE(builder.Append().ok()); + for (const auto value : row) { + EXPECT_TRUE(value_builder->Append(value).ok()); + } + } + return finish_array(&builder); + } + std::shared_ptr build_time32_array(const std::shared_ptr& type, const std::vector& values) { arrow::Time32Builder builder(type, arrow::default_memory_pool()); @@ -365,6 +382,28 @@ class ParquetColumnReaderTest : public testing::Test { EXPECT_EQ(b_values.get_data_at(1).to_string(), "sb"); EXPECT_EQ(b_values.get_data_at(4).to_string(), "se"); }); + add_field(arrow::field("list_int_col", + arrow::list(arrow::field("element", arrow::int32(), false)), false), + build_required_int_list_array(), + [](const ParquetColumnSchema& schema, const IColumn& column) { + EXPECT_EQ(remove_nullable(schema.type)->get_primitive_type(), TYPE_ARRAY); + const auto* array_type = + assert_cast(remove_nullable(schema.type).get()); + EXPECT_EQ( + remove_nullable(array_type->get_nested_type())->get_primitive_type(), + TYPE_INT); + const auto& array_column = assert_cast(column); + ASSERT_EQ(array_column.size(), ROW_COUNT); + EXPECT_EQ(array_column.size_at(0), 2); + EXPECT_EQ(array_column.size_at(1), 1); + EXPECT_EQ(array_column.size_at(2), 3); + EXPECT_EQ(array_column.size_at(4), 2); + const auto& values = assert_cast(array_column.get_data()); + ASSERT_EQ(values.size(), 9); + EXPECT_EQ(values.get_element(0), 1); + EXPECT_EQ(values.get_element(5), 6); + EXPECT_EQ(values.get_element(8), 9); + }); auto schema = arrow::schema(_arrow_fields); auto table = arrow::Table::Make(schema, _arrays); From ac24a827685562d3adb877ccd2082ceb719e642e Mon Sep 17 00:00:00 2001 From: Gabriel Date: Thu, 28 May 2026 11:11:00 +0800 Subject: [PATCH 22/38] [fix](be) Fill partition columns in TableReader (#63773) Problem Summary: TableReader could map partition columns to physical file columns before checking split partition values, and constant/default expression materialization used the file-local block row count. For scans where partition values should be filled from split metadata, especially when the file-local block row count differs from the batch row count, this could produce incorrect materialized columns. --- .../data_type_datetimev2_serde.cpp | 3 +- .../data_type_datev2_serde.cpp | 7 +- .../data_type_decimal_serde.cpp | 14 +- .../data_type_nullable_serde.cpp | 6 +- .../data_type_number_serde.cpp | 10 +- .../data_type_string_serde.cpp | 2 +- .../data_type_serde/data_type_time_serde.cpp | 20 +- be/src/format/new_parquet/parquet_reader.cpp | 6 +- be/src/format/reader/column_mapper.cpp | 14 +- be/src/format/reader/column_mapper.h | 3 +- be/src/format/reader/table_reader.h | 17 +- .../data_type_serde_decoded_values_test.cpp | 3 +- .../new_parquet/parquet_reader_test.cpp | 4 +- be/test/format/reader/expr/cast_test.cpp | 10 +- be/test/format/reader/table_reader_test.cpp | 278 ++++++++++-------- 15 files changed, 225 insertions(+), 172 deletions(-) diff --git a/be/src/core/data_type_serde/data_type_datetimev2_serde.cpp b/be/src/core/data_type_serde/data_type_datetimev2_serde.cpp index fc2c14d1829049..ce0599080c6b2a 100644 --- a/be/src/core/data_type_serde/data_type_datetimev2_serde.cpp +++ b/be/src/core/data_type_serde/data_type_datetimev2_serde.cpp @@ -22,12 +22,13 @@ #include // IWYU pragma: keep #include + #include "common/status.h" #include "core/column/column_const.h" -#include "core/data_type_serde/decoded_column_view.h" #include "core/data_type/data_type_decimal.h" #include "core/data_type/data_type_number.h" #include "core/data_type/primitive_type.h" +#include "core/data_type_serde/decoded_column_view.h" #include "core/types.h" #include "core/value/vdatetime_value.h" #include "exprs/function/cast/cast_to_datetimev2_impl.hpp" diff --git a/be/src/core/data_type_serde/data_type_datev2_serde.cpp b/be/src/core/data_type_serde/data_type_datev2_serde.cpp index 9410df86eaa237..94b86312d61d3a 100644 --- a/be/src/core/data_type_serde/data_type_datev2_serde.cpp +++ b/be/src/core/data_type_serde/data_type_datev2_serde.cpp @@ -22,11 +22,12 @@ #include #include + #include "core/column/column_const.h" -#include "core/data_type_serde/decoded_column_view.h" #include "core/data_type/data_type_decimal.h" #include "core/data_type/data_type_number.h" #include "core/data_type/define_primitive_type.h" +#include "core/data_type_serde/decoded_column_view.h" #include "core/types.h" #include "core/value/vdatetime_value.h" #include "exprs/function/cast/cast_to_datev2_impl.hpp" @@ -125,8 +126,8 @@ Status DataTypeDateV2SerDe::read_column_from_arrow(IColumn& column, const arrow: return Status::OK(); } -Status DataTypeDateV2SerDe::read_column_from_decoded_values( - IColumn& column, const DecodedColumnView& view) const { +Status DataTypeDateV2SerDe::read_column_from_decoded_values(IColumn& column, + const DecodedColumnView& view) const { if (view.value_kind != DecodedValueKind::INT32) { return Status::NotSupported("DATEV2 decoded reader expects INT32 source"); } diff --git a/be/src/core/data_type_serde/data_type_decimal_serde.cpp b/be/src/core/data_type_serde/data_type_decimal_serde.cpp index c744cdb8a2f63d..9fa2e0c6ebd9cc 100644 --- a/be/src/core/data_type_serde/data_type_decimal_serde.cpp +++ b/be/src/core/data_type_serde/data_type_decimal_serde.cpp @@ -61,8 +61,8 @@ NativeType decode_big_endian_signed_integer(const uint8_t* data, int length) { } template -typename PrimitiveTypeTraits::CppType read_decimal_decoded_value( - const DecodedColumnView& view, int64_t row) { +typename PrimitiveTypeTraits::CppType read_decimal_decoded_value(const DecodedColumnView& view, + int64_t row) { using FieldType = typename PrimitiveTypeTraits::CppType; if (view.value_kind == DecodedValueKind::INT32) { const auto* values = reinterpret_cast(view.values); @@ -76,9 +76,9 @@ typename PrimitiveTypeTraits::CppType read_decimal_decoded_value( const auto length = view.value_kind == DecodedValueKind::FIXED_BINARY ? view.fixed_length : cast_set(value.size); - return FieldType {static_cast( - decode_big_endian_signed_integer(reinterpret_cast(value.data), - length))}; + return FieldType { + static_cast(decode_big_endian_signed_integer( + reinterpret_cast(value.data), length))}; } template @@ -441,8 +441,8 @@ Status DataTypeDecimalSerDe::read_column_from_decoded_values( return read_decimal_decoded_values(column, view); } } - return Status::NotSupported("Unsupported decoded values for {} from source kind {}", - get_name(), static_cast(view.value_kind)); + return Status::NotSupported("Unsupported decoded values for {} from source kind {}", get_name(), + static_cast(view.value_kind)); } template diff --git a/be/src/core/data_type_serde/data_type_nullable_serde.cpp b/be/src/core/data_type_serde/data_type_nullable_serde.cpp index 6b15b29c63ad67..b02c8606332b92 100644 --- a/be/src/core/data_type_serde/data_type_nullable_serde.cpp +++ b/be/src/core/data_type_serde/data_type_nullable_serde.cpp @@ -29,9 +29,9 @@ #include "core/column/column_const.h" #include "core/column/column_nullable.h" #include "core/column/column_vector.h" -#include "core/data_type_serde/decoded_column_view.h" #include "core/data_type_serde/data_type_serde.h" #include "core/data_type_serde/data_type_string_serde.h" +#include "core/data_type_serde/decoded_column_view.h" #include "exprs/function/cast/cast_base.h" #include "format/transformer/vcsv_transformer.h" #include "util/jsonb_document.h" @@ -351,8 +351,8 @@ Status DataTypeNullableSerDe::read_column_from_arrow(IColumn& column, ctz); } -Status DataTypeNullableSerDe::read_column_from_decoded_values( - IColumn& column, const DecodedColumnView& view) const { +Status DataTypeNullableSerDe::read_column_from_decoded_values(IColumn& column, + const DecodedColumnView& view) const { auto& nullable_column = assert_cast(column); auto& null_map = nullable_column.get_null_map_data(); const auto old_size = null_map.size(); diff --git a/be/src/core/data_type_serde/data_type_number_serde.cpp b/be/src/core/data_type_serde/data_type_number_serde.cpp index 131e6d059417f7..6cd30449083f23 100644 --- a/be/src/core/data_type_serde/data_type_number_serde.cpp +++ b/be/src/core/data_type_serde/data_type_number_serde.cpp @@ -26,8 +26,8 @@ #include "core/column/column_nullable.h" #include "core/data_type/define_primitive_type.h" #include "core/data_type/primitive_type.h" -#include "core/data_type_serde/decoded_column_view.h" #include "core/data_type_serde/data_type_serde.h" +#include "core/data_type_serde/decoded_column_view.h" #include "core/packed_int128.h" #include "core/types.h" #include "core/value/timestamptz_value.h" @@ -55,8 +55,8 @@ Status read_number_decoded_values(IColumn& column, const DecodedColumnView& view if (view.values == nullptr && view.row_count > 0) { return Status::Corruption("Decoded value buffer is null for {}", column.get_name()); } - auto& data = assert_cast::ColumnType&>(column) - .get_data(); + auto& data = + assert_cast::ColumnType&>(column).get_data(); const auto* values = decoded_values_as(view); for (int64_t row = 0; row < view.row_count; ++row) { using DorisCppType = typename PrimitiveTypeTraits::CppType; @@ -204,8 +204,8 @@ Status DataTypeNumberSerDe::read_column_from_decoded_values( return read_number_decoded_values(column, view); } } - return Status::NotSupported("Unsupported decoded values for {} from source kind {}", - get_name(), static_cast(view.value_kind)); + return Status::NotSupported("Unsupported decoded values for {} from source kind {}", get_name(), + static_cast(view.value_kind)); } template diff --git a/be/src/core/data_type_serde/data_type_string_serde.cpp b/be/src/core/data_type_serde/data_type_string_serde.cpp index 478cdf3b5e6f1a..0a9a5cd7dabc04 100644 --- a/be/src/core/data_type_serde/data_type_string_serde.cpp +++ b/be/src/core/data_type_serde/data_type_string_serde.cpp @@ -18,8 +18,8 @@ #include "core/data_type_serde/data_type_string_serde.h" #include "core/column/column_string.h" -#include "core/data_type_serde/decoded_column_view.h" #include "core/data_type/define_primitive_type.h" +#include "core/data_type_serde/decoded_column_view.h" #include "util/jsonb_document_cast.h" #include "util/jsonb_utils.h" #include "util/jsonb_writer.h" diff --git a/be/src/core/data_type_serde/data_type_time_serde.cpp b/be/src/core/data_type_serde/data_type_time_serde.cpp index 65e1afa577d0ed..a40a8d217c9bd4 100644 --- a/be/src/core/data_type_serde/data_type_time_serde.cpp +++ b/be/src/core/data_type_serde/data_type_time_serde.cpp @@ -17,10 +17,10 @@ #include "core/data_type_serde/data_type_time_serde.h" -#include "core/data_type_serde/decoded_column_view.h" #include "core/data_type/data_type_decimal.h" #include "core/data_type/data_type_number.h" #include "core/data_type/primitive_type.h" +#include "core/data_type_serde/decoded_column_view.h" #include "core/value/time_value.h" #include "exprs/function/cast/cast_base.h" #include "exprs/function/cast/cast_to_time_impl.hpp" @@ -44,12 +44,11 @@ TimeValue::TimeType read_time_decoded_value(const DecodedColumnView& view, int64 } const bool negative = micros < 0; const int64_t abs_micros = std::abs(micros); - return TimeValue::make_time(abs_micros / TimeValue::ONE_HOUR_MICROSECONDS, - (abs_micros % TimeValue::ONE_HOUR_MICROSECONDS) / - TimeValue::ONE_MINUTE_MICROSECONDS, - (abs_micros % TimeValue::ONE_MINUTE_MICROSECONDS) / - TimeValue::ONE_SECOND_MICROSECONDS, - abs_micros % TimeValue::ONE_SECOND_MICROSECONDS, negative); + return TimeValue::make_time( + abs_micros / TimeValue::ONE_HOUR_MICROSECONDS, + (abs_micros % TimeValue::ONE_HOUR_MICROSECONDS) / TimeValue::ONE_MINUTE_MICROSECONDS, + (abs_micros % TimeValue::ONE_MINUTE_MICROSECONDS) / TimeValue::ONE_SECOND_MICROSECONDS, + abs_micros % TimeValue::ONE_SECOND_MICROSECONDS, negative); } } // namespace @@ -173,10 +172,9 @@ Status DataTypeTimeV2SerDe::from_string_strict_mode(StringRef& str, IColumn& col return Status::OK(); } -Status DataTypeTimeV2SerDe::read_column_from_decoded_values( - IColumn& column, const DecodedColumnView& view) const { - if (view.value_kind != DecodedValueKind::INT32 && - view.value_kind != DecodedValueKind::INT64) { +Status DataTypeTimeV2SerDe::read_column_from_decoded_values(IColumn& column, + const DecodedColumnView& view) const { + if (view.value_kind != DecodedValueKind::INT32 && view.value_kind != DecodedValueKind::INT64) { return Status::NotSupported("TIMEV2 decoded reader expects INT32 or INT64 source"); } if (view.values == nullptr && view.row_count > 0) { diff --git a/be/src/format/new_parquet/parquet_reader.cpp b/be/src/format/new_parquet/parquet_reader.cpp index 6d0ef3eb742840..70902d936ee400 100644 --- a/be/src/format/new_parquet/parquet_reader.cpp +++ b/be/src/format/new_parquet/parquet_reader.cpp @@ -328,9 +328,9 @@ Status ParquetReader::_execute_filter_conjuncts(int64_t batch_rows, Block* file_ } IColumn::Filter filter(static_cast(batch_rows), 1); bool can_filter_all = false; - RETURN_IF_ERROR(expression_filter.conjunct->execute_filter( - file_block, filter.data(), static_cast(batch_rows), false, - &can_filter_all)); + RETURN_IF_ERROR(expression_filter.conjunct->execute_filter(file_block, filter.data(), + static_cast(batch_rows), + false, &can_filter_all)); *selected_rows = can_filter_all ? 0 : _apply_filter_to_selection(filter, selection, *selected_rows); } diff --git a/be/src/format/reader/column_mapper.cpp b/be/src/format/reader/column_mapper.cpp index 80a81f6c76d57c..e8e7442a8d798e 100644 --- a/be/src/format/reader/column_mapper.cpp +++ b/be/src/format/reader/column_mapper.cpp @@ -249,18 +249,20 @@ Status TableColumnMapper::create_mapping(const std::vector& project ColumnMapping mapping; mapping.table_column_id = table_column.id; mapping.table_type = table_column.type; - if (const auto* file_field = _find_file_field(table_column, file_schema)) { - RETURN_IF_ERROR(_create_direct_mapping(table_column, *file_field, &mapping)); - } else if (table_column.is_partition_key && partition_values.count(table_column.name) > 0) { - // 3. Partition column, use partition value as a constant mapping. Note that partition column may also have default expression, but partition value should take precedence if it exists. + if (table_column.is_partition_key && partition_values.count(table_column.name) > 0) { + // 1. Partition column, use partition value as a constant mapping. Note that partition column may also have default expression, but partition value should take precedence if it exists. + mapping.is_constant = true; mapping.default_expr = VExprContext::create_shared(TableLiteral::create_shared( mapping.table_type, partition_values.at(table_column.name))); + } else if (const auto* file_field = _find_file_field(table_column, file_schema)) { + // 2. Table column has a matching file column, use it as a direct mapping. + RETURN_IF_ERROR(_create_direct_mapping(table_column, *file_field, &mapping)); } else if (table_column.default_expr != nullptr) { - // 4. Table column does not exist in file (column adding by schema evolution), which has a default expression, use it as a constant mapping. + // 3. Table column does not exist in file (column adding by schema evolution), which has a default expression, use it as a constant mapping. mapping.is_constant = true; mapping.default_expr = table_column.default_expr; } else if (table_column.name == ROW_LINEAGE_ROW_ID) { - // 5. Virtual column, use special mapping to indicate it should be materialized by table reader instead of read from file or evaluated from expression. + // 4. Virtual column, use special mapping to indicate it should be materialized by table reader instead of read from file or evaluated from expression. mapping.virtual_column_type = TableVirtualColumnType::ROW_ID; } else if (table_column.name == ROW_LINEAGE_LAST_UPDATED_SEQ_NUMBER) { mapping.virtual_column_type = TableVirtualColumnType::LAST_UPDATED_SEQUENCE_NUMBER; diff --git a/be/src/format/reader/column_mapper.h b/be/src/format/reader/column_mapper.h index bcfe71522088dd..75b53f68d2d09e 100644 --- a/be/src/format/reader/column_mapper.h +++ b/be/src/format/reader/column_mapper.h @@ -43,8 +43,7 @@ struct SchemaField; struct FileScanRequest; struct FieldProjection; -using TableColumnPredicates = - std::map>>; +using TableColumnPredicates = std::map>>; enum class TableColumnMappingMode { BY_FIELD_ID, diff --git a/be/src/format/reader/table_reader.h b/be/src/format/reader/table_reader.h index 2cf5eb30468b8a..ee252817d40a98 100644 --- a/be/src/format/reader/table_reader.h +++ b/be/src/format/reader/table_reader.h @@ -295,9 +295,20 @@ class TableReader { return Status::OK(); } if (mapping.default_expr != nullptr) { - int res_id; - RETURN_IF_ERROR(mapping.default_expr->execute(current_block, &res_id)); - *column = current_block->get_columns()[res_id]; + if (current_block->rows() == current_rows) { + int res_id; + RETURN_IF_ERROR(mapping.default_expr->execute(current_block, &res_id)); + *column = current_block->get_columns()[res_id]; + } else { + DORIS_CHECK(mapping.is_constant); + Block eval_block; + eval_block.insert( + {mapping.table_type->create_column_const_with_default_value(current_rows), + mapping.table_type, "__table_reader_const_rows"}); + int res_id; + RETURN_IF_ERROR(mapping.default_expr->execute(&eval_block, &res_id)); + *column = eval_block.get_columns()[res_id]; + } return Status::OK(); } *column = mapping.table_type->create_column_const_with_default_value(current_rows); diff --git a/be/test/core/data_type_serde/data_type_serde_decoded_values_test.cpp b/be/test/core/data_type_serde/data_type_serde_decoded_values_test.cpp index 10f15bb28b1c10..1622775b6a871a 100644 --- a/be/test/core/data_type_serde/data_type_serde_decoded_values_test.cpp +++ b/be/test/core/data_type_serde/data_type_serde_decoded_values_test.cpp @@ -237,7 +237,8 @@ TEST(DataTypeSerDeDecodedValuesTest, ReadNullableInt32Values) { ASSERT_TRUE(st.ok()) << st; const auto& nullable_column = assert_cast(*column); - const auto& nested_column = assert_cast(nullable_column.get_nested_column()); + const auto& nested_column = + assert_cast(nullable_column.get_nested_column()); ASSERT_EQ(nullable_column.size(), 4); EXPECT_FALSE(nullable_column.is_null_at(0)); EXPECT_TRUE(nullable_column.is_null_at(1)); diff --git a/be/test/format/new_parquet/parquet_reader_test.cpp b/be/test/format/new_parquet/parquet_reader_test.cpp index 43ec9cc0ab1c03..f393da6822c0eb 100644 --- a/be/test/format/new_parquet/parquet_reader_test.cpp +++ b/be/test/format/new_parquet/parquet_reader_test.cpp @@ -196,8 +196,8 @@ void write_int_pair_parquet_file(const std::string& file_path, int64_t row_group builder.version(::parquet::ParquetVersion::PARQUET_2_6); builder.data_page_version(::parquet::ParquetDataPageVersion::V2); builder.compression(::parquet::Compression::UNCOMPRESSED); - PARQUET_THROW_NOT_OK(::parquet::arrow::WriteTable( - *table, arrow::default_memory_pool(), out, row_group_size, builder.build())); + PARQUET_THROW_NOT_OK(::parquet::arrow::WriteTable(*table, arrow::default_memory_pool(), out, + row_group_size, builder.build())); } Block build_file_block(const std::vector& schema) { diff --git a/be/test/format/reader/expr/cast_test.cpp b/be/test/format/reader/expr/cast_test.cpp index cab4e6c5b0db20..a236d327a1f2c4 100644 --- a/be/test/format/reader/expr/cast_test.cpp +++ b/be/test/format/reader/expr/cast_test.cpp @@ -72,7 +72,8 @@ class Int64ChildGreaterThanExpr final : public VExpr { Status execute_column_impl(VExprContext* context, const Block* block, const Selector* selector, size_t count, ColumnPtr& result_column) const override { ColumnPtr child_column; - RETURN_IF_ERROR(get_child(0)->execute_column(context, block, selector, count, child_column)); + RETURN_IF_ERROR( + get_child(0)->execute_column(context, block, selector, count, child_column)); const auto& input = assert_cast(*child_column); auto result = ColumnUInt8::create(); auto& result_data = result->get_data(); @@ -261,8 +262,8 @@ TEST_F(CastTest, ColumnMapperBuildsCastFilterForTypeMismatch) { table_filter.slot_ids = {7}; reader::FileScanRequest file_request; - ASSERT_TRUE(mapper.create_scan_request({table_filter}, {}, projected_columns, &file_request) - .ok()); + ASSERT_TRUE( + mapper.create_scan_request({table_filter}, {}, projected_columns, &file_request).ok()); ASSERT_EQ(file_request.expression_filters.size(), 1); ASSERT_EQ(file_request.predicate_columns, std::vector({0})); const auto& localized_expr = file_request.expression_filters[0].conjunct->root(); @@ -285,8 +286,7 @@ TEST_F(CastTest, ColumnMapperBuildsCastFilterForTypeMismatch) { ASSERT_TRUE(status.ok()) << status; IColumn::Filter filter(block.rows(), 1); bool can_filter_all = false; - status = conjunct->execute_filter(&block, filter.data(), block.rows(), false, - &can_filter_all); + status = conjunct->execute_filter(&block, filter.data(), block.rows(), false, &can_filter_all); ASSERT_TRUE(status.ok()) << status; EXPECT_FALSE(can_filter_all); ASSERT_EQ(filter.size(), 2); diff --git a/be/test/format/reader/table_reader_test.cpp b/be/test/format/reader/table_reader_test.cpp index 3d132244122ff7..f770fddb7238b3 100644 --- a/be/test/format/reader/table_reader_test.cpp +++ b/be/test/format/reader/table_reader_test.cpp @@ -56,9 +56,8 @@ class TableInt32GreaterThanExpr final : public VExpr { Status execute_column_impl(VExprContext* context, const Block* block, const Selector* selector, size_t count, ColumnPtr& result_column) const override { const auto* slot_ref = assert_cast(get_child(0).get()); - const auto& input = - assert_cast( - *block->get_by_position(slot_ref->column_id()).column); + const auto& input = assert_cast( + *block->get_by_position(slot_ref->column_id()).column); auto result = ColumnUInt8::create(); auto& result_data = result->get_data(); result_data.resize(count); @@ -184,8 +183,7 @@ void write_parquet_file(const std::string& file_path, int32_t id, const std::str arrow::field("id", arrow::int32(), false), arrow::field("value", arrow::utf8(), false), }); - auto table = - arrow::Table::Make(schema, {build_int32_array({id}), build_string_array({value})}); + auto table = arrow::Table::Make(schema, {build_int32_array({id}), build_string_array({value})}); auto file_result = arrow::io::FileOutputStream::Open(file_path); ASSERT_TRUE(file_result.ok()) << file_result.status(); @@ -195,8 +193,8 @@ void write_parquet_file(const std::string& file_path, int32_t id, const std::str builder.version(::parquet::ParquetVersion::PARQUET_2_6); builder.data_page_version(::parquet::ParquetDataPageVersion::V2); builder.compression(::parquet::Compression::UNCOMPRESSED); - PARQUET_THROW_NOT_OK(::parquet::arrow::WriteTable( - *table, arrow::default_memory_pool(), out, 1, builder.build())); + PARQUET_THROW_NOT_OK(::parquet::arrow::WriteTable(*table, arrow::default_memory_pool(), out, 1, + builder.build())); } void write_int_pair_parquet_file(const std::string& file_path, const std::vector& ids, @@ -221,8 +219,8 @@ void write_int_pair_parquet_file(const std::string& file_path, const std::vector builder.compression(::parquet::Compression::UNCOMPRESSED); const auto write_row_group_size = row_group_size > 0 ? row_group_size : static_cast(ids.size()); - PARQUET_THROW_NOT_OK(::parquet::arrow::WriteTable( - *table, arrow::default_memory_pool(), out, write_row_group_size, builder.build())); + PARQUET_THROW_NOT_OK(::parquet::arrow::WriteTable(*table, arrow::default_memory_pool(), out, + write_row_group_size, builder.build())); } Block build_table_block(const std::vector& columns) { @@ -269,20 +267,19 @@ TEST(TableReaderTest, ReopenSplitAfterClose) { RuntimeState state {TQueryOptions(), TQueryGlobals()}; TableReader reader; - ASSERT_TRUE(reader - .init({ - .projected_columns = projected_columns, - .column_predicates = {}, - .conjuncts = VExprContext( - std::make_shared(0, 0, 0)), - .format = FileFormat::PARQUET, - .scan_params = nullptr, - .io_ctx = nullptr, - .runtime_state = &state, - .scanner_profile = nullptr, - .allow_missing_columns = true, - .profile = nullptr, - }) + ASSERT_TRUE(reader.init({ + .projected_columns = projected_columns, + .column_predicates = {}, + .conjuncts = VExprContext( + std::make_shared(0, 0, 0)), + .format = FileFormat::PARQUET, + .scan_params = nullptr, + .io_ctx = nullptr, + .runtime_state = &state, + .scanner_profile = nullptr, + .allow_missing_columns = true, + .profile = nullptr, + }) .ok()); // Simulate the scanner lifecycle for three different splits: @@ -335,20 +332,19 @@ TEST(TableReaderTest, OpenReaderBuildsTableFiltersFromConjuncts) { RuntimeState state {TQueryOptions(), TQueryGlobals()}; TableReader reader; - ASSERT_TRUE(reader - .init({ - .projected_columns = projected_columns, - .column_predicates = {}, - .conjuncts = VExprContext( - std::make_shared(0, 0, 2)), - .format = FileFormat::PARQUET, - .scan_params = nullptr, - .io_ctx = nullptr, - .runtime_state = &state, - .scanner_profile = nullptr, - .allow_missing_columns = true, - .profile = nullptr, - }) + ASSERT_TRUE(reader.init({ + .projected_columns = projected_columns, + .column_predicates = {}, + .conjuncts = VExprContext( + std::make_shared(0, 0, 2)), + .format = FileFormat::PARQUET, + .scan_params = nullptr, + .io_ctx = nullptr, + .runtime_state = &state, + .scanner_profile = nullptr, + .allow_missing_columns = true, + .profile = nullptr, + }) .ok()); ASSERT_TRUE(reader.prepare_split(build_split_options(file_path)).ok()); @@ -417,19 +413,18 @@ TEST(TableReaderTest, OpenReaderBuildsColumnPredicateFilters) { RuntimeState state {TQueryOptions(), TQueryGlobals()}; TableReader reader; - ASSERT_TRUE(reader - .init({ - .projected_columns = projected_columns, - .column_predicates = std::move(column_predicates), - .conjuncts = VExprContext(nullptr), - .format = FileFormat::PARQUET, - .scan_params = nullptr, - .io_ctx = nullptr, - .runtime_state = &state, - .scanner_profile = nullptr, - .allow_missing_columns = true, - .profile = nullptr, - }) + ASSERT_TRUE(reader.init({ + .projected_columns = projected_columns, + .column_predicates = std::move(column_predicates), + .conjuncts = VExprContext(nullptr), + .format = FileFormat::PARQUET, + .scan_params = nullptr, + .io_ctx = nullptr, + .runtime_state = &state, + .scanner_profile = nullptr, + .allow_missing_columns = true, + .profile = nullptr, + }) .ok()); ASSERT_TRUE(reader.prepare_split(build_split_options(file_path)).ok()); @@ -509,8 +504,8 @@ TEST(TableReaderTest, CreateScanRequestDeduplicatesSharedPredicateColumns) { }); FileScanRequest file_request; - ASSERT_TRUE(mapper.create_scan_request(table_filters, {}, projected_columns, &file_request) - .ok()); + ASSERT_TRUE( + mapper.create_scan_request(table_filters, {}, projected_columns, &file_request).ok()); // Both filters reference column a. It must still be read once as a predicate column, and a // predicate column must not be repeated as a non-predicate column. @@ -544,21 +539,20 @@ TEST(TableReaderTest, OpenReaderPushesMultiColumnConjunctToParquetReader) { RuntimeState state {TQueryOptions(), TQueryGlobals()}; TableReader reader; - ASSERT_TRUE(reader - .init({ - .projected_columns = projected_columns, - .column_predicates = {}, - .conjuncts = VExprContext( - std::make_shared( - 0, 0, 1, 1, 8)), - .format = FileFormat::PARQUET, - .scan_params = nullptr, - .io_ctx = nullptr, - .runtime_state = &state, - .scanner_profile = nullptr, - .allow_missing_columns = true, - .profile = nullptr, - }) + ASSERT_TRUE(reader.init({ + .projected_columns = projected_columns, + .column_predicates = {}, + .conjuncts = VExprContext( + std::make_shared(0, 0, 1, + 1, 8)), + .format = FileFormat::PARQUET, + .scan_params = nullptr, + .io_ctx = nullptr, + .runtime_state = &state, + .scanner_profile = nullptr, + .allow_missing_columns = true, + .profile = nullptr, + }) .ok()); ASSERT_TRUE(reader.prepare_split(build_split_options(file_path)).ok()); @@ -600,19 +594,18 @@ TEST(TableReaderTest, ProjectedColumnsFillDefaultForParquetSchemaMismatch) { RuntimeState state {TQueryOptions(), TQueryGlobals()}; TableReader reader; - ASSERT_TRUE(reader - .init({ - .projected_columns = projected_columns, - .column_predicates = {}, - .conjuncts = VExprContext(nullptr), - .format = FileFormat::PARQUET, - .scan_params = nullptr, - .io_ctx = nullptr, - .runtime_state = &state, - .scanner_profile = nullptr, - .allow_missing_columns = true, - .profile = nullptr, - }) + ASSERT_TRUE(reader.init({ + .projected_columns = projected_columns, + .column_predicates = {}, + .conjuncts = VExprContext(nullptr), + .format = FileFormat::PARQUET, + .scan_params = nullptr, + .io_ctx = nullptr, + .runtime_state = &state, + .scanner_profile = nullptr, + .allow_missing_columns = true, + .profile = nullptr, + }) .ok()); ASSERT_TRUE(reader.prepare_split(build_split_options(file_path)).ok()); @@ -645,19 +638,18 @@ TEST(TableReaderTest, ProjectedColumnsRejectParquetSchemaMismatchWhenMissingColu RuntimeState state {TQueryOptions(), TQueryGlobals()}; TableReader reader; - ASSERT_TRUE(reader - .init({ - .projected_columns = projected_columns, - .column_predicates = {}, - .conjuncts = VExprContext(nullptr), - .format = FileFormat::PARQUET, - .scan_params = nullptr, - .io_ctx = nullptr, - .runtime_state = &state, - .scanner_profile = nullptr, - .allow_missing_columns = false, - .profile = nullptr, - }) + ASSERT_TRUE(reader.init({ + .projected_columns = projected_columns, + .column_predicates = {}, + .conjuncts = VExprContext(nullptr), + .format = FileFormat::PARQUET, + .scan_params = nullptr, + .io_ctx = nullptr, + .runtime_state = &state, + .scanner_profile = nullptr, + .allow_missing_columns = false, + .profile = nullptr, + }) .ok()); ASSERT_TRUE(reader.prepare_split(build_split_options(file_path)).ok()); @@ -674,6 +666,56 @@ TEST(TableReaderTest, ProjectedColumnsRejectParquetSchemaMismatchWhenMissingColu std::filesystem::remove_all(test_dir); } +TEST(TableReaderTest, ProjectedPartitionColumnUsesSplitPartitionValue) { + const auto test_dir = + std::filesystem::temp_directory_path() / "doris_table_reader_partition_value_test"; + std::filesystem::remove_all(test_dir); + std::filesystem::create_directories(test_dir); + + const auto file_path = (test_dir / "split.parquet").string(); + write_parquet_file(file_path, 1, "one"); + + std::vector projected_columns; + auto partition_column = make_table_column(1, "value", std::make_shared()); + partition_column.is_partition_key = true; + projected_columns.push_back(std::move(partition_column)); + + RuntimeState state {TQueryOptions(), TQueryGlobals()}; + TableReader reader; + ASSERT_TRUE(reader.init({ + .projected_columns = projected_columns, + .column_predicates = {}, + .conjuncts = VExprContext(nullptr), + .format = FileFormat::PARQUET, + .scan_params = nullptr, + .io_ctx = nullptr, + .runtime_state = &state, + .scanner_profile = nullptr, + .allow_missing_columns = true, + .profile = nullptr, + }) + .ok()); + + auto split_options = build_split_options(file_path); + split_options.partition_values.emplace("value", Field::create_field("p1")); + ASSERT_TRUE(reader.prepare_split(split_options).ok()); + + // The file has a physical column with the same id/name. The split partition value should still + // take precedence and be materialized by TableReader. + Block block = build_table_block(projected_columns); + bool eos = false; + ASSERT_TRUE(reader.get_block(&block, &eos).ok()); + ASSERT_FALSE(eos); + + const auto& partition_value = + assert_cast(*block.get_by_position(0).column); + ASSERT_EQ(partition_value.size(), 1); + EXPECT_EQ(partition_value.get_data_at(0).to_string(), "p1"); + + ASSERT_TRUE(reader.close().ok()); + std::filesystem::remove_all(test_dir); +} + TEST(TableReaderTest, ProjectedColumnsUseMapperExpressionForSameNameDifferentIdParquetSchema) { const auto test_dir = std::filesystem::temp_directory_path() / "doris_table_reader_same_name_diff_id_test"; @@ -688,19 +730,18 @@ TEST(TableReaderTest, ProjectedColumnsUseMapperExpressionForSameNameDifferentIdP RuntimeState state {TQueryOptions(), TQueryGlobals()}; TableReader reader; - ASSERT_TRUE(reader - .init({ - .projected_columns = projected_columns, - .column_predicates = {}, - .conjuncts = VExprContext(nullptr), - .format = FileFormat::PARQUET, - .scan_params = nullptr, - .io_ctx = nullptr, - .runtime_state = &state, - .scanner_profile = nullptr, - .allow_missing_columns = true, - .profile = nullptr, - }) + ASSERT_TRUE(reader.init({ + .projected_columns = projected_columns, + .column_predicates = {}, + .conjuncts = VExprContext(nullptr), + .format = FileFormat::PARQUET, + .scan_params = nullptr, + .io_ctx = nullptr, + .runtime_state = &state, + .scanner_profile = nullptr, + .allow_missing_columns = true, + .profile = nullptr, + }) .ok()); ASSERT_TRUE(reader.prepare_split(build_split_options(file_path)).ok()); @@ -738,19 +779,18 @@ TEST(TableReaderTest, ProjectedColumnsUseMapperExpressionsForParquetSchemaMismat RuntimeState state {TQueryOptions(), TQueryGlobals()}; TableReader reader; - ASSERT_TRUE(reader - .init({ - .projected_columns = projected_columns, - .column_predicates = {}, - .conjuncts = VExprContext(nullptr), - .format = FileFormat::PARQUET, - .scan_params = nullptr, - .io_ctx = nullptr, - .runtime_state = &state, - .scanner_profile = nullptr, - .allow_missing_columns = true, - .profile = nullptr, - }) + ASSERT_TRUE(reader.init({ + .projected_columns = projected_columns, + .column_predicates = {}, + .conjuncts = VExprContext(nullptr), + .format = FileFormat::PARQUET, + .scan_params = nullptr, + .io_ctx = nullptr, + .runtime_state = &state, + .scanner_profile = nullptr, + .allow_missing_columns = true, + .profile = nullptr, + }) .ok()); ASSERT_TRUE(reader.prepare_split(build_split_options(file_path)).ok()); From 321134d931e1a9e00bfa535a51e34c8b76fb7d7d Mon Sep 17 00:00:00 2001 From: Gabriel Date: Thu, 28 May 2026 13:45:11 +0800 Subject: [PATCH 23/38] Materialize Iceberg row lineage virtual columns (#63787) 1. ParquetReader reads a range of a parquet file 2. ParquetReader supports virtual column reader (RowPosition) 3. IcebergReader supports virtual columns --- be/src/format/new_parquet/column_reader.cpp | 62 ++++ be/src/format/new_parquet/column_reader.h | 9 + be/src/format/new_parquet/parquet_reader.cpp | 98 ++++++- be/src/format/new_parquet/parquet_reader.h | 6 + be/src/format/reader/table_reader.cpp | 2 + be/src/format/reader/table_reader.h | 5 + be/src/format/table/iceberg_reader_v2.h | 138 +++++++-- be/src/io/file_factory.h | 2 + .../new_parquet/parquet_reader_test.cpp | 166 ++++++++++- be/test/format/reader/table_reader_test.cpp | 271 ++++++++++++++++++ 10 files changed, 730 insertions(+), 29 deletions(-) diff --git a/be/src/format/new_parquet/column_reader.cpp b/be/src/format/new_parquet/column_reader.cpp index 9952016832c788..c427b38a97041f 100644 --- a/be/src/format/new_parquet/column_reader.cpp +++ b/be/src/format/new_parquet/column_reader.cpp @@ -33,8 +33,10 @@ #include "core/column/column.h" #include "core/column/column_array.h" #include "core/column/column_struct.h" +#include "core/column/column_vector.h" #include "core/data_type/data_type_array.h" #include "core/data_type/data_type_nullable.h" +#include "core/data_type/data_type_number.h" #include "core/data_type/data_type_struct.h" #include "core/data_type_serde/decoded_column_view.h" #include "format/new_parquet/parquet_column_schema.h" @@ -130,6 +132,52 @@ class ListColumnReader final : public ParquetColumnReader { std::unique_ptr _element_reader; }; +class RowPositionColumnReader final : public ParquetColumnReader { +public: + explicit RowPositionColumnReader(int64_t row_group_first_row) + : _row_group_first_row(row_group_first_row) {} + + int file_column_id() const override { + return ParquetColumnReaderFactory::ROW_POSITION_COLUMN_ID; + } + int parquet_leaf_column_id() const override { return -1; } + const DataTypePtr& type() const override { return _type; } + const std::string& name() const override { return _name; } + + Status read(int64_t rows, MutableColumnPtr& column, int64_t* rows_read) override { + if (column.get() == nullptr || rows_read == nullptr) { + return Status::InvalidArgument("Invalid parquet row position read result pointer"); + } + if (rows < 0) { + return Status::InvalidArgument("Invalid parquet row position read rows {}", rows); + } + auto* vector_column = assert_cast(column.get()); + auto& data = vector_column->get_data(); + const auto old_size = data.size(); + data.resize(old_size + rows); + for (int64_t row = 0; row < rows; ++row) { + data[old_size + row] = _row_group_first_row + _next_row_position + row; + } + _next_row_position += rows; + *rows_read = rows; + return Status::OK(); + } + + Status skip(int64_t rows) override { + if (rows <= 0) { + return Status::OK(); + } + _next_row_position += rows; + return Status::OK(); + } + +private: + int64_t _row_group_first_row = 0; + int64_t _next_row_position = 0; + DataTypePtr _type = std::make_shared(); + std::string _name = ParquetColumnReaderFactory::ROW_POSITION_COLUMN_NAME; +}; + Status read_records(ScalarColumnReader& column_reader, int64_t batch_rows, ::parquet::internal::RecordReader** record_reader, int64_t* rows_read) { auto reader = column_reader.record_reader(); @@ -558,6 +606,20 @@ ParquetColumnReaderFactory::ParquetColumnReaderFactory( : _row_group(std::move(row_group)), _record_readers(static_cast(num_leaf_columns)) {} +reader::SchemaField ParquetColumnReaderFactory::row_position_schema_field() { + reader::SchemaField field; + field.id = ROW_POSITION_COLUMN_ID; + field.name = ROW_POSITION_COLUMN_NAME; + field.type = std::make_shared(); + field.column_type = reader::ColumnType::ROW_NUMBER; + return field; +} + +std::unique_ptr ParquetColumnReaderFactory::create_row_position_column_reader( + int64_t row_group_first_row) const { + return std::make_unique(row_group_first_row); +} + Status ParquetColumnReaderFactory::create_scalar_reader( int parquet_leaf_column_id, const ParquetTypeDescriptor& type_descriptor, const ::parquet::ColumnDescriptor* descriptor, DataTypePtr type, std::string name, diff --git a/be/src/format/new_parquet/column_reader.h b/be/src/format/new_parquet/column_reader.h index ec691a9743e4ed..80f7060fa31605 100644 --- a/be/src/format/new_parquet/column_reader.h +++ b/be/src/format/new_parquet/column_reader.h @@ -41,6 +41,7 @@ class IColumn; namespace reader { struct FieldProjection; +struct SchemaField; } // namespace reader namespace parquet { @@ -89,6 +90,11 @@ class ParquetColumnReaderFactory { ParquetColumnReaderFactory(std::shared_ptr<::parquet::RowGroupReader> row_group, int num_leaf_columns); + static constexpr int ROW_POSITION_COLUMN_ID = -10001; + static constexpr const char* ROW_POSITION_COLUMN_NAME = "__parquet_row_position"; + + static reader::SchemaField row_position_schema_field(); + // 根据 file-local schema tree 创建 column reader。复杂类型会在这里递归创建 // children。该入口只理解 Parquet file schema,不处理 table/global schema。 Status create(const ParquetColumnSchema& column_schema, @@ -100,6 +106,9 @@ class ParquetColumnReaderFactory { return create(column_schema, nullptr, reader); } + std::unique_ptr create_row_position_column_reader( + int64_t row_group_first_row) const; + private: Status create_scalar_column_reader(const ParquetColumnSchema& column_schema, std::unique_ptr* reader) const; diff --git a/be/src/format/new_parquet/parquet_reader.cpp b/be/src/format/new_parquet/parquet_reader.cpp index 70902d936ee400..043f155dd8588f 100644 --- a/be/src/format/new_parquet/parquet_reader.cpp +++ b/be/src/format/new_parquet/parquet_reader.cpp @@ -159,12 +159,16 @@ struct ParquetReaderScanState { std::vector predicate_fields; std::vector non_predicate_fields; std::vector selected_row_groups; + // We need this to quickly determine the first row of each row group, which is needed for position delete and page index. + // TODO: this may be parsed by multiple ParquetReader with the same file but different scan ranges, so we should cache it + std::vector row_group_first_rows; size_t next_row_group_idx = 0; std::shared_ptr<::parquet::RowGroupReader> current_row_group; std::vector> current_predicate_columns; std::vector> current_non_predicate_columns; int64_t current_row_group_rows = 0; int64_t current_row_group_rows_read = 0; + int64_t current_row_group_first_row = 0; }; Status ParquetReader::_reset_reader_position() { @@ -174,6 +178,7 @@ Status ParquetReader::_reset_reader_position() { _state->current_non_predicate_columns.clear(); _state->current_row_group_rows = 0; _state->current_row_group_rows_read = 0; + _state->current_row_group_first_row = 0; return Status::OK(); } @@ -183,6 +188,7 @@ void ParquetReader::_reset_current_row_group() { _state->current_non_predicate_columns.clear(); _state->current_row_group_rows = 0; _state->current_row_group_rows_read = 0; + _state->current_row_group_first_row = 0; } void ParquetReader::_fill_schema_field(const ParquetColumnSchema& column_schema, @@ -383,15 +389,24 @@ Status ParquetReader::_open_next_row_group(bool* has_row_group) { _reset_current_row_group(); continue; } + DORIS_CHECK(row_group_idx >= 0 && + row_group_idx < static_cast(_state->row_group_first_rows.size())); + _state->current_row_group_first_row = _state->row_group_first_rows[row_group_idx]; _state->current_row_group_rows_read = 0; _state->current_predicate_columns.clear(); _state->current_non_predicate_columns.clear(); ParquetColumnReaderFactory column_reader_factory(_state->current_row_group, _state->schema->num_columns()); - for (const auto file_field_id : _request->predicate_columns) { - const auto& column_schema = _state->file_schema[file_field_id]; - const auto projection_it = _request->complex_projections.find(file_field_id); + for (const auto file_column_id : _request->predicate_columns) { + if (file_column_id == ParquetColumnReaderFactory::ROW_POSITION_COLUMN_ID) { + _state->current_predicate_columns.push_back( + column_reader_factory.create_row_position_column_reader( + _state->current_row_group_first_row)); + continue; + } + const auto& column_schema = _state->file_schema[file_column_id]; + const auto projection_it = _request->complex_projections.find(file_column_id); const auto* projection = projection_it == _request->complex_projections.end() ? nullptr : &projection_it->second; @@ -400,9 +415,15 @@ Status ParquetReader::_open_next_row_group(bool* has_row_group) { column_reader_factory.create(*column_schema, projection, &column_reader)); _state->current_predicate_columns.push_back(std::move(column_reader)); } - for (const auto file_field_id : _request->non_predicate_columns) { - const auto& column_schema = _state->file_schema[file_field_id]; - const auto projection_it = _request->complex_projections.find(file_field_id); + for (const auto file_column_id : _request->non_predicate_columns) { + if (file_column_id == ParquetColumnReaderFactory::ROW_POSITION_COLUMN_ID) { + _state->current_non_predicate_columns.push_back( + column_reader_factory.create_row_position_column_reader( + _state->current_row_group_first_row)); + continue; + } + const auto& column_schema = _state->file_schema[file_column_id]; + const auto projection_it = _request->complex_projections.find(file_column_id); const auto* projection = projection_it == _request->complex_projections.end() ? nullptr : &projection_it->second; @@ -479,6 +500,48 @@ Status ParquetReader::_read_current_row_group_batch(int64_t batch_rows, Block* f return Status::OK(); } +int64_t ParquetReader::_column_start_offset( + const ::parquet::ColumnChunkMetaData& column_metadata) const { + return column_metadata.has_dictionary_page() + ? cast_set(column_metadata.dictionary_page_offset()) + : cast_set(column_metadata.data_page_offset()); +} + +bool ParquetReader::_is_row_group_outside_range(int row_group_idx) const { + DORIS_CHECK(_file_description != nullptr); + // This parquet file is not split + if (_file_description->range_size < 0) { + return false; + } + const int64_t range_start_offset = _file_description->range_start_offset; + const int64_t range_end_offset = range_start_offset + _file_description->range_size; + DORIS_CHECK(range_start_offset >= 0); + DORIS_CHECK(range_end_offset >= range_start_offset); + // read whole parquet file if the range covers the whole file, which is a common case when parquet files are not splittable. + if (range_start_offset == 0 && + (_file_description->file_size < 0 || range_end_offset >= _file_description->file_size)) { + return false; + } + + auto row_group_metadata = _state->metadata->RowGroup(row_group_idx); + DORIS_CHECK(row_group_metadata != nullptr); + DORIS_CHECK(row_group_metadata->num_columns() > 0); + const auto first_column = row_group_metadata->ColumnChunk(0); + const auto last_column = row_group_metadata->ColumnChunk(row_group_metadata->num_columns() - 1); + DORIS_CHECK(first_column != nullptr); + DORIS_CHECK(last_column != nullptr); + const int64_t row_group_start_offset = _column_start_offset(*first_column); + const int64_t row_group_end_offset = + _column_start_offset(*last_column) + last_column->total_compressed_size(); + // A scan range is a byte split, while Parquet is read by row group. If a row group crosses + // split boundaries, using overlap would let adjacent ranges read the same row group. Keep the + // same ownership rule as the legacy vparquet reader: the range containing the row group's + // midpoint owns the whole row group. + const int64_t row_group_mid_offset = + row_group_start_offset + (row_group_end_offset - row_group_start_offset) / 2; + return row_group_mid_offset < range_start_offset || row_group_mid_offset >= range_end_offset; +} + ParquetReader::ParquetReader(std::shared_ptr& system_properties, std::unique_ptr& file_description, std::shared_ptr io_ctx, RuntimeProfile* profile) @@ -548,10 +611,16 @@ Status ParquetReader::open(std::unique_ptr& request) { const int num_fields = static_cast(_state->file_schema.size()); for (const auto file_column_id : _request->predicate_columns) { DORIS_CHECK(_request->column_positions.count(file_column_id) > 0); + if (file_column_id == ParquetColumnReaderFactory::ROW_POSITION_COLUMN_ID) { + continue; + } DORIS_CHECK(file_column_id >= 0 && file_column_id < num_fields); } for (const auto file_column_id : _request->non_predicate_columns) { DORIS_CHECK(_request->column_positions.count(file_column_id) > 0); + if (file_column_id == ParquetColumnReaderFactory::ROW_POSITION_COLUMN_ID) { + continue; + } DORIS_CHECK(file_column_id >= 0 && file_column_id < num_fields); } for (const auto& column_filter : _request->column_predicate_filters) { @@ -579,6 +648,23 @@ Status ParquetReader::open(std::unique_ptr& request) { } RETURN_IF_ERROR(select_row_groups_by_statistics(*_state->metadata, _state->file_schema, *_request, &_state->selected_row_groups)); + std::vector range_selected_row_groups; + range_selected_row_groups.reserve(_state->selected_row_groups.size()); + for (const auto row_group_idx : _state->selected_row_groups) { + if (!_is_row_group_outside_range(row_group_idx)) { + range_selected_row_groups.push_back(row_group_idx); + } + } + _state->selected_row_groups = std::move(range_selected_row_groups); + _state->row_group_first_rows.resize(_state->metadata->num_row_groups()); + int64_t next_row_group_first_row = 0; + for (int row_group_idx = 0; row_group_idx < _state->metadata->num_row_groups(); + ++row_group_idx) { + _state->row_group_first_rows[row_group_idx] = next_row_group_first_row; + auto row_group_metadata = _state->metadata->RowGroup(row_group_idx); + DORIS_CHECK(row_group_metadata != nullptr); + next_row_group_first_row += row_group_metadata->num_rows(); + } RETURN_IF_ERROR(_reset_reader_position()); _eof = _state->selected_row_groups.empty(); return Status::OK(); diff --git a/be/src/format/new_parquet/parquet_reader.h b/be/src/format/new_parquet/parquet_reader.h index aa5cbfb5fcd450..14a891c75e1dcf 100644 --- a/be/src/format/new_parquet/parquet_reader.h +++ b/be/src/format/new_parquet/parquet_reader.h @@ -31,6 +31,10 @@ struct IOContext; } // namespace io } // namespace doris +namespace parquet { +class ColumnChunkMetaData; +} // namespace parquet + namespace doris::parquet { struct ParquetReaderScanState; @@ -137,6 +141,8 @@ class ParquetReader : public reader::FileReader { uint16_t selected_rows); Status _open_next_row_group(bool* has_row_group); Status _read_current_row_group_batch(int64_t batch_rows, Block* file_block, size_t* rows); + bool _is_row_group_outside_range(int row_group_idx) const; + int64_t _column_start_offset(const ::parquet::ColumnChunkMetaData& column_metadata) const; std::unique_ptr _state; ParquetProfile _parquet_profile; diff --git a/be/src/format/reader/table_reader.cpp b/be/src/format/reader/table_reader.cpp index 58de83785892fd..86868b97b0bba3 100644 --- a/be/src/format/reader/table_reader.cpp +++ b/be/src/format/reader/table_reader.cpp @@ -154,6 +154,8 @@ std::unique_ptr create_file_description(const TFileRangeDes auto file_description = std::make_unique(); file_description->path = range.path; file_description->file_size = range.__isset.file_size ? range.file_size : -1; + file_description->range_start_offset = range.__isset.start_offset ? range.start_offset : 0; + file_description->range_size = range.__isset.size ? range.size : -1; if (range.__isset.fs_name) { file_description->fs_name = range.fs_name; } diff --git a/be/src/format/reader/table_reader.h b/be/src/format/reader/table_reader.h index ee252817d40a98..5441995e18c35e 100644 --- a/be/src/format/reader/table_reader.h +++ b/be/src/format/reader/table_reader.h @@ -230,6 +230,7 @@ class TableReader { auto file_request = std::make_unique(); RETURN_IF_ERROR(_data_reader.column_mapper.create_scan_request( _table_filters, _table_column_predicates, _projected_columns, file_request.get())); + RETURN_IF_ERROR(customize_file_scan_request(file_request.get())); RETURN_IF_ERROR(_open_local_filter_exprs(*file_request)); _data_reader.scan_schema.clear(); _data_reader.block_template.clear(); @@ -259,6 +260,10 @@ class TableReader { Status _build_table_filters_from_conjuncts(); Status _open_local_filter_exprs(const FileScanRequest& file_request); + virtual Status customize_file_scan_request(FileScanRequest* file_request) { + return Status::OK(); + } + // 关闭当前具体 reader。 // 该 hook 会被 create_next_reader 和 close 调用;实现应保持幂等。 virtual Status close_current_reader() { diff --git a/be/src/format/table/iceberg_reader_v2.h b/be/src/format/table/iceberg_reader_v2.h index fc957eda12448e..6c6f4416717eb5 100644 --- a/be/src/format/table/iceberg_reader_v2.h +++ b/be/src/format/table/iceberg_reader_v2.h @@ -25,8 +25,17 @@ #include #include "common/status.h" +#include "core/assert_cast.h" +#include "core/block/block.h" +#include "core/column/column_const.h" +#include "core/column/column_nullable.h" +#include "core/column/column_vector.h" +#include "core/data_type/define_primitive_type.h" +#include "core/field.h" +#include "format/new_parquet/column_reader.h" #include "format/reader/file_reader.h" #include "format/reader/table_reader.h" +#include "gen_cpp/PlanNodes_types.h" namespace doris { class Block; @@ -34,27 +43,6 @@ class Block; namespace doris::iceberg { -// Iceberg data file 摘要。它描述当前要读取的物理 data file,不承载列映射逻辑。 -struct IcebergDataFile final : public reader::BaseDataFile { - int64_t sequence_number = 0; - int64_t first_row_id = -1; -}; - -// Iceberg delete file 摘要。position/equality/deletion vector 的具体读取在 -// IcebergTableReader 实现阶段补齐。 -struct IcebergDeleteFile final : public reader::BaseDataFile { - int64_t sequence_number = 0; - std::vector equality_field_ids; -}; - -// 单个 Iceberg data file 的 scan 输入。 -// 该结构只进入 IcebergTableReader,不直接传给 ParquetReader。 -struct IcebergScanTask final : public reader::ScanTask { - std::vector positional_deletes; - std::vector equality_deletes; - std::vector deletion_vectors; -}; - // Iceberg table-level reader。 // 该层继承 TableReader,复用多文件编排和动态分区裁剪等通用能力;同时组合 // FileReader 完成 data file 物理读取,不继承具体文件格式 reader。 @@ -62,6 +50,22 @@ class IcebergTableReader : public reader::TableReader { public: ~IcebergTableReader() override = default; + Status prepare_split(const reader::SplitReadOptions& options) override { + _row_lineage_columns = {}; + if (options.current_range.__isset.table_format_params && + options.current_range.table_format_params.__isset.iceberg_params) { + const auto& iceberg_params = options.current_range.table_format_params.iceberg_params; + if (iceberg_params.__isset.first_row_id) { + _row_lineage_columns.first_row_id = iceberg_params.first_row_id; + } + if (iceberg_params.__isset.last_updated_sequence_number) { + _row_lineage_columns.last_updated_sequence_number = + iceberg_params.last_updated_sequence_number; + } + } + return TableReader::prepare_split(options); + } + protected: // 将 file-local block 转换为 table/global schema block。 // 这里执行 ColumnMapping 中的 finalize_expr、缺失列填充、partition/generated 列 @@ -76,7 +80,21 @@ class IcebergTableReader : public reader::TableReader { // 物化 Iceberg 虚拟列。 // 例如 _row_id、_last_updated_sequence_number 等,它们不来自 Parquet 文件物理列。 Status materialize_virtual_columns(Block* table_block) override { - // 真实实现会物化 _row_id、_last_updated_sequence_number 等 Iceberg 虚拟列。 + for (size_t column_idx = 0; column_idx < _data_reader.column_mapper.mappings().size(); + ++column_idx) { + const auto& mapping = _data_reader.column_mapper.mappings()[column_idx]; + switch (mapping.virtual_column_type) { + case reader::TableVirtualColumnType::ROW_ID: + RETURN_IF_ERROR(_materialize_row_lineage_row_id(table_block, column_idx)); + break; + case reader::TableVirtualColumnType::LAST_UPDATED_SEQUENCE_NUMBER: + RETURN_IF_ERROR(_materialize_row_lineage_last_updated_sequence_number(table_block, + column_idx)); + break; + case reader::TableVirtualColumnType::INVALID: + break; + } + } return Status::OK(); } @@ -88,12 +106,88 @@ class IcebergTableReader : public reader::TableReader { return Status::OK(); } + Status customize_file_scan_request(reader::FileScanRequest* file_request) override { + if (_row_lineage_columns.first_row_id < 0 || !_need_row_lineage_row_id()) { + return Status::OK(); + } + DORIS_CHECK(file_request != nullptr); + const auto row_position_column_id = + doris::parquet::ParquetColumnReaderFactory::ROW_POSITION_COLUMN_ID; + if (file_request->column_positions.count(row_position_column_id) > 0) { + return Status::OK(); + } + _row_position_block_position = file_request->column_positions.size(); + file_request->non_predicate_columns.push_back(row_position_column_id); + file_request->column_positions.emplace(row_position_column_id, + _row_position_block_position); + _data_reader.block_schema.push_back( + doris::parquet::ParquetColumnReaderFactory::row_position_schema_field()); + return Status::OK(); + } + // 在 table block 上应用 equality delete。 // equality delete 依赖 table-level 列语义,因此不能下沉到 ParquetReader。 Status apply_equality_deletes(Block* block) { // 真实实现会在 table block 上应用 equality delete。 return Status::OK(); } + +private: + struct RowLineageColumns { + int64_t first_row_id = -1; + int64_t last_updated_sequence_number = -1; + }; + + Status _materialize_row_lineage_row_id(Block* table_block, size_t column_idx) { + if (_row_lineage_columns.first_row_id < 0) { + return Status::OK(); + } + DORIS_CHECK(_row_position_block_position < _data_reader.block_template.columns()); + const auto& row_position_column = assert_cast( + *_data_reader.block_template.get_by_position(_row_position_block_position).column); + DORIS_CHECK(row_position_column.size() == table_block->rows()); + auto column = table_block->get_by_position(column_idx) + .column->convert_to_full_column_if_const() + ->assume_mutable(); + auto* nullable_column = assert_cast(column.get()); + auto& null_map = nullable_column->get_null_map_data(); + auto& data = + assert_cast(*nullable_column->get_nested_column_ptr()).get_data(); + null_map.resize(row_position_column.size()); + std::fill(null_map.begin(), null_map.end(), 0); + data.resize(row_position_column.size()); + for (size_t row = 0; row < row_position_column.size(); ++row) { + data[row] = _row_lineage_columns.first_row_id + row_position_column.get_element(row); + } + table_block->replace_by_position(column_idx, std::move(column)); + return Status::OK(); + } + + Status _materialize_row_lineage_last_updated_sequence_number(Block* table_block, + size_t column_idx) { + if (_row_lineage_columns.last_updated_sequence_number < 0) { + return Status::OK(); + } + const auto rows = table_block->rows(); + auto data_column = table_block->get_by_position(column_idx).type->create_column(); + data_column->insert(Field::create_field( + _row_lineage_columns.last_updated_sequence_number)); + auto column = ColumnConst::create(std::move(data_column), rows); + table_block->replace_by_position(column_idx, std::move(column)); + return Status::OK(); + } + + RowLineageColumns _row_lineage_columns; + size_t _row_position_block_position = 0; + + bool _need_row_lineage_row_id() const { + for (const auto& mapping : _data_reader.column_mapper.mappings()) { + if (mapping.virtual_column_type == reader::TableVirtualColumnType::ROW_ID) { + return true; + } + } + return false; + } }; } // namespace doris::iceberg diff --git a/be/src/io/file_factory.h b/be/src/io/file_factory.h index a32c8077c48e03..33595313b921b1 100644 --- a/be/src/io/file_factory.h +++ b/be/src/io/file_factory.h @@ -65,6 +65,8 @@ struct FileDescription { // -1 means unset. // If the file length is not set, the file length will be fetched from the file system. int64_t file_size = -1; + int64_t range_start_offset = 0; + int64_t range_size = -1; // modification time of this file. // 0 means unset. int64_t mtime = 0; diff --git a/be/test/format/new_parquet/parquet_reader_test.cpp b/be/test/format/new_parquet/parquet_reader_test.cpp index f393da6822c0eb..00938482d6c3c0 100644 --- a/be/test/format/new_parquet/parquet_reader_test.cpp +++ b/be/test/format/new_parquet/parquet_reader_test.cpp @@ -40,6 +40,7 @@ #include "core/field.h" #include "exprs/vexpr.h" #include "exprs/vexpr_context.h" +#include "format/new_parquet/column_reader.h" #include "format/reader/column_mapper.h" #include "format/reader/file_reader.h" #include "format/reader/table_reader.h" @@ -208,6 +209,35 @@ Block build_file_block(const std::vector& schema) { return block; } +Block build_file_block_with_row_position(const std::vector& schema) { + auto block = build_file_block(schema); + const auto row_position_field = + parquet::ParquetColumnReaderFactory::row_position_schema_field(); + block.insert({row_position_field.type->create_column(), row_position_field.type, + row_position_field.name}); + return block; +} + +int64_t parquet_column_start_offset(const ::parquet::ColumnChunkMetaData& column_metadata) { + return column_metadata.has_dictionary_page() + ? static_cast(column_metadata.dictionary_page_offset()) + : static_cast(column_metadata.data_page_offset()); +} + +std::pair row_group_mid_range(const std::string& file_path, int row_group_idx) { + auto reader = ::parquet::ParquetFileReader::OpenFile(file_path, false); + auto metadata = reader->metadata(); + auto row_group_metadata = metadata->RowGroup(row_group_idx); + auto first_column = row_group_metadata->ColumnChunk(0); + auto last_column = row_group_metadata->ColumnChunk(row_group_metadata->num_columns() - 1); + const int64_t row_group_start_offset = parquet_column_start_offset(*first_column); + const int64_t row_group_end_offset = + parquet_column_start_offset(*last_column) + last_column->total_compressed_size(); + const int64_t row_group_mid_offset = + row_group_start_offset + (row_group_end_offset - row_group_start_offset) / 2; + return {row_group_mid_offset, 1}; +} + class TestFileReader final : public reader::FileReader { public: TestFileReader(std::shared_ptr& system_properties, @@ -311,12 +341,15 @@ class NewParquetReaderTest : public testing::Test { void TearDown() override { std::filesystem::remove_all(_test_dir); } - std::unique_ptr create_reader() const { + std::unique_ptr create_reader(int64_t range_start_offset = 0, + int64_t range_size = -1) const { auto system_properties = std::make_shared(); system_properties->system_type = TFileType::FILE_LOCAL; auto file_description = std::make_unique(); file_description->path = _file_path; file_description->file_size = static_cast(std::filesystem::file_size(_file_path)); + file_description->range_start_offset = range_start_offset; + file_description->range_size = range_size; return std::make_unique(system_properties, file_description, nullptr, nullptr); } @@ -541,5 +574,136 @@ TEST_F(NewParquetReaderTest, PredicateFiltersRowGroupsByStatistics) { EXPECT_EQ(values, std::vector({"three", "four", "five"})); } +TEST_F(NewParquetReaderTest, RowPositionReaderReturnsFileLocalPositions) { + write_parquet_file(_file_path, 2); + auto parquet_file_reader = ::parquet::ParquetFileReader::OpenFile(_file_path, false); + ASSERT_EQ(parquet_file_reader->metadata()->num_row_groups(), 3); + + auto reader = create_reader(); + RuntimeState state {TQueryOptions(), TQueryGlobals()}; + ASSERT_TRUE(reader->init(&state).ok()); + + std::vector schema; + ASSERT_TRUE(reader->get_schema(&schema).ok()); + auto request = std::make_unique(); + request->non_predicate_columns = {parquet::ParquetColumnReaderFactory::ROW_POSITION_COLUMN_ID, + 0}; + request->column_positions = { + {0, 0}, + {parquet::ParquetColumnReaderFactory::ROW_POSITION_COLUMN_ID, 2}, + }; + ASSERT_TRUE(reader->open(request).ok()); + + std::vector row_positions; + std::vector ids; + bool eof = false; + while (!eof) { + Block block = build_file_block_with_row_position(schema); + size_t rows = 0; + ASSERT_TRUE(reader->get_block(&block, &rows, &eof).ok()); + if (rows == 0) { + continue; + } + const auto& id_column = assert_cast(*block.get_by_position(0).column); + const auto& row_position_column = + assert_cast(*block.get_by_position(2).column); + for (size_t row = 0; row < rows; ++row) { + ids.push_back(id_column.get_element(row)); + row_positions.push_back(row_position_column.get_element(row)); + } + } + + EXPECT_EQ(ids, std::vector({1, 2, 3, 4, 5})); + EXPECT_EQ(row_positions, std::vector({0, 1, 2, 3, 4})); +} + +TEST_F(NewParquetReaderTest, RowPositionReaderKeepsPositionsAfterSelection) { + auto reader = create_reader(); + RuntimeState state {TQueryOptions(), TQueryGlobals()}; + ASSERT_TRUE(reader->init(&state).ok()); + + std::vector schema; + ASSERT_TRUE(reader->get_schema(&schema).ok()); + Block block = build_file_block_with_row_position(schema); + + auto request = std::make_unique(); + request->predicate_columns = {0}; + request->non_predicate_columns = {parquet::ParquetColumnReaderFactory::ROW_POSITION_COLUMN_ID}; + request->column_positions = { + {0, 0}, + {parquet::ParquetColumnReaderFactory::ROW_POSITION_COLUMN_ID, 2}, + }; + reader::FileExpressionFilter expression_filter; + expression_filter.conjunct = create_int32_greater_than_conjunct(0, 2); + request->expression_filters.push_back(std::move(expression_filter)); + ASSERT_TRUE(reader->open(request).ok()); + + size_t rows = 0; + bool eof = false; + ASSERT_TRUE(reader->get_block(&block, &rows, &eof).ok()); + EXPECT_FALSE(eof); + ASSERT_EQ(rows, 3); + + const auto& id_column = assert_cast(*block.get_by_position(0).column); + const auto& row_position_column = + assert_cast(*block.get_by_position(2).column); + EXPECT_EQ(id_column.get_element(0), 3); + EXPECT_EQ(id_column.get_element(1), 4); + EXPECT_EQ(id_column.get_element(2), 5); + EXPECT_EQ(row_position_column.get_element(0), 2); + EXPECT_EQ(row_position_column.get_element(1), 3); + EXPECT_EQ(row_position_column.get_element(2), 4); +} + +TEST_F(NewParquetReaderTest, RowPositionReaderUsesFileLocalPositionsForScanRange) { + write_parquet_file(_file_path, 2); + auto parquet_file_reader = ::parquet::ParquetFileReader::OpenFile(_file_path, false); + ASSERT_EQ(parquet_file_reader->metadata()->num_row_groups(), 3); + + const std::vector> expected_ids = {{1, 2}, {3, 4}, {5}}; + const std::vector> expected_row_positions = {{0, 1}, {2, 3}, {4}}; + for (int row_group_idx = 0; row_group_idx < 3; ++row_group_idx) { + const auto [range_start_offset, range_size] = + row_group_mid_range(_file_path, row_group_idx); + auto reader = create_reader(range_start_offset, range_size); + RuntimeState state {TQueryOptions(), TQueryGlobals()}; + ASSERT_TRUE(reader->init(&state).ok()); + + std::vector schema; + ASSERT_TRUE(reader->get_schema(&schema).ok()); + auto request = std::make_unique(); + request->non_predicate_columns = { + parquet::ParquetColumnReaderFactory::ROW_POSITION_COLUMN_ID, 0}; + request->column_positions = { + {0, 0}, + {parquet::ParquetColumnReaderFactory::ROW_POSITION_COLUMN_ID, 2}, + }; + ASSERT_TRUE(reader->open(request).ok()); + + std::vector ids; + std::vector row_positions; + bool eof = false; + while (!eof) { + Block block = build_file_block_with_row_position(schema); + size_t rows = 0; + ASSERT_TRUE(reader->get_block(&block, &rows, &eof).ok()); + if (rows == 0) { + continue; + } + const auto& id_column = + assert_cast(*block.get_by_position(0).column); + const auto& row_position_column = + assert_cast(*block.get_by_position(2).column); + for (size_t row = 0; row < rows; ++row) { + ids.push_back(id_column.get_element(row)); + row_positions.push_back(row_position_column.get_element(row)); + } + } + + EXPECT_EQ(ids, expected_ids[row_group_idx]); + EXPECT_EQ(row_positions, expected_row_positions[row_group_idx]); + } +} + } // namespace } // namespace doris diff --git a/be/test/format/reader/table_reader_test.cpp b/be/test/format/reader/table_reader_test.cpp index f770fddb7238b3..dc050976836b93 100644 --- a/be/test/format/reader/table_reader_test.cpp +++ b/be/test/format/reader/table_reader_test.cpp @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -30,12 +31,15 @@ #include "core/assert_cast.h" #include "core/block/block.h" +#include "core/column/column_nullable.h" #include "core/column/column_string.h" #include "core/column/column_vector.h" +#include "core/data_type/data_type_nullable.h" #include "core/data_type/data_type_number.h" #include "core/data_type/data_type_string.h" #include "exprs/vexpr.h" #include "format/reader/expr/slot_ref.h" +#include "format/table/iceberg_reader_v2.h" #include "gen_cpp/PlanNodes_types.h" #include "runtime/runtime_state.h" #include "storage/predicate/predicate_creator.h" @@ -231,6 +235,19 @@ Block build_table_block(const std::vector& columns) { return block; } +void expect_nullable_int64_column_values(const IColumn& column, + const std::vector& expected_values) { + const auto full_column = column.convert_to_full_column_if_const(); + const auto& nullable_column = assert_cast(*full_column); + const auto& values = + assert_cast(nullable_column.get_nested_column()).get_data(); + ASSERT_EQ(nullable_column.size(), expected_values.size()); + for (size_t row = 0; row < expected_values.size(); ++row) { + EXPECT_EQ(nullable_column.get_null_map_data()[row], 0); + EXPECT_EQ(values[row], expected_values[row]); + } +} + SplitReadOptions build_split_options(const std::string& file_path) { SplitReadOptions options; options.current_range.__set_path(file_path); @@ -239,6 +256,40 @@ SplitReadOptions build_split_options(const std::string& file_path) { return options; } +void set_iceberg_row_lineage_params(SplitReadOptions* split_options, int64_t first_row_id, + int64_t last_updated_sequence_number) { + TTableFormatFileDesc table_format_params; + TIcebergFileDesc iceberg_params; + iceberg_params.__set_first_row_id(first_row_id); + iceberg_params.__set_last_updated_sequence_number(last_updated_sequence_number); + table_format_params.__set_iceberg_params(iceberg_params); + split_options->current_range.__set_table_format_params(table_format_params); +} + +int64_t parquet_column_start_offset(const ::parquet::ColumnChunkMetaData& column_metadata) { + return column_metadata.has_dictionary_page() + ? static_cast(column_metadata.dictionary_page_offset()) + : static_cast(column_metadata.data_page_offset()); +} + +SplitReadOptions build_split_options_for_row_group_mid(const std::string& file_path, + int row_group_idx) { + auto options = build_split_options(file_path); + auto reader = ::parquet::ParquetFileReader::OpenFile(file_path, false); + auto metadata = reader->metadata(); + auto row_group_metadata = metadata->RowGroup(row_group_idx); + auto first_column = row_group_metadata->ColumnChunk(0); + auto last_column = row_group_metadata->ColumnChunk(row_group_metadata->num_columns() - 1); + const int64_t row_group_start_offset = parquet_column_start_offset(*first_column); + const int64_t row_group_end_offset = + parquet_column_start_offset(*last_column) + last_column->total_compressed_size(); + const int64_t row_group_mid_offset = + row_group_start_offset + (row_group_end_offset - row_group_start_offset) / 2; + options.current_range.__set_start_offset(row_group_mid_offset); + options.current_range.__set_size(1); + return options; +} + TableColumn make_table_column(ColumnId id, const std::string& name, const DataTypePtr& type) { TableColumn column; column.id = id; @@ -716,6 +767,226 @@ TEST(TableReaderTest, ProjectedPartitionColumnUsesSplitPartitionValue) { std::filesystem::remove_all(test_dir); } +TEST(TableReaderTest, IcebergVirtualColumnsUseRowLineageMetadata) { + const auto test_dir = + std::filesystem::temp_directory_path() / "doris_iceberg_virtual_columns_test"; + std::filesystem::remove_all(test_dir); + std::filesystem::create_directories(test_dir); + + const auto file_path = (test_dir / "split.parquet").string(); + write_int_pair_parquet_file(file_path, {1, 2, 3}, {10, 20, 30}, {"one", "two", "three"}); + + std::vector projected_columns; + projected_columns.push_back( + make_table_column(100, "_row_id", make_nullable(std::make_shared()))); + projected_columns.push_back( + make_table_column(101, "_last_updated_sequence_number", + make_nullable(std::make_shared()))); + projected_columns.push_back(make_table_column(0, "id", std::make_shared())); + + RuntimeState state {TQueryOptions(), TQueryGlobals()}; + doris::iceberg::IcebergTableReader reader; + ASSERT_TRUE(reader.init({ + .projected_columns = projected_columns, + .column_predicates = {}, + .conjuncts = VExprContext( + std::make_shared(0, 0, 1)), + .format = FileFormat::PARQUET, + .scan_params = nullptr, + .io_ctx = nullptr, + .runtime_state = &state, + .scanner_profile = nullptr, + .allow_missing_columns = true, + .profile = nullptr, + }) + .ok()); + + auto split_options = build_split_options(file_path); + set_iceberg_row_lineage_params(&split_options, 1000, 77); + ASSERT_TRUE(reader.prepare_split(split_options).ok()); + + Block block = build_table_block(projected_columns); + bool eos = false; + ASSERT_TRUE(reader.get_block(&block, &eos).ok()); + ASSERT_FALSE(eos); + + const auto& id_column = assert_cast(*block.get_by_position(2).column); + + ASSERT_EQ(block.rows(), 2); + EXPECT_EQ(id_column.get_element(0), 2); + EXPECT_EQ(id_column.get_element(1), 3); + expect_nullable_int64_column_values(*block.get_by_position(0).column, {1001, 1002}); + expect_nullable_int64_column_values(*block.get_by_position(1).column, {77, 77}); + + ASSERT_TRUE(reader.close().ok()); + std::filesystem::remove_all(test_dir); +} + +TEST(TableReaderTest, IcebergVirtualColumnsKeepRowLineageAfterConjunctFiltering) { + const auto test_dir = + std::filesystem::temp_directory_path() / "doris_iceberg_virtual_columns_conjunct_test"; + std::filesystem::remove_all(test_dir); + std::filesystem::create_directories(test_dir); + + const auto file_path = (test_dir / "split.parquet").string(); + write_int_pair_parquet_file(file_path, {1, 2, 3}, {10, 20, 30}, {"one", "two", "three"}); + + std::vector projected_columns; + projected_columns.push_back( + make_table_column(100, "_row_id", make_nullable(std::make_shared()))); + projected_columns.push_back( + make_table_column(101, "_last_updated_sequence_number", + make_nullable(std::make_shared()))); + projected_columns.push_back(make_table_column(0, "id", std::make_shared())); + + RuntimeState state {TQueryOptions(), TQueryGlobals()}; + doris::iceberg::IcebergTableReader reader; + ASSERT_TRUE(reader.init({ + .projected_columns = projected_columns, + .column_predicates = {}, + .conjuncts = VExprContext( + std::make_shared(0, 0, 1)), + .format = FileFormat::PARQUET, + .scan_params = nullptr, + .io_ctx = nullptr, + .runtime_state = &state, + .scanner_profile = nullptr, + .allow_missing_columns = true, + .profile = nullptr, + }) + .ok()); + + auto split_options = build_split_options(file_path); + set_iceberg_row_lineage_params(&split_options, 3000, 88); + ASSERT_TRUE(reader.prepare_split(split_options).ok()); + + Block block = build_table_block(projected_columns); + bool eos = false; + ASSERT_TRUE(reader.get_block(&block, &eos).ok()); + ASSERT_FALSE(eos); + + const auto& id_column = assert_cast(*block.get_by_position(2).column); + + ASSERT_EQ(block.rows(), 2); + EXPECT_EQ(id_column.get_element(0), 2); + EXPECT_EQ(id_column.get_element(1), 3); + expect_nullable_int64_column_values(*block.get_by_position(0).column, {3001, 3002}); + expect_nullable_int64_column_values(*block.get_by_position(1).column, {88, 88}); + + ASSERT_TRUE(reader.close().ok()); + std::filesystem::remove_all(test_dir); +} + +TEST(TableReaderTest, IcebergVirtualColumnsKeepRowLineageAfterRowGroupPredicatePruning) { + const auto test_dir = std::filesystem::temp_directory_path() / + "doris_iceberg_virtual_columns_row_group_predicate_test"; + std::filesystem::remove_all(test_dir); + std::filesystem::create_directories(test_dir); + + const auto file_path = (test_dir / "split.parquet").string(); + // ColumnPredicate is used for row-group/statistics pruning. Keep one row per row group so + // id > 2 prunes the first two row groups and leaves only the third file-local row. + write_int_pair_parquet_file(file_path, {1, 2, 3}, {10, 20, 30}, {"one", "two", "three"}, 1); + + std::vector projected_columns; + projected_columns.push_back( + make_table_column(100, "_row_id", make_nullable(std::make_shared()))); + projected_columns.push_back( + make_table_column(101, "_last_updated_sequence_number", + make_nullable(std::make_shared()))); + projected_columns.push_back(make_table_column(0, "id", std::make_shared())); + + TableColumnPredicates column_predicates; + column_predicates[0].push_back(create_comparison_predicate( + 0, "id", std::make_shared(), Field::create_field(2), false)); + + RuntimeState state {TQueryOptions(), TQueryGlobals()}; + doris::iceberg::IcebergTableReader reader; + ASSERT_TRUE(reader.init({ + .projected_columns = projected_columns, + .column_predicates = std::move(column_predicates), + .conjuncts = VExprContext(nullptr), + .format = FileFormat::PARQUET, + .scan_params = nullptr, + .io_ctx = nullptr, + .runtime_state = &state, + .scanner_profile = nullptr, + .allow_missing_columns = true, + .profile = nullptr, + }) + .ok()); + + auto split_options = build_split_options(file_path); + set_iceberg_row_lineage_params(&split_options, 4000, 99); + ASSERT_TRUE(reader.prepare_split(split_options).ok()); + + Block block = build_table_block(projected_columns); + bool eos = false; + ASSERT_TRUE(reader.get_block(&block, &eos).ok()); + ASSERT_FALSE(eos); + + const auto& id_column = assert_cast(*block.get_by_position(2).column); + + ASSERT_EQ(block.rows(), 1); + EXPECT_EQ(id_column.get_element(0), 3); + expect_nullable_int64_column_values(*block.get_by_position(0).column, {4002}); + expect_nullable_int64_column_values(*block.get_by_position(1).column, {99}); + + ASSERT_TRUE(reader.close().ok()); + std::filesystem::remove_all(test_dir); +} + +TEST(TableReaderTest, ParquetReaderReadsOnlyRowGroupsInFileRange) { + const auto test_dir = + std::filesystem::temp_directory_path() / "doris_table_reader_file_range_test"; + std::filesystem::remove_all(test_dir); + std::filesystem::create_directories(test_dir); + + const auto file_path = (test_dir / "split.parquet").string(); + write_int_pair_parquet_file(file_path, {1, 2, 3}, {10, 20, 30}, + {"range_group_one", "range_group_two", "range_group_three"}, 1); + + std::vector projected_columns; + projected_columns.push_back(make_table_column(0, "id", std::make_shared())); + projected_columns.push_back(make_table_column(2, "value", std::make_shared())); + + RuntimeState state {TQueryOptions(), TQueryGlobals()}; + TableReader reader; + ASSERT_TRUE(reader.init({ + .projected_columns = projected_columns, + .column_predicates = {}, + .conjuncts = VExprContext(nullptr), + .format = FileFormat::PARQUET, + .scan_params = nullptr, + .io_ctx = nullptr, + .runtime_state = &state, + .scanner_profile = nullptr, + .allow_missing_columns = true, + .profile = nullptr, + }) + .ok()); + + ASSERT_TRUE(reader.prepare_split(build_split_options_for_row_group_mid(file_path, 1)).ok()); + + Block block = build_table_block(projected_columns); + bool eos = false; + ASSERT_TRUE(reader.get_block(&block, &eos).ok()); + ASSERT_FALSE(eos); + + const auto& id_column = assert_cast(*block.get_by_position(0).column); + const auto& value_column = assert_cast(*block.get_by_position(1).column); + ASSERT_EQ(block.rows(), 1); + EXPECT_EQ(id_column.get_element(0), 2); + EXPECT_EQ(value_column.get_data_at(0).to_string(), "range_group_two"); + + ASSERT_TRUE(reader.get_block(&block, &eos).ok()); + EXPECT_TRUE(eos); + EXPECT_EQ(block.rows(), 0); + + ASSERT_TRUE(reader.close().ok()); + std::filesystem::remove_all(test_dir); +} + TEST(TableReaderTest, ProjectedColumnsUseMapperExpressionForSameNameDifferentIdParquetSchema) { const auto test_dir = std::filesystem::temp_directory_path() / "doris_table_reader_same_name_diff_id_test"; From 6cb5719cbd67bc64d6986ceb4ac9eaa4e15b2243 Mon Sep 17 00:00:00 2001 From: Socrates Date: Thu, 28 May 2026 14:54:27 +0800 Subject: [PATCH 24/38] [feature](be) Add basic parquet map reader Issue Number: close #xxx Related PR: #xxx Problem Summary: Add initial new Parquet MAP reader support for required scalar key/value entries and normalize MAP key_value schema metadata for future complex projection work. None - Test: Unit Test / Manual test - Added parquet column reader unit test coverage for required Map(Int32, String). - Ran git diff --check. - BE build will be verified on Fedora after push. - Behavior changed: No - Does this need documentation: No --- be/src/format/new_parquet/column_reader.cpp | 191 +++++++++++++++++- be/src/format/new_parquet/column_reader.h | 4 + .../new_parquet/parquet_column_schema.cpp | 43 +++- .../parquet_column_reader_test.cpp | 85 +++++++- 4 files changed, 310 insertions(+), 13 deletions(-) diff --git a/be/src/format/new_parquet/column_reader.cpp b/be/src/format/new_parquet/column_reader.cpp index c427b38a97041f..4d4545f5d80dcd 100644 --- a/be/src/format/new_parquet/column_reader.cpp +++ b/be/src/format/new_parquet/column_reader.cpp @@ -32,6 +32,7 @@ #include "core/column/column.h" #include "core/column/column_array.h" +#include "core/column/column_map.h" #include "core/column/column_struct.h" #include "core/column/column_vector.h" #include "core/data_type/data_type_array.h" @@ -178,6 +179,35 @@ class RowPositionColumnReader final : public ParquetColumnReader { std::string _name = ParquetColumnReaderFactory::ROW_POSITION_COLUMN_NAME; }; +class MapColumnReader final : public ParquetColumnReader { +public: + MapColumnReader(const ParquetColumnSchema& schema, DataTypePtr type, + std::unique_ptr key_reader, + std::unique_ptr value_reader) + : _field_id(schema.top_level_field_id), + _repeated_repetition_level(schema.repeated_repetition_level), + _type(std::move(type)), + _name(schema.name), + _key_reader(std::move(key_reader)), + _value_reader(std::move(value_reader)) {} + + int file_column_id() const override { return _field_id; } + int parquet_leaf_column_id() const override { return -1; } + const DataTypePtr& type() const override { return _type; } + const std::string& name() const override { return _name; } + + Status read(int64_t rows, MutableColumnPtr& column, int64_t* rows_read) override; + Status skip(int64_t rows) override; + +private: + int _field_id = -1; + int16_t _repeated_repetition_level = 0; + DataTypePtr _type; + std::string _name; + std::unique_ptr _key_reader; + std::unique_ptr _value_reader; +}; + Status read_records(ScalarColumnReader& column_reader, int64_t batch_rows, ::parquet::internal::RecordReader** record_reader, int64_t* rows_read) { auto reader = column_reader.record_reader(); @@ -567,6 +597,135 @@ Status ListColumnReader::skip(int64_t rows) { return _element_reader->skip(rows); } +Status MapColumnReader::read(int64_t rows, MutableColumnPtr& column, int64_t* rows_read) { + if (column.get() == nullptr || rows_read == nullptr) { + return Status::InvalidArgument("Invalid parquet map read result pointer for column {}", + _name); + } + if (_key_reader == nullptr || _value_reader == nullptr) { + return Status::InternalError("Parquet map child reader is not initialized for column {}", + _name); + } + auto* key_reader = dynamic_cast(_key_reader.get()); + auto* value_reader = dynamic_cast(_value_reader.get()); + if (key_reader == nullptr || value_reader == nullptr) { + return Status::NotSupported( + "Current parquet MAP reader only supports scalar key/value for column {}", _name); + } + if (key_reader->descriptor()->max_definition_level() != 1 || + value_reader->descriptor()->max_definition_level() != 1) { + return Status::NotSupported( + "Current parquet MAP reader only supports required key/value entries for column {}", + _name); + } + + ::parquet::internal::RecordReader* key_record_reader = nullptr; + int64_t records_read = 0; + RETURN_IF_ERROR(read_records(*key_reader, rows, &key_record_reader, &records_read)); + const int64_t levels_written = key_record_reader->levels_written(); + if (records_read != rows || levels_written < records_read) { + return Status::Corruption( + "Invalid parquet MAP key read result for column {}: rows={}, levels={}", _name, + records_read, levels_written); + } + if (key_record_reader->values_written() != levels_written) { + return Status::NotSupported( + "Current parquet MAP reader only supports non-empty maps with required entries " + "for column {}", + _name); + } + + const int16_t max_definition_level = key_reader->descriptor()->max_definition_level(); + if (auto* def_levels = key_record_reader->def_levels(); def_levels != nullptr) { + for (int64_t level_idx = 0; level_idx < levels_written; ++level_idx) { + if (def_levels[level_idx] != max_definition_level) { + return Status::NotSupported( + "Current parquet MAP reader only supports non-empty maps with required " + "entries for column {}", + _name); + } + } + } + + ::parquet::internal::RecordReader* value_record_reader = nullptr; + int64_t value_records_read = 0; + RETURN_IF_ERROR( + read_records(*value_reader, records_read, &value_record_reader, &value_records_read)); + if (value_records_read != records_read || + value_record_reader->levels_written() != levels_written || + value_record_reader->values_written() != levels_written) { + return Status::Corruption( + "Invalid parquet MAP value read result for column {}: rows={}, levels={}, " + "values={}, expected={}", + _name, value_records_read, value_record_reader->levels_written(), + value_record_reader->values_written(), levels_written); + } + if (auto* def_levels = value_record_reader->def_levels(); def_levels != nullptr) { + for (int64_t level_idx = 0; level_idx < levels_written; ++level_idx) { + if (def_levels[level_idx] != max_definition_level) { + return Status::NotSupported( + "Current parquet MAP reader only supports non-empty maps with required " + "entries for column {}", + _name); + } + } + } + + const auto* key_rep_levels = key_record_reader->rep_levels(); + const auto* value_rep_levels = value_record_reader->rep_levels(); + if ((key_rep_levels == nullptr || value_rep_levels == nullptr) && levels_written > 0) { + return Status::Corruption( + "Parquet MAP reader returned null repetition levels for column {}", _name); + } + for (int64_t level_idx = 0; level_idx < levels_written; ++level_idx) { + if (key_rep_levels[level_idx] != value_rep_levels[level_idx]) { + return Status::Corruption( + "Parquet MAP key/value repetition levels are not aligned for column {}", _name); + } + } + + auto& map_column = assert_cast(*column); + auto key_column = map_column.get_keys_ptr()->assume_mutable(); + RETURN_IF_ERROR(append_scalar_values(*key_reader, *key_record_reader, levels_written, nullptr, + key_column)); + map_column.get_keys_ptr() = std::move(key_column); + + auto value_column = map_column.get_values_ptr()->assume_mutable(); + RETURN_IF_ERROR(append_scalar_values(*value_reader, *value_record_reader, levels_written, + nullptr, value_column)); + map_column.get_values_ptr() = std::move(value_column); + + auto& offsets = map_column.get_offsets(); + offsets.reserve(offsets.size() + static_cast(records_read)); + size_t current_offset = offsets.empty() ? 0 : offsets.back(); + int64_t current_record = 0; + for (int64_t level_idx = 0; level_idx < levels_written; ++level_idx) { + if (level_idx == 0 || key_rep_levels[level_idx] < _repeated_repetition_level) { + if (level_idx != 0) { + offsets.push_back(current_offset); + current_record++; + } + } + current_offset++; + } + while (current_record < records_read) { + offsets.push_back(current_offset); + current_record++; + } + *rows_read = records_read; + return Status::OK(); +} + +Status MapColumnReader::skip(int64_t rows) { + if (rows <= 0) { + return Status::OK(); + } + DORIS_CHECK(_key_reader != nullptr); + DORIS_CHECK(_value_reader != nullptr); + RETURN_IF_ERROR(_key_reader->skip(rows)); + return _value_reader->skip(rows); +} + Status ParquetColumnReader::skip(int64_t rows) { return Status::NotSupported("Parquet column skip is not implemented, rows={}", rows); } @@ -811,6 +970,35 @@ Status ParquetColumnReaderFactory::create_list_column_reader( return Status::OK(); } +Status ParquetColumnReaderFactory::create_map_column_reader( + const ParquetColumnSchema& column_schema, const reader::FieldProjection* projection, + std::unique_ptr* reader) const { + if (reader == nullptr) { + return Status::InvalidArgument("reader is null"); + } + if (projection != nullptr && !projection->project_all_children) { + return Status::NotSupported("Parquet MAP projection is not implemented for column {}", + column_schema.name); + } + if (column_schema.type != nullptr && column_schema.type->is_nullable()) { + return Status::NotSupported("Nullable parquet MAP reader is not implemented for column {}", + column_schema.name); + } + if (column_schema.children.size() != 1 || column_schema.children[0]->children.size() != 2) { + return Status::NotSupported("Unsupported parquet MAP layout for column {}", + column_schema.name); + } + const auto& key_value_schema = *column_schema.children[0]; + std::unique_ptr key_reader; + RETURN_IF_ERROR(create_nested_scalar_column_reader(*key_value_schema.children[0], &key_reader)); + std::unique_ptr value_reader; + RETURN_IF_ERROR( + create_nested_scalar_column_reader(*key_value_schema.children[1], &value_reader)); + *reader = std::make_unique(column_schema, column_schema.type, + std::move(key_reader), std::move(value_reader)); + return Status::OK(); +} + Status ParquetColumnReaderFactory::create(const ParquetColumnSchema& column_schema, const reader::FieldProjection* projection, std::unique_ptr* reader) const { @@ -825,8 +1013,7 @@ Status ParquetColumnReaderFactory::create(const ParquetColumnSchema& column_sche case ParquetColumnSchemaKind::LIST: return create_list_column_reader(column_schema, projection, reader); case ParquetColumnSchemaKind::MAP: - return Status::NotSupported("Parquet MAP reader is not implemented for column {}", - column_schema.name); + return create_map_column_reader(column_schema, projection, reader); } return Status::NotSupported("Unsupported parquet column schema kind for column {}", column_schema.name); diff --git a/be/src/format/new_parquet/column_reader.h b/be/src/format/new_parquet/column_reader.h index 80f7060fa31605..62400d739cad8b 100644 --- a/be/src/format/new_parquet/column_reader.h +++ b/be/src/format/new_parquet/column_reader.h @@ -124,6 +124,10 @@ class ParquetColumnReaderFactory { const reader::FieldProjection* projection, std::unique_ptr* reader) const; + Status create_map_column_reader(const ParquetColumnSchema& column_schema, + const reader::FieldProjection* projection, + std::unique_ptr* reader) const; + Status get_record_reader(int leaf_column_id, const ::parquet::ColumnDescriptor* descriptor, const std::string& name, std::shared_ptr<::parquet::internal::RecordReader>* reader) const; diff --git a/be/src/format/new_parquet/parquet_column_schema.cpp b/be/src/format/new_parquet/parquet_column_schema.cpp index 8541769c1d2840..cbca53c7f72fda 100644 --- a/be/src/format/new_parquet/parquet_column_schema.cpp +++ b/be/src/format/new_parquet/parquet_column_schema.cpp @@ -187,11 +187,44 @@ Status build_node_schema(const ::parquet::SchemaDescriptor& schema, return Status::NotSupported("Unsupported parquet MAP encoding for column {}", node.name()); } - std::unique_ptr key_value; - RETURN_IF_ERROR(build_node_schema( - schema, *group.field(0), - child_context(context, *group.field(0), 0, column_schema->schema_node_id), - &key_value)); + const auto& key_value_node = *group.field(0); + if (!key_value_node.is_repeated()) { + return Status::NotSupported("Unsupported parquet MAP encoding for column {}", + node.name()); + } + auto key_value_context = + child_context(context, key_value_node, 0, column_schema->schema_node_id); + column_schema->repeated_repetition_level = key_value_context.repeated_repetition_level; + if (key_value_node.is_primitive()) { + return Status::NotSupported("Unsupported parquet MAP key_value layout for column {}", + node.name()); + } + const auto& key_value_group = + static_cast(key_value_node); + if (key_value_group.field_count() != 2) { + return Status::NotSupported("Unsupported parquet MAP key_value layout for column {}", + node.name()); + } + auto key_value = std::make_unique(); + inherit_common_schema_state(key_value_node, key_value_context, key_value.get()); + key_value->kind = ParquetColumnSchemaKind::STRUCT; + DataTypes child_types; + Strings child_names; + child_types.reserve(key_value_group.field_count()); + child_names.reserve(key_value_group.field_count()); + for (int child_idx = 0; child_idx < key_value_group.field_count(); ++child_idx) { + std::unique_ptr child; + RETURN_IF_ERROR(build_node_schema( + schema, *key_value_group.field(child_idx), + child_context(key_value_context, *key_value_group.field(child_idx), child_idx, + key_value->schema_node_id), + &child)); + child_types.push_back(child->type); + child_names.push_back(child->name); + key_value->children.push_back(std::move(child)); + } + key_value->type = std::make_shared(child_types, child_names); + propagate_child_levels(key_value.get()); if (key_value->children.size() != 2) { return Status::NotSupported("Unsupported parquet MAP key_value layout for column {}", node.name()); diff --git a/be/test/format/new_parquet/parquet_column_reader_test.cpp b/be/test/format/new_parquet/parquet_column_reader_test.cpp index b85bbb80a6aa21..4a187f4f8e0d18 100644 --- a/be/test/format/new_parquet/parquet_column_reader_test.cpp +++ b/be/test/format/new_parquet/parquet_column_reader_test.cpp @@ -31,12 +31,14 @@ #include "core/assert_cast.h" #include "core/column/column_array.h" #include "core/column/column_decimal.h" +#include "core/column/column_map.h" #include "core/column/column_nullable.h" #include "core/column/column_string.h" #include "core/column/column_struct.h" #include "core/column/column_vector.h" #include "core/data_type/data_type.h" #include "core/data_type/data_type_array.h" +#include "core/data_type/data_type_map.h" #include "core/data_type/data_type_nullable.h" #include "core/data_type/data_type_struct.h" #include "core/types.h" @@ -161,6 +163,26 @@ class ParquetColumnReaderTest : public testing::Test { return finish_array(&builder); } + std::shared_ptr build_required_int_string_map_array() { + auto key_builder = std::make_shared(); + auto value_builder = std::make_shared(); + auto map_type = arrow::map(arrow::int32(), arrow::field("value", arrow::utf8(), false)); + arrow::MapBuilder builder(arrow::default_memory_pool(), key_builder, value_builder, + map_type); + const std::vector>> values = { + {{1, "a"}, {2, "b"}}, {{3, "c"}}, {{4, "d"}, {5, "e"}, {6, "f"}}, + {{7, "g"}}, {{8, "h"}, {9, "i"}}, + }; + for (const auto& row : values) { + EXPECT_TRUE(builder.Append().ok()); + for (const auto& [key, value] : row) { + EXPECT_TRUE(key_builder->Append(key).ok()); + EXPECT_TRUE(value_builder->Append(value).ok()); + } + } + return finish_array(&builder); + } + std::shared_ptr build_time32_array(const std::shared_ptr& type, const std::vector& values) { arrow::Time32Builder builder(type, arrow::default_memory_pool()); @@ -394,16 +416,55 @@ class ParquetColumnReaderTest : public testing::Test { TYPE_INT); const auto& array_column = assert_cast(column); ASSERT_EQ(array_column.size(), ROW_COUNT); - EXPECT_EQ(array_column.size_at(0), 2); - EXPECT_EQ(array_column.size_at(1), 1); - EXPECT_EQ(array_column.size_at(2), 3); - EXPECT_EQ(array_column.size_at(4), 2); + const auto array_size_at = [&array_column](size_t row_idx) { + return array_column.get_offsets()[row_idx] - + (row_idx == 0 ? 0 : array_column.get_offsets()[row_idx - 1]); + }; + EXPECT_EQ(array_size_at(0), 2); + EXPECT_EQ(array_size_at(1), 1); + EXPECT_EQ(array_size_at(2), 3); + EXPECT_EQ(array_size_at(4), 2); const auto& values = assert_cast(array_column.get_data()); ASSERT_EQ(values.size(), 9); EXPECT_EQ(values.get_element(0), 1); EXPECT_EQ(values.get_element(5), 6); EXPECT_EQ(values.get_element(8), 9); }); + add_field(arrow::field( + "map_int_string_col", + arrow::map(arrow::int32(), arrow::field("value", arrow::utf8(), false)), + false), + build_required_int_string_map_array(), + [](const ParquetColumnSchema& schema, const IColumn& column) { + EXPECT_EQ(remove_nullable(schema.type)->get_primitive_type(), TYPE_MAP); + const auto* map_type = + assert_cast(remove_nullable(schema.type).get()); + EXPECT_EQ(remove_nullable(map_type->get_key_type())->get_primitive_type(), + TYPE_INT); + EXPECT_EQ(remove_nullable(map_type->get_value_type())->get_primitive_type(), + TYPE_STRING); + const auto& map_column = assert_cast(column); + ASSERT_EQ(map_column.size(), ROW_COUNT); + const auto map_size_at = [&map_column](size_t row_idx) { + return map_column.get_offsets()[row_idx] - + (row_idx == 0 ? 0 : map_column.get_offsets()[row_idx - 1]); + }; + EXPECT_EQ(map_size_at(0), 2); + EXPECT_EQ(map_size_at(1), 1); + EXPECT_EQ(map_size_at(2), 3); + EXPECT_EQ(map_size_at(4), 2); + const auto& keys = assert_cast(map_column.get_keys()); + const auto& values = + assert_cast(map_column.get_values()); + ASSERT_EQ(keys.size(), 9); + ASSERT_EQ(values.size(), 9); + EXPECT_EQ(keys.get_element(0), 1); + EXPECT_EQ(keys.get_element(5), 6); + EXPECT_EQ(keys.get_element(8), 9); + EXPECT_EQ(values.get_data_at(0).to_string(), "a"); + EXPECT_EQ(values.get_data_at(5).to_string(), "f"); + EXPECT_EQ(values.get_data_at(8).to_string(), "i"); + }); auto schema = arrow::schema(_arrow_fields); auto table = arrow::Table::Make(schema, _arrays); @@ -439,6 +500,16 @@ class ParquetColumnReaderTest : public testing::Test { _expected_by_field[field_idx](*_fields[field_idx], *column); } + size_t find_field_idx(const std::string& name) const { + for (size_t field_idx = 0; field_idx < _fields.size(); ++field_idx) { + if (_fields[field_idx]->name == name) { + return field_idx; + } + } + ADD_FAILURE() << "Cannot find parquet test field " << name; + return _fields.size(); + } + std::filesystem::path _test_dir; std::string _file_path; std::unique_ptr<::parquet::ParquetFileReader> _file_reader; @@ -496,7 +567,8 @@ TEST_F(ParquetColumnReaderTest, SelectReadsOnlySelectedRanges) { } TEST_F(ParquetColumnReaderTest, ReadProjectedStructChildren) { - const auto field_idx = _fields.size() - 1; + const auto field_idx = find_field_idx("struct_col"); + ASSERT_LT(field_idx, _fields.size()); const auto& struct_schema = *_fields[field_idx]; ASSERT_EQ(struct_schema.name, "struct_col"); ASSERT_EQ(struct_schema.children.size(), 2); @@ -533,7 +605,8 @@ TEST_F(ParquetColumnReaderTest, ReadProjectedStructChildren) { } TEST_F(ParquetColumnReaderTest, BuildComplexSchemaPathMetadata) { - const auto field_idx = _fields.size() - 1; + const auto field_idx = find_field_idx("struct_col"); + ASSERT_LT(field_idx, _fields.size()); const auto& struct_schema = *_fields[field_idx]; ASSERT_EQ(struct_schema.name, "struct_col"); ASSERT_EQ(struct_schema.children.size(), 2); From b8a40bd22a1bc56ddce95b11795f05c81234e077 Mon Sep 17 00:00:00 2001 From: Socrates Date: Thu, 28 May 2026 15:48:59 +0800 Subject: [PATCH 25/38] [feature](be) Support parquet repeated level assembly ### What problem does this PR solve? Issue Number: close #xxx Related PR: #xxx Problem Summary: Add shared LIST/MAP level assembly for scalar nested Parquet children, including null parent rows, empty collections, nullable element/value slots, overflow buffering, and parent-row skip/select semantics. ### Release note None ### Check List (For Author) - Test: Manual test - Ran git diff --check. BE DEBUG build will be run on Fedora after push. - Behavior changed: Yes. New Parquet reader can read LIST/MAP scalar children with null/empty/nullable-child cases. - Does this need documentation: No --- be/src/format/new_parquet/column_reader.cpp | 689 ++++++++++++++---- .../parquet_column_reader_test.cpp | 322 ++++++++ 2 files changed, 854 insertions(+), 157 deletions(-) diff --git a/be/src/format/new_parquet/column_reader.cpp b/be/src/format/new_parquet/column_reader.cpp index 4d4545f5d80dcd..b4cc00caa98767 100644 --- a/be/src/format/new_parquet/column_reader.cpp +++ b/be/src/format/new_parquet/column_reader.cpp @@ -33,6 +33,7 @@ #include "core/column/column.h" #include "core/column/column_array.h" #include "core/column/column_map.h" +#include "core/column/column_nullable.h" #include "core/column/column_struct.h" #include "core/column/column_vector.h" #include "core/data_type/data_type_array.h" @@ -46,6 +47,27 @@ namespace doris::parquet { namespace { +constexpr int64_t NESTED_READ_BATCH_ROWS = 4096; + +struct NestedScalarBatch { + int64_t records_read = 0; + int64_t levels_written = 0; + int64_t values_written = 0; + std::vector def_levels; + std::vector rep_levels; + std::vector value_indices; + MutableColumnPtr values_column; + + bool empty() const { return levels_written == 0; } +}; + +struct NestedScalarOverflow { + NestedScalarBatch batch; + + bool empty() const { return batch.empty(); } + void clear() { batch = NestedScalarBatch(); } +}; + class ScalarColumnReader final : public ParquetColumnReader { public: ScalarColumnReader(int parquet_leaf_column_id, const ::parquet::ColumnDescriptor* descriptor, @@ -112,6 +134,7 @@ class ListColumnReader final : public ParquetColumnReader { ListColumnReader(const ParquetColumnSchema& schema, DataTypePtr type, std::unique_ptr element_reader) : _field_id(schema.top_level_field_id), + _nullable_definition_level(schema.nullable_definition_level), _repeated_repetition_level(schema.repeated_repetition_level), _type(std::move(type)), _name(schema.name), @@ -127,10 +150,12 @@ class ListColumnReader final : public ParquetColumnReader { private: int _field_id = -1; + int16_t _nullable_definition_level = 0; int16_t _repeated_repetition_level = 0; DataTypePtr _type; std::string _name; std::unique_ptr _element_reader; + NestedScalarOverflow _element_overflow; }; class RowPositionColumnReader final : public ParquetColumnReader { @@ -185,6 +210,7 @@ class MapColumnReader final : public ParquetColumnReader { std::unique_ptr key_reader, std::unique_ptr value_reader) : _field_id(schema.top_level_field_id), + _nullable_definition_level(schema.nullable_definition_level), _repeated_repetition_level(schema.repeated_repetition_level), _type(std::move(type)), _name(schema.name), @@ -201,11 +227,14 @@ class MapColumnReader final : public ParquetColumnReader { private: int _field_id = -1; + int16_t _nullable_definition_level = 0; int16_t _repeated_repetition_level = 0; DataTypePtr _type; std::string _name; std::unique_ptr _key_reader; std::unique_ptr _value_reader; + NestedScalarOverflow _key_overflow; + NestedScalarOverflow _value_overflow; }; Status read_records(ScalarColumnReader& column_reader, int64_t batch_rows, @@ -410,6 +439,231 @@ Status append_scalar_values(const ScalarColumnReader& column_reader, return Status::OK(); } +Status read_nested_scalar_batch(ScalarColumnReader& column_reader, int64_t batch_rows, + int16_t value_slot_definition_level, NestedScalarBatch* batch) { + if (batch == nullptr) { + return Status::InvalidArgument("Nested scalar batch is null for column {}", + column_reader.name()); + } + *batch = NestedScalarBatch(); + + ::parquet::internal::RecordReader* record_reader = nullptr; + RETURN_IF_ERROR(read_records(column_reader, batch_rows, &record_reader, &batch->records_read)); + if (column_reader.type()->is_nullable() && record_reader->read_dense_for_nullable()) { + return Status::NotSupported( + "Dense nullable parquet nested reader is not supported for column {}", + column_reader.name()); + } + batch->levels_written = record_reader->levels_written(); + batch->values_written = record_reader->values_written(); + if (batch->levels_written < batch->records_read || batch->values_written < 0 || + batch->values_written > batch->levels_written) { + return Status::Corruption( + "Invalid nested parquet read result for column {}: rows={}, levels={}, values={}", + column_reader.name(), batch->records_read, batch->levels_written, + batch->values_written); + } + if (batch->levels_written == 0) { + return Status::OK(); + } + + auto* def_levels = record_reader->def_levels(); + if (def_levels == nullptr && column_reader.descriptor()->max_definition_level() > 0) { + return Status::Corruption( + "Nested parquet reader returned null definition levels for column {}", + column_reader.name()); + } + batch->def_levels.resize(static_cast(batch->levels_written)); + if (def_levels == nullptr) { + std::fill(batch->def_levels.begin(), batch->def_levels.end(), + column_reader.descriptor()->max_definition_level()); + } else { + std::copy(def_levels, def_levels + batch->levels_written, batch->def_levels.begin()); + } + + auto* rep_levels = record_reader->rep_levels(); + if (rep_levels == nullptr && column_reader.descriptor()->max_repetition_level() > 0) { + return Status::Corruption( + "Nested parquet reader returned null repetition levels for column {}", + column_reader.name()); + } + batch->rep_levels.resize(static_cast(batch->levels_written)); + if (rep_levels == nullptr) { + std::fill(batch->rep_levels.begin(), batch->rep_levels.end(), 0); + } else { + std::copy(rep_levels, rep_levels + batch->levels_written, batch->rep_levels.begin()); + } + + batch->value_indices.resize(static_cast(batch->levels_written), -1); + int64_t value_idx = 0; + const int16_t max_definition_level = column_reader.descriptor()->max_definition_level(); + NullMap value_null_map; + for (int64_t level_idx = 0; level_idx < batch->levels_written; ++level_idx) { + if (batch->def_levels[level_idx] >= value_slot_definition_level) { + if (value_idx >= batch->values_written) { + return Status::Corruption( + "Nested parquet reader returned fewer values than definition levels for " + "column {}", + column_reader.name()); + } + batch->value_indices[level_idx] = value_idx++; + if (column_reader.type()->is_nullable()) { + value_null_map.push_back(batch->def_levels[level_idx] != max_definition_level); + } + } + } + if (value_idx != batch->values_written) { + return Status::Corruption( + "Nested parquet reader returned extra values for column {}: consumed={}, values={}", + column_reader.name(), value_idx, batch->values_written); + } + if (column_reader.type()->is_nullable() && + value_null_map.size() != static_cast(batch->values_written)) { + return Status::Corruption("Invalid nested parquet null map for column {}", + column_reader.name()); + } + + batch->values_column = column_reader.type()->create_column(); + if (batch->values_written > 0) { + const NullMap* null_map = value_null_map.empty() ? nullptr : &value_null_map; + RETURN_IF_ERROR(append_scalar_values(column_reader, *record_reader, batch->values_written, + null_map, batch->values_column)); + } + return Status::OK(); +} + +void move_nested_scalar_tail(const NestedScalarBatch& src, int64_t start_level, + NestedScalarOverflow* overflow) { + DORIS_CHECK(overflow != nullptr); + if (start_level >= src.levels_written) { + overflow->clear(); + return; + } + + NestedScalarBatch dst; + dst.records_read = 0; + dst.levels_written = src.levels_written - start_level; + dst.def_levels.assign(src.def_levels.begin() + start_level, src.def_levels.end()); + dst.rep_levels.assign(src.rep_levels.begin() + start_level, src.rep_levels.end()); + dst.value_indices.resize(static_cast(dst.levels_written), -1); + dst.values_column = src.values_column->clone_empty(); + + for (int64_t level_idx = start_level; level_idx < src.levels_written; ++level_idx) { + const int64_t value_idx = src.value_indices[level_idx]; + if (value_idx < 0) { + continue; + } + dst.value_indices[static_cast(level_idx - start_level)] = dst.values_written; + dst.values_column->insert_from(*src.values_column, static_cast(value_idx)); + dst.values_written++; + } + overflow->batch = std::move(dst); +} + +Status append_scalar_batch_value(const ScalarColumnReader& column_reader, + const NestedScalarBatch& batch, int64_t level_idx, + MutableColumnPtr& column) { + const int64_t value_idx = batch.value_indices[level_idx]; + if (value_idx < 0) { + return Status::Corruption("Nested parquet value is absent for column {}", + column_reader.name()); + } + column->insert_from(*batch.values_column, static_cast(value_idx)); + return Status::OK(); +} + +ColumnArray* array_column_from_output(MutableColumnPtr& column) { + if (auto* nullable_column = check_and_get_column(*column)) { + return assert_cast(&nullable_column->get_nested_column()); + } + return assert_cast(column.get()); +} + +ColumnMap* map_column_from_output(MutableColumnPtr& column) { + if (auto* nullable_column = check_and_get_column(*column)) { + return assert_cast(&nullable_column->get_nested_column()); + } + return assert_cast(column.get()); +} + +NullMap* null_map_from_nullable_output(MutableColumnPtr& column) { + if (auto* nullable_column = check_and_get_column(*column)) { + return &nullable_column->get_null_map_data(); + } + return nullptr; +} + +void append_offsets(ColumnArray::Offsets64& offsets, const std::vector& entry_counts) { + offsets.reserve(offsets.size() + entry_counts.size()); + uint64_t current_offset = offsets.empty() ? 0 : offsets.back(); + for (const auto entry_count : entry_counts) { + current_offset += entry_count; + offsets.push_back(current_offset); + } +} + +void append_parent_nulls(NullMap* dst, const NullMap& src) { + if (dst == nullptr) { + return; + } + dst->insert(src.begin(), src.end()); +} + +template +Status assemble_repeated_levels(ScalarColumnReader& driver_reader, int16_t repeated_level, + int16_t value_slot_definition_level, int64_t rows, + NestedScalarOverflow* overflow, Sink& sink, int64_t* rows_read) { + if (overflow == nullptr || rows_read == nullptr) { + return Status::InvalidArgument("Invalid repeated level assembler arguments for column {}", + driver_reader.name()); + } + *rows_read = 0; + while (*rows_read < rows) { + NestedScalarBatch read_batch; + NestedScalarBatch* batch = nullptr; + bool from_overflow = false; + if (!overflow->empty()) { + batch = &overflow->batch; + from_overflow = true; + } else { + const int64_t batch_rows = std::max(rows - *rows_read, NESTED_READ_BATCH_ROWS); + RETURN_IF_ERROR(read_nested_scalar_batch(driver_reader, batch_rows, + value_slot_definition_level, &read_batch)); + if (read_batch.empty()) { + break; + } + batch = &read_batch; + } + RETURN_IF_ERROR(sink.start_batch(*batch)); + + int64_t level_idx = 0; + while (level_idx < batch->levels_written) { + const bool starts_parent = batch->rep_levels[level_idx] < repeated_level; + if (starts_parent && *rows_read >= rows) { + move_nested_scalar_tail(*batch, level_idx, overflow); + return Status::OK(); + } + if (starts_parent) { + RETURN_IF_ERROR(sink.start_parent(*batch, level_idx)); + ++*rows_read; + } else { + if (*rows_read == 0) { + return Status::Corruption( + "Repeated parquet stream starts with repeated level for column {}", + driver_reader.name()); + } + RETURN_IF_ERROR(sink.append_repeated(*batch, level_idx)); + } + ++level_idx; + } + + if (from_overflow) { + overflow->clear(); + } + } + return Status::OK(); +} + } // namespace Status ScalarColumnReader::read(int64_t rows, MutableColumnPtr& column, int64_t* rows_read) { @@ -525,67 +779,80 @@ Status ListColumnReader::read(int64_t rows, MutableColumnPtr& column, int64_t* r return Status::NotSupported( "Current parquet LIST reader only supports scalar elements for column {}", _name); } - if (element_reader->descriptor()->max_definition_level() != 1) { - return Status::NotSupported( - "Current parquet LIST reader only supports required elements for column {}", _name); - } - - ::parquet::internal::RecordReader* record_reader = nullptr; - int64_t records_read = 0; - RETURN_IF_ERROR(read_records(*element_reader, rows, &record_reader, &records_read)); - const int64_t levels_written = record_reader->levels_written(); - if (records_read != rows || levels_written < records_read) { - return Status::Corruption( - "Invalid parquet LIST read result for column {}: rows={}, levels={}", _name, - records_read, levels_written); - } - if (record_reader->values_written() != levels_written) { - return Status::NotSupported( - "Current parquet LIST reader only supports non-empty lists with required " - "elements for column {}", - _name); - } - const int16_t max_definition_level = element_reader->descriptor()->max_definition_level(); - if (auto* def_levels = record_reader->def_levels(); def_levels != nullptr) { - for (int64_t level_idx = 0; level_idx < levels_written; ++level_idx) { - if (def_levels[level_idx] != max_definition_level) { - return Status::NotSupported( - "Current parquet LIST reader only supports non-empty lists with required " - "elements for column {}", - _name); + auto* array_column = array_column_from_output(column); + DORIS_CHECK(array_column != nullptr); + auto* parent_null_map = null_map_from_nullable_output(column); + auto nested_column = array_column->get_data_ptr()->assume_mutable(); + std::vector entry_counts; + NullMap parent_nulls; + const int16_t element_slot_definition_level = _nullable_definition_level + 1; + const int16_t element_max_definition_level = + element_reader->descriptor()->max_definition_level(); + + struct ListSink { + ListColumnReader* self = nullptr; + ScalarColumnReader* element_reader = nullptr; + MutableColumnPtr* nested_column = nullptr; + std::vector* entry_counts = nullptr; + NullMap* parent_nulls = nullptr; + int16_t element_max_definition_level = 0; + + Status start_batch(const NestedScalarBatch&) { return Status::OK(); } + + Status start_parent(const NestedScalarBatch& batch, int64_t level_idx) { + const int16_t def_level = batch.def_levels[level_idx]; + if (def_level < self->_nullable_definition_level) { + if (!self->_type->is_nullable()) { + return Status::Corruption( + "Parquet LIST column {} contains null for non-nullable list", + self->_name); + } + entry_counts->push_back(0); + parent_nulls->push_back(1); + return Status::OK(); + } + entry_counts->push_back(0); + parent_nulls->push_back(0); + if (def_level == self->_nullable_definition_level) { + return Status::OK(); } + return append_element(batch, level_idx); } - } - auto& array_column = assert_cast(*column); - auto nested_column = array_column.get_data_ptr()->assume_mutable(); - RETURN_IF_ERROR(append_scalar_values(*element_reader, *record_reader, levels_written, nullptr, - nested_column)); - array_column.get_data_ptr() = std::move(nested_column); + Status append_repeated(const NestedScalarBatch& batch, int64_t level_idx) { + if (entry_counts->empty()) { + return Status::Corruption("Invalid repeated LIST level for column {}", self->_name); + } + return append_element(batch, level_idx); + } - auto* rep_levels = record_reader->rep_levels(); - if (rep_levels == nullptr && levels_written > 0) { - return Status::Corruption( - "Parquet LIST reader returned null repetition levels for column {}", _name); - } - auto& offsets = array_column.get_offsets(); - offsets.reserve(offsets.size() + static_cast(records_read)); - size_t current_offset = offsets.empty() ? 0 : offsets.back(); - int64_t current_record = 0; - for (int64_t level_idx = 0; level_idx < levels_written; ++level_idx) { - if (level_idx == 0 || rep_levels[level_idx] < _repeated_repetition_level) { - if (level_idx != 0) { - offsets.push_back(current_offset); - current_record++; + Status append_element(const NestedScalarBatch& batch, int64_t level_idx) { + const int16_t def_level = batch.def_levels[level_idx]; + if (def_level == element_max_definition_level) { + RETURN_IF_ERROR(append_scalar_batch_value(*element_reader, batch, level_idx, + *nested_column)); + } else { + if (!element_reader->type()->is_nullable()) { + return Status::Corruption( + "Parquet LIST column {} contains null for non-nullable element", + self->_name); + } + (*nested_column)->insert_default(); } + ++entry_counts->back(); + return Status::OK(); } - current_offset++; - } - while (current_record < records_read) { - offsets.push_back(current_offset); - current_record++; - } - *rows_read = records_read; + }; + + ListSink sink {this, element_reader, &nested_column, + &entry_counts, &parent_nulls, element_max_definition_level}; + RETURN_IF_ERROR(assemble_repeated_levels(*element_reader, _repeated_repetition_level, + element_slot_definition_level, rows, + &_element_overflow, sink, rows_read)); + + array_column->get_data_ptr() = std::move(nested_column); + append_offsets(array_column->get_offsets(), entry_counts); + append_parent_nulls(parent_null_map, parent_nulls); return Status::OK(); } @@ -593,8 +860,26 @@ Status ListColumnReader::skip(int64_t rows) { if (rows <= 0) { return Status::OK(); } - DORIS_CHECK(_element_reader != nullptr); - return _element_reader->skip(rows); + auto* element_reader = dynamic_cast(_element_reader.get()); + if (element_reader == nullptr) { + return Status::NotSupported( + "Current parquet LIST reader only supports scalar elements for column {}", _name); + } + struct SkipSink { + Status start_batch(const NestedScalarBatch&) { return Status::OK(); } + Status start_parent(const NestedScalarBatch&, int64_t) { return Status::OK(); } + Status append_repeated(const NestedScalarBatch&, int64_t) { return Status::OK(); } + }; + SkipSink sink; + int64_t rows_read = 0; + RETURN_IF_ERROR(assemble_repeated_levels(*element_reader, _repeated_repetition_level, + _nullable_definition_level + 1, rows, + &_element_overflow, sink, &rows_read)); + if (rows_read != rows) { + return Status::Corruption("Failed to skip parquet LIST column {}: skipped {} of {} rows", + _name, rows_read, rows); + } + return Status::OK(); } Status MapColumnReader::read(int64_t rows, MutableColumnPtr& column, int64_t* rows_read) { @@ -612,107 +897,140 @@ Status MapColumnReader::read(int64_t rows, MutableColumnPtr& column, int64_t* ro return Status::NotSupported( "Current parquet MAP reader only supports scalar key/value for column {}", _name); } - if (key_reader->descriptor()->max_definition_level() != 1 || - value_reader->descriptor()->max_definition_level() != 1) { - return Status::NotSupported( - "Current parquet MAP reader only supports required key/value entries for column {}", - _name); - } - ::parquet::internal::RecordReader* key_record_reader = nullptr; - int64_t records_read = 0; - RETURN_IF_ERROR(read_records(*key_reader, rows, &key_record_reader, &records_read)); - const int64_t levels_written = key_record_reader->levels_written(); - if (records_read != rows || levels_written < records_read) { - return Status::Corruption( - "Invalid parquet MAP key read result for column {}: rows={}, levels={}", _name, - records_read, levels_written); - } - if (key_record_reader->values_written() != levels_written) { - return Status::NotSupported( - "Current parquet MAP reader only supports non-empty maps with required entries " - "for column {}", - _name); - } - - const int16_t max_definition_level = key_reader->descriptor()->max_definition_level(); - if (auto* def_levels = key_record_reader->def_levels(); def_levels != nullptr) { - for (int64_t level_idx = 0; level_idx < levels_written; ++level_idx) { - if (def_levels[level_idx] != max_definition_level) { - return Status::NotSupported( - "Current parquet MAP reader only supports non-empty maps with required " - "entries for column {}", - _name); + auto* map_column = map_column_from_output(column); + DORIS_CHECK(map_column != nullptr); + auto* parent_null_map = null_map_from_nullable_output(column); + auto key_column = map_column->get_keys_ptr()->assume_mutable(); + auto value_column = map_column->get_values_ptr()->assume_mutable(); + std::vector entry_counts; + NullMap parent_nulls; + const int16_t entry_definition_level = _nullable_definition_level + 1; + const int16_t key_max_definition_level = key_reader->descriptor()->max_definition_level(); + const int16_t value_max_definition_level = value_reader->descriptor()->max_definition_level(); + + struct MapSink { + MapColumnReader* self = nullptr; + ScalarColumnReader* key_reader = nullptr; + ScalarColumnReader* value_reader = nullptr; + MutableColumnPtr* key_column = nullptr; + MutableColumnPtr* value_column = nullptr; + std::vector* entry_counts = nullptr; + NullMap* parent_nulls = nullptr; + int16_t key_max_definition_level = 0; + int16_t value_max_definition_level = 0; + + Status read_value_batch(int64_t batch_rows, NestedScalarBatch* value_batch) { + if (!self->_value_overflow.empty()) { + *value_batch = std::move(self->_value_overflow.batch); + self->_value_overflow.clear(); + return Status::OK(); } + return read_nested_scalar_batch(*value_reader, batch_rows, + self->_nullable_definition_level + 1, value_batch); } - } - ::parquet::internal::RecordReader* value_record_reader = nullptr; - int64_t value_records_read = 0; - RETURN_IF_ERROR( - read_records(*value_reader, records_read, &value_record_reader, &value_records_read)); - if (value_records_read != records_read || - value_record_reader->levels_written() != levels_written || - value_record_reader->values_written() != levels_written) { - return Status::Corruption( - "Invalid parquet MAP value read result for column {}: rows={}, levels={}, " - "values={}, expected={}", - _name, value_records_read, value_record_reader->levels_written(), - value_record_reader->values_written(), levels_written); - } - if (auto* def_levels = value_record_reader->def_levels(); def_levels != nullptr) { - for (int64_t level_idx = 0; level_idx < levels_written; ++level_idx) { - if (def_levels[level_idx] != max_definition_level) { - return Status::NotSupported( - "Current parquet MAP reader only supports non-empty maps with required " - "entries for column {}", - _name); + Status validate_value_alignment(const NestedScalarBatch& key_batch, + const NestedScalarBatch& value_batch) { + if (value_batch.records_read != key_batch.records_read || + value_batch.levels_written != key_batch.levels_written) { + return Status::Corruption( + "Parquet MAP key/value levels are not aligned for column {}: key rows={}, " + "key levels={}, value rows={}, value levels={}", + self->_name, key_batch.records_read, key_batch.levels_written, + value_batch.records_read, value_batch.levels_written); + } + for (int64_t level_idx = 0; level_idx < key_batch.levels_written; ++level_idx) { + if (value_batch.rep_levels[level_idx] != key_batch.rep_levels[level_idx]) { + return Status::Corruption( + "Parquet MAP key/value repetition levels are not aligned for column {}", + self->_name); + } } + return Status::OK(); } - } - const auto* key_rep_levels = key_record_reader->rep_levels(); - const auto* value_rep_levels = value_record_reader->rep_levels(); - if ((key_rep_levels == nullptr || value_rep_levels == nullptr) && levels_written > 0) { - return Status::Corruption( - "Parquet MAP reader returned null repetition levels for column {}", _name); - } - for (int64_t level_idx = 0; level_idx < levels_written; ++level_idx) { - if (key_rep_levels[level_idx] != value_rep_levels[level_idx]) { - return Status::Corruption( - "Parquet MAP key/value repetition levels are not aligned for column {}", _name); + Status start_batch(const NestedScalarBatch& key_batch) { + RETURN_IF_ERROR(read_value_batch(key_batch.records_read, &value_batch)); + RETURN_IF_ERROR(validate_value_alignment(key_batch, value_batch)); + return Status::OK(); } - } - auto& map_column = assert_cast(*column); - auto key_column = map_column.get_keys_ptr()->assume_mutable(); - RETURN_IF_ERROR(append_scalar_values(*key_reader, *key_record_reader, levels_written, nullptr, - key_column)); - map_column.get_keys_ptr() = std::move(key_column); - - auto value_column = map_column.get_values_ptr()->assume_mutable(); - RETURN_IF_ERROR(append_scalar_values(*value_reader, *value_record_reader, levels_written, - nullptr, value_column)); - map_column.get_values_ptr() = std::move(value_column); - - auto& offsets = map_column.get_offsets(); - offsets.reserve(offsets.size() + static_cast(records_read)); - size_t current_offset = offsets.empty() ? 0 : offsets.back(); - int64_t current_record = 0; - for (int64_t level_idx = 0; level_idx < levels_written; ++level_idx) { - if (level_idx == 0 || key_rep_levels[level_idx] < _repeated_repetition_level) { - if (level_idx != 0) { - offsets.push_back(current_offset); - current_record++; + Status start_parent(const NestedScalarBatch& key_batch, int64_t level_idx) { + const int16_t def_level = key_batch.def_levels[level_idx]; + if (def_level < self->_nullable_definition_level) { + if (!self->_type->is_nullable()) { + return Status::Corruption( + "Parquet MAP column {} contains null for non-nullable map", + self->_name); + } + entry_counts->push_back(0); + parent_nulls->push_back(1); + return Status::OK(); } + entry_counts->push_back(0); + parent_nulls->push_back(0); + if (def_level == self->_nullable_definition_level) { + return Status::OK(); + } + return append_entry(key_batch, level_idx); } - current_offset++; - } - while (current_record < records_read) { - offsets.push_back(current_offset); - current_record++; - } - *rows_read = records_read; + + Status append_repeated(const NestedScalarBatch& key_batch, int64_t level_idx) { + if (entry_counts->empty()) { + return Status::Corruption("Invalid repeated MAP level for column {}", self->_name); + } + return append_entry(key_batch, level_idx); + } + + Status append_entry(const NestedScalarBatch& key_batch, int64_t level_idx) { + if (key_batch.def_levels[level_idx] != key_max_definition_level) { + return Status::Corruption("Parquet MAP column {} contains null map key", + self->_name); + } + RETURN_IF_ERROR( + append_scalar_batch_value(*key_reader, key_batch, level_idx, *key_column)); + if (value_batch.def_levels[level_idx] == value_max_definition_level) { + RETURN_IF_ERROR(append_scalar_batch_value(*value_reader, value_batch, level_idx, + *value_column)); + } else { + if (!value_reader->type()->is_nullable()) { + return Status::Corruption( + "Parquet MAP column {} contains null for non-nullable value", + self->_name); + } + (*value_column)->insert_default(); + } + ++entry_counts->back(); + return Status::OK(); + } + + NestedScalarBatch value_batch; + }; + + MapSink sink {this, + key_reader, + value_reader, + &key_column, + &value_column, + &entry_counts, + &parent_nulls, + key_max_definition_level, + value_max_definition_level}; + RETURN_IF_ERROR(assemble_repeated_levels(*key_reader, _repeated_repetition_level, + entry_definition_level, rows, &_key_overflow, sink, + rows_read)); + if (!_key_overflow.empty()) { + move_nested_scalar_tail( + sink.value_batch, + sink.value_batch.levels_written - _key_overflow.batch.levels_written, + &_value_overflow); + } + + map_column->get_keys_ptr() = std::move(key_column); + map_column->get_values_ptr() = std::move(value_column); + append_offsets(map_column->get_offsets(), entry_counts); + append_parent_nulls(parent_null_map, parent_nulls); return Status::OK(); } @@ -722,8 +1040,73 @@ Status MapColumnReader::skip(int64_t rows) { } DORIS_CHECK(_key_reader != nullptr); DORIS_CHECK(_value_reader != nullptr); - RETURN_IF_ERROR(_key_reader->skip(rows)); - return _value_reader->skip(rows); + auto* key_reader = dynamic_cast(_key_reader.get()); + auto* value_reader = dynamic_cast(_value_reader.get()); + if (key_reader == nullptr || value_reader == nullptr) { + return Status::NotSupported( + "Current parquet MAP reader only supports scalar key/value for column {}", _name); + } + struct SkipSink { + MapColumnReader* self = nullptr; + ScalarColumnReader* value_reader = nullptr; + + Status read_value_batch(int64_t batch_rows, NestedScalarBatch* value_batch) { + if (!self->_value_overflow.empty()) { + *value_batch = std::move(self->_value_overflow.batch); + self->_value_overflow.clear(); + return Status::OK(); + } + return read_nested_scalar_batch(*value_reader, batch_rows, + self->_nullable_definition_level + 1, value_batch); + } + + Status validate_value_alignment(const NestedScalarBatch& key_batch, + const NestedScalarBatch& value_batch) { + if (value_batch.records_read != key_batch.records_read || + value_batch.levels_written != key_batch.levels_written) { + return Status::Corruption( + "Parquet MAP key/value levels are not aligned for column {} while " + "skipping", + self->_name); + } + for (int64_t level_idx = 0; level_idx < key_batch.levels_written; ++level_idx) { + if (value_batch.rep_levels[level_idx] != key_batch.rep_levels[level_idx]) { + return Status::Corruption( + "Parquet MAP key/value repetition levels are not aligned for column {}", + self->_name); + } + } + return Status::OK(); + } + + Status start_batch(const NestedScalarBatch& key_batch) { + RETURN_IF_ERROR(read_value_batch(key_batch.records_read, &value_batch)); + RETURN_IF_ERROR(validate_value_alignment(key_batch, value_batch)); + return Status::OK(); + } + + Status start_parent(const NestedScalarBatch&, int64_t) { return Status::OK(); } + + Status append_repeated(const NestedScalarBatch&, int64_t) { return Status::OK(); } + + NestedScalarBatch value_batch; + }; + SkipSink sink {this, value_reader}; + int64_t rows_read = 0; + RETURN_IF_ERROR(assemble_repeated_levels(*key_reader, _repeated_repetition_level, + _nullable_definition_level + 1, rows, &_key_overflow, + sink, &rows_read)); + if (!_key_overflow.empty()) { + move_nested_scalar_tail( + sink.value_batch, + sink.value_batch.levels_written - _key_overflow.batch.levels_written, + &_value_overflow); + } + if (rows_read != rows) { + return Status::Corruption("Failed to skip parquet MAP column {}: skipped {} of {} rows", + _name, rows_read, rows); + } + return Status::OK(); } Status ParquetColumnReader::skip(int64_t rows) { @@ -954,10 +1337,6 @@ Status ParquetColumnReaderFactory::create_list_column_reader( return Status::NotSupported("Parquet LIST projection is not implemented for column {}", column_schema.name); } - if (column_schema.type != nullptr && column_schema.type->is_nullable()) { - return Status::NotSupported("Nullable parquet LIST reader is not implemented for column {}", - column_schema.name); - } if (column_schema.children.size() != 1) { return Status::NotSupported("Unsupported parquet LIST layout for column {}", column_schema.name); @@ -980,10 +1359,6 @@ Status ParquetColumnReaderFactory::create_map_column_reader( return Status::NotSupported("Parquet MAP projection is not implemented for column {}", column_schema.name); } - if (column_schema.type != nullptr && column_schema.type->is_nullable()) { - return Status::NotSupported("Nullable parquet MAP reader is not implemented for column {}", - column_schema.name); - } if (column_schema.children.size() != 1 || column_schema.children[0]->children.size() != 2) { return Status::NotSupported("Unsupported parquet MAP layout for column {}", column_schema.name); diff --git a/be/test/format/new_parquet/parquet_column_reader_test.cpp b/be/test/format/new_parquet/parquet_column_reader_test.cpp index 4a187f4f8e0d18..50aa801f4c70e3 100644 --- a/be/test/format/new_parquet/parquet_column_reader_test.cpp +++ b/be/test/format/new_parquet/parquet_column_reader_test.cpp @@ -163,6 +163,38 @@ class ParquetColumnReaderTest : public testing::Test { return finish_array(&builder); } + std::shared_ptr build_nullable_int_list_array() { + auto value_builder = std::make_shared(); + arrow::ListBuilder builder(arrow::default_memory_pool(), value_builder); + EXPECT_TRUE(builder.Append().ok()); + EXPECT_TRUE(value_builder->Append(10).ok()); + EXPECT_TRUE(value_builder->Append(20).ok()); + EXPECT_TRUE(builder.AppendNull().ok()); + EXPECT_TRUE(builder.AppendEmptyValue().ok()); + EXPECT_TRUE(builder.Append().ok()); + EXPECT_TRUE(value_builder->AppendNull().ok()); + EXPECT_TRUE(value_builder->Append(30).ok()); + EXPECT_TRUE(builder.Append().ok()); + EXPECT_TRUE(value_builder->Append(40).ok()); + return finish_array(&builder); + } + + std::shared_ptr build_required_nullable_int_list_array() { + auto value_builder = std::make_shared(); + arrow::ListBuilder builder(arrow::default_memory_pool(), value_builder); + EXPECT_TRUE(builder.AppendEmptyValue().ok()); + EXPECT_TRUE(builder.Append().ok()); + EXPECT_TRUE(value_builder->AppendNull().ok()); + EXPECT_TRUE(value_builder->Append(110).ok()); + EXPECT_TRUE(builder.Append().ok()); + EXPECT_TRUE(value_builder->Append(120).ok()); + EXPECT_TRUE(builder.Append().ok()); + EXPECT_TRUE(value_builder->Append(130).ok()); + EXPECT_TRUE(value_builder->AppendNull().ok()); + EXPECT_TRUE(builder.Append().ok()); + return finish_array(&builder); + } + std::shared_ptr build_required_int_string_map_array() { auto key_builder = std::make_shared(); auto value_builder = std::make_shared(); @@ -183,6 +215,50 @@ class ParquetColumnReaderTest : public testing::Test { return finish_array(&builder); } + std::shared_ptr build_nullable_int_string_map_array() { + auto key_builder = std::make_shared(); + auto value_builder = std::make_shared(); + auto map_type = arrow::map(arrow::int32(), arrow::field("value", arrow::utf8(), true)); + arrow::MapBuilder builder(arrow::default_memory_pool(), key_builder, value_builder, + map_type); + EXPECT_TRUE(builder.Append().ok()); + EXPECT_TRUE(key_builder->Append(10).ok()); + EXPECT_TRUE(value_builder->Append("aa").ok()); + EXPECT_TRUE(key_builder->Append(20).ok()); + EXPECT_TRUE(value_builder->AppendNull().ok()); + EXPECT_TRUE(builder.AppendNull().ok()); + EXPECT_TRUE(builder.AppendEmptyValue().ok()); + EXPECT_TRUE(builder.Append().ok()); + EXPECT_TRUE(key_builder->Append(30).ok()); + EXPECT_TRUE(value_builder->Append("cc").ok()); + EXPECT_TRUE(builder.Append().ok()); + EXPECT_TRUE(key_builder->Append(40).ok()); + EXPECT_TRUE(value_builder->AppendNull().ok()); + return finish_array(&builder); + } + + std::shared_ptr build_required_nullable_string_map_array() { + auto key_builder = std::make_shared(); + auto value_builder = std::make_shared(); + auto map_type = arrow::map(arrow::int32(), arrow::field("value", arrow::utf8(), true)); + arrow::MapBuilder builder(arrow::default_memory_pool(), key_builder, value_builder, + map_type); + EXPECT_TRUE(builder.AppendEmptyValue().ok()); + EXPECT_TRUE(builder.Append().ok()); + EXPECT_TRUE(key_builder->Append(101).ok()); + EXPECT_TRUE(value_builder->AppendNull().ok()); + EXPECT_TRUE(key_builder->Append(102).ok()); + EXPECT_TRUE(value_builder->Append("bb").ok()); + EXPECT_TRUE(builder.Append().ok()); + EXPECT_TRUE(key_builder->Append(103).ok()); + EXPECT_TRUE(value_builder->Append("cc").ok()); + EXPECT_TRUE(builder.AppendEmptyValue().ok()); + EXPECT_TRUE(builder.Append().ok()); + EXPECT_TRUE(key_builder->Append(104).ok()); + EXPECT_TRUE(value_builder->AppendNull().ok()); + return finish_array(&builder); + } + std::shared_ptr build_time32_array(const std::shared_ptr& type, const std::vector& values) { arrow::Time32Builder builder(type, arrow::default_memory_pool()); @@ -430,6 +506,57 @@ class ParquetColumnReaderTest : public testing::Test { EXPECT_EQ(values.get_element(5), 6); EXPECT_EQ(values.get_element(8), 9); }); + add_field(arrow::field("nullable_list_int_col", + arrow::list(arrow::field("element", arrow::int32(), true)), true), + build_nullable_int_list_array(), + [](const ParquetColumnSchema& schema, const IColumn& column) { + EXPECT_TRUE(schema.type->is_nullable()); + const auto& nullable_column = assert_cast(column); + ASSERT_EQ(nullable_column.size(), ROW_COUNT); + EXPECT_FALSE(nullable_column.is_null_at(0)); + EXPECT_TRUE(nullable_column.is_null_at(1)); + EXPECT_FALSE(nullable_column.is_null_at(2)); + EXPECT_FALSE(nullable_column.is_null_at(3)); + const auto& array_column = + assert_cast(nullable_column.get_nested_column()); + const auto& offsets = array_column.get_offsets(); + ASSERT_EQ(offsets.size(), ROW_COUNT); + EXPECT_EQ(offsets[0], 2); + EXPECT_EQ(offsets[1], 2); + EXPECT_EQ(offsets[2], 2); + EXPECT_EQ(offsets[3], 4); + EXPECT_EQ(offsets[4], 5); + const auto& elements = + assert_cast(array_column.get_data()); + const auto& values = + assert_cast(elements.get_nested_column()); + ASSERT_EQ(elements.size(), 5); + EXPECT_EQ(values.get_element(0), 10); + EXPECT_EQ(values.get_element(1), 20); + EXPECT_TRUE(elements.is_null_at(2)); + EXPECT_EQ(values.get_element(3), 30); + EXPECT_EQ(values.get_element(4), 40); + }); + add_field(arrow::field("required_nullable_list_int_col", + arrow::list(arrow::field("element", arrow::int32(), true)), false), + build_required_nullable_int_list_array(), + [](const ParquetColumnSchema& schema, const IColumn& column) { + EXPECT_FALSE(schema.type->is_nullable()); + const auto& array_column = assert_cast(column); + const auto& offsets = array_column.get_offsets(); + ASSERT_EQ(offsets.size(), ROW_COUNT); + EXPECT_EQ(offsets[0], 0); + EXPECT_EQ(offsets[1], 2); + EXPECT_EQ(offsets[2], 3); + EXPECT_EQ(offsets[3], 5); + EXPECT_EQ(offsets[4], 5); + const auto& elements = + assert_cast(array_column.get_data()); + ASSERT_EQ(elements.size(), 5); + EXPECT_TRUE(elements.is_null_at(0)); + EXPECT_FALSE(elements.is_null_at(1)); + EXPECT_TRUE(elements.is_null_at(4)); + }); add_field(arrow::field( "map_int_string_col", arrow::map(arrow::int32(), arrow::field("value", arrow::utf8(), false)), @@ -465,6 +592,63 @@ class ParquetColumnReaderTest : public testing::Test { EXPECT_EQ(values.get_data_at(5).to_string(), "f"); EXPECT_EQ(values.get_data_at(8).to_string(), "i"); }); + add_field( + arrow::field("nullable_map_int_string_col", + arrow::map(arrow::int32(), arrow::field("value", arrow::utf8(), true)), + true), + build_nullable_int_string_map_array(), + [](const ParquetColumnSchema& schema, const IColumn& column) { + EXPECT_TRUE(schema.type->is_nullable()); + const auto& nullable_column = assert_cast(column); + ASSERT_EQ(nullable_column.size(), ROW_COUNT); + EXPECT_FALSE(nullable_column.is_null_at(0)); + EXPECT_TRUE(nullable_column.is_null_at(1)); + EXPECT_FALSE(nullable_column.is_null_at(2)); + const auto& map_column = + assert_cast(nullable_column.get_nested_column()); + const auto& offsets = map_column.get_offsets(); + ASSERT_EQ(offsets.size(), ROW_COUNT); + EXPECT_EQ(offsets[0], 2); + EXPECT_EQ(offsets[1], 2); + EXPECT_EQ(offsets[2], 2); + EXPECT_EQ(offsets[3], 3); + EXPECT_EQ(offsets[4], 4); + const auto& keys = assert_cast(map_column.get_keys()); + const auto& values = + assert_cast(map_column.get_values()); + const auto& value_data = + assert_cast(values.get_nested_column()); + ASSERT_EQ(keys.size(), 4); + EXPECT_EQ(keys.get_element(0), 10); + EXPECT_EQ(keys.get_element(1), 20); + EXPECT_EQ(keys.get_element(3), 40); + EXPECT_EQ(value_data.get_data_at(0).to_string(), "aa"); + EXPECT_TRUE(values.is_null_at(1)); + EXPECT_EQ(value_data.get_data_at(2).to_string(), "cc"); + EXPECT_TRUE(values.is_null_at(3)); + }); + add_field( + arrow::field("required_nullable_map_int_string_col", + arrow::map(arrow::int32(), arrow::field("value", arrow::utf8(), true)), + false), + build_required_nullable_string_map_array(), + [](const ParquetColumnSchema& schema, const IColumn& column) { + EXPECT_FALSE(schema.type->is_nullable()); + const auto& map_column = assert_cast(column); + const auto& offsets = map_column.get_offsets(); + ASSERT_EQ(offsets.size(), ROW_COUNT); + EXPECT_EQ(offsets[0], 0); + EXPECT_EQ(offsets[1], 2); + EXPECT_EQ(offsets[2], 3); + EXPECT_EQ(offsets[3], 3); + EXPECT_EQ(offsets[4], 4); + const auto& values = + assert_cast(map_column.get_values()); + ASSERT_EQ(values.size(), 4); + EXPECT_TRUE(values.is_null_at(0)); + EXPECT_FALSE(values.is_null_at(1)); + EXPECT_TRUE(values.is_null_at(3)); + }); auto schema = arrow::schema(_arrow_fields); auto table = arrow::Table::Make(schema, _arrays); @@ -531,6 +715,16 @@ TEST_F(ParquetColumnReaderTest, ReadAllSupportedPhysicalAndLogicalTypes) { } } +TEST_F(ParquetColumnReaderTest, ReadSupportedComplexTypes) { + read_and_validate(find_field_idx("struct_col")); + read_and_validate(find_field_idx("list_int_col")); + read_and_validate(find_field_idx("nullable_list_int_col")); + read_and_validate(find_field_idx("required_nullable_list_int_col")); + read_and_validate(find_field_idx("map_int_string_col")); + read_and_validate(find_field_idx("nullable_map_int_string_col")); + read_and_validate(find_field_idx("required_nullable_map_int_string_col")); +} + TEST_F(ParquetColumnReaderTest, SkipThenRead) { auto reader = create_reader(1); auto st = reader->skip(2); @@ -604,6 +798,134 @@ TEST_F(ParquetColumnReaderTest, ReadProjectedStructChildren) { EXPECT_EQ(values.get_data_at(4).to_string(), "se"); } +TEST_F(ParquetColumnReaderTest, ReadListWithOverflowAcrossChunks) { + const auto field_idx = find_field_idx("nullable_list_int_col"); + auto reader = create_reader(field_idx); + MutableColumnPtr column = reader->type()->create_column(); + + int64_t rows_read = 0; + auto st = reader->read(2, column, &rows_read); + ASSERT_TRUE(st.ok()) << st; + ASSERT_EQ(rows_read, 2); + st = reader->read(3, column, &rows_read); + ASSERT_TRUE(st.ok()) << st; + ASSERT_EQ(rows_read, 3); + + _expected_by_field[field_idx](*_fields[field_idx], *column); +} + +TEST_F(ParquetColumnReaderTest, SkipListWithOverflowThenRead) { + const auto field_idx = find_field_idx("nullable_list_int_col"); + auto reader = create_reader(field_idx); + auto st = reader->skip(1); + ASSERT_TRUE(st.ok()) << st; + + MutableColumnPtr column = reader->type()->create_column(); + int64_t rows_read = 0; + st = reader->read(3, column, &rows_read); + ASSERT_TRUE(st.ok()) << st; + ASSERT_EQ(rows_read, 3); + + const auto& nullable_column = assert_cast(*column); + ASSERT_EQ(nullable_column.size(), 3); + EXPECT_TRUE(nullable_column.is_null_at(0)); + const auto& array_column = assert_cast(nullable_column.get_nested_column()); + const auto& offsets = array_column.get_offsets(); + ASSERT_EQ(offsets.size(), 3); + EXPECT_EQ(offsets[0], 0); + EXPECT_EQ(offsets[1], 0); + EXPECT_EQ(offsets[2], 2); +} + +TEST_F(ParquetColumnReaderTest, SelectListWithOverflow) { + const auto field_idx = find_field_idx("nullable_list_int_col"); + auto reader = create_reader(field_idx); + SelectionVector selection(3); + selection.set_index(0, 0); + selection.set_index(1, 3); + selection.set_index(2, 4); + + MutableColumnPtr column = reader->type()->create_column(); + auto st = reader->select(selection, 3, ROW_COUNT, column); + ASSERT_TRUE(st.ok()) << st; + + const auto& nullable_column = assert_cast(*column); + ASSERT_EQ(nullable_column.size(), 3); + EXPECT_FALSE(nullable_column.is_null_at(0)); + EXPECT_FALSE(nullable_column.is_null_at(1)); + EXPECT_FALSE(nullable_column.is_null_at(2)); + const auto& array_column = assert_cast(nullable_column.get_nested_column()); + const auto& offsets = array_column.get_offsets(); + ASSERT_EQ(offsets.size(), 3); + EXPECT_EQ(offsets[0], 2); + EXPECT_EQ(offsets[1], 4); + EXPECT_EQ(offsets[2], 5); +} + +TEST_F(ParquetColumnReaderTest, ReadMapWithOverflowAcrossChunks) { + const auto field_idx = find_field_idx("nullable_map_int_string_col"); + auto reader = create_reader(field_idx); + MutableColumnPtr column = reader->type()->create_column(); + + int64_t rows_read = 0; + auto st = reader->read(2, column, &rows_read); + ASSERT_TRUE(st.ok()) << st; + ASSERT_EQ(rows_read, 2); + st = reader->read(3, column, &rows_read); + ASSERT_TRUE(st.ok()) << st; + ASSERT_EQ(rows_read, 3); + + _expected_by_field[field_idx](*_fields[field_idx], *column); +} + +TEST_F(ParquetColumnReaderTest, SkipMapWithOverflowThenRead) { + const auto field_idx = find_field_idx("nullable_map_int_string_col"); + auto reader = create_reader(field_idx); + auto st = reader->skip(1); + ASSERT_TRUE(st.ok()) << st; + + MutableColumnPtr column = reader->type()->create_column(); + int64_t rows_read = 0; + st = reader->read(3, column, &rows_read); + ASSERT_TRUE(st.ok()) << st; + ASSERT_EQ(rows_read, 3); + + const auto& nullable_column = assert_cast(*column); + ASSERT_EQ(nullable_column.size(), 3); + EXPECT_TRUE(nullable_column.is_null_at(0)); + const auto& map_column = assert_cast(nullable_column.get_nested_column()); + const auto& offsets = map_column.get_offsets(); + ASSERT_EQ(offsets.size(), 3); + EXPECT_EQ(offsets[0], 0); + EXPECT_EQ(offsets[1], 0); + EXPECT_EQ(offsets[2], 1); +} + +TEST_F(ParquetColumnReaderTest, SelectMapWithOverflow) { + const auto field_idx = find_field_idx("nullable_map_int_string_col"); + auto reader = create_reader(field_idx); + SelectionVector selection(3); + selection.set_index(0, 0); + selection.set_index(1, 3); + selection.set_index(2, 4); + + MutableColumnPtr column = reader->type()->create_column(); + auto st = reader->select(selection, 3, ROW_COUNT, column); + ASSERT_TRUE(st.ok()) << st; + + const auto& nullable_column = assert_cast(*column); + ASSERT_EQ(nullable_column.size(), 3); + EXPECT_FALSE(nullable_column.is_null_at(0)); + EXPECT_FALSE(nullable_column.is_null_at(1)); + EXPECT_FALSE(nullable_column.is_null_at(2)); + const auto& map_column = assert_cast(nullable_column.get_nested_column()); + const auto& offsets = map_column.get_offsets(); + ASSERT_EQ(offsets.size(), 3); + EXPECT_EQ(offsets[0], 2); + EXPECT_EQ(offsets[1], 3); + EXPECT_EQ(offsets[2], 4); +} + TEST_F(ParquetColumnReaderTest, BuildComplexSchemaPathMetadata) { const auto field_idx = find_field_idx("struct_col"); ASSERT_LT(field_idx, _fields.size()); From 68a29c54444960ec38011874a449c8a86dba53aa Mon Sep 17 00:00:00 2001 From: Socrates Date: Thu, 28 May 2026 15:52:04 +0800 Subject: [PATCH 26/38] [fix](be) Fix parquet map reader build warnings ### What problem does this PR solve? Issue Number: close #xxx Related PR: #xxx Problem Summary: Fix warning-as-error failures in the new parquet map reader caused by shadowed local names and aggregate initialization after adding the nested level assembler. ### Release note None ### Check List (For Author) - Test: Manual test - Ran git diff --check locally. Fedora DEBUG BE build will be run after pushing this commit. - Behavior changed: No - Does this need documentation: No --- be/src/format/new_parquet/column_reader.cpp | 55 +++++++++++---------- 1 file changed, 30 insertions(+), 25 deletions(-) diff --git a/be/src/format/new_parquet/column_reader.cpp b/be/src/format/new_parquet/column_reader.cpp index b4cc00caa98767..9d9ac98ced99e1 100644 --- a/be/src/format/new_parquet/column_reader.cpp +++ b/be/src/format/new_parquet/column_reader.cpp @@ -920,28 +920,29 @@ Status MapColumnReader::read(int64_t rows, MutableColumnPtr& column, int64_t* ro int16_t key_max_definition_level = 0; int16_t value_max_definition_level = 0; - Status read_value_batch(int64_t batch_rows, NestedScalarBatch* value_batch) { + Status read_value_batch(int64_t batch_rows, NestedScalarBatch* out_value_batch) { if (!self->_value_overflow.empty()) { - *value_batch = std::move(self->_value_overflow.batch); + *out_value_batch = std::move(self->_value_overflow.batch); self->_value_overflow.clear(); return Status::OK(); } return read_nested_scalar_batch(*value_reader, batch_rows, - self->_nullable_definition_level + 1, value_batch); + self->_nullable_definition_level + 1, out_value_batch); } Status validate_value_alignment(const NestedScalarBatch& key_batch, - const NestedScalarBatch& value_batch) { - if (value_batch.records_read != key_batch.records_read || - value_batch.levels_written != key_batch.levels_written) { + const NestedScalarBatch& candidate_value_batch) { + if (candidate_value_batch.records_read != key_batch.records_read || + candidate_value_batch.levels_written != key_batch.levels_written) { return Status::Corruption( "Parquet MAP key/value levels are not aligned for column {}: key rows={}, " "key levels={}, value rows={}, value levels={}", self->_name, key_batch.records_read, key_batch.levels_written, - value_batch.records_read, value_batch.levels_written); + candidate_value_batch.records_read, candidate_value_batch.levels_written); } for (int64_t level_idx = 0; level_idx < key_batch.levels_written; ++level_idx) { - if (value_batch.rep_levels[level_idx] != key_batch.rep_levels[level_idx]) { + if (candidate_value_batch.rep_levels[level_idx] != + key_batch.rep_levels[level_idx]) { return Status::Corruption( "Parquet MAP key/value repetition levels are not aligned for column {}", self->_name); @@ -1008,15 +1009,16 @@ Status MapColumnReader::read(int64_t rows, MutableColumnPtr& column, int64_t* ro NestedScalarBatch value_batch; }; - MapSink sink {this, - key_reader, - value_reader, - &key_column, - &value_column, - &entry_counts, - &parent_nulls, - key_max_definition_level, - value_max_definition_level}; + MapSink sink; + sink.self = this; + sink.key_reader = key_reader; + sink.value_reader = value_reader; + sink.key_column = &key_column; + sink.value_column = &value_column; + sink.entry_counts = &entry_counts; + sink.parent_nulls = &parent_nulls; + sink.key_max_definition_level = key_max_definition_level; + sink.value_max_definition_level = value_max_definition_level; RETURN_IF_ERROR(assemble_repeated_levels(*key_reader, _repeated_repetition_level, entry_definition_level, rows, &_key_overflow, sink, rows_read)); @@ -1050,27 +1052,28 @@ Status MapColumnReader::skip(int64_t rows) { MapColumnReader* self = nullptr; ScalarColumnReader* value_reader = nullptr; - Status read_value_batch(int64_t batch_rows, NestedScalarBatch* value_batch) { + Status read_value_batch(int64_t batch_rows, NestedScalarBatch* out_value_batch) { if (!self->_value_overflow.empty()) { - *value_batch = std::move(self->_value_overflow.batch); + *out_value_batch = std::move(self->_value_overflow.batch); self->_value_overflow.clear(); return Status::OK(); } return read_nested_scalar_batch(*value_reader, batch_rows, - self->_nullable_definition_level + 1, value_batch); + self->_nullable_definition_level + 1, out_value_batch); } Status validate_value_alignment(const NestedScalarBatch& key_batch, - const NestedScalarBatch& value_batch) { - if (value_batch.records_read != key_batch.records_read || - value_batch.levels_written != key_batch.levels_written) { + const NestedScalarBatch& candidate_value_batch) { + if (candidate_value_batch.records_read != key_batch.records_read || + candidate_value_batch.levels_written != key_batch.levels_written) { return Status::Corruption( "Parquet MAP key/value levels are not aligned for column {} while " "skipping", self->_name); } for (int64_t level_idx = 0; level_idx < key_batch.levels_written; ++level_idx) { - if (value_batch.rep_levels[level_idx] != key_batch.rep_levels[level_idx]) { + if (candidate_value_batch.rep_levels[level_idx] != + key_batch.rep_levels[level_idx]) { return Status::Corruption( "Parquet MAP key/value repetition levels are not aligned for column {}", self->_name); @@ -1091,7 +1094,9 @@ Status MapColumnReader::skip(int64_t rows) { NestedScalarBatch value_batch; }; - SkipSink sink {this, value_reader}; + SkipSink sink; + sink.self = this; + sink.value_reader = value_reader; int64_t rows_read = 0; RETURN_IF_ERROR(assemble_repeated_levels(*key_reader, _repeated_repetition_level, _nullable_definition_level + 1, rows, &_key_overflow, From c3c9d3b28ba02688071f5cc0c065c775eb3353bd Mon Sep 17 00:00:00 2001 From: Socrates Date: Thu, 28 May 2026 17:16:33 +0800 Subject: [PATCH 27/38] [feature](be) Support parquet struct scalar assembly ### What problem does this PR solve? Issue Number: close #xxx Related PR: #xxx Problem Summary: Support parquet STRUCT reading with scalar children through definition-level assembly, including nullable parent struct handling and projected struct child reads. ### Release note None ### Check List (For Author) - Test: Manual test - Ran git diff --check locally. - Ran BUILD_TYPE=DEBUG ./build.sh --be on Fedora. - Behavior changed: Yes - New parquet reader now supports nullable STRUCT columns with scalar children and projected scalar struct children. - Does this need documentation: No --- be/src/format/new_parquet/column_reader.cpp | 133 ++++++++++++++++-- .../parquet_column_reader_test.cpp | 113 +++++++++++++++ 2 files changed, 237 insertions(+), 9 deletions(-) diff --git a/be/src/format/new_parquet/column_reader.cpp b/be/src/format/new_parquet/column_reader.cpp index 9d9ac98ced99e1..37d1efa322ee01 100644 --- a/be/src/format/new_parquet/column_reader.cpp +++ b/be/src/format/new_parquet/column_reader.cpp @@ -110,6 +110,7 @@ class StructColumnReader final : public ParquetColumnReader { StructColumnReader(const ParquetColumnSchema& schema, DataTypePtr type, std::vector> children) : _field_id(schema.top_level_field_id), + _nullable_definition_level(schema.nullable_definition_level), _type(std::move(type)), _name(schema.name), _children(std::move(children)) {} @@ -124,6 +125,7 @@ class StructColumnReader final : public ParquetColumnReader { private: int _field_id = -1; + int16_t _nullable_definition_level = 0; DataTypePtr _type; std::string _name; std::vector> _children; @@ -586,6 +588,13 @@ ColumnMap* map_column_from_output(MutableColumnPtr& column) { return assert_cast(column.get()); } +ColumnStruct* struct_column_from_output(MutableColumnPtr& column) { + if (auto* nullable_column = check_and_get_column(*column)) { + return assert_cast(&nullable_column->get_nested_column()); + } + return assert_cast(column.get()); +} + NullMap* null_map_from_nullable_output(MutableColumnPtr& column) { if (auto* nullable_column = check_and_get_column(*column)) { return &nullable_column->get_null_map_data(); @@ -732,14 +741,120 @@ Status StructColumnReader::read(int64_t rows, MutableColumnPtr& column, int64_t* return Status::OK(); } + auto* struct_column = struct_column_from_output(column); + DORIS_CHECK(struct_column != nullptr); + auto* parent_null_map = null_map_from_nullable_output(column); + DCHECK_EQ(struct_column->get_columns().size(), _children.size()); + + std::vector scalar_children; + scalar_children.reserve(_children.size()); + bool all_scalar_children = true; + for (const auto& child_reader : _children) { + DORIS_CHECK(child_reader != nullptr); + auto* scalar_child = dynamic_cast(child_reader.get()); + if (scalar_child == nullptr) { + all_scalar_children = false; + break; + } + scalar_children.push_back(scalar_child); + } + if (all_scalar_children) { + std::vector child_batches(scalar_children.size()); + int64_t expected_rows = -1; + for (size_t child_idx = 0; child_idx < scalar_children.size(); ++child_idx) { + RETURN_IF_ERROR(read_nested_scalar_batch(*scalar_children[child_idx], rows, + _nullable_definition_level, + &child_batches[child_idx])); + if (expected_rows < 0) { + expected_rows = child_batches[child_idx].records_read; + } else if (child_batches[child_idx].records_read != expected_rows) { + return Status::Corruption( + "Parquet struct children returned different row counts in column {}: {} " + "vs {}", + _name, expected_rows, child_batches[child_idx].records_read); + } + if (child_batches[child_idx].levels_written != child_batches[child_idx].records_read) { + return Status::Corruption( + "Parquet struct child {} returned repeated levels in column {}", + scalar_children[child_idx]->name(), _name); + } + } + + if (expected_rows <= 0) { + *rows_read = 0; + return Status::OK(); + } + + std::vector child_columns; + child_columns.reserve(scalar_children.size()); + for (size_t child_idx = 0; child_idx < scalar_children.size(); ++child_idx) { + child_columns.push_back(struct_column->get_column_ptr(child_idx)->assume_mutable()); + } + + NullMap parent_nulls; + parent_nulls.reserve(static_cast(expected_rows)); + for (int64_t row_idx = 0; row_idx < expected_rows; ++row_idx) { + const bool parent_is_null = + child_batches[0].def_levels[row_idx] < _nullable_definition_level; + parent_nulls.push_back(parent_is_null); + for (size_t child_idx = 1; child_idx < child_batches.size(); ++child_idx) { + const bool child_parent_is_null = + child_batches[child_idx].def_levels[row_idx] < _nullable_definition_level; + if (child_parent_is_null != parent_is_null) { + return Status::Corruption( + "Parquet struct children returned different null parent shape in " + "column {}", + _name); + } + } + for (size_t child_idx = 0; child_idx < scalar_children.size(); ++child_idx) { + if (parent_is_null) { + child_columns[child_idx]->insert_default(); + } else { + if (!scalar_children[child_idx]->type()->is_nullable() && + child_batches[child_idx].def_levels[row_idx] != + scalar_children[child_idx]->descriptor()->max_definition_level()) { + return Status::Corruption( + "Parquet STRUCT column {} contains null for non-nullable child {}", + _name, scalar_children[child_idx]->name()); + } + RETURN_IF_ERROR(append_scalar_batch_value(*scalar_children[child_idx], + child_batches[child_idx], row_idx, + child_columns[child_idx])); + } + } + } + for (size_t child_idx = 0; child_idx < child_columns.size(); ++child_idx) { + struct_column->get_column_ptr(child_idx) = std::move(child_columns[child_idx]); + } + if (parent_null_map == nullptr) { + for (const auto parent_is_null : parent_nulls) { + if (parent_is_null) { + return Status::Corruption( + "Parquet STRUCT column {} contains null for non-nullable struct", + _name); + } + } + } else { + append_parent_nulls(parent_null_map, parent_nulls); + } + *rows_read = expected_rows; + return Status::OK(); + } + + if (parent_null_map != nullptr) { + return Status::NotSupported( + "Current parquet nullable STRUCT reader only supports scalar children for column " + "{}", + _name); + } + int64_t expected_rows = -1; size_t child_idx = 0; - DCHECK_EQ(assert_cast(*column).get_columns().size(), _children.size()); for (auto& child_reader : _children) { DORIS_CHECK(child_reader != nullptr); int64_t child_rows = 0; - auto child_column = - assert_cast(*column).get_column_ptr(child_idx)->assume_mutable(); + auto child_column = struct_column->get_column_ptr(child_idx)->assume_mutable(); RETURN_IF_ERROR(child_reader->read(rows, child_column, &child_rows)); if (expected_rows < 0) { expected_rows = child_rows; @@ -748,6 +863,7 @@ Status StructColumnReader::read(int64_t rows, MutableColumnPtr& column, int64_t* "Parquet struct children returned different row counts in column {}: {} vs {}", _name, expected_rows, child_rows); } + struct_column->get_column_ptr(child_idx) = std::move(child_column); child_idx++; } @@ -1288,11 +1404,6 @@ Status ParquetColumnReaderFactory::create_struct_column_reader( if (reader == nullptr) { return Status::InvalidArgument("reader is null"); } - if (column_schema.type != nullptr && column_schema.type->is_nullable()) { - return Status::NotSupported( - "Nullable parquet STRUCT reader is not implemented for column {}", - column_schema.name); - } std::vector> child_readers; child_readers.reserve(column_schema.children.size()); DataTypes projected_child_types; @@ -1311,7 +1422,11 @@ Status ParquetColumnReaderFactory::create_struct_column_reader( child_projection = &*it; } std::unique_ptr child_reader; - RETURN_IF_ERROR(create(*child_schema, child_projection, &child_reader)); + if (child_schema->kind == ParquetColumnSchemaKind::PRIMITIVE) { + RETURN_IF_ERROR(create_nested_scalar_column_reader(*child_schema, &child_reader)); + } else { + RETURN_IF_ERROR(create(*child_schema, child_projection, &child_reader)); + } projected_child_types.push_back(child_reader->type()); projected_child_names.push_back(child_reader->name()); child_readers.push_back(std::move(child_reader)); diff --git a/be/test/format/new_parquet/parquet_column_reader_test.cpp b/be/test/format/new_parquet/parquet_column_reader_test.cpp index 50aa801f4c70e3..059e9b709aa4ad 100644 --- a/be/test/format/new_parquet/parquet_column_reader_test.cpp +++ b/be/test/format/new_parquet/parquet_column_reader_test.cpp @@ -148,6 +148,33 @@ class ParquetColumnReaderTest : public testing::Test { return finish_array(&builder); } + std::shared_ptr build_nullable_struct_array() { + auto struct_type = arrow::struct_( + {arrow::field("a", arrow::int32(), false), arrow::field("b", arrow::utf8(), true)}); + std::vector> field_builders; + auto a_array_builder = std::make_unique(); + field_builders.push_back(std::shared_ptr(std::move(a_array_builder))); + auto b_array_builder = std::make_unique(); + field_builders.push_back(std::shared_ptr(std::move(b_array_builder))); + arrow::StructBuilder builder(struct_type, arrow::default_memory_pool(), + std::move(field_builders)); + auto* a_builder = assert_cast(builder.field_builder(0)); + auto* b_builder = assert_cast(builder.field_builder(1)); + + EXPECT_TRUE(builder.Append().ok()); + EXPECT_TRUE(a_builder->Append(201).ok()); + EXPECT_TRUE(b_builder->Append("nsa").ok()); + EXPECT_TRUE(builder.AppendNull().ok()); + EXPECT_TRUE(builder.Append().ok()); + EXPECT_TRUE(a_builder->Append(203).ok()); + EXPECT_TRUE(b_builder->AppendNull().ok()); + EXPECT_TRUE(builder.Append().ok()); + EXPECT_TRUE(a_builder->Append(204).ok()); + EXPECT_TRUE(b_builder->Append("nsd").ok()); + EXPECT_TRUE(builder.AppendNull().ok()); + return finish_array(&builder); + } + std::shared_ptr build_required_int_list_array() { auto value_builder = std::make_shared(); arrow::ListBuilder builder(arrow::default_memory_pool(), value_builder); @@ -480,6 +507,41 @@ class ParquetColumnReaderTest : public testing::Test { EXPECT_EQ(b_values.get_data_at(1).to_string(), "sb"); EXPECT_EQ(b_values.get_data_at(4).to_string(), "se"); }); + add_field(arrow::field("nullable_struct_col", + arrow::struct_({ + arrow::field("a", arrow::int32(), false), + arrow::field("b", arrow::utf8(), true), + }), + true), + build_nullable_struct_array(), + [](const ParquetColumnSchema& schema, const IColumn& column) { + EXPECT_TRUE(schema.type->is_nullable()); + const auto& nullable_column = assert_cast(column); + ASSERT_EQ(nullable_column.size(), ROW_COUNT); + EXPECT_FALSE(nullable_column.is_null_at(0)); + EXPECT_TRUE(nullable_column.is_null_at(1)); + EXPECT_FALSE(nullable_column.is_null_at(2)); + EXPECT_FALSE(nullable_column.is_null_at(3)); + EXPECT_TRUE(nullable_column.is_null_at(4)); + + const auto& struct_column = + assert_cast(nullable_column.get_nested_column()); + ASSERT_EQ(struct_column.get_columns().size(), 2); + const auto& a_values = + assert_cast(struct_column.get_column(0)); + const auto& b_values = + assert_cast(struct_column.get_column(1)); + const auto& b_nested = + assert_cast(b_values.get_nested_column()); + EXPECT_EQ(a_values.get_element(0), 201); + EXPECT_EQ(a_values.get_element(2), 203); + EXPECT_EQ(a_values.get_element(3), 204); + EXPECT_FALSE(b_values.is_null_at(0)); + EXPECT_TRUE(b_values.is_null_at(2)); + EXPECT_FALSE(b_values.is_null_at(3)); + EXPECT_EQ(b_nested.get_data_at(0).to_string(), "nsa"); + EXPECT_EQ(b_nested.get_data_at(3).to_string(), "nsd"); + }); add_field(arrow::field("list_int_col", arrow::list(arrow::field("element", arrow::int32(), false)), false), build_required_int_list_array(), @@ -717,6 +779,7 @@ TEST_F(ParquetColumnReaderTest, ReadAllSupportedPhysicalAndLogicalTypes) { TEST_F(ParquetColumnReaderTest, ReadSupportedComplexTypes) { read_and_validate(find_field_idx("struct_col")); + read_and_validate(find_field_idx("nullable_struct_col")); read_and_validate(find_field_idx("list_int_col")); read_and_validate(find_field_idx("nullable_list_int_col")); read_and_validate(find_field_idx("required_nullable_list_int_col")); @@ -798,6 +861,56 @@ TEST_F(ParquetColumnReaderTest, ReadProjectedStructChildren) { EXPECT_EQ(values.get_data_at(4).to_string(), "se"); } +TEST_F(ParquetColumnReaderTest, ReadProjectedNullableStructChildren) { + const auto field_idx = find_field_idx("nullable_struct_col"); + ASSERT_LT(field_idx, _fields.size()); + const auto& struct_schema = *_fields[field_idx]; + ASSERT_EQ(struct_schema.name, "nullable_struct_col"); + ASSERT_EQ(struct_schema.children.size(), 2); + + reader::FieldProjection projection; + projection.file_column_id = struct_schema.top_level_field_id; + projection.file_path = struct_schema.file_path; + projection.project_all_children = false; + reader::FieldProjection child_projection; + child_projection.file_column_id = struct_schema.top_level_field_id; + child_projection.file_path = struct_schema.children[1]->file_path; + projection.children.push_back(std::move(child_projection)); + + ParquetColumnReaderFactory factory(_row_group, _file_reader->metadata()->num_columns()); + std::unique_ptr reader; + auto st = factory.create(struct_schema, &projection, &reader); + ASSERT_TRUE(st.ok()) << st; + ASSERT_TRUE(reader->type()->is_nullable()); + ASSERT_EQ(remove_nullable(reader->type())->get_primitive_type(), TYPE_STRUCT); + const auto* projected_type = + assert_cast(remove_nullable(reader->type()).get()); + ASSERT_EQ(projected_type->get_elements().size(), 1); + EXPECT_EQ(projected_type->get_element_name(0), "b"); + + MutableColumnPtr column = reader->type()->create_column(); + int64_t rows_read = 0; + st = reader->read(ROW_COUNT, column, &rows_read); + ASSERT_TRUE(st.ok()) << st; + ASSERT_EQ(rows_read, ROW_COUNT); + const auto& nullable_column = assert_cast(*column); + EXPECT_FALSE(nullable_column.is_null_at(0)); + EXPECT_TRUE(nullable_column.is_null_at(1)); + EXPECT_FALSE(nullable_column.is_null_at(2)); + EXPECT_FALSE(nullable_column.is_null_at(3)); + EXPECT_TRUE(nullable_column.is_null_at(4)); + const auto& struct_column = + assert_cast(nullable_column.get_nested_column()); + ASSERT_EQ(struct_column.get_columns().size(), 1); + const auto& values = assert_cast(struct_column.get_column(0)); + const auto& nested_values = assert_cast(values.get_nested_column()); + EXPECT_FALSE(values.is_null_at(0)); + EXPECT_TRUE(values.is_null_at(2)); + EXPECT_FALSE(values.is_null_at(3)); + EXPECT_EQ(nested_values.get_data_at(0).to_string(), "nsa"); + EXPECT_EQ(nested_values.get_data_at(3).to_string(), "nsd"); +} + TEST_F(ParquetColumnReaderTest, ReadListWithOverflowAcrossChunks) { const auto field_idx = find_field_idx("nullable_list_int_col"); auto reader = create_reader(field_idx); From 2cfd5031169f2f37ec7260958cbf928386366ffd Mon Sep 17 00:00:00 2001 From: Gabriel Date: Thu, 28 May 2026 17:36:46 +0800 Subject: [PATCH 28/38] [feature](be) Support Iceberg position delete predicates (#63799) Add file-layer DeletePredicate execution for Parquet row positions and wire IcebergTableReader v2 to convert Iceberg position deletes and deletion vectors into file-local deleted row positions. Equality delete files are detected and fail explicitly instead of being silently ignored. --- be/src/format/new_parquet/parquet_reader.cpp | 43 +- .../format/reader/expr/delete_predicate.cpp | 14 +- be/src/format/reader/table/paimon_reader.cpp | 35 +- be/src/format/reader/table/paimon_reader.h | 3 +- be/src/format/reader/table_reader.cpp | 118 ++++-- be/src/format/reader/table_reader.h | 163 +++++-- be/src/format/table/deletion_vector_reader.h | 6 + be/src/format/table/iceberg_reader_v2.cpp | 396 ++++++++++++++++- be/src/format/table/iceberg_reader_v2.h | 198 ++++----- .../new_parquet/parquet_reader_test.cpp | 94 ++++ be/test/format/reader/table_reader_test.cpp | 401 ++++++++++++++++++ 11 files changed, 1234 insertions(+), 237 deletions(-) diff --git a/be/src/format/new_parquet/parquet_reader.cpp b/be/src/format/new_parquet/parquet_reader.cpp index 043f155dd8588f..489e184cd2b889 100644 --- a/be/src/format/new_parquet/parquet_reader.cpp +++ b/be/src/format/new_parquet/parquet_reader.cpp @@ -327,18 +327,47 @@ Status ParquetReader::_execute_filter_conjuncts(int64_t batch_rows, Block* file_ // predicate columns in the file-local block have been materialized. for (const auto& expression_filter : _request->expression_filters) { if (expression_filter.conjunct == nullptr) { - continue; + if (expression_filter.delete_conjunct == nullptr) { + continue; + } + } else { + if (*selected_rows == 0) { + break; + } + IColumn::Filter filter(static_cast(batch_rows), 1); + bool can_filter_all = false; + RETURN_IF_ERROR(expression_filter.conjunct->execute_filter( + file_block, filter.data(), static_cast(batch_rows), false, + &can_filter_all)); + *selected_rows = can_filter_all ? 0 + : _apply_filter_to_selection(filter, selection, + *selected_rows); } if (*selected_rows == 0) { break; } - IColumn::Filter filter(static_cast(batch_rows), 1); - bool can_filter_all = false; - RETURN_IF_ERROR(expression_filter.conjunct->execute_filter(file_block, filter.data(), - static_cast(batch_rows), - false, &can_filter_all)); + if (expression_filter.delete_conjunct == nullptr) { + continue; + } + int result_column_id = -1; + RETURN_IF_ERROR(expression_filter.delete_conjunct->root()->execute( + expression_filter.delete_conjunct.get(), file_block, &result_column_id)); + DORIS_CHECK(result_column_id >= 0 && + result_column_id < static_cast(file_block->columns())); + const auto& delete_filter = assert_cast( + *file_block->get_by_position(result_column_id).column) + .get_data(); + DORIS_CHECK(delete_filter.size() == static_cast(batch_rows)); + IColumn::Filter keep_filter(static_cast(batch_rows), 1); + bool has_kept_row = false; + for (size_t row = 0; row < static_cast(batch_rows); ++row) { + keep_filter[row] = !delete_filter[row]; + has_kept_row |= keep_filter[row] != 0; + } + file_block->erase(result_column_id); *selected_rows = - can_filter_all ? 0 : _apply_filter_to_selection(filter, selection, *selected_rows); + !has_kept_row ? 0 + : _apply_filter_to_selection(keep_filter, selection, *selected_rows); } return Status::OK(); } diff --git a/be/src/format/reader/expr/delete_predicate.cpp b/be/src/format/reader/expr/delete_predicate.cpp index 01844fa8a07069..31c6a057afd213 100644 --- a/be/src/format/reader/expr/delete_predicate.cpp +++ b/be/src/format/reader/expr/delete_predicate.cpp @@ -69,26 +69,26 @@ void DeletePredicate::close(VExprContext* context, FunctionContext::FunctionStat * Row IDs should be generated by file reader as a virtual column in `block`. **/ Status DeletePredicate::execute(VExprContext* context, Block* block, int* result_column_id) const { - if (block->empty()) { - return Status::OK(); - } - DCHECK(_open_finished || block == nullptr); if (_children.size() != 1) { return Status::InternalError(fmt::format( "DeletePredicate should have exactly 1 child expr, but got {}", _children.size())); } int slot = -1; RETURN_IF_ERROR(_children[0]->execute(context, block, &slot)); - const auto count = block->rows(); - auto res_col = ColumnBool::create(block->rows(), 0); const auto& row_ids = assert_cast(*block->get_by_position(slot).column).get_data(); - DCHECK_EQ(row_ids.size(), count); + const auto count = row_ids.size(); + auto res_col = ColumnBool::create(count, 0); if (_deleted_rows.empty()) { block->insert({std::move(res_col), std::make_shared(), expr_name()}); *result_column_id = static_cast(block->get_columns().size() - 1); return Status::OK(); } + if (count == 0) { + block->insert({std::move(res_col), std::make_shared(), expr_name()}); + *result_column_id = static_cast(block->get_columns().size() - 1); + return Status::OK(); + } const int64_t* delete_rows = _deleted_rows.data(); const int64_t* delete_rows_end = delete_rows + _deleted_rows.size(); const int64_t* start_pos = std::lower_bound(delete_rows, delete_rows_end, row_ids[0]); diff --git a/be/src/format/reader/table/paimon_reader.cpp b/be/src/format/reader/table/paimon_reader.cpp index 713d1a97e68983..d5c450b2c0172b 100644 --- a/be/src/format/reader/table/paimon_reader.cpp +++ b/be/src/format/reader/table/paimon_reader.cpp @@ -17,26 +17,39 @@ #include "format/reader/table/paimon_reader.h" +#include +#include + #include "format/table/deletion_vector_reader.h" namespace doris::paimon { -bool PaimonReader::_parse_delete_file(const TTableFormatFileDesc& t_desc, DeleteFileDesc& desc) { +Status PaimonReader::_parse_deletion_vector_file(const TTableFormatFileDesc& t_desc, + DeleteFileDesc* desc, bool* has_delete_file) { + DORIS_CHECK(desc != nullptr); + DORIS_CHECK(has_delete_file != nullptr); + *has_delete_file = false; const auto& table_desc = t_desc.paimon_params; if (!table_desc.__isset.deletion_file) { - return false; + return Status::OK(); } const auto& deletion_file = table_desc.deletion_file; - desc.key.resize(deletion_file.path.size() + sizeof(deletion_file.offset)); - memcpy(desc.key.data(), deletion_file.path.data(), deletion_file.path.size()); - memcpy(desc.key.data() + deletion_file.path.size(), &deletion_file.offset, - sizeof(deletion_file.offset)); - desc.path = deletion_file.path; - desc.start_offset = deletion_file.offset; - desc.size = deletion_file.length + 4; - desc.file_size = -1; - return true; + const std::string key_prefix = "paimon_dv:"; + desc->key.resize(key_prefix.size() + deletion_file.path.size() + sizeof(deletion_file.offset)); + char* key_data = desc->key.data(); + memcpy(key_data, key_prefix.data(), key_prefix.size()); + key_data += key_prefix.size(); + memcpy(key_data, deletion_file.path.data(), deletion_file.path.size()); + key_data += deletion_file.path.size(); + memcpy(key_data, &deletion_file.offset, sizeof(deletion_file.offset)); + desc->path = deletion_file.path; + desc->start_offset = deletion_file.offset; + desc->size = deletion_file.length + 4; + desc->file_size = -1; + desc->format = DeleteFileDesc::Format::PAIMON; + *has_delete_file = true; + return Status::OK(); } } // namespace doris::paimon diff --git a/be/src/format/reader/table/paimon_reader.h b/be/src/format/reader/table/paimon_reader.h index d0f33c7a90c0b6..ce386460a6e681 100644 --- a/be/src/format/reader/table/paimon_reader.h +++ b/be/src/format/reader/table/paimon_reader.h @@ -30,7 +30,8 @@ class PaimonReader final : public reader::TableReader { ~PaimonReader() final = default; protected: - bool _parse_delete_file(const TTableFormatFileDesc& t_desc, DeleteFileDesc& desc) override; + Status _parse_deletion_vector_file(const TTableFormatFileDesc& t_desc, DeleteFileDesc* desc, + bool* has_delete_file) override; }; } // namespace doris::paimon diff --git a/be/src/format/reader/table_reader.cpp b/be/src/format/reader/table_reader.cpp index 86868b97b0bba3..0735cc51f383b7 100644 --- a/be/src/format/reader/table_reader.cpp +++ b/be/src/format/reader/table_reader.cpp @@ -20,16 +20,21 @@ #include #include +#include +#include #include #include +#include "common/cast_set.h" #include "common/status.h" #include "core/assert_cast.h" +#include "exec/common/endian.h" #include "exprs/vslot_ref.h" #include "format/new_parquet/parquet_reader.h" #include "format/reader/column_mapper.h" #include "format/table/deletion_vector_reader.h" #include "io/io_common.h" +#include "roaring/roaring64map.hh" namespace doris::reader { namespace { @@ -66,10 +71,63 @@ void build_table_filters_from_conjunct(const VExprSPtr& conjunct, table_filter.conjunct = VExprContext::create_shared(conjunct); table_filter.slot_ids.assign(slot_ids.begin(), slot_ids.end()); table_filters->push_back(std::move(table_filter)); - return; } } +Status parse_deletion_vector(const char* buf, size_t buffer_size, DeleteFileDesc::Format format, + DeleteRows* delete_rows) { + DORIS_CHECK(buf != nullptr); + DORIS_CHECK(delete_rows != nullptr); + DORIS_CHECK(format == DeleteFileDesc::Format::PAIMON || + format == DeleteFileDesc::Format::ICEBERG); + + const size_t checksum_size = format == DeleteFileDesc::Format::ICEBERG ? 4 : 0; + if (buffer_size < 8 + checksum_size) [[unlikely]] { + return Status::DataQualityError("Deletion vector file size too small: {}", buffer_size); + } + + auto total_length = BigEndian::Load32(buf); + if (total_length + 4 + checksum_size != buffer_size) [[unlikely]] { + return Status::DataQualityError("Deletion vector length mismatch, expected: {}, actual: {}", + total_length + 4 + checksum_size, buffer_size); + } + + constexpr static char MAGIC_NUMBER[] = {'\xD1', '\xD3', '\x39', '\x64'}; + if (memcmp(buf + sizeof(total_length), MAGIC_NUMBER, 4) != 0) [[unlikely]] { + return Status::DataQualityError("Deletion vector magic number mismatch"); + } + + const char* bitmap_buf = buf + 8; + const size_t bitmap_size = buffer_size - 8 - checksum_size; + if (format == DeleteFileDesc::Format::PAIMON) { + roaring::Roaring bitmap; + try { + bitmap = roaring::Roaring::readSafe(bitmap_buf, bitmap_size); + } catch (const std::runtime_error& e) { + return Status::DataQualityError("Decode roaring bitmap failed, {}", e.what()); + } + + delete_rows->reserve(bitmap.cardinality()); + for (auto it = bitmap.begin(); it != bitmap.end(); it++) { + delete_rows->push_back(*it); + } + return Status::OK(); + } + + roaring::Roaring64Map bitmap; + try { + bitmap = roaring::Roaring64Map::readSafe(bitmap_buf, bitmap_size); + } catch (const std::runtime_error& e) { + return Status::DataQualityError("Decode roaring bitmap failed, {}", e.what()); + } + + delete_rows->reserve(bitmap.cardinality()); + for (auto it = bitmap.begin(); it != bitmap.end(); it++) { + delete_rows->push_back(cast_set(*it)); + } + return Status::OK(); +} + } // namespace std::shared_ptr create_system_properties( @@ -117,10 +175,17 @@ Status TableReader::_open_local_filter_exprs(const FileScanRequest& file_request RowDescriptor row_desc; for (const auto& expression_filter : file_request.expression_filters) { if (expression_filter.conjunct == nullptr) { - continue; + if (expression_filter.delete_conjunct == nullptr) { + continue; + } + } else { + RETURN_IF_ERROR(expression_filter.conjunct->prepare(_runtime_state, row_desc)); + RETURN_IF_ERROR(expression_filter.conjunct->open(_runtime_state)); + } + if (expression_filter.delete_conjunct != nullptr) { + RETURN_IF_ERROR(expression_filter.delete_conjunct->prepare(_runtime_state, row_desc)); + RETURN_IF_ERROR(expression_filter.delete_conjunct->open(_runtime_state)); } - RETURN_IF_ERROR(expression_filter.conjunct->prepare(_runtime_state, row_desc)); - RETURN_IF_ERROR(expression_filter.conjunct->open(_runtime_state)); } return Status::OK(); } @@ -169,12 +234,17 @@ Status TableReader::prepare_split(const SplitReadOptions& options) { _partition_values = std::move(options.partition_values); _current_task = std::make_unique(); _current_task->data_file = create_file_description(options.current_range); + _delete_rows = nullptr; return _parse_delete_predicates(options); } Status TableReader::_parse_delete_predicates(const SplitReadOptions& options) { DeleteFileDesc desc {.fs_name = options.current_range.fs_name}; - if (_parse_delete_file(options.current_range.table_format_params, desc)) { + bool has_delete_file = false; + RETURN_IF_ERROR(_parse_deletion_vector_file(options.current_range.table_format_params, &desc, + &has_delete_file)); + if (has_delete_file) { + DORIS_CHECK(options.cache != nullptr); Status create_status = Status::OK(); _delete_rows = options.cache->get(desc.key, [&]() -> DeleteRows* { @@ -195,45 +265,11 @@ Status TableReader::_parse_delete_predicates(const SplitReadOptions& options) { } const char* buf = buffer.data(); - uint32_t actual_length; - std::memcpy(reinterpret_cast(&actual_length), buf, 4); - std::reverse(reinterpret_cast(&actual_length), - reinterpret_cast(&actual_length) + 4); - buf += 4; - if (actual_length != bytes_read - 4) [[unlikely]] { - create_status = Status::RuntimeError( - "DeletionVector deserialize error: length not match, " - "actual length: {}, expect length: {}", - actual_length, bytes_read - 4); - return nullptr; - } - uint32_t magic_number; - std::memcpy(reinterpret_cast(&magic_number), buf, 4); - std::reverse(reinterpret_cast(&magic_number), - reinterpret_cast(&magic_number) + 4); - buf += 4; - const static uint32_t MAGIC_NUMBER = 1581511376; - if (magic_number != MAGIC_NUMBER) [[unlikely]] { - create_status = Status::RuntimeError( - "DeletionVector deserialize error: invalid magic number {}", magic_number); - return nullptr; - } - - roaring::Roaring roaring_bitmap; SCOPED_TIMER(_profile->parse_delete_file_time); - try { - roaring_bitmap = roaring::Roaring::readSafe(buf, bytes_read - 4); - } catch (const std::runtime_error& e) { - create_status = Status::RuntimeError( - "DeletionVector deserialize error: failed to deserialize roaring bitmap, " - "{}", - e.what()); + create_status = parse_deletion_vector(buf, bytes_read, desc.format, delete_rows); + if (!create_status.ok()) [[unlikely]] { return nullptr; } - delete_rows->reserve(roaring_bitmap.cardinality()); - for (auto it = roaring_bitmap.begin(); it != roaring_bitmap.end(); it++) { - delete_rows->push_back(*it); - } COUNTER_UPDATE(_profile->num_delete_rows, delete_rows->size()); return delete_rows; }); diff --git a/be/src/format/reader/table_reader.h b/be/src/format/reader/table_reader.h index 5441995e18c35e..f94e98bd83798e 100644 --- a/be/src/format/reader/table_reader.h +++ b/be/src/format/reader/table_reader.h @@ -19,12 +19,14 @@ #include +#include #include #include #include #include #include +#include "common/cast_set.h" #include "common/status.h" #include "core/assert_cast.h" #include "core/block/block.h" @@ -32,11 +34,14 @@ #include "core/data_type/data_type_array.h" #include "core/data_type/data_type_map.h" #include "core/data_type/data_type_nullable.h" +#include "core/data_type/data_type_number.h" #include "core/data_type/data_type_struct.h" #include "exprs/vexpr_context.h" #include "exprs/vexpr_fwd.h" +#include "format/new_parquet/column_reader.h" #include "format/reader/column_mapper.h" #include "format/reader/expr/delete_predicate.h" +#include "format/reader/expr/slot_ref.h" #include "format/reader/file_reader.h" #include "runtime/descriptors.h" @@ -179,19 +184,9 @@ class TableReader { } continue; } - DCHECK_EQ(_data_reader.block_template.columns(), _data_reader.scan_schema.size()); - + DCHECK_EQ(_data_reader.block_template.columns(), _data_reader.block_schema.size()); DORIS_CHECK(block->columns() == _data_reader.column_mapper.mappings().size()); - size_t idx = 0; - for (const auto& mapping : _data_reader.column_mapper.mappings()) { - ColumnPtr column; - RETURN_IF_ERROR(_materialize_mapping_column(mapping, &_data_reader.block_template, - current_rows, &column)); - block->replace_by_position(idx, std::move(column)); - idx++; - } - RETURN_IF_ERROR(finalize_chunk(block)); - RETURN_IF_ERROR(materialize_virtual_columns(block)); + RETURN_IF_ERROR(finalize_chunk(block, current_rows)); if (current_eof) { RETURN_IF_ERROR(close_current_reader()); } @@ -209,9 +204,13 @@ class TableReader { } protected: - virtual bool _parse_delete_file(const TTableFormatFileDesc& t_desc, DeleteFileDesc& desc) { - return false; + // Parse deletion vector information from table format specific file description. + virtual Status _parse_deletion_vector_file(const TTableFormatFileDesc& t_desc, + DeleteFileDesc* desc, bool* has_delete_file) { + *has_delete_file = false; + return Status::OK(); } + // 切换到下一个 reader 的通用流程。 // 该方法先关闭当前 reader,再打开下一个具体 reader;子类不应重复实现这个循环。 Status create_next_reader(bool* eos); @@ -219,36 +218,45 @@ class TableReader { // 打开当前具体 reader。 // 子类在这里基于当前 split/task 初始化底层 FileReader。 virtual Status open_reader() { + // 1. Get file schema and create column mapping. std::vector file_schema; RETURN_IF_ERROR(_data_reader.reader->get_schema(&file_schema)); - _data_reader.block_schema = file_schema; + _data_reader.file_schema = file_schema; RETURN_IF_ERROR(_data_reader.column_mapper.create_mapping(_projected_columns, _partition_values, file_schema)); DORIS_CHECK(_data_reader.column_mapper.mappings().size() == _projected_columns.size()); + + // 2. Build table filters based on conjuncts and column predicates. RETURN_IF_ERROR(_build_table_filters_from_conjuncts()); + // 3. Create file scan request based on column mapping and table filters, then open file reader with the request. + // file scan request is the main carrier of file-level pruning information, including column mapping, column-level filters and expression filters. The file reader will evaluate the filters and only return rows that satisfy the filters to table reader. auto file_request = std::make_unique(); RETURN_IF_ERROR(_data_reader.column_mapper.create_scan_request( _table_filters, _table_column_predicates, _projected_columns, file_request.get())); RETURN_IF_ERROR(customize_file_scan_request(file_request.get())); RETURN_IF_ERROR(_open_local_filter_exprs(*file_request)); - _data_reader.scan_schema.clear(); + _data_reader.block_schema.clear(); _data_reader.block_template.clear(); - _data_reader.scan_schema.resize(file_request->column_positions.size()); + _data_reader.block_schema.resize(file_request->column_positions.size()); + + // 4. Build block schema based on file schema and column mapping. The scan schema describes the column layout of the block returned by file reader, which is determined by the column mapping and file schema. for (const auto& [file_column_id, block_position] : file_request->column_positions) { - DORIS_CHECK(block_position < _data_reader.scan_schema.size()); - const auto* field = _find_schema_field(_data_reader.block_schema, file_column_id); + DORIS_CHECK(block_position < _data_reader.block_schema.size()); + const auto* field = _find_schema_field(_data_reader.file_schema, file_column_id); DORIS_CHECK(field != nullptr); auto projection_it = file_request->complex_projections.find(file_column_id); if (projection_it == file_request->complex_projections.end()) { - _data_reader.scan_schema[block_position] = *field; + _data_reader.block_schema[block_position] = *field; } else { RETURN_IF_ERROR(_project_schema_field(*field, projection_it->second, - &_data_reader.scan_schema[block_position])); + &_data_reader.block_schema[block_position])); } } - _data_reader.block_template.reserve(_data_reader.scan_schema.size()); - for (const auto& field : _data_reader.scan_schema) { + + // 5. Prepare block template based on block schema. The block template is used to store the block returned by file reader before finalize; it has the same column layout as the file reader output block, which is determined by the column mapping and file schema. + _data_reader.block_template.reserve(_data_reader.block_schema.size()); + for (const auto& field : _data_reader.block_schema) { _data_reader.block_template.insert( {field.type->create_column(), field.type, field.name}); } @@ -261,6 +269,68 @@ class TableReader { Status _open_local_filter_exprs(const FileScanRequest& file_request); virtual Status customize_file_scan_request(FileScanRequest* file_request) { + return _append_delete_predicate(file_request); + } + + static size_t _next_block_position(const FileScanRequest& request) { + size_t next_position = 0; + for (const auto& [_, block_position] : request.column_positions) { + next_position = std::max(next_position, block_position + 1); + } + return next_position; + } + + void _append_file_scan_column(FileScanRequest* request, ColumnId column_id, + std::vector* scan_columns) { + DORIS_CHECK(request != nullptr); + DORIS_CHECK(scan_columns != nullptr); + if (scan_columns == &request->non_predicate_columns && + std::find(request->predicate_columns.begin(), request->predicate_columns.end(), + column_id) != request->predicate_columns.end()) { + return; + } + const bool newly_added = request->column_positions.count(column_id) == 0; + if (newly_added) { + request->column_positions.emplace(column_id, _next_block_position(*request)); + scan_columns->push_back(column_id); + } else if (std::find(scan_columns->begin(), scan_columns->end(), column_id) == + scan_columns->end()) { + scan_columns->push_back(column_id); + } + if (scan_columns == &request->predicate_columns) { + request->non_predicate_columns.erase( + std::remove(request->non_predicate_columns.begin(), + request->non_predicate_columns.end(), column_id), + request->non_predicate_columns.end()); + } + if (column_id == doris::parquet::ParquetColumnReaderFactory::ROW_POSITION_COLUMN_ID && + _find_schema_field(_data_reader.file_schema, column_id) == nullptr) { + _data_reader.file_schema.push_back( + doris::parquet::ParquetColumnReaderFactory::row_position_schema_field()); + } + } + + // Append DeletePredicate to file scan request if there are deletes. The predicate will be evaluated in file reader level and filter out deleted rows before returning data to table reader. + Status _append_delete_predicate(FileScanRequest* request) { + DORIS_CHECK(request != nullptr); + if (_delete_rows == nullptr || _delete_rows->empty()) { + return Status::OK(); + } + const auto row_position_column_id = + parquet::ParquetColumnReaderFactory::ROW_POSITION_COLUMN_ID; + _append_file_scan_column(request, row_position_column_id, &request->predicate_columns); + + auto delete_predicate = std::make_shared(*_delete_rows); + const auto block_position = request->column_positions.at(row_position_column_id); + delete_predicate->add_child(TableSlotRef::create_shared( + cast_set(block_position), cast_set(block_position), -1, + std::make_shared(), + parquet::ParquetColumnReaderFactory::ROW_POSITION_COLUMN_NAME)); + + FileExpressionFilter delete_filter; + delete_filter.delete_conjunct = VExprContext::create_shared(std::move(delete_predicate)); + delete_filter.file_column_ids.push_back(row_position_column_id); + request->expression_filters.push_back(std::move(delete_filter)); return Status::OK(); } @@ -272,27 +342,32 @@ class TableReader { _data_reader.column_mapper.clear(); _table_filters.clear(); _table_column_predicates.clear(); + _data_reader.file_schema.clear(); _data_reader.block_schema.clear(); - _data_reader.scan_schema.clear(); _data_reader.block_template.clear(); _current_task.reset(); return Status::OK(); } - // 将 file-local block 转换为 table/global schema block。 - // 这里执行 ColumnMapping 中的 finalize_expr、缺失列填充、partition/generated 列 - // 物化以及复杂列 remap。 - virtual Status finalize_chunk(Block* block) { return Status::OK(); } - - // 物化虚拟列。 - // 例如 _row_id、_last_updated_sequence_number 等,它们不来自文件物理列。 - virtual Status materialize_virtual_columns(Block* table_block) { - // 真实实现会物化 _row_id、_last_updated_sequence_number 等 Iceberg 虚拟列。 + // Finalize file-local block to table/global schema block. + virtual Status finalize_chunk(Block* block, const size_t rows) { + size_t idx = 0; + for (const auto& mapping : _data_reader.column_mapper.mappings()) { + ColumnPtr column; + RETURN_IF_ERROR(_materialize_mapping_column(mapping, &_data_reader.block_template, rows, + &column)); + block->replace_by_position(idx, std::move(column)); + idx++; + } + RETURN_IF_ERROR(materialize_virtual_columns(block)); return Status::OK(); } + // Materialize virtual columns in table block, such as _row_id and _last_updated_sequence_number in Iceberg. This is called after finalize_chunk, so the virtual column can be referenced in finalize_expr. + virtual Status materialize_virtual_columns(Block* table_block) { return Status::OK(); } + Status _materialize_mapping_column(const ColumnMapping& mapping, Block* current_block, - size_t current_rows, ColumnPtr* column) { + const size_t rows, ColumnPtr* column) { if (mapping.projection != nullptr) { int res_id; RETURN_IF_ERROR(mapping.projection->execute(current_block, &res_id)); @@ -300,23 +375,22 @@ class TableReader { return Status::OK(); } if (mapping.default_expr != nullptr) { - if (current_block->rows() == current_rows) { + if (current_block->rows() == rows) { int res_id; RETURN_IF_ERROR(mapping.default_expr->execute(current_block, &res_id)); *column = current_block->get_columns()[res_id]; } else { DORIS_CHECK(mapping.is_constant); Block eval_block; - eval_block.insert( - {mapping.table_type->create_column_const_with_default_value(current_rows), - mapping.table_type, "__table_reader_const_rows"}); + eval_block.insert({mapping.table_type->create_column_const_with_default_value(rows), + mapping.table_type, "__table_reader_const_rows"}); int res_id; RETURN_IF_ERROR(mapping.default_expr->execute(&eval_block, &res_id)); *column = eval_block.get_columns()[res_id]; } return Status::OK(); } - *column = mapping.table_type->create_column_const_with_default_value(current_rows); + *column = mapping.table_type->create_column_const_with_default_value(rows); return Status::OK(); } @@ -338,8 +412,10 @@ class TableReader { struct DataReader { std::unique_ptr reader; TableColumnMapper column_mapper; - std::vector block_schema; - std::vector scan_schema; + std::vector + file_schema; // Schema of the data file, also including virtual column (row position). + std::vector + block_schema; // Schema of the block returned by file reader, determined by column mapping and file schema. It is used for file reader to materialize columns into correct type and position. Block block_template; }; DataReader _data_reader; @@ -352,8 +428,8 @@ class TableReader { TableColumnPredicates _table_column_predicates; VExprContext _conjuncts {nullptr}; std::unique_ptr _profile; - // Parsed from DELETION_VECTOR in Iceberg and Paimon - DeleteRows* _delete_rows; + // Parsed from row-position based delete files, including position delete and deletion vector. + DeleteRows* _delete_rows = nullptr; TFileScanRangeParams* _scan_params; std::shared_ptr _io_ctx; RuntimeState* _runtime_state; @@ -451,6 +527,7 @@ class TableReader { return Status::OK(); } + // Parse row-position deletes from table format specific parameters, and fill in _delete_rows. Status _parse_delete_predicates(const SplitReadOptions& options); }; diff --git a/be/src/format/table/deletion_vector_reader.h b/be/src/format/table/deletion_vector_reader.h index b030f048415bf1..968344a8496bc7 100644 --- a/be/src/format/table/deletion_vector_reader.h +++ b/be/src/format/table/deletion_vector_reader.h @@ -37,6 +37,11 @@ struct IOContext; namespace doris { struct DeleteFileDesc { + enum class Format { + PAIMON, + ICEBERG, + }; + std::string key = ""; std::string path = ""; std::string fs_name = ""; @@ -44,6 +49,7 @@ struct DeleteFileDesc { int64_t size = 0; int64_t file_size = -1; int64_t modification_time = 0; + Format format = Format::PAIMON; }; class DeletionVectorReader { diff --git a/be/src/format/table/iceberg_reader_v2.cpp b/be/src/format/table/iceberg_reader_v2.cpp index 220f153e93fc67..ad72313cc89990 100644 --- a/be/src/format/table/iceberg_reader_v2.cpp +++ b/be/src/format/table/iceberg_reader_v2.cpp @@ -17,4 +17,398 @@ #include "format/table/iceberg_reader_v2.h" -namespace doris::iceberg {} // namespace doris::iceberg +#include +#include +#include +#include + +#include "core/assert_cast.h" +#include "core/block/block.h" +#include "core/column/column_const.h" +#include "core/column/column_nullable.h" +#include "core/column/column_string.h" +#include "core/column/column_vector.h" +#include "core/data_type/data_type_number.h" +#include "core/data_type/define_primitive_type.h" +#include "core/field.h" +#include "format/new_parquet/column_reader.h" +#include "format/new_parquet/parquet_reader.h" +#include "format/reader/table_reader.h" +#include "format/table/deletion_vector_reader.h" +#include "io/file_factory.h" + +namespace doris::iceberg { + +IcebergTableReader::PositionDeleteBlockCollector::PositionDeleteBlockCollector( + std::string data_file_path, std::map* rows) + : _data_file_path(std::move(data_file_path)), _rows(rows) {} + +Status IcebergTableReader::PositionDeleteBlockCollector::collect(const Block& block, + size_t read_rows) { + if (read_rows == 0) { + return Status::OK(); + } + const auto& file_path_column = assert_cast( + *block.get_by_position(ICEBERG_FILE_PATH_BLOCK_POSITION).column); + const auto& pos_column = + assert_cast(*block.get_by_position(ICEBERG_ROW_POS_BLOCK_POSITION) + .column); + for (size_t row = 0; row < read_rows; ++row) { + const auto file_path = file_path_column.get_data_at(row).to_string(); + if (file_path == _data_file_path) { + (*_rows)[file_path].push_back(pos_column.get_element(row)); + } + } + return Status::OK(); +} + +Status IcebergTableReader::prepare_split(const reader::SplitReadOptions& options) { + _row_lineage_columns = {}; + _iceberg_params = nullptr; + _delete_predicates_initialized = false; + _position_delete_rows_storage.clear(); + _equality_delete_files.clear(); + if (options.current_range.__isset.table_format_params && + options.current_range.table_format_params.__isset.iceberg_params) { + const auto& iceberg_params = options.current_range.table_format_params.iceberg_params; + _iceberg_params = &iceberg_params; + if (iceberg_params.__isset.first_row_id) { + _row_lineage_columns.first_row_id = iceberg_params.first_row_id; + } + if (iceberg_params.__isset.last_updated_sequence_number) { + _row_lineage_columns.last_updated_sequence_number = + iceberg_params.last_updated_sequence_number; + } + } + RETURN_IF_ERROR(TableReader::prepare_split(options)); + return _collect_position_delete_rows(options.current_range.table_format_params); +} + +Status IcebergTableReader::finalize_chunk(Block* block, const size_t rows) { + RETURN_IF_ERROR(reader::TableReader::finalize_chunk(block, rows)); + RETURN_IF_ERROR(apply_equality_deletes(block)); + return Status::OK(); +} + +Status IcebergTableReader::materialize_virtual_columns(Block* table_block) { + for (size_t column_idx = 0; column_idx < _data_reader.column_mapper.mappings().size(); + ++column_idx) { + const auto& mapping = _data_reader.column_mapper.mappings()[column_idx]; + switch (mapping.virtual_column_type) { + case reader::TableVirtualColumnType::ROW_ID: + RETURN_IF_ERROR(_materialize_row_lineage_row_id(table_block, column_idx)); + break; + case reader::TableVirtualColumnType::LAST_UPDATED_SEQUENCE_NUMBER: + RETURN_IF_ERROR( + _materialize_row_lineage_last_updated_sequence_number(table_block, column_idx)); + break; + case reader::TableVirtualColumnType::INVALID: + break; + } + } + return Status::OK(); +} + +Status IcebergTableReader::customize_file_scan_request(reader::FileScanRequest* file_request) { + RETURN_IF_ERROR(TableReader::customize_file_scan_request(file_request)); + if (_row_lineage_columns.first_row_id >= 0 && _need_row_lineage_row_id()) { + RETURN_IF_ERROR(_append_row_position_output_column(file_request)); + } + return Status::OK(); +} + +Status IcebergTableReader::_parse_deletion_vector_file(const TTableFormatFileDesc& t_desc, + DeleteFileDesc* desc, + bool* has_delete_file) { + DORIS_CHECK(desc != nullptr); + DORIS_CHECK(has_delete_file != nullptr); + *has_delete_file = false; + if (!t_desc.__isset.iceberg_params) { + return Status::OK(); + } + const auto& iceberg_params = t_desc.iceberg_params; + if (!iceberg_params.__isset.format_version || + iceberg_params.format_version < MIN_SUPPORT_DELETE_FILES_VERSION || + !iceberg_params.__isset.delete_files || iceberg_params.delete_files.empty()) { + return Status::OK(); + } + + const TIcebergDeleteFileDesc* deletion_vector = nullptr; + for (const auto& delete_file : iceberg_params.delete_files) { + if (!delete_file.__isset.content || delete_file.content != DELETION_VECTOR) { + continue; + } + if (deletion_vector != nullptr) { + return Status::DataQualityError("This iceberg data file has multiple DVs."); + } + deletion_vector = &delete_file; + } + if (deletion_vector == nullptr) { + return Status::OK(); + } + if (!deletion_vector->__isset.content_offset || + !deletion_vector->__isset.content_size_in_bytes) { + return Status::InternalError("Deletion vector is missing content offset or length"); + } + + desc->key = _iceberg_delete_vector_cache_key(*deletion_vector); + desc->path = deletion_vector->path; + desc->start_offset = deletion_vector->content_offset; + desc->size = deletion_vector->content_size_in_bytes; + desc->file_size = -1; + desc->format = DeleteFileDesc::Format::ICEBERG; + *has_delete_file = true; + return Status::OK(); +} + +Status IcebergTableReader::_collect_position_delete_rows(const TTableFormatFileDesc& t_desc) { + if (!t_desc.__isset.iceberg_params || _delete_predicates_initialized) { + _delete_predicates_initialized = true; + return Status::OK(); + } + const auto& iceberg_params = t_desc.iceberg_params; + if (!iceberg_params.__isset.format_version || + iceberg_params.format_version < MIN_SUPPORT_DELETE_FILES_VERSION || + !iceberg_params.__isset.delete_files || iceberg_params.delete_files.empty()) { + _delete_predicates_initialized = true; + return Status::OK(); + } + + std::vector position_delete_files; + for (const auto& delete_file : iceberg_params.delete_files) { + if (!delete_file.__isset.content) { + continue; + } + if (delete_file.content == POSITION_DELETE) { + position_delete_files.push_back(delete_file); + } else if (delete_file.content == EQUALITY_DELETE) { + _equality_delete_files.push_back(delete_file); + } + } + + if (_delete_rows != nullptr) { + _position_delete_rows_storage = *_delete_rows; + _delete_rows = &_position_delete_rows_storage; + } + if (!position_delete_files.empty()) { + RETURN_IF_ERROR(_read_position_delete_files(position_delete_files)); + } + + _delete_predicates_initialized = true; + return Status::OK(); +} + +Status IcebergTableReader::apply_equality_deletes(Block* block) { + if (!_equality_delete_files.empty()) { + return Status::NotSupported("Iceberg equality delete is not supported by TableReader"); + } + return Status::OK(); +} + +std::string IcebergTableReader::_iceberg_delete_vector_cache_key( + const TIcebergDeleteFileDesc& delete_file) { + const std::string key_prefix = "iceberg_dv:"; + std::string key; + key.resize(key_prefix.size() + delete_file.path.size() + sizeof(delete_file.content_offset) + + sizeof(delete_file.content_size_in_bytes)); + char* data = key.data(); + memcpy(data, key_prefix.data(), key_prefix.size()); + data += key_prefix.size(); + memcpy(data, delete_file.path.data(), delete_file.path.size()); + data += delete_file.path.size(); + memcpy(data, &delete_file.content_offset, sizeof(delete_file.content_offset)); + data += sizeof(delete_file.content_offset); + memcpy(data, &delete_file.content_size_in_bytes, sizeof(delete_file.content_size_in_bytes)); + return key; +} + +std::shared_ptr IcebergTableReader::_delete_file_system_properties( + const TFileScanRangeParams& scan_params) { + auto system_properties = std::make_shared(); + system_properties->system_type = + scan_params.__isset.file_type ? scan_params.file_type : TFileType::FILE_LOCAL; + system_properties->properties = scan_params.properties; + system_properties->hdfs_params = scan_params.hdfs_params; + if (scan_params.__isset.broker_addresses) { + system_properties->broker_addresses.assign(scan_params.broker_addresses.begin(), + scan_params.broker_addresses.end()); + } + return system_properties; +} + +std::unique_ptr IcebergTableReader::_delete_file_description( + const TFileRangeDesc& range) { + auto file_description = std::make_unique(); + file_description->path = range.path; + file_description->file_size = range.__isset.file_size ? range.file_size : -1; + file_description->range_start_offset = range.__isset.start_offset ? range.start_offset : 0; + file_description->range_size = range.__isset.size ? range.size : -1; + if (range.__isset.fs_name) { + file_description->fs_name = range.fs_name; + } + return file_description; +} + +const reader::SchemaField* IcebergTableReader::_find_delete_field( + const std::vector& schema, const std::string& name) { + for (const auto& field : schema) { + if (field.name == name) { + return &field; + } + } + return nullptr; +} + +Block IcebergTableReader::_build_position_delete_block(const reader::SchemaField& file_path_field, + const reader::SchemaField& pos_field) { + Block block; + block.insert({file_path_field.type->create_column(), file_path_field.type, ICEBERG_FILE_PATH}); + block.insert({pos_field.type->create_column(), pos_field.type, ICEBERG_ROW_POS}); + return block; +} + +Status IcebergTableReader::_append_row_position_output_column(reader::FileScanRequest* request) { + const auto row_position_column_id = + doris::parquet::ParquetColumnReaderFactory::ROW_POSITION_COLUMN_ID; + _append_file_scan_column(request, row_position_column_id, &request->non_predicate_columns); + _row_position_block_position = request->column_positions.at(row_position_column_id); + return Status::OK(); +} + +std::string IcebergTableReader::_data_file_path() const { + if (_iceberg_params != nullptr && _iceberg_params->__isset.original_file_path) { + return _iceberg_params->original_file_path; + } + DORIS_CHECK(_current_task != nullptr); + DORIS_CHECK(_current_task->data_file != nullptr); + return _current_task->data_file->path; +} + +Status IcebergTableReader::_read_parquet_position_delete_file( + const TIcebergDeleteFileDesc& delete_file, const TFileScanRangeParams& scan_params, + IcebergDeleteFileIOContext* delete_io_ctx, PositionDeleteBlockCollector* collector) { + if (!delete_file.__isset.file_format) { + return Status::InternalError("Iceberg position delete file is missing file format"); + } + if (delete_file.file_format == TFileFormatType::FORMAT_ORC) { + return Status::NotSupported("Iceberg ORC position delete file is not supported"); + } + if (delete_file.file_format != TFileFormatType::FORMAT_PARQUET) { + return Status::NotSupported("Unsupported Iceberg delete file format {}", + delete_file.file_format); + } + + auto delete_range = build_iceberg_delete_file_range(delete_file.path); + if (_current_task != nullptr && _current_task->data_file != nullptr && + !_current_task->data_file->fs_name.empty()) { + delete_range.__set_fs_name(_current_task->data_file->fs_name); + } + auto system_properties = _delete_file_system_properties(scan_params); + auto file_description = _delete_file_description(delete_range); + std::shared_ptr io_ctx(&delete_io_ctx->io_ctx, [](io::IOContext*) {}); + parquet::ParquetReader reader(system_properties, file_description, io_ctx, _scanner_profile); + RETURN_IF_ERROR(reader.init(_runtime_state)); + + std::vector schema; + RETURN_IF_ERROR(reader.get_schema(&schema)); + const auto* file_path_field = _find_delete_field(schema, ICEBERG_FILE_PATH); + const auto* pos_field = _find_delete_field(schema, ICEBERG_ROW_POS); + if (file_path_field == nullptr || pos_field == nullptr) { + return Status::InternalError("Position delete parquet file is missing required columns"); + } + + auto request = std::make_unique(); + request->non_predicate_columns = {file_path_field->id, pos_field->id}; + request->column_positions = { + {file_path_field->id, ICEBERG_FILE_PATH_BLOCK_POSITION}, + {pos_field->id, ICEBERG_ROW_POS_BLOCK_POSITION}, + }; + RETURN_IF_ERROR(reader.open(request)); + + bool eof = false; + while (!eof) { + Block block = _build_position_delete_block(*file_path_field, *pos_field); + size_t read_rows = 0; + RETURN_IF_ERROR(reader.get_block(&block, &read_rows, &eof)); + RETURN_IF_ERROR(collector->collect(block, read_rows)); + } + return reader.close(); +} + +Status IcebergTableReader::_read_position_delete_files( + const std::vector& delete_files) { + TFileScanRangeParams delete_scan_params = + _scan_params == nullptr ? TFileScanRangeParams() : *_scan_params; + std::map rows_by_file; + const auto data_file_path = _data_file_path(); + IcebergDeleteFileIOContext delete_io_ctx(_runtime_state); + PositionDeleteBlockCollector collector(data_file_path, &rows_by_file); + for (const auto& delete_file : delete_files) { + RETURN_IF_ERROR(_read_parquet_position_delete_file(delete_file, delete_scan_params, + &delete_io_ctx, &collector)); + } + auto rows_it = rows_by_file.find(data_file_path); + if (rows_it == rows_by_file.end()) { + return Status::OK(); + } + // Position delete files and deletion vectors both become row-position deletes for the + // common TableReader DeletePredicate path. Keep the merged rows in a member vector because + // DeletePredicate stores a reference to the vector used by _delete_rows. + _position_delete_rows_storage.insert(_position_delete_rows_storage.end(), + rows_it->second.begin(), rows_it->second.end()); + std::sort(_position_delete_rows_storage.begin(), _position_delete_rows_storage.end()); + _position_delete_rows_storage.erase(std::unique(_position_delete_rows_storage.begin(), + _position_delete_rows_storage.end()), + _position_delete_rows_storage.end()); + _delete_rows = &_position_delete_rows_storage; + return Status::OK(); +} + +Status IcebergTableReader::_materialize_row_lineage_row_id(Block* table_block, size_t column_idx) { + if (_row_lineage_columns.first_row_id < 0) { + return Status::OK(); + } + DORIS_CHECK(_row_position_block_position < _data_reader.block_template.columns()); + const auto& row_position_column = assert_cast( + *_data_reader.block_template.get_by_position(_row_position_block_position).column); + DORIS_CHECK(row_position_column.size() == table_block->rows()); + auto column = + table_block->get_by_position(column_idx).column->convert_to_full_column_if_const() + ->assume_mutable(); + auto* nullable_column = assert_cast(column.get()); + auto& null_map = nullable_column->get_null_map_data(); + auto& data = assert_cast(*nullable_column->get_nested_column_ptr()).get_data(); + null_map.resize(row_position_column.size()); + std::fill(null_map.begin(), null_map.end(), 0); + data.resize(row_position_column.size()); + for (size_t row = 0; row < row_position_column.size(); ++row) { + data[row] = _row_lineage_columns.first_row_id + row_position_column.get_element(row); + } + table_block->replace_by_position(column_idx, std::move(column)); + return Status::OK(); +} + +Status IcebergTableReader::_materialize_row_lineage_last_updated_sequence_number( + Block* table_block, size_t column_idx) { + if (_row_lineage_columns.last_updated_sequence_number < 0) { + return Status::OK(); + } + const auto rows = table_block->rows(); + auto data_column = table_block->get_by_position(column_idx).type->create_column(); + data_column->insert( + Field::create_field(_row_lineage_columns.last_updated_sequence_number)); + auto column = ColumnConst::create(std::move(data_column), rows); + table_block->replace_by_position(column_idx, std::move(column)); + return Status::OK(); +} + +bool IcebergTableReader::_need_row_lineage_row_id() const { + for (const auto& mapping : _data_reader.column_mapper.mappings()) { + if (mapping.virtual_column_type == reader::TableVirtualColumnType::ROW_ID) { + return true; + } + } + return false; +} + +} // namespace doris::iceberg diff --git a/be/src/format/table/iceberg_reader_v2.h b/be/src/format/table/iceberg_reader_v2.h index 6c6f4416717eb5..fbc8e28441b661 100644 --- a/be/src/format/table/iceberg_reader_v2.h +++ b/be/src/format/table/iceberg_reader_v2.h @@ -19,26 +19,24 @@ #include #include +#include #include #include -#include #include #include "common/status.h" -#include "core/assert_cast.h" -#include "core/block/block.h" -#include "core/column/column_const.h" -#include "core/column/column_nullable.h" -#include "core/column/column_vector.h" -#include "core/data_type/define_primitive_type.h" -#include "core/field.h" -#include "format/new_parquet/column_reader.h" #include "format/reader/file_reader.h" #include "format/reader/table_reader.h" +#include "format/table/iceberg_delete_file_reader_helper.h" #include "gen_cpp/PlanNodes_types.h" namespace doris { class Block; +struct DeleteFileDesc; +namespace io { +struct FileDescription; +struct FileSystemProperties; +} // namespace io } // namespace doris namespace doris::iceberg { @@ -50,144 +48,92 @@ class IcebergTableReader : public reader::TableReader { public: ~IcebergTableReader() override = default; - Status prepare_split(const reader::SplitReadOptions& options) override { - _row_lineage_columns = {}; - if (options.current_range.__isset.table_format_params && - options.current_range.table_format_params.__isset.iceberg_params) { - const auto& iceberg_params = options.current_range.table_format_params.iceberg_params; - if (iceberg_params.__isset.first_row_id) { - _row_lineage_columns.first_row_id = iceberg_params.first_row_id; - } - if (iceberg_params.__isset.last_updated_sequence_number) { - _row_lineage_columns.last_updated_sequence_number = - iceberg_params.last_updated_sequence_number; - } - } - return TableReader::prepare_split(options); - } + Status prepare_split(const reader::SplitReadOptions& options) override; protected: // 将 file-local block 转换为 table/global schema block。 // 这里执行 ColumnMapping 中的 finalize_expr、缺失列填充、partition/generated 列 // 物化以及复杂列 remap。 - Status finalize_chunk(Block* block) override { - // 真实实现会根据 ColumnMapping 执行 finalize_expr/default/partition/generated - // expressions,把 file-local block 写成 table block。 - RETURN_IF_ERROR(apply_equality_deletes(block)); - return Status::OK(); - } - - // 物化 Iceberg 虚拟列。 - // 例如 _row_id、_last_updated_sequence_number 等,它们不来自 Parquet 文件物理列。 - Status materialize_virtual_columns(Block* table_block) override { - for (size_t column_idx = 0; column_idx < _data_reader.column_mapper.mappings().size(); - ++column_idx) { - const auto& mapping = _data_reader.column_mapper.mappings()[column_idx]; - switch (mapping.virtual_column_type) { - case reader::TableVirtualColumnType::ROW_ID: - RETURN_IF_ERROR(_materialize_row_lineage_row_id(table_block, column_idx)); - break; - case reader::TableVirtualColumnType::LAST_UPDATED_SEQUENCE_NUMBER: - RETURN_IF_ERROR(_materialize_row_lineage_last_updated_sequence_number(table_block, - column_idx)); - break; - case reader::TableVirtualColumnType::INVALID: - break; - } - } - return Status::OK(); - } - - // 将 Iceberg position delete / deletion vector 转换成底层 reader 可消费的删除信息。 - // 这一步发生在读取 data file 前,因此会修改 FileScanRequest。 - Status apply_position_deletes(reader::FileScanRequest* request) { - // 真实实现会把 position delete / deletion vector 转换成 file-local delete 信息。 - (void)request; - return Status::OK(); - } - - Status customize_file_scan_request(reader::FileScanRequest* file_request) override { - if (_row_lineage_columns.first_row_id < 0 || !_need_row_lineage_row_id()) { - return Status::OK(); - } - DORIS_CHECK(file_request != nullptr); - const auto row_position_column_id = - doris::parquet::ParquetColumnReaderFactory::ROW_POSITION_COLUMN_ID; - if (file_request->column_positions.count(row_position_column_id) > 0) { - return Status::OK(); - } - _row_position_block_position = file_request->column_positions.size(); - file_request->non_predicate_columns.push_back(row_position_column_id); - file_request->column_positions.emplace(row_position_column_id, - _row_position_block_position); - _data_reader.block_schema.push_back( - doris::parquet::ParquetColumnReaderFactory::row_position_schema_field()); - return Status::OK(); - } + Status finalize_chunk(Block* block, const size_t rows) override; + + Status materialize_virtual_columns(Block* table_block) override; + + Status customize_file_scan_request(reader::FileScanRequest* file_request) override; + + Status _parse_deletion_vector_file(const TTableFormatFileDesc& t_desc, DeleteFileDesc* desc, + bool* has_delete_file) override; + + Status _collect_position_delete_rows(const TTableFormatFileDesc& t_desc); // 在 table block 上应用 equality delete。 // equality delete 依赖 table-level 列语义,因此不能下沉到 ParquetReader。 - Status apply_equality_deletes(Block* block) { - // 真实实现会在 table block 上应用 equality delete。 - return Status::OK(); - } + Status apply_equality_deletes(Block* block); private: + static constexpr int MIN_SUPPORT_DELETE_FILES_VERSION = 2; + static constexpr int POSITION_DELETE = 1; + static constexpr int EQUALITY_DELETE = 2; + static constexpr int DELETION_VECTOR = 3; + struct RowLineageColumns { int64_t first_row_id = -1; int64_t last_updated_sequence_number = -1; }; - Status _materialize_row_lineage_row_id(Block* table_block, size_t column_idx) { - if (_row_lineage_columns.first_row_id < 0) { - return Status::OK(); - } - DORIS_CHECK(_row_position_block_position < _data_reader.block_template.columns()); - const auto& row_position_column = assert_cast( - *_data_reader.block_template.get_by_position(_row_position_block_position).column); - DORIS_CHECK(row_position_column.size() == table_block->rows()); - auto column = table_block->get_by_position(column_idx) - .column->convert_to_full_column_if_const() - ->assume_mutable(); - auto* nullable_column = assert_cast(column.get()); - auto& null_map = nullable_column->get_null_map_data(); - auto& data = - assert_cast(*nullable_column->get_nested_column_ptr()).get_data(); - null_map.resize(row_position_column.size()); - std::fill(null_map.begin(), null_map.end(), 0); - data.resize(row_position_column.size()); - for (size_t row = 0; row < row_position_column.size(); ++row) { - data[row] = _row_lineage_columns.first_row_id + row_position_column.get_element(row); - } - table_block->replace_by_position(column_idx, std::move(column)); - return Status::OK(); - } + static constexpr const char* ICEBERG_FILE_PATH = "file_path"; + static constexpr const char* ICEBERG_ROW_POS = "pos"; + static constexpr size_t ICEBERG_FILE_PATH_BLOCK_POSITION = 0; + static constexpr size_t ICEBERG_ROW_POS_BLOCK_POSITION = 1; + + class PositionDeleteBlockCollector final { + public: + PositionDeleteBlockCollector(std::string data_file_path, + std::map* rows); + + Status collect(const Block& block, size_t read_rows); + + private: + std::string _data_file_path; + std::map* _rows = nullptr; + }; + + static std::string _iceberg_delete_vector_cache_key(const TIcebergDeleteFileDesc& delete_file); + + static std::shared_ptr _delete_file_system_properties( + const TFileScanRangeParams& scan_params); + + static std::unique_ptr _delete_file_description(const TFileRangeDesc& range); + + static const reader::SchemaField* _find_delete_field( + const std::vector& schema, const std::string& name); + + static Block _build_position_delete_block(const reader::SchemaField& file_path_field, + const reader::SchemaField& pos_field); + + Status _append_row_position_output_column(reader::FileScanRequest* request); + + std::string _data_file_path() const; + + Status _read_parquet_position_delete_file(const TIcebergDeleteFileDesc& delete_file, + const TFileScanRangeParams& scan_params, + IcebergDeleteFileIOContext* delete_io_ctx, + PositionDeleteBlockCollector* collector); + + Status _read_position_delete_files(const std::vector& delete_files); + + Status _materialize_row_lineage_row_id(Block* table_block, size_t column_idx); Status _materialize_row_lineage_last_updated_sequence_number(Block* table_block, - size_t column_idx) { - if (_row_lineage_columns.last_updated_sequence_number < 0) { - return Status::OK(); - } - const auto rows = table_block->rows(); - auto data_column = table_block->get_by_position(column_idx).type->create_column(); - data_column->insert(Field::create_field( - _row_lineage_columns.last_updated_sequence_number)); - auto column = ColumnConst::create(std::move(data_column), rows); - table_block->replace_by_position(column_idx, std::move(column)); - return Status::OK(); - } + size_t column_idx); RowLineageColumns _row_lineage_columns; size_t _row_position_block_position = 0; + const TIcebergFileDesc* _iceberg_params = nullptr; + bool _delete_predicates_initialized = false; + reader::DeleteRows _position_delete_rows_storage; + std::vector _equality_delete_files; - bool _need_row_lineage_row_id() const { - for (const auto& mapping : _data_reader.column_mapper.mappings()) { - if (mapping.virtual_column_type == reader::TableVirtualColumnType::ROW_ID) { - return true; - } - } - return false; - } + bool _need_row_lineage_row_id() const; }; } // namespace doris::iceberg diff --git a/be/test/format/new_parquet/parquet_reader_test.cpp b/be/test/format/new_parquet/parquet_reader_test.cpp index 00938482d6c3c0..0be12c271293cc 100644 --- a/be/test/format/new_parquet/parquet_reader_test.cpp +++ b/be/test/format/new_parquet/parquet_reader_test.cpp @@ -42,6 +42,8 @@ #include "exprs/vexpr_context.h" #include "format/new_parquet/column_reader.h" #include "format/reader/column_mapper.h" +#include "format/reader/expr/delete_predicate.h" +#include "format/reader/expr/slot_ref.h" #include "format/reader/file_reader.h" #include "format/reader/table_reader.h" #include "gen_cpp/Types_types.h" @@ -655,6 +657,98 @@ TEST_F(NewParquetReaderTest, RowPositionReaderKeepsPositionsAfterSelection) { EXPECT_EQ(row_position_column.get_element(2), 4); } +TEST_F(NewParquetReaderTest, DeletePredicateFiltersRowPositions) { + auto reader = create_reader(); + RuntimeState state {TQueryOptions(), TQueryGlobals()}; + ASSERT_TRUE(reader->init(&state).ok()); + + std::vector schema; + ASSERT_TRUE(reader->get_schema(&schema).ok()); + Block block = build_file_block_with_row_position(schema); + + static const std::vector deleted_rows {1, 3}; + auto delete_predicate = std::make_shared(deleted_rows); + delete_predicate->add_child(TableSlotRef::create_shared( + 2, 2, -1, std::make_shared(), + parquet::ParquetColumnReaderFactory::ROW_POSITION_COLUMN_NAME)); + + auto request = std::make_unique(); + request->predicate_columns = {parquet::ParquetColumnReaderFactory::ROW_POSITION_COLUMN_ID}; + request->non_predicate_columns = {0}; + request->column_positions = { + {0, 0}, + {parquet::ParquetColumnReaderFactory::ROW_POSITION_COLUMN_ID, 2}, + }; + reader::FileExpressionFilter delete_filter; + delete_filter.delete_conjunct = VExprContext::create_shared(std::move(delete_predicate)); + delete_filter.file_column_ids.push_back( + parquet::ParquetColumnReaderFactory::ROW_POSITION_COLUMN_ID); + request->expression_filters.push_back(std::move(delete_filter)); + ASSERT_TRUE(reader->open(request).ok()); + + size_t rows = 0; + bool eof = false; + ASSERT_TRUE(reader->get_block(&block, &rows, &eof).ok()); + EXPECT_FALSE(eof); + ASSERT_EQ(rows, 3); + + const auto& id_column = assert_cast(*block.get_by_position(0).column); + const auto& row_position_column = + assert_cast(*block.get_by_position(2).column); + EXPECT_EQ(id_column.get_element(0), 1); + EXPECT_EQ(id_column.get_element(1), 3); + EXPECT_EQ(id_column.get_element(2), 5); + EXPECT_EQ(row_position_column.get_element(0), 0); + EXPECT_EQ(row_position_column.get_element(1), 2); + EXPECT_EQ(row_position_column.get_element(2), 4); +} + +TEST_F(NewParquetReaderTest, QueryPredicateAndDeletePredicateFilterRowPositions) { + auto reader = create_reader(); + RuntimeState state {TQueryOptions(), TQueryGlobals()}; + ASSERT_TRUE(reader->init(&state).ok()); + + std::vector schema; + ASSERT_TRUE(reader->get_schema(&schema).ok()); + Block block = build_file_block_with_row_position(schema); + + static const std::vector deleted_rows {3}; + auto delete_predicate = std::make_shared(deleted_rows); + delete_predicate->add_child(TableSlotRef::create_shared( + 2, 2, -1, std::make_shared(), + parquet::ParquetColumnReaderFactory::ROW_POSITION_COLUMN_NAME)); + + auto request = std::make_unique(); + request->predicate_columns = {0, parquet::ParquetColumnReaderFactory::ROW_POSITION_COLUMN_ID}; + request->non_predicate_columns = {}; + request->column_positions = { + {0, 0}, + {parquet::ParquetColumnReaderFactory::ROW_POSITION_COLUMN_ID, 2}, + }; + reader::FileExpressionFilter expression_filter; + expression_filter.conjunct = create_int32_greater_than_conjunct(0, 2); + expression_filter.delete_conjunct = VExprContext::create_shared(std::move(delete_predicate)); + expression_filter.file_column_ids.push_back(0); + expression_filter.file_column_ids.push_back( + parquet::ParquetColumnReaderFactory::ROW_POSITION_COLUMN_ID); + request->expression_filters.push_back(std::move(expression_filter)); + ASSERT_TRUE(reader->open(request).ok()); + + size_t rows = 0; + bool eof = false; + ASSERT_TRUE(reader->get_block(&block, &rows, &eof).ok()); + EXPECT_FALSE(eof); + ASSERT_EQ(rows, 2); + + const auto& id_column = assert_cast(*block.get_by_position(0).column); + const auto& row_position_column = + assert_cast(*block.get_by_position(2).column); + EXPECT_EQ(id_column.get_element(0), 3); + EXPECT_EQ(id_column.get_element(1), 5); + EXPECT_EQ(row_position_column.get_element(0), 2); + EXPECT_EQ(row_position_column.get_element(1), 4); +} + TEST_F(NewParquetReaderTest, RowPositionReaderUsesFileLocalPositionsForScanRange) { write_parquet_file(_file_path, 2); auto parquet_file_reader = ::parquet::ParquetFileReader::OpenFile(_file_path, false); diff --git a/be/test/format/reader/table_reader_test.cpp b/be/test/format/reader/table_reader_test.cpp index dc050976836b93..8705775485f02f 100644 --- a/be/test/format/reader/table_reader_test.cpp +++ b/be/test/format/reader/table_reader_test.cpp @@ -25,6 +25,7 @@ #include #include +#include #include #include #include @@ -37,10 +38,16 @@ #include "core/data_type/data_type_nullable.h" #include "core/data_type/data_type_number.h" #include "core/data_type/data_type_string.h" +#include "exec/common/endian.h" #include "exprs/vexpr.h" +#include "format/format_common.h" #include "format/reader/expr/slot_ref.h" +#include "format/table/deletion_vector_reader.h" #include "format/table/iceberg_reader_v2.h" #include "gen_cpp/PlanNodes_types.h" +#include "io/io_common.h" +#include "roaring/roaring64map.hh" +#include "runtime/runtime_profile.h" #include "runtime/runtime_state.h" #include "storage/predicate/predicate_creator.h" @@ -80,6 +87,58 @@ class TableInt32GreaterThanExpr final : public VExpr { const std::string _expr_name = "TableInt32GreaterThanExpr"; }; +class IcebergTableReaderDeleteFileTestHelper final : public doris::iceberg::IcebergTableReader { +public: + Status parse_deletion_vector_file(const TTableFormatFileDesc& t_desc, DeleteFileDesc* desc, + bool* has_delete_file) { + return _parse_deletion_vector_file(t_desc, desc, has_delete_file); + } +}; + +class IcebergTableReaderScanRequestTestHelper final : public doris::iceberg::IcebergTableReader { +public: + Status init_for_scan_request_test(std::vector projected_columns) { + _query_options = std::make_unique(); + _query_globals = std::make_unique(); + _state = std::make_unique(*_query_options, *_query_globals); + RETURN_IF_ERROR(init({ + .projected_columns = std::move(projected_columns), + .column_predicates = {}, + .conjuncts = VExprContext(nullptr), + .format = FileFormat::PARQUET, + .scan_params = nullptr, + .io_ctx = nullptr, + .runtime_state = _state.get(), + .scanner_profile = nullptr, + .allow_missing_columns = true, + .profile = nullptr, + })); + + SplitReadOptions split_options; + split_options.current_range.__set_path("scan-request-test.parquet"); + TTableFormatFileDesc table_format_params; + TIcebergFileDesc iceberg_params; + iceberg_params.__set_first_row_id(1000); + table_format_params.__set_iceberg_params(iceberg_params); + split_options.current_range.__set_table_format_params(table_format_params); + RETURN_IF_ERROR(prepare_split(split_options)); + + _delete_rows_storage = {1}; + _delete_rows = &_delete_rows_storage; + return Status::OK(); + } + + Status customize_request(FileScanRequest* request) { + return customize_file_scan_request(request); + } + +private: + std::unique_ptr _query_options; + std::unique_ptr _query_globals; + std::unique_ptr _state; + DeleteRows _delete_rows_storage; +}; + class TableInt32SumGreaterThanExpr final : public VExpr { public: TableInt32SumGreaterThanExpr(int left_slot_id, int left_column_id, int right_slot_id, @@ -174,6 +233,14 @@ std::shared_ptr build_int32_array(const std::vector& valu return finish_array(&builder); } +std::shared_ptr build_int64_array(const std::vector& values) { + arrow::Int64Builder builder; + for (const auto value : values) { + EXPECT_TRUE(builder.Append(value).ok()); + } + return finish_array(&builder); +} + std::shared_ptr build_string_array(const std::vector& values) { arrow::StringBuilder builder; for (const auto& value : values) { @@ -227,6 +294,54 @@ void write_int_pair_parquet_file(const std::string& file_path, const std::vector write_row_group_size, builder.build())); } +void write_position_delete_parquet_file(const std::string& file_path, + const std::vector& data_file_paths, + const std::vector& positions) { + auto schema = arrow::schema({ + arrow::field("file_path", arrow::utf8(), false), + arrow::field("pos", arrow::int64(), false), + }); + auto table = arrow::Table::Make(schema, + {build_string_array(data_file_paths), + build_int64_array(positions)}); + + auto file_result = arrow::io::FileOutputStream::Open(file_path); + ASSERT_TRUE(file_result.ok()) << file_result.status(); + std::shared_ptr out = *file_result; + + ::parquet::WriterProperties::Builder builder; + builder.version(::parquet::ParquetVersion::PARQUET_2_6); + builder.data_page_version(::parquet::ParquetDataPageVersion::V2); + builder.compression(::parquet::Compression::UNCOMPRESSED); + PARQUET_THROW_NOT_OK(::parquet::arrow::WriteTable( + *table, arrow::default_memory_pool(), out, static_cast(positions.size()), + builder.build())); +} + +int64_t write_iceberg_deletion_vector_file(const std::string& file_path, + const std::vector& deleted_positions) { + roaring::Roaring64Map rows; + for (const auto position : deleted_positions) { + rows.add(position); + } + + const size_t bitmap_size = rows.getSizeInBytes(); + std::vector blob(4 + 4 + bitmap_size + 4); + rows.write(blob.data() + 8); + + const uint32_t total_length = static_cast(4 + bitmap_size); + BigEndian::Store32(blob.data(), total_length); + constexpr char DV_MAGIC[] = {'\xD1', '\xD3', '\x39', '\x64'}; + memcpy(blob.data() + 4, DV_MAGIC, 4); + BigEndian::Store32(blob.data() + 8 + bitmap_size, 0); + + std::ofstream output(file_path, std::ios::binary); + EXPECT_TRUE(output.is_open()); + output.write(blob.data(), static_cast(blob.size())); + EXPECT_TRUE(output.good()); + return static_cast(blob.size()); +} + Block build_table_block(const std::vector& columns) { Block block; for (const auto& column : columns) { @@ -266,6 +381,81 @@ void set_iceberg_row_lineage_params(SplitReadOptions* split_options, int64_t fir split_options->current_range.__set_table_format_params(table_format_params); } +TIcebergDeleteFileDesc make_iceberg_deletion_vector(const std::string& path, int64_t offset, + int64_t size) { + TIcebergDeleteFileDesc delete_file; + delete_file.__set_content(3); + delete_file.__set_path(path); + delete_file.__set_content_offset(offset); + delete_file.__set_content_size_in_bytes(size); + return delete_file; +} + +TIcebergDeleteFileDesc make_iceberg_position_delete_file(const std::string& path) { + TIcebergDeleteFileDesc delete_file; + delete_file.__set_content(1); + delete_file.__set_path(path); + delete_file.__set_file_format(TFileFormatType::FORMAT_PARQUET); + return delete_file; +} + +TFileScanRangeParams make_local_parquet_scan_params() { + TFileScanRangeParams scan_params; + scan_params.__set_file_type(TFileType::FILE_LOCAL); + scan_params.__set_format_type(TFileFormatType::FORMAT_PARQUET); + return scan_params; +} + +std::shared_ptr make_io_context(io::FileReaderStats* file_reader_stats, + io::FileCacheStatistics* file_cache_stats) { + auto io_ctx = std::make_shared(); + io_ctx->file_reader_stats = file_reader_stats; + io_ctx->file_cache_stats = file_cache_stats; + return io_ctx; +} + +std::unique_ptr make_table_read_profile(RuntimeProfile* profile) { + auto read_profile = std::make_unique(); + read_profile->num_delete_files = ADD_COUNTER(profile, "NumDeleteFiles", TUnit::UNIT); + read_profile->num_delete_rows = ADD_COUNTER(profile, "NumDeleteRows", TUnit::UNIT); + read_profile->parse_delete_file_time = ADD_TIMER(profile, "ParseDeleteFileTime"); + return read_profile; +} + +TTableFormatFileDesc make_iceberg_table_format_desc( + const std::string& data_file_path, const std::vector& delete_files) { + TTableFormatFileDesc table_format_params; + TIcebergFileDesc iceberg_params; + iceberg_params.__set_format_version(2); + iceberg_params.__set_original_file_path(data_file_path); + iceberg_params.__set_delete_files(delete_files); + table_format_params.__set_iceberg_params(iceberg_params); + return table_format_params; +} + +std::vector read_iceberg_ids( + doris::iceberg::IcebergTableReader* reader, + const std::vector& projected_columns) { + std::vector ids; + bool eos = false; + while (!eos) { + Block block = build_table_block(projected_columns); + auto status = reader->get_block(&block, &eos); + if (!status.ok()) { + ADD_FAILURE() << status; + return ids; + } + if (block.rows() == 0) { + continue; + } + const auto& id_column = assert_cast(*block.get_by_position(0).column); + for (size_t row = 0; row < block.rows(); ++row) { + ids.push_back(id_column.get_element(row)); + } + } + return ids; +} + int64_t parquet_column_start_offset(const ::parquet::ColumnChunkMetaData& column_metadata) { return column_metadata.has_dictionary_page() ? static_cast(column_metadata.dictionary_page_offset()) @@ -936,6 +1126,217 @@ TEST(TableReaderTest, IcebergVirtualColumnsKeepRowLineageAfterRowGroupPredicateP std::filesystem::remove_all(test_dir); } +TEST(TableReaderTest, IcebergDeletionVectorUsesTableReaderDeleteFileInterface) { + TTableFormatFileDesc table_format_desc; + TIcebergFileDesc iceberg_desc; + iceberg_desc.__set_format_version(2); + iceberg_desc.__set_delete_files({make_iceberg_deletion_vector("dv.bin", 8, 128)}); + table_format_desc.__set_iceberg_params(iceberg_desc); + + IcebergTableReaderDeleteFileTestHelper reader; + DeleteFileDesc desc; + bool has_delete_file = false; + ASSERT_TRUE(reader.parse_deletion_vector_file(table_format_desc, &desc, &has_delete_file).ok()); + + EXPECT_TRUE(has_delete_file); + EXPECT_EQ(desc.path, "dv.bin"); + EXPECT_EQ(desc.start_offset, 8); + EXPECT_EQ(desc.size, 128); + EXPECT_EQ(desc.file_size, -1); + EXPECT_EQ(desc.format, DeleteFileDesc::Format::ICEBERG); +} + +TEST(TableReaderTest, IcebergDeletionVectorRejectsMultipleDeleteFiles) { + TTableFormatFileDesc table_format_desc; + TIcebergFileDesc iceberg_desc; + iceberg_desc.__set_format_version(2); + iceberg_desc.__set_delete_files({make_iceberg_deletion_vector("dv-a.bin", 8, 128), + make_iceberg_deletion_vector("dv-b.bin", 16, 256)}); + table_format_desc.__set_iceberg_params(iceberg_desc); + + IcebergTableReaderDeleteFileTestHelper reader; + DeleteFileDesc desc; + bool has_delete_file = false; + auto status = reader.parse_deletion_vector_file(table_format_desc, &desc, &has_delete_file); + + EXPECT_FALSE(status.ok()); +} + +TEST(TableReaderTest, IcebergTableReaderAppliesDeletionVectorFile) { + const auto test_dir = + std::filesystem::temp_directory_path() / "doris_iceberg_deletion_vector_file_test"; + std::filesystem::remove_all(test_dir); + std::filesystem::create_directories(test_dir); + + const auto file_path = (test_dir / "split.parquet").string(); + const auto dv_path = (test_dir / "delete-vector.bin").string(); + write_int_pair_parquet_file(file_path, {1, 2, 3, 4, 5}, {10, 20, 30, 40, 50}, + {"one", "two", "three", "four", "five"}); + const auto dv_size = write_iceberg_deletion_vector_file(dv_path, {0, 4}); + + std::vector projected_columns; + projected_columns.push_back(make_table_column(0, "id", std::make_shared())); + + RuntimeProfile profile("test_profile"); + RuntimeState state {TQueryOptions(), TQueryGlobals()}; + auto scan_params = make_local_parquet_scan_params(); + io::FileReaderStats file_reader_stats; + io::FileCacheStatistics file_cache_stats; + auto io_ctx = make_io_context(&file_reader_stats, &file_cache_stats); + ShardedKVCache cache(1); + doris::iceberg::IcebergTableReader reader; + ASSERT_TRUE(reader.init({ + .projected_columns = projected_columns, + .column_predicates = {}, + .conjuncts = VExprContext(nullptr), + .format = FileFormat::PARQUET, + .scan_params = &scan_params, + .io_ctx = io_ctx, + .runtime_state = &state, + .scanner_profile = &profile, + .allow_missing_columns = true, + .profile = make_table_read_profile(&profile), + }) + .ok()); + + auto split_options = build_split_options(file_path); + split_options.cache = &cache; + split_options.current_range.__set_table_format_params(make_iceberg_table_format_desc( + file_path, {make_iceberg_deletion_vector(dv_path, 0, dv_size)})); + ASSERT_TRUE(reader.prepare_split(split_options).ok()); + + EXPECT_EQ(read_iceberg_ids(&reader, projected_columns), std::vector({2, 3, 4})); + + ASSERT_TRUE(reader.close().ok()); + std::filesystem::remove_all(test_dir); +} + +TEST(TableReaderTest, IcebergTableReaderAppliesPositionDeleteFile) { + const auto test_dir = + std::filesystem::temp_directory_path() / "doris_iceberg_position_delete_file_test"; + std::filesystem::remove_all(test_dir); + std::filesystem::create_directories(test_dir); + + const auto file_path = (test_dir / "split.parquet").string(); + const auto delete_file_path = (test_dir / "position-delete.parquet").string(); + write_int_pair_parquet_file(file_path, {1, 2, 3, 4, 5}, {10, 20, 30, 40, 50}, + {"one", "two", "three", "four", "five"}); + write_position_delete_parquet_file(delete_file_path, {file_path, file_path}, {1, 3}); + + std::vector projected_columns; + projected_columns.push_back(make_table_column(0, "id", std::make_shared())); + + RuntimeProfile profile("test_profile"); + RuntimeState state {TQueryOptions(), TQueryGlobals()}; + auto scan_params = make_local_parquet_scan_params(); + io::FileReaderStats file_reader_stats; + io::FileCacheStatistics file_cache_stats; + auto io_ctx = make_io_context(&file_reader_stats, &file_cache_stats); + ShardedKVCache cache(1); + doris::iceberg::IcebergTableReader reader; + ASSERT_TRUE(reader.init({ + .projected_columns = projected_columns, + .column_predicates = {}, + .conjuncts = VExprContext(nullptr), + .format = FileFormat::PARQUET, + .scan_params = &scan_params, + .io_ctx = io_ctx, + .runtime_state = &state, + .scanner_profile = &profile, + .allow_missing_columns = true, + .profile = make_table_read_profile(&profile), + }) + .ok()); + + auto split_options = build_split_options(file_path); + split_options.cache = &cache; + split_options.current_range.__set_table_format_params(make_iceberg_table_format_desc( + file_path, {make_iceberg_position_delete_file(delete_file_path)})); + ASSERT_TRUE(reader.prepare_split(split_options).ok()); + + EXPECT_EQ(read_iceberg_ids(&reader, projected_columns), std::vector({1, 3, 5})); + + ASSERT_TRUE(reader.close().ok()); + std::filesystem::remove_all(test_dir); +} + +TEST(TableReaderTest, IcebergTableReaderMergesDeletionVectorAndPositionDeleteFiles) { + const auto test_dir = + std::filesystem::temp_directory_path() / "doris_iceberg_delete_files_merge_test"; + std::filesystem::remove_all(test_dir); + std::filesystem::create_directories(test_dir); + + const auto file_path = (test_dir / "split.parquet").string(); + const auto dv_path = (test_dir / "delete-vector.bin").string(); + const auto position_delete_path = (test_dir / "position-delete.parquet").string(); + write_int_pair_parquet_file(file_path, {1, 2, 3, 4, 5}, {10, 20, 30, 40, 50}, + {"one", "two", "three", "four", "five"}); + const auto dv_size = write_iceberg_deletion_vector_file(dv_path, {0}); + write_position_delete_parquet_file(position_delete_path, {file_path, file_path}, {3, 3}); + + std::vector projected_columns; + projected_columns.push_back(make_table_column(0, "id", std::make_shared())); + + RuntimeProfile profile("test_profile"); + RuntimeState state {TQueryOptions(), TQueryGlobals()}; + auto scan_params = make_local_parquet_scan_params(); + io::FileReaderStats file_reader_stats; + io::FileCacheStatistics file_cache_stats; + auto io_ctx = make_io_context(&file_reader_stats, &file_cache_stats); + ShardedKVCache cache(1); + doris::iceberg::IcebergTableReader reader; + ASSERT_TRUE(reader.init({ + .projected_columns = projected_columns, + .column_predicates = {}, + .conjuncts = VExprContext(nullptr), + .format = FileFormat::PARQUET, + .scan_params = &scan_params, + .io_ctx = io_ctx, + .runtime_state = &state, + .scanner_profile = &profile, + .allow_missing_columns = true, + .profile = make_table_read_profile(&profile), + }) + .ok()); + + auto split_options = build_split_options(file_path); + split_options.cache = &cache; + split_options.current_range.__set_table_format_params(make_iceberg_table_format_desc( + file_path, {make_iceberg_deletion_vector(dv_path, 0, dv_size), + make_iceberg_position_delete_file(position_delete_path)})); + ASSERT_TRUE(reader.prepare_split(split_options).ok()); + + EXPECT_EQ(read_iceberg_ids(&reader, projected_columns), std::vector({2, 3, 5})); + + ASSERT_TRUE(reader.close().ok()); + std::filesystem::remove_all(test_dir); +} + +TEST(TableReaderTest, RowPositionDeletePredicateColumnIsNotRepeatedAsOutputColumn) { + const auto row_position_column_id = + doris::parquet::ParquetColumnReaderFactory::ROW_POSITION_COLUMN_ID; + std::vector projected_columns; + projected_columns.push_back( + make_table_column(100, "_row_id", make_nullable(std::make_shared()))); + projected_columns.push_back(make_table_column(0, "id", std::make_shared())); + + IcebergTableReaderScanRequestTestHelper reader; + ASSERT_TRUE(reader.init_for_scan_request_test(projected_columns).ok()); + + FileScanRequest request; + request.non_predicate_columns.push_back(0); + request.column_positions.emplace(0, 0); + + ASSERT_TRUE(reader.customize_request(&request).ok()); + + EXPECT_EQ(request.predicate_columns, std::vector({row_position_column_id})); + EXPECT_EQ(request.non_predicate_columns, std::vector({0})); + ASSERT_TRUE(request.column_positions.contains(row_position_column_id)); + EXPECT_EQ(request.column_positions.at(row_position_column_id), 1); + ASSERT_EQ(request.expression_filters.size(), 1); + EXPECT_NE(request.expression_filters[0].delete_conjunct, nullptr); +} + TEST(TableReaderTest, ParquetReaderReadsOnlyRowGroupsInFileRange) { const auto test_dir = std::filesystem::temp_directory_path() / "doris_table_reader_file_range_test"; From 6b3ce8c9185f7f837e2f26645f444e316fbd2c5b Mon Sep 17 00:00:00 2001 From: Gabriel Date: Thu, 28 May 2026 23:16:41 +0800 Subject: [PATCH 29/38] [feature](be) Support Iceberg equality deletes in reader (#63852) ### What problem does this PR solve? Issue Number: close #xxx Related PR: #xxx Problem Summary: Implement Iceberg equality delete filtering in the v2 Iceberg reader by materializing equality delete keys as delete predicate expressions and applying them through the file reader filter path. ### Release note Support reading Iceberg equality delete files in the BE Iceberg reader. ### Check List (For Author) - Test: Unit Test / Manual test - Added EqualityDeletePredicateTest for single-column, multi-column, null matching, and error handling. - Manual test: git diff --check. - Not run: run-be-ut.sh failed because this environment only has JDK 11 and requires JDK 17; clang-format script failed because llvm@16 is not installed. - Behavior changed: Yes, Iceberg reader now filters equality-deleted rows. - Does this need documentation: No ### What problem does this PR solve? Issue Number: close #xxx Related PR: #xxx Problem Summary: ### Release note None ### Check List (For Author) - Test - [ ] Regression test - [ ] Unit Test - [ ] Manual test (add detailed scripts or steps below) - [ ] No need to test or manual test. Explain why: - [ ] This is a refactor/code format and no logic has been changed. - [ ] Previous test can cover this change. - [ ] No code files have been changed. - [ ] Other reason - Behavior changed: - [ ] No. - [ ] Yes. - Does this need documentation? - [ ] No. - [ ] Yes. ### Check List (For Reviewer who merge this PR) - [ ] Confirm the release note - [ ] Confirm test cases - [ ] Confirm document - [ ] Add branch pick label --- be/src/format/new_parquet/parquet_reader.cpp | 8 +- .../reader/expr/equality_delete_predicate.cpp | 158 ++++++++++++ .../reader/expr/equality_delete_predicate.h | 71 ++++++ be/src/format/reader/table_reader.cpp | 2 +- be/src/format/reader/table_reader.h | 4 +- be/src/format/table/iceberg_reader_v2.cpp | 229 ++++++++++++++---- be/src/format/table/iceberg_reader_v2.h | 51 ++-- .../expr/equality_delete_predicate_test.cpp | 181 ++++++++++++++ be/test/format/reader/table_reader_test.cpp | 19 +- 9 files changed, 634 insertions(+), 89 deletions(-) create mode 100644 be/src/format/reader/expr/equality_delete_predicate.cpp create mode 100644 be/src/format/reader/expr/equality_delete_predicate.h create mode 100644 be/test/format/reader/expr/equality_delete_predicate_test.cpp diff --git a/be/src/format/new_parquet/parquet_reader.cpp b/be/src/format/new_parquet/parquet_reader.cpp index 489e184cd2b889..5e4107d727d749 100644 --- a/be/src/format/new_parquet/parquet_reader.cpp +++ b/be/src/format/new_parquet/parquet_reader.cpp @@ -339,9 +339,9 @@ Status ParquetReader::_execute_filter_conjuncts(int64_t batch_rows, Block* file_ RETURN_IF_ERROR(expression_filter.conjunct->execute_filter( file_block, filter.data(), static_cast(batch_rows), false, &can_filter_all)); - *selected_rows = can_filter_all ? 0 - : _apply_filter_to_selection(filter, selection, - *selected_rows); + *selected_rows = + can_filter_all ? 0 + : _apply_filter_to_selection(filter, selection, *selected_rows); } if (*selected_rows == 0) { break; @@ -367,7 +367,7 @@ Status ParquetReader::_execute_filter_conjuncts(int64_t batch_rows, Block* file_ file_block->erase(result_column_id); *selected_rows = !has_kept_row ? 0 - : _apply_filter_to_selection(keep_filter, selection, *selected_rows); + : _apply_filter_to_selection(keep_filter, selection, *selected_rows); } return Status::OK(); } diff --git a/be/src/format/reader/expr/equality_delete_predicate.cpp b/be/src/format/reader/expr/equality_delete_predicate.cpp new file mode 100644 index 00000000000000..2b714abade7cac --- /dev/null +++ b/be/src/format/reader/expr/equality_delete_predicate.cpp @@ -0,0 +1,158 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "format/reader/expr/equality_delete_predicate.h" + +#include + +#include + +#include "common/status.h" +#include "core/assert_cast.h" +#include "core/block/column_with_type_and_name.h" +#include "core/column/column_nullable.h" +#include "core/column/column_vector.h" +#include "core/data_type/data_type_number.h" + +namespace doris { +namespace { + +bool column_value_equal(const ColumnPtr& lhs, size_t lhs_row, const ColumnPtr& rhs, + size_t rhs_row) { + if (lhs->is_nullable() && rhs->is_nullable()) { + return lhs->compare_at(lhs_row, rhs_row, *rhs, -1) == 0; + } + if (lhs->is_nullable()) { + const auto& nullable_lhs = assert_cast(*lhs); + return !nullable_lhs.is_null_at(lhs_row) && + nullable_lhs.get_nested_column().compare_at(lhs_row, rhs_row, *rhs, -1) == 0; + } + if (rhs->is_nullable()) { + const auto& nullable_rhs = assert_cast(*rhs); + return !nullable_rhs.is_null_at(rhs_row) && + lhs->compare_at(lhs_row, rhs_row, nullable_rhs.get_nested_column(), -1) == 0; + } + return lhs->compare_at(lhs_row, rhs_row, *rhs, -1) == 0; +} + +} // namespace + +EqualityDeletePredicate::EqualityDeletePredicate(Block delete_block, std::vector field_ids) + : VExpr(), _delete_block(std::move(delete_block)), _field_ids(std::move(field_ids)) { + _node_type = TExprNodeType::PREDICATE; + _opcode = TExprOpcode::DELETE; + _data_type = std::make_shared(); + _expr_name = "EqualityDeletePredicate"; + DCHECK_EQ(_delete_block.columns(), _field_ids.size()); + _delete_hashes = _build_hashes(_delete_block); + for (size_t row = 0; row < _delete_hashes.size(); ++row) { + _delete_hash_map.emplace(_delete_hashes[row], row); + } +} + +Status EqualityDeletePredicate::prepare(RuntimeState* state, const RowDescriptor& desc, + VExprContext* context) { + RETURN_IF_ERROR_OR_PREPARED(VExpr::prepare(state, desc, context)); + _expr_name = "EqualityDeletePredicate"; + _prepare_finished = true; + return Status::OK(); +} + +Status EqualityDeletePredicate::open(RuntimeState* state, VExprContext* context, + FunctionContext::FunctionStateScope scope) { + DCHECK(_prepare_finished); + for (auto& child : _children) { + RETURN_IF_ERROR(child->open(state, context, scope)); + } + if (scope == FunctionContext::FRAGMENT_LOCAL) { + RETURN_IF_ERROR(VExpr::get_const_col(context, nullptr)); + } + _open_finished = true; + return Status::OK(); +} + +void EqualityDeletePredicate::close(VExprContext* context, + FunctionContext::FunctionStateScope scope) { + VExpr::close(context, scope); +} + +Status EqualityDeletePredicate::execute(VExprContext* context, Block* block, + int* result_column_id) const { + if (_children.size() != _field_ids.size()) { + return Status::InternalError( + "EqualityDeletePredicate should have {} child exprs, but got {}", _field_ids.size(), + _children.size()); + } + + Block data_key_block; + for (const auto& child : _children) { + int slot = -1; + RETURN_IF_ERROR(child->execute(context, block, &slot)); + const auto& key_column = block->get_by_position(slot); + data_key_block.insert({key_column.column, key_column.type, key_column.name}); + } + + const auto rows = data_key_block.rows(); + auto res_col = ColumnBool::create(rows, 0); + if (_delete_hash_map.empty() || rows == 0) { + block->insert({std::move(res_col), std::make_shared(), expr_name()}); + *result_column_id = static_cast(block->columns() - 1); + return Status::OK(); + } + + auto data_hashes = _build_hashes(data_key_block); + auto& result_data = res_col->get_data(); + for (size_t row = 0; row < rows; ++row) { + const auto range = _delete_hash_map.equal_range(data_hashes[row]); + for (auto it = range.first; it != range.second; ++it) { + if (_equal(data_key_block, row, it->second)) { + result_data[row] = true; + break; + } + } + } + + block->insert({std::move(res_col), std::make_shared(), expr_name()}); + *result_column_id = static_cast(block->columns() - 1); + return Status::OK(); +} + +std::vector EqualityDeletePredicate::_build_hashes(const Block& block) { + std::vector hashes(block.rows(), 0); + for (const auto& column : block.get_columns()) { + column->update_hashes_with_value(hashes.data(), nullptr); + } + return hashes; +} + +bool EqualityDeletePredicate::_equal(const Block& data_block, size_t data_row, + size_t delete_row) const { + for (size_t column_idx = 0; column_idx < _delete_block.columns(); ++column_idx) { + const auto& data_column = data_block.get_by_position(column_idx).column; + const auto& delete_column = _delete_block.get_by_position(column_idx).column; + if (!column_value_equal(data_column, data_row, delete_column, delete_row)) { + return false; + } + } + return true; +} + +std::string EqualityDeletePredicate::debug_string() const { + return _expr_name; +} + +} // namespace doris diff --git a/be/src/format/reader/expr/equality_delete_predicate.h b/be/src/format/reader/expr/equality_delete_predicate.h new file mode 100644 index 00000000000000..2e33cffb3985df --- /dev/null +++ b/be/src/format/reader/expr/equality_delete_predicate.h @@ -0,0 +1,71 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "common/status.h" +#include "core/block/block.h" +#include "exprs/function_context.h" +#include "exprs/vexpr.h" + +namespace doris { +class RowDescriptor; +class RuntimeState; +class VExprContext; +} // namespace doris + +namespace doris { + +class EqualityDeletePredicate final : public VExpr { + ENABLE_FACTORY_CREATOR(EqualityDeletePredicate); + +public: + EqualityDeletePredicate(Block delete_block, std::vector field_ids); + ~EqualityDeletePredicate() override = default; + + Status execute(VExprContext* context, Block* block, int* result_column_id) const override; + Status execute_column_impl(VExprContext* context, const Block* block, const Selector* selector, + size_t count, ColumnPtr& result_column) const override { + return Status::InternalError("Not implement EqualityDeletePredicate::execute_column_impl"); + } + Status prepare(RuntimeState* state, const RowDescriptor& desc, VExprContext* context) override; + Status open(RuntimeState* state, VExprContext* context, + FunctionContext::FunctionStateScope scope) override; + void close(VExprContext* context, FunctionContext::FunctionStateScope scope) override; + std::string debug_string() const override; + uint64_t get_digest(uint64_t seed) const override { return 0; } + const std::string& expr_name() const override { return _expr_name; } + +private: + static std::vector _build_hashes(const Block& block); + bool _equal(const Block& data_block, size_t data_row, size_t delete_row) const; + + std::string _expr_name; + Block _delete_block; + std::vector _field_ids; + std::vector _delete_hashes; + std::multimap _delete_hash_map; +}; + +} // namespace doris diff --git a/be/src/format/reader/table_reader.cpp b/be/src/format/reader/table_reader.cpp index 0735cc51f383b7..8289d637d78b14 100644 --- a/be/src/format/reader/table_reader.cpp +++ b/be/src/format/reader/table_reader.cpp @@ -21,8 +21,8 @@ #include #include -#include #include +#include #include #include "common/cast_set.h" diff --git a/be/src/format/reader/table_reader.h b/be/src/format/reader/table_reader.h index f94e98bd83798e..de7626dfb2418d 100644 --- a/be/src/format/reader/table_reader.h +++ b/be/src/format/reader/table_reader.h @@ -139,7 +139,9 @@ class TableReader { // 子类可以在自己的 init(options) 中调用该方法;这里不接收具体表格式 schema/task。 virtual Status init(TableReadOptions options); - // 读取当前 split/partition 之前初始化。 + // Prepare for reading a new split/task. + // 1. Pass a new split/task to reader, which will be used in subsequent open_reader() to initialize the underlying file reader. + // 2. Parse delete predicates from split/task information, which will be used for later dynamic filtering and delete handling. virtual Status prepare_split(const SplitReadOptions& options); // table-level 动态过滤入口。 diff --git a/be/src/format/table/iceberg_reader_v2.cpp b/be/src/format/table/iceberg_reader_v2.cpp index ad72313cc89990..ed6649fce2c9ed 100644 --- a/be/src/format/table/iceberg_reader_v2.cpp +++ b/be/src/format/table/iceberg_reader_v2.cpp @@ -22,6 +22,7 @@ #include #include +#include "common/cast_set.h" #include "core/assert_cast.h" #include "core/block/block.h" #include "core/column/column_const.h" @@ -33,30 +34,32 @@ #include "core/field.h" #include "format/new_parquet/column_reader.h" #include "format/new_parquet/parquet_reader.h" +#include "format/reader/expr/cast.h" +#include "format/reader/expr/equality_delete_predicate.h" +#include "format/reader/expr/slot_ref.h" #include "format/reader/table_reader.h" #include "format/table/deletion_vector_reader.h" #include "io/file_factory.h" namespace doris::iceberg { -IcebergTableReader::PositionDeleteBlockCollector::PositionDeleteBlockCollector( - std::string data_file_path, std::map* rows) +IcebergTableReader::PositionDeleteRowsCollector::PositionDeleteRowsCollector( + std::string data_file_path, reader::DeleteRows* rows) : _data_file_path(std::move(data_file_path)), _rows(rows) {} -Status IcebergTableReader::PositionDeleteBlockCollector::collect(const Block& block, - size_t read_rows) { +Status IcebergTableReader::PositionDeleteRowsCollector::collect(const Block& block, + size_t read_rows) { if (read_rows == 0) { return Status::OK(); } const auto& file_path_column = assert_cast( *block.get_by_position(ICEBERG_FILE_PATH_BLOCK_POSITION).column); - const auto& pos_column = - assert_cast(*block.get_by_position(ICEBERG_ROW_POS_BLOCK_POSITION) - .column); + const auto& pos_column = assert_cast( + *block.get_by_position(ICEBERG_ROW_POS_BLOCK_POSITION).column); for (size_t row = 0; row < read_rows; ++row) { const auto file_path = file_path_column.get_data_at(row).to_string(); if (file_path == _data_file_path) { - (*_rows)[file_path].push_back(pos_column.get_element(row)); + _rows->push_back(pos_column.get_element(row)); } } return Status::OK(); @@ -67,7 +70,7 @@ Status IcebergTableReader::prepare_split(const reader::SplitReadOptions& options _iceberg_params = nullptr; _delete_predicates_initialized = false; _position_delete_rows_storage.clear(); - _equality_delete_files.clear(); + _equality_delete_filters.clear(); if (options.current_range.__isset.table_format_params && options.current_range.table_format_params.__isset.iceberg_params) { const auto& iceberg_params = options.current_range.table_format_params.iceberg_params; @@ -81,13 +84,7 @@ Status IcebergTableReader::prepare_split(const reader::SplitReadOptions& options } } RETURN_IF_ERROR(TableReader::prepare_split(options)); - return _collect_position_delete_rows(options.current_range.table_format_params); -} - -Status IcebergTableReader::finalize_chunk(Block* block, const size_t rows) { - RETURN_IF_ERROR(reader::TableReader::finalize_chunk(block, rows)); - RETURN_IF_ERROR(apply_equality_deletes(block)); - return Status::OK(); + return _init_delete_predicates(options.current_range.table_format_params); } Status IcebergTableReader::materialize_virtual_columns(Block* table_block) { @@ -114,6 +111,7 @@ Status IcebergTableReader::customize_file_scan_request(reader::FileScanRequest* if (_row_lineage_columns.first_row_id >= 0 && _need_row_lineage_row_id()) { RETURN_IF_ERROR(_append_row_position_output_column(file_request)); } + RETURN_IF_ERROR(_append_equality_delete_predicates(file_request)); return Status::OK(); } @@ -161,7 +159,7 @@ Status IcebergTableReader::_parse_deletion_vector_file(const TTableFormatFileDes return Status::OK(); } -Status IcebergTableReader::_collect_position_delete_rows(const TTableFormatFileDesc& t_desc) { +Status IcebergTableReader::_init_delete_predicates(const TTableFormatFileDesc& t_desc) { if (!t_desc.__isset.iceberg_params || _delete_predicates_initialized) { _delete_predicates_initialized = true; return Status::OK(); @@ -175,6 +173,7 @@ Status IcebergTableReader::_collect_position_delete_rows(const TTableFormatFileD } std::vector position_delete_files; + std::vector equality_delete_files; for (const auto& delete_file : iceberg_params.delete_files) { if (!delete_file.__isset.content) { continue; @@ -182,29 +181,31 @@ Status IcebergTableReader::_collect_position_delete_rows(const TTableFormatFileD if (delete_file.content == POSITION_DELETE) { position_delete_files.push_back(delete_file); } else if (delete_file.content == EQUALITY_DELETE) { - _equality_delete_files.push_back(delete_file); + equality_delete_files.push_back(delete_file); } } + // `_delete_rows != nullptr` means DeleteVector is parsed if (_delete_rows != nullptr) { _position_delete_rows_storage = *_delete_rows; _delete_rows = &_position_delete_rows_storage; } + // Combine position delete rows from both deletion vector and position delete files, and + // initialize equality delete predicates. Position delete files contain row positions of + // deleted rows, which can be directly added to `_delete_rows`. Equality delete files contain + // values of deleted rows, which require reading the files and building predicates for later + // filtering. if (!position_delete_files.empty()) { - RETURN_IF_ERROR(_read_position_delete_files(position_delete_files)); + RETURN_IF_ERROR(_init_position_delete_rows(position_delete_files)); + } + if (!equality_delete_files.empty()) { + RETURN_IF_ERROR(_init_equality_delete_predicates(equality_delete_files)); } _delete_predicates_initialized = true; return Status::OK(); } -Status IcebergTableReader::apply_equality_deletes(Block* block) { - if (!_equality_delete_files.empty()) { - return Status::NotSupported("Iceberg equality delete is not supported by TableReader"); - } - return Status::OK(); -} - std::string IcebergTableReader::_iceberg_delete_vector_cache_key( const TIcebergDeleteFileDesc& delete_file) { const std::string key_prefix = "iceberg_dv:"; @@ -259,14 +260,6 @@ const reader::SchemaField* IcebergTableReader::_find_delete_field( return nullptr; } -Block IcebergTableReader::_build_position_delete_block(const reader::SchemaField& file_path_field, - const reader::SchemaField& pos_field) { - Block block; - block.insert({file_path_field.type->create_column(), file_path_field.type, ICEBERG_FILE_PATH}); - block.insert({pos_field.type->create_column(), pos_field.type, ICEBERG_ROW_POS}); - return block; -} - Status IcebergTableReader::_append_row_position_output_column(reader::FileScanRequest* request) { const auto row_position_column_id = doris::parquet::ParquetColumnReaderFactory::ROW_POSITION_COLUMN_ID; @@ -275,6 +268,46 @@ Status IcebergTableReader::_append_row_position_output_column(reader::FileScanRe return Status::OK(); } +Status IcebergTableReader::_append_equality_delete_predicates(reader::FileScanRequest* request) { + DORIS_CHECK(request != nullptr); + for (const auto& filter : _equality_delete_filters) { + auto delete_predicate = + std::make_shared(filter.delete_block, filter.field_ids); + reader::FileExpressionFilter expression_filter; + expression_filter.delete_conjunct = VExprContext::create_shared(delete_predicate); + DCHECK_EQ(filter.field_ids.size(), filter.key_types.size()); + for (size_t idx = 0; idx < filter.field_ids.size(); ++idx) { + const int field_id = filter.field_ids[idx]; + auto field_it = + std::find_if(_data_reader.file_schema.begin(), _data_reader.file_schema.end(), + [field_id](const reader::SchemaField& field) { + return !field.field_id_path.empty() && + field.field_id_path.back() == field_id; + }); + if (field_it == _data_reader.file_schema.end()) { + return Status::InternalError( + "Can not find equality delete column field id {} in data file schema", + field_id); + } + _append_file_scan_column(request, field_it->id, &request->predicate_columns); + const auto block_position = request->column_positions.at(field_it->id); + auto slot = TableSlotRef::create_shared(cast_set(block_position), + cast_set(block_position), -1, + field_it->type, field_it->name); + if (field_it->type->equals(*filter.key_types[idx])) { + delete_predicate->add_child(std::move(slot)); + } else { + auto cast_expr = Cast::create_shared(filter.key_types[idx]); + cast_expr->add_child(std::move(slot)); + delete_predicate->add_child(std::move(cast_expr)); + } + expression_filter.file_column_ids.push_back(field_it->id); + } + request->expression_filters.push_back(std::move(expression_filter)); + } + return Status::OK(); +} + std::string IcebergTableReader::_data_file_path() const { if (_iceberg_params != nullptr && _iceberg_params->__isset.original_file_path) { return _iceberg_params->original_file_path; @@ -286,7 +319,7 @@ std::string IcebergTableReader::_data_file_path() const { Status IcebergTableReader::_read_parquet_position_delete_file( const TIcebergDeleteFileDesc& delete_file, const TFileScanRangeParams& scan_params, - IcebergDeleteFileIOContext* delete_io_ctx, PositionDeleteBlockCollector* collector) { + IcebergDeleteFileIOContext* delete_io_ctx, PositionDeleteRowsCollector* collector) { if (!delete_file.__isset.file_format) { return Status::InternalError("Iceberg position delete file is missing file format"); } @@ -326,8 +359,16 @@ Status IcebergTableReader::_read_parquet_position_delete_file( RETURN_IF_ERROR(reader.open(request)); bool eof = false; + auto build_position_delete_block = [](const reader::SchemaField& file_path_field, + const reader::SchemaField& pos_field) -> Block { + Block block; + block.insert( + {file_path_field.type->create_column(), file_path_field.type, ICEBERG_FILE_PATH}); + block.insert({pos_field.type->create_column(), pos_field.type, ICEBERG_ROW_POS}); + return block; + }; while (!eof) { - Block block = _build_position_delete_block(*file_path_field, *pos_field); + Block block = build_position_delete_block(*file_path_field, *pos_field); size_t read_rows = 0; RETURN_IF_ERROR(reader.get_block(&block, &read_rows, &eof)); RETURN_IF_ERROR(collector->collect(block, read_rows)); @@ -335,35 +376,127 @@ Status IcebergTableReader::_read_parquet_position_delete_file( return reader.close(); } -Status IcebergTableReader::_read_position_delete_files( +Status IcebergTableReader::_init_position_delete_rows( const std::vector& delete_files) { TFileScanRangeParams delete_scan_params = _scan_params == nullptr ? TFileScanRangeParams() : *_scan_params; - std::map rows_by_file; + reader::DeleteRows position_delete_rows; const auto data_file_path = _data_file_path(); IcebergDeleteFileIOContext delete_io_ctx(_runtime_state); - PositionDeleteBlockCollector collector(data_file_path, &rows_by_file); + PositionDeleteRowsCollector collector(data_file_path, &position_delete_rows); for (const auto& delete_file : delete_files) { RETURN_IF_ERROR(_read_parquet_position_delete_file(delete_file, delete_scan_params, &delete_io_ctx, &collector)); } - auto rows_it = rows_by_file.find(data_file_path); - if (rows_it == rows_by_file.end()) { + if (position_delete_rows.empty()) { return Status::OK(); } // Position delete files and deletion vectors both become row-position deletes for the // common TableReader DeletePredicate path. Keep the merged rows in a member vector because // DeletePredicate stores a reference to the vector used by _delete_rows. _position_delete_rows_storage.insert(_position_delete_rows_storage.end(), - rows_it->second.begin(), rows_it->second.end()); + position_delete_rows.begin(), position_delete_rows.end()); std::sort(_position_delete_rows_storage.begin(), _position_delete_rows_storage.end()); - _position_delete_rows_storage.erase(std::unique(_position_delete_rows_storage.begin(), - _position_delete_rows_storage.end()), - _position_delete_rows_storage.end()); + _position_delete_rows_storage.erase( + std::unique(_position_delete_rows_storage.begin(), _position_delete_rows_storage.end()), + _position_delete_rows_storage.end()); _delete_rows = &_position_delete_rows_storage; return Status::OK(); } +Status IcebergTableReader::_init_equality_delete_predicates( + const std::vector& delete_files) { + TFileScanRangeParams delete_scan_params = + _scan_params == nullptr ? TFileScanRangeParams() : *_scan_params; + IcebergDeleteFileIOContext delete_io_ctx(_runtime_state); + for (const auto& delete_file : delete_files) { + RETURN_IF_ERROR(_read_parquet_equality_delete_file(delete_file, delete_scan_params, + &delete_io_ctx)); + } + return Status::OK(); +} + +Status IcebergTableReader::_read_parquet_equality_delete_file( + const TIcebergDeleteFileDesc& delete_file, const TFileScanRangeParams& scan_params, + IcebergDeleteFileIOContext* delete_io_ctx) { + if (!delete_file.__isset.file_format) { + return Status::InternalError("Iceberg equality delete file is missing file format"); + } + if (delete_file.file_format != TFileFormatType::FORMAT_PARQUET) { + return Status::NotSupported("Unsupported Iceberg equality delete file format {}", + delete_file.file_format); + } + if (!delete_file.__isset.field_ids || delete_file.field_ids.empty()) { + return Status::InternalError("Iceberg equality delete file is missing field ids"); + } + + auto delete_range = build_iceberg_delete_file_range(delete_file.path); + if (_current_task != nullptr && _current_task->data_file != nullptr && + !_current_task->data_file->fs_name.empty()) { + delete_range.__set_fs_name(_current_task->data_file->fs_name); + } + auto system_properties = _delete_file_system_properties(scan_params); + auto file_description = _delete_file_description(delete_range); + std::shared_ptr io_ctx(&delete_io_ctx->io_ctx, [](io::IOContext*) {}); + parquet::ParquetReader reader(system_properties, file_description, io_ctx, _scanner_profile); + RETURN_IF_ERROR(reader.init(_runtime_state)); + + std::vector schema; + RETURN_IF_ERROR(reader.get_schema(&schema)); + std::vector delete_fields; + std::vector delete_field_ids; + std::vector delete_key_types; + for (const auto field_id : delete_file.field_ids) { + auto field_it = std::find_if( + schema.begin(), schema.end(), [field_id](const reader::SchemaField& field) { + return !field.field_id_path.empty() && field.field_id_path.back() == field_id; + }); + if (field_it == schema.end()) { + return Status::InternalError("Can not find field id {} in equality delete file {}", + field_id, delete_file.path); + } + if (!field_it->children.empty()) { + return Status::NotSupported( + "Iceberg equality delete does not support complex column {}", field_it->name); + } + delete_fields.push_back(*field_it); + delete_field_ids.push_back(field_id); + delete_key_types.push_back(field_it->type); + } + + auto request = std::make_unique(); + for (size_t idx = 0; idx < delete_fields.size(); ++idx) { + request->non_predicate_columns.push_back(delete_fields[idx].id); + request->column_positions.emplace(delete_fields[idx].id, idx); + } + RETURN_IF_ERROR(reader.open(request)); + + auto build_equality_delete_block = [](const std::vector fields) -> Block { + Block block; + for (const auto& field : fields) { + block.insert({field.type->create_column(), field.type, field.name}); + } + return block; + }; + Block delete_block = build_equality_delete_block(delete_fields); + bool eof = false; + while (!eof) { + Block block = build_equality_delete_block(delete_fields); + size_t read_rows = 0; + RETURN_IF_ERROR(reader.get_block(&block, &read_rows, &eof)); + if (read_rows > 0) { + MutableBlock mutable_block(&delete_block); + RETURN_IF_ERROR(mutable_block.merge(block)); + } + } + RETURN_IF_ERROR(reader.close()); + _equality_delete_filters.push_back( + EqualityDeleteFilter {.field_ids = std::move(delete_field_ids), + .key_types = std::move(delete_key_types), + .delete_block = std::move(delete_block)}); + return Status::OK(); +} + Status IcebergTableReader::_materialize_row_lineage_row_id(Block* table_block, size_t column_idx) { if (_row_lineage_columns.first_row_id < 0) { return Status::OK(); @@ -372,9 +505,9 @@ Status IcebergTableReader::_materialize_row_lineage_row_id(Block* table_block, s const auto& row_position_column = assert_cast( *_data_reader.block_template.get_by_position(_row_position_block_position).column); DORIS_CHECK(row_position_column.size() == table_block->rows()); - auto column = - table_block->get_by_position(column_idx).column->convert_to_full_column_if_const() - ->assume_mutable(); + auto column = table_block->get_by_position(column_idx) + .column->convert_to_full_column_if_const() + ->assume_mutable(); auto* nullable_column = assert_cast(column.get()); auto& null_map = nullable_column->get_null_map_data(); auto& data = assert_cast(*nullable_column->get_nested_column_ptr()).get_data(); diff --git a/be/src/format/table/iceberg_reader_v2.h b/be/src/format/table/iceberg_reader_v2.h index fbc8e28441b661..497a989289a14d 100644 --- a/be/src/format/table/iceberg_reader_v2.h +++ b/be/src/format/table/iceberg_reader_v2.h @@ -17,14 +17,12 @@ #pragma once -#include -#include -#include #include #include #include #include "common/status.h" +#include "core/block/block.h" #include "format/reader/file_reader.h" #include "format/reader/table_reader.h" #include "format/table/iceberg_delete_file_reader_helper.h" @@ -51,11 +49,6 @@ class IcebergTableReader : public reader::TableReader { Status prepare_split(const reader::SplitReadOptions& options) override; protected: - // 将 file-local block 转换为 table/global schema block。 - // 这里执行 ColumnMapping 中的 finalize_expr、缺失列填充、partition/generated 列 - // 物化以及复杂列 remap。 - Status finalize_chunk(Block* block, const size_t rows) override; - Status materialize_virtual_columns(Block* table_block) override; Status customize_file_scan_request(reader::FileScanRequest* file_request) override; @@ -63,11 +56,7 @@ class IcebergTableReader : public reader::TableReader { Status _parse_deletion_vector_file(const TTableFormatFileDesc& t_desc, DeleteFileDesc* desc, bool* has_delete_file) override; - Status _collect_position_delete_rows(const TTableFormatFileDesc& t_desc); - - // 在 table block 上应用 equality delete。 - // equality delete 依赖 table-level 列语义,因此不能下沉到 ParquetReader。 - Status apply_equality_deletes(Block* block); + Status _init_delete_predicates(const TTableFormatFileDesc& t_desc); private: static constexpr int MIN_SUPPORT_DELETE_FILES_VERSION = 2; @@ -85,16 +74,15 @@ class IcebergTableReader : public reader::TableReader { static constexpr size_t ICEBERG_FILE_PATH_BLOCK_POSITION = 0; static constexpr size_t ICEBERG_ROW_POS_BLOCK_POSITION = 1; - class PositionDeleteBlockCollector final { + class PositionDeleteRowsCollector final { public: - PositionDeleteBlockCollector(std::string data_file_path, - std::map* rows); + PositionDeleteRowsCollector(std::string data_file_path, reader::DeleteRows* rows); Status collect(const Block& block, size_t read_rows); private: std::string _data_file_path; - std::map* _rows = nullptr; + reader::DeleteRows* _rows = nullptr; }; static std::string _iceberg_delete_vector_cache_key(const TIcebergDeleteFileDesc& delete_file); @@ -102,27 +90,35 @@ class IcebergTableReader : public reader::TableReader { static std::shared_ptr _delete_file_system_properties( const TFileScanRangeParams& scan_params); - static std::unique_ptr _delete_file_description(const TFileRangeDesc& range); + static std::unique_ptr _delete_file_description( + const TFileRangeDesc& range); static const reader::SchemaField* _find_delete_field( const std::vector& schema, const std::string& name); - static Block _build_position_delete_block(const reader::SchemaField& file_path_field, - const reader::SchemaField& pos_field); - Status _append_row_position_output_column(reader::FileScanRequest* request); + Status _append_equality_delete_predicates(reader::FileScanRequest* request); + + Status _init_equality_delete_predicates( + const std::vector& delete_files); + std::string _data_file_path() const; + // Read equality/position delete files. + Status _read_parquet_equality_delete_file(const TIcebergDeleteFileDesc& delete_file, + const TFileScanRangeParams& scan_params, + IcebergDeleteFileIOContext* delete_io_ctx); Status _read_parquet_position_delete_file(const TIcebergDeleteFileDesc& delete_file, const TFileScanRangeParams& scan_params, IcebergDeleteFileIOContext* delete_io_ctx, - PositionDeleteBlockCollector* collector); + PositionDeleteRowsCollector* collector); - Status _read_position_delete_files(const std::vector& delete_files); + // Read position delete files and collect deleted row positions to update DeletePredicate. + Status _init_position_delete_rows(const std::vector& delete_files); + // Materialize row lineage virtual columns based on the position delete file. Status _materialize_row_lineage_row_id(Block* table_block, size_t column_idx); - Status _materialize_row_lineage_last_updated_sequence_number(Block* table_block, size_t column_idx); @@ -131,7 +127,12 @@ class IcebergTableReader : public reader::TableReader { const TIcebergFileDesc* _iceberg_params = nullptr; bool _delete_predicates_initialized = false; reader::DeleteRows _position_delete_rows_storage; - std::vector _equality_delete_files; + struct EqualityDeleteFilter { + std::vector field_ids; + std::vector key_types; + Block delete_block; + }; + std::vector _equality_delete_filters; bool _need_row_lineage_row_id() const; }; diff --git a/be/test/format/reader/expr/equality_delete_predicate_test.cpp b/be/test/format/reader/expr/equality_delete_predicate_test.cpp new file mode 100644 index 00000000000000..07ff0f78f81e88 --- /dev/null +++ b/be/test/format/reader/expr/equality_delete_predicate_test.cpp @@ -0,0 +1,181 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "format/reader/expr/equality_delete_predicate.h" + +#include + +#include +#include +#include +#include + +#include "common/status.h" +#include "core/assert_cast.h" +#include "core/block/block.h" +#include "core/column/column_nullable.h" +#include "core/column/column_string.h" +#include "core/column/column_vector.h" +#include "core/data_type/data_type_nullable.h" +#include "core/data_type/data_type_number.h" +#include "core/data_type/data_type_string.h" +#include "exprs/vexpr_context.h" +#include "format/reader/expr/cast.h" +#include "runtime/descriptors.h" +#include "testutil/column_helper.h" +#include "testutil/mock/mock_runtime_state.h" +#include "testutil/mock/mock_slot_ref.h" + +namespace doris { + +class EqualityDeletePredicateTest : public testing::Test { +protected: + static ColumnWithTypeAndName make_nullable_int_column( + const std::string& name, const std::vector>& values) { + auto data = ColumnInt32::create(); + auto null_map = ColumnUInt8::create(); + for (const auto& value : values) { + data->insert_value(value.value_or(0)); + null_map->insert_value(!value.has_value()); + } + auto type = make_nullable(std::make_shared()); + return {ColumnNullable::create(std::move(data), std::move(null_map)), type, name}; + } + + static ColumnWithTypeAndName make_nullable_string_column( + const std::string& name, const std::vector>& values) { + auto data = ColumnString::create(); + auto null_map = ColumnUInt8::create(); + for (const auto& value : values) { + const std::string data_value = value.value_or(""); + data->insert_data(data_value.data(), data_value.size()); + null_map->insert_value(!value.has_value()); + } + auto type = make_nullable(std::make_shared()); + return {ColumnNullable::create(std::move(data), std::move(null_map)), type, name}; + } + + static std::vector result_column_data(const Block& block, int result_column_id) { + const auto& result_column = + assert_cast(*block.get_by_position(result_column_id).column); + return {result_column.get_data().begin(), result_column.get_data().end()}; + } + + static Status execute_equality_delete_predicate(Block delete_block, std::vector field_ids, + Block* data_block, int* result_column_id) { + auto predicate = + std::make_shared(std::move(delete_block), field_ids); + predicate->_open_finished = true; + for (size_t idx = 0; idx < field_ids.size(); ++idx) { + predicate->add_child( + std::make_shared(idx, data_block->get_by_position(idx).type)); + } + + VExprContext context(predicate); + return predicate->execute(&context, data_block, result_column_id); + } + + static Status execute_prepared_equality_delete_predicate(const VExprContextSPtr& context, + MockRuntimeState* state, + Block* data_block, + int* result_column_id) { + RETURN_IF_ERROR(context->prepare(state, RowDescriptor())); + RETURN_IF_ERROR(context->open(state)); + return context->execute(data_block, result_column_id); + } +}; + +TEST_F(EqualityDeletePredicateTest, MatchSingleColumn) { + Block delete_block; + delete_block.insert(make_nullable_int_column("id", {1, 4})); + Block data_block; + data_block.insert(make_nullable_int_column("id", {1, 2, 3, 4})); + + int result_column_id = -1; + auto status = execute_equality_delete_predicate(std::move(delete_block), {1}, &data_block, + &result_column_id); + ASSERT_TRUE(status.ok()) << status; + EXPECT_EQ(result_column_data(data_block, result_column_id), std::vector({1, 0, 0, 1})); +} + +TEST_F(EqualityDeletePredicateTest, MatchMultipleColumns) { + Block delete_block; + delete_block.insert(make_nullable_int_column("id", {1, 2})); + delete_block.insert(make_nullable_string_column("name", {"a", "b"})); + Block data_block; + data_block.insert(make_nullable_int_column("id", {1, 1, 2, 2})); + data_block.insert(make_nullable_string_column("name", {"a", "b", "a", "b"})); + + int result_column_id = -1; + auto status = execute_equality_delete_predicate(std::move(delete_block), {1, 2}, &data_block, + &result_column_id); + ASSERT_TRUE(status.ok()) << status; + EXPECT_EQ(result_column_data(data_block, result_column_id), std::vector({1, 0, 0, 1})); +} + +TEST_F(EqualityDeletePredicateTest, MatchNullValues) { + Block delete_block; + delete_block.insert(make_nullable_int_column("id", {std::nullopt})); + Block data_block; + data_block.insert(make_nullable_int_column("id", {1, std::nullopt, 3})); + + int result_column_id = -1; + auto status = execute_equality_delete_predicate(std::move(delete_block), {1}, &data_block, + &result_column_id); + ASSERT_TRUE(status.ok()) << status; + EXPECT_EQ(result_column_data(data_block, result_column_id), std::vector({0, 1, 0})); +} + +TEST_F(EqualityDeletePredicateTest, MatchAfterCastToDeleteKeyType) { + Block delete_block; + delete_block.insert(make_nullable_int_column("id", {1, 4})); + Block data_block; + data_block.insert(ColumnHelper::create_column_with_name({1, 2, 4})); + + auto predicate = std::make_shared(std::move(delete_block), + std::vector {1}); + auto cast_expr = Cast::create_shared(make_nullable(std::make_shared())); + cast_expr->add_child(std::make_shared(0, data_block.get_by_position(0).type)); + predicate->add_child(std::move(cast_expr)); + auto context = VExprContext::create_shared(predicate); + MockRuntimeState state; + + int result_column_id = -1; + auto status = execute_prepared_equality_delete_predicate(context, &state, &data_block, + &result_column_id); + ASSERT_TRUE(status.ok()) << status; + EXPECT_EQ(result_column_data(data_block, result_column_id), std::vector({1, 0, 1})); + context->close(); +} + +TEST_F(EqualityDeletePredicateTest, ChildCountMismatchReturnsError) { + Block delete_block; + delete_block.insert(make_nullable_int_column("id", {1})); + auto predicate = std::make_shared(std::move(delete_block), + std::vector {1}); + predicate->_open_finished = true; + Block data_block; + data_block.insert(make_nullable_int_column("id", {1})); + VExprContext context(predicate); + + int result_column_id = -1; + auto status = predicate->execute(&context, &data_block, &result_column_id); + ASSERT_FALSE(status.ok()); + EXPECT_NE(status.to_string().find("should have 1 child exprs"), std::string::npos); +} + +} // namespace doris diff --git a/be/test/format/reader/table_reader_test.cpp b/be/test/format/reader/table_reader_test.cpp index 8705775485f02f..8a72937002dba4 100644 --- a/be/test/format/reader/table_reader_test.cpp +++ b/be/test/format/reader/table_reader_test.cpp @@ -301,9 +301,8 @@ void write_position_delete_parquet_file(const std::string& file_path, arrow::field("file_path", arrow::utf8(), false), arrow::field("pos", arrow::int64(), false), }); - auto table = arrow::Table::Make(schema, - {build_string_array(data_file_paths), - build_int64_array(positions)}); + auto table = arrow::Table::Make( + schema, {build_string_array(data_file_paths), build_int64_array(positions)}); auto file_result = arrow::io::FileOutputStream::Open(file_path); ASSERT_TRUE(file_result.ok()) << file_result.status(); @@ -313,9 +312,9 @@ void write_position_delete_parquet_file(const std::string& file_path, builder.version(::parquet::ParquetVersion::PARQUET_2_6); builder.data_page_version(::parquet::ParquetDataPageVersion::V2); builder.compression(::parquet::Compression::UNCOMPRESSED); - PARQUET_THROW_NOT_OK(::parquet::arrow::WriteTable( - *table, arrow::default_memory_pool(), out, static_cast(positions.size()), - builder.build())); + PARQUET_THROW_NOT_OK(::parquet::arrow::WriteTable(*table, arrow::default_memory_pool(), out, + static_cast(positions.size()), + builder.build())); } int64_t write_iceberg_deletion_vector_file(const std::string& file_path, @@ -423,7 +422,8 @@ std::unique_ptr make_table_read_profile(RuntimeProfile* profile) { } TTableFormatFileDesc make_iceberg_table_format_desc( - const std::string& data_file_path, const std::vector& delete_files) { + const std::string& data_file_path, + const std::vector& delete_files) { TTableFormatFileDesc table_format_params; TIcebergFileDesc iceberg_params; iceberg_params.__set_format_version(2); @@ -433,9 +433,8 @@ TTableFormatFileDesc make_iceberg_table_format_desc( return table_format_params; } -std::vector read_iceberg_ids( - doris::iceberg::IcebergTableReader* reader, - const std::vector& projected_columns) { +std::vector read_iceberg_ids(doris::iceberg::IcebergTableReader* reader, + const std::vector& projected_columns) { std::vector ids; bool eos = false; while (!eos) { From 23697072651f3875d65813f496f7f32b2fc3f03f Mon Sep 17 00:00:00 2001 From: Socrates Date: Thu, 28 May 2026 17:36:36 +0800 Subject: [PATCH 30/38] [doc](be) Update parquet complex reader status ### What problem does this PR solve? Issue Number: close #xxx Related PR: #xxx Problem Summary: Update the new parquet reader implementation document with the current complex type support status, validation results, remaining gaps, and next implementation priorities. ### Release note None ### Check List (For Author) - Test: No need to test - Documentation-only change. - Behavior changed: No - Does this need documentation: No --- ...ris-arrow-parquet-reader-implementation.md | 85 ++++++++++++++----- 1 file changed, 62 insertions(+), 23 deletions(-) diff --git a/docs/doris-arrow-parquet-reader-implementation.md b/docs/doris-arrow-parquet-reader-implementation.md index d191229e44562c..c3acb5d8f1e7f4 100644 --- a/docs/doris-arrow-parquet-reader-implementation.md +++ b/docs/doris-arrow-parquet-reader-implementation.md @@ -142,7 +142,9 @@ select(selection, selected_rows, batch_rows, column) 当前实现: - `ScalarColumnReader`:基于 Arrow internal `RecordReader` 读取 flat primitive/string/decimal/time/timestamp。 -- `StructColumnReader`:递归读取 children,支持非常基础的 struct 组装。 +- `StructColumnReader`:支持 top-level struct 的 scalar child 组装,包含 nullable parent struct、nullable scalar child 和 struct child projection。 +- `ListColumnReader`:支持 scalar element 的 LIST level 组装,包含 null list、empty list、nullable element 和 overflow state。 +- `MapColumnReader`:支持 scalar key/value 的 MAP level 组装,包含 null map、empty map、nullable scalar value 和 overflow state。 `select()` 在基类中统一实现:把 `SelectionVector` 合并成连续 row ranges,然后交替调用 `skip()` 和 `read()`。当前不实现整批 read 后再 filter 的 fallback。 @@ -218,7 +220,8 @@ DataTypeSerDe::read_column_from_decoded_values(...) - selection index 当前是 `uint16_t`,需要显式约束 batch size; - selected read 依赖 Arrow internal `RecordReader::SkipRecords` 和 `ReadRecords`,需要继续隔离在 `column_reader.*`; - 没有 page-level row range selection; -- 复杂列延时物化尚未实现。 +- LIST/MAP 的 `select()` 已经复用 `skip() + read()` range 策略,并通过 nested overflow state 保持 cursor 正确; +- Struct 的 complex child selected read 仍依赖 child reader 自身能力,后续需要补多 stream assembler。 ## Schema Change 当前状态 @@ -248,20 +251,44 @@ DataTypeSerDe::read_column_from_decoded_values(...) - schema builder 能识别 `STRUCT`、`LIST`、`MAP`。 - 可以把复杂 Parquet schema 组合成 Doris `DataTypeStruct`、`DataTypeArray`、`DataTypeMap`。 -- `StructColumnReader` 可以递归读取 children,支持非常基础的非 nullable struct。 +- `ParquetColumnSchema` 已记录 file path、field id path、name path、definition level、repetition level、nullable definition level 和 repeated repetition level,为后续 child-level mapping/schema change 留入口。 +- `TableColumnMapper` 可以为 struct child 生成 `FieldProjection`,`ParquetReader` 会把 projected file-local schema 暴露给上层。 +- `StructColumnReader` 支持 top-level struct 的 scalar children: + - required struct; + - nullable struct; + - required scalar child; + - nullable scalar child; + - projected scalar child,例如只读 `s.b` 时仍能根据该 leaf 的 definition level 还原 parent null map。 +- `LIST` 支持 scalar element: + - required / nullable list; + - null list; + - empty list; + - required / nullable scalar element; + - 小批量 read 下跨 batch 的 overflow; + - `skip()` / `select()` 通过同一个 level assembler 推进。 +- `MAP` 支持 scalar key/value: + - required / nullable map; + - null map; + - empty map; + - required key; + - required / nullable scalar value; + - key leaf 作为 shape driver,value leaf 校验 row count、level count 和 repetition level 对齐; + - `skip()` / `select()` 通过同一个 level assembler 推进。 +- `NestedScalarBatch` 在每次 `RecordReader::ReadRecords()` 后复制 def/rep levels,并把 defined values materialize 到 Doris-owned 临时列,避免保存 Arrow buffer 或 `StringRef`。 +- `NestedScalarOverflow` 保存未消费的 level tail 和 compact 后的 Doris-owned value column,LIST/MAP read-ahead 不再假设 child records 等于 output rows。 +- `RepeatedLevelAssembler` 统一折叠 repeated level stream,生成 parent row、entry count、parent null map,并由 sink 写入 list/map child column。 主要缺口: -- nullable struct 未实现。 -- list reader 未实现。 -- map reader 未实现。 -- repeated / nested definition level assembler 未实现。 -- primitive reader 当前只支持 `max_repetition_level == 0 && max_definition_level <= 1` 的 RecordReader 路径。 -- 复杂列裁剪未实现。 -- 复杂列延时物化未实现。 -- 复杂列 schema evolution / child remap 未实现。 +- `Array(Struct)`、`Map` 还未实现。当前 Struct reader 可以组装 scalar child,但 LIST/MAP assembler 还没有接 complex child sink。 +- 嵌套 list/map 还未实现,例如 `Array(Array)`、`Map>`。 +- nullable struct 如果包含 complex child,目前仍返回 `NotSupported`,避免在缺少多 stream assembler 时误读。 +- LIST/MAP 的 nested projection 还未实现。当前只支持完整读取 scalar element/value,不支持只投影 `array.element.x` 或 `map.value.y`。 +- 复杂类型 schema change 还未实现 child-level remap/default/cast。当前 schema/path/projection 结构按后续扩展预留,但缺失 child、rename、field id remap、default child、nested cast 都还没有接入。 +- primitive reader 的 flat scalar 路径仍只支持 `max_repetition_level == 0 && max_definition_level <= 1`;nested scalar 只能通过 complex reader 使用。 +- complex child 的 lazy materialization 还不完整,尤其是 Struct complex child 和未来多 leaf value 需要统一 cursor/overflow。 -结论:当前复杂列“schema 可见”,但“读取能力不完整”。真正可用还需要实现 Dremel assembler 或等价的 nested column assembler。 +结论:当前复杂列已经从“schema 可见”推进到“scalar-child LIST/MAP/STRUCT 可读”。下一阶段重点不是再补单个特殊 case,而是把 Struct child 接入 LIST/MAP assembler,并建立多 leaf stream 的统一 cursor/overflow 模型。 ## 当前可用能力总结 @@ -277,7 +304,10 @@ DataTypeSerDe::read_column_from_decoded_values(...) - 通过 `DataTypeSerDe::read_column_from_decoded_values()` 写入 Doris column; - 基础 predicate-first scan; - flat column selected read; -- 非 nullable struct 的初步读取框架。 +- non-nullable / nullable struct 的 scalar child 读取; +- struct scalar child projection; +- scalar LIST / MAP 读取; +- LIST / MAP 的 skip/select overflow 推进。 当前还不具备完整生产能力,尤其缺少: @@ -286,22 +316,31 @@ DataTypeSerDe::read_column_from_decoded_values(...) - batch 内 `ColumnPredicate` 执行; - `reader_expression_map`; - page index / bloom filter / dictionary pruning; -- list/map/nullable struct; -- nested column pruning; -- nested lazy materialization; +- `Array(Struct)` / `Map`; +- nested list/map; +- LIST/MAP child projection; +- 复杂类型 schema change; +- complex child nested lazy materialization; - 充分单测覆盖。 +最近验证状态: + +- `git diff --check` 通过。 +- Fedora `/home/socrates/code/doris` 上 `BUILD_TYPE=DEBUG ./build.sh --be` 通过。 +- 本地 macOS 运行 `./run-be-ut.sh --run '--filter=ParquetColumnReaderTest.*'` 被环境阻断,CMake 检查 clang++ 时失败:`ld: library 'c++' not found`,未进入测试体。 + ## 下一步优先级 建议按以下顺序推进: -1. 收敛 `SchemaField` 和 `ColumnMapping` 的 id 语义,区分 Iceberg field id、Parquet leaf column id 和 file-local output position。 -2. 补齐 batch 内 `ColumnPredicate` 执行,让 row group pruning 之后仍有正确 residual filter。 -3. 实现 `reader_expression_map`,支撑 schema change 下无法安全下推的 filter fallback。 -4. 补 flat primitive/string/decimal/timestamp 的 selected read 单测。 -5. 实现 nullable struct,再实现 list/map assembler。 -6. 在复杂列 assembler 稳定后,再做 nested pruning 和 nested lazy materialization。 -7. 后续再接 page index、bloom filter、dictionary pruning。 +1. 抽象 Struct child sink,把 `Array(Struct)` 和 `Map` 接到现有 LIST/MAP level assembler。 +2. 将 LIST/MAP projection 从 top-level projection 扩展到 child projection,先支持 `array.element.` 和 `map.value.` 这类 Struct child 裁剪。 +3. 为多 leaf stream 引入统一 cursor/overflow 状态,避免 Struct、Array、Map 各自维护不兼容的 read-ahead。 +4. 收敛 `SchemaField` 和 `ColumnMapping` 的 id 语义,区分 Iceberg field id、Parquet leaf column id 和 file-local output position。 +5. 设计复杂类型 schema change 的 child-level mapping 接口,先预留缺失 child/default/null/cast sink,不立即实现完整语义。 +6. 补齐 batch 内 `ColumnPredicate` 执行,让 row group pruning 之后仍有正确 residual filter。 +7. 实现 `reader_expression_map`,支撑 schema change 下无法安全下推的 filter fallback。 +8. 在复杂列 assembler 稳定后,再做 nested pruning、nested lazy materialization、page index、bloom filter、dictionary pruning。 ## 核心规则 From 4ca4e217a29d80f45e83690aacd03c8f61dd8c22 Mon Sep 17 00:00:00 2001 From: Socrates Date: Fri, 29 May 2026 10:00:21 +0800 Subject: [PATCH 31/38] [feature](be) Add parquet dictionary row group pruning ### What problem does this PR solve? Issue Number: close #xxx Related PR: #xxx Problem Summary: Add first-stage dictionary predicate pushdown for the new Parquet reader. It conservatively prunes fully dictionary encoded string-like row groups for EQ and IN predicates by evaluating owned dictionary values before reading data pages. ### Release note None ### Check List (For Author) - Test: Manual test - Ran build-support/clang-format.sh on modified BE files. - Ran git diff --check. - Local targeted BE UT could not run because the Mac toolchain fails CMake compiler detection with ld: library 'c++' not found. - Behavior changed: No - Does this need documentation: No --- be/src/format/new_parquet/parquet_reader.cpp | 5 +- .../format/new_parquet/parquet_statistics.cpp | 207 +++++++++++++++++- .../format/new_parquet/parquet_statistics.h | 6 +- .../new_parquet/parquet_reader_test.cpp | 121 ++++++++++ 4 files changed, 323 insertions(+), 16 deletions(-) diff --git a/be/src/format/new_parquet/parquet_reader.cpp b/be/src/format/new_parquet/parquet_reader.cpp index 5e4107d727d749..2626df205fab2a 100644 --- a/be/src/format/new_parquet/parquet_reader.cpp +++ b/be/src/format/new_parquet/parquet_reader.cpp @@ -675,8 +675,9 @@ Status ParquetReader::open(std::unique_ptr& request) { reader::SchemaField projected_field; RETURN_IF_ERROR(_get_projected_schema_field(file_column_id, &projection, &projected_field)); } - RETURN_IF_ERROR(select_row_groups_by_statistics(*_state->metadata, _state->file_schema, - *_request, &_state->selected_row_groups)); + RETURN_IF_ERROR(select_row_groups_by_statistics(*_state->metadata, _state->file_reader.get(), + _state->file_schema, *_request, + &_state->selected_row_groups)); std::vector range_selected_row_groups; range_selected_row_groups.reserve(_state->selected_row_groups.size()); for (const auto row_group_idx : _state->selected_row_groups) { diff --git a/be/src/format/new_parquet/parquet_statistics.cpp b/be/src/format/new_parquet/parquet_statistics.cpp index a28ccb8ae25cd0..b7a4ad9b096070 100644 --- a/be/src/format/new_parquet/parquet_statistics.cpp +++ b/be/src/format/new_parquet/parquet_statistics.cpp @@ -19,9 +19,12 @@ #include #include +#include #include #include +#include +#include #include #include #include @@ -103,6 +106,165 @@ bool is_null_only_predicate(const ColumnPredicate& predicate) { predicate.type() == PredicateType::IS_NOT_NULL; } +bool is_supported_dictionary_predicate(const ColumnPredicate& predicate) { + switch (predicate.type()) { + case PredicateType::EQ: + case PredicateType::IN_LIST: + return true; + default: + return false; + } +} + +bool is_dictionary_data_encoding(::parquet::Encoding::type encoding) { + return encoding == ::parquet::Encoding::PLAIN_DICTIONARY || + encoding == ::parquet::Encoding::RLE_DICTIONARY; +} + +bool is_level_encoding(::parquet::Encoding::type encoding) { + return encoding == ::parquet::Encoding::RLE || encoding == ::parquet::Encoding::BIT_PACKED; +} + +bool is_data_page_type(::parquet::PageType::type page_type) { + return page_type == ::parquet::PageType::DATA_PAGE || + page_type == ::parquet::PageType::DATA_PAGE_V2; +} + +bool is_dictionary_encoded_chunk(const ::parquet::ColumnChunkMetaData& column_metadata) { + if (!column_metadata.has_dictionary_page()) { + return false; + } + + const auto& encoding_stats = column_metadata.encoding_stats(); + if (!encoding_stats.empty()) { + bool has_dictionary_data_page = false; + for (const auto& encoding_stat : encoding_stats) { + if (!is_data_page_type(encoding_stat.page_type) || encoding_stat.count <= 0) { + continue; + } + if (!is_dictionary_data_encoding(encoding_stat.encoding)) { + return false; + } + has_dictionary_data_page = true; + } + return has_dictionary_data_page; + } + + bool has_dictionary_encoding = false; + for (const auto encoding : column_metadata.encodings()) { + if (is_dictionary_data_encoding(encoding)) { + has_dictionary_encoding = true; + continue; + } + if (!is_level_encoding(encoding)) { + return false; + } + } + return has_dictionary_encoding; +} + +bool supports_dictionary_pruning(const ParquetColumnSchema& column_schema, + const ::parquet::ColumnChunkMetaData& column_metadata, + const reader::FileColumnPredicateFilter& column_filter) { + if (column_schema.kind != ParquetColumnSchemaKind::PRIMITIVE || + column_schema.descriptor == nullptr || column_schema.type == nullptr) { + return false; + } + if (!column_schema.type_descriptor.is_string_like) { + return false; + } + if (column_metadata.type() != ::parquet::Type::BYTE_ARRAY && + column_metadata.type() != ::parquet::Type::FIXED_LEN_BYTE_ARRAY) { + return false; + } + for (const auto& column_predicate : column_filter.predicates) { + if (column_predicate == nullptr || !is_supported_dictionary_predicate(*column_predicate)) { + return false; + } + } + return true; +} + +struct OwnedDictionaryWords { + std::vector values; + std::vector refs; + + void clear() { + values.clear(); + refs.clear(); + } + + void build_refs() { + refs.reserve(values.size()); + for (const auto& value : values) { + refs.emplace_back(value.data(), value.size()); + } + } +}; + +bool read_dictionary_words(::parquet::ParquetFileReader* file_reader, int row_group_idx, + int leaf_column_id, const ParquetColumnSchema& column_schema, + OwnedDictionaryWords* dict_words) { + DORIS_CHECK(dict_words != nullptr); + dict_words->clear(); + if (file_reader == nullptr || leaf_column_id < 0) { + return false; + } + + auto row_group_reader = file_reader->RowGroup(row_group_idx); + if (row_group_reader == nullptr) { + return false; + } + auto page_reader = row_group_reader->GetColumnPageReader(leaf_column_id); + if (page_reader == nullptr) { + return false; + } + auto column_reader = + ::parquet::ColumnReader::Make(column_schema.descriptor, std::move(page_reader)); + if (column_reader == nullptr) { + return false; + } + + int32_t dictionary_length = 0; + const void* dictionary = nullptr; + try { + dictionary = column_reader->ReadDictionary(&dictionary_length); + } catch (const ::parquet::ParquetException&) { + return false; + } catch (const std::exception&) { + return false; + } + if (dictionary == nullptr || dictionary_length <= 0) { + return false; + } + + dict_words->values.reserve(static_cast(dictionary_length)); + if (column_schema.descriptor->physical_type() == ::parquet::Type::BYTE_ARRAY) { + const auto* byte_array_values = reinterpret_cast(dictionary); + for (int32_t dict_idx = 0; dict_idx < dictionary_length; ++dict_idx) { + dict_words->values.emplace_back( + reinterpret_cast(byte_array_values[dict_idx].ptr), + byte_array_values[dict_idx].len); + } + dict_words->build_refs(); + return true; + } + if (column_schema.descriptor->physical_type() == ::parquet::Type::FIXED_LEN_BYTE_ARRAY) { + const int type_length = column_schema.descriptor->type_length(); + if (type_length <= 0) { + return false; + } + const auto* flba_values = reinterpret_cast(dictionary); + for (int32_t dict_idx = 0; dict_idx < dictionary_length; ++dict_idx) { + dict_words->values.emplace_back( + reinterpret_cast(flba_values[dict_idx].ptr), type_length); + } + dict_words->build_refs(); + return true; + } + return false; +} + segment_v2::ZoneMap to_column_predicate_statistics(const ParquetColumnStatistics& statistics) { segment_v2::ZoneMap predicate_statistics; predicate_statistics.min_value = statistics.min_value; @@ -181,26 +343,46 @@ bool ParquetStatisticsUtils::CheckStatistics(const reader::FileColumnPredicateFi } bool ParquetStatisticsUtils::RowGroupExcludes( - const ::parquet::RowGroupMetaData& row_group, - const std::vector>& schema, + const ::parquet::RowGroupMetaData& row_group, ::parquet::ParquetFileReader* file_reader, + int row_group_idx, const std::vector>& schema, const reader::FileColumnPredicateFilter& column_filter) { if (column_filter.predicates.empty()) { return false; } - DCHECK(column_filter.file_column_id >= 0 && - column_filter.file_column_id < row_group.num_columns()); DCHECK_LT(column_filter.file_column_id, schema.size()); - auto column_chunk = row_group.ColumnChunk(column_filter.file_column_id); + const auto& column_schema = *schema[column_filter.file_column_id]; + if (column_schema.kind != ParquetColumnSchemaKind::PRIMITIVE || + column_schema.leaf_column_id < 0) { + return false; + } + DCHECK_LT(column_schema.leaf_column_id, row_group.num_columns()); + auto column_chunk = row_group.ColumnChunk(column_schema.leaf_column_id); if (column_chunk == nullptr) { return false; } - return CheckStatistics(column_filter, - TransformColumnStatistics(*schema[column_filter.file_column_id], - column_chunk->statistics())); + if (CheckStatistics(column_filter, + TransformColumnStatistics(column_schema, column_chunk->statistics()))) { + return true; + } + if (!supports_dictionary_pruning(column_schema, *column_chunk, column_filter) || + !is_dictionary_encoded_chunk(*column_chunk)) { + return false; + } + OwnedDictionaryWords dict_words; + if (!read_dictionary_words(file_reader, row_group_idx, column_schema.leaf_column_id, + column_schema, &dict_words)) { + return false; + } + for (const auto& column_predicate : column_filter.predicates) { + if (!column_predicate->evaluate_and(dict_words.refs.data(), dict_words.refs.size())) { + return true; + } + } + return false; } Status ParquetStatisticsUtils::SelectRowGroups( - const ::parquet::FileMetaData& metadata, + const ::parquet::FileMetaData& metadata, ::parquet::ParquetFileReader* file_reader, const std::vector>& file_schema, const reader::FileScanRequest& request, std::vector* selected_row_groups) { if (selected_row_groups == nullptr) { @@ -218,7 +400,8 @@ Status ParquetStatisticsUtils::SelectRowGroups( } bool drop = false; for (const auto& column_filter : request.column_predicate_filters) { - if (RowGroupExcludes(*row_group, file_schema, column_filter)) { + if (RowGroupExcludes(*row_group, file_reader, row_group_idx, file_schema, + column_filter)) { drop = true; break; } @@ -246,10 +429,10 @@ bool ParquetStatisticsUtils::BloomFilterSupported(const ParquetColumnSchema& col } Status select_row_groups_by_statistics( - const ::parquet::FileMetaData& metadata, + const ::parquet::FileMetaData& metadata, ::parquet::ParquetFileReader* file_reader, const std::vector>& file_schema, const reader::FileScanRequest& request, std::vector* selected_row_groups) { - return ParquetStatisticsUtils::SelectRowGroups(metadata, file_schema, request, + return ParquetStatisticsUtils::SelectRowGroups(metadata, file_reader, file_schema, request, selected_row_groups); } diff --git a/be/src/format/new_parquet/parquet_statistics.h b/be/src/format/new_parquet/parquet_statistics.h index 4f43ae245b57bf..ff1c300e84ca6f 100644 --- a/be/src/format/new_parquet/parquet_statistics.h +++ b/be/src/format/new_parquet/parquet_statistics.h @@ -26,6 +26,7 @@ namespace parquet { class FileMetaData; +class ParquetFileReader; class RowGroupMetaData; class Statistics; } // namespace parquet @@ -66,11 +67,12 @@ struct ParquetStatisticsUtils { const ParquetColumnStatistics& statistics); static bool RowGroupExcludes(const ::parquet::RowGroupMetaData& row_group, + ::parquet::ParquetFileReader* file_reader, int row_group_idx, const std::vector>& schema, const reader::FileColumnPredicateFilter& column_filter); static Status SelectRowGroups( - const ::parquet::FileMetaData& metadata, + const ::parquet::FileMetaData& metadata, ::parquet::ParquetFileReader* file_reader, const std::vector>& file_schema, const reader::FileScanRequest& request, std::vector* selected_row_groups); @@ -82,7 +84,7 @@ struct ParquetStatisticsUtils { // 后续 page index、dictionary、bloom filter 等文件格式优化也应继续收敛在这一层,避免污染 // ParquetReader 的 scan 调度代码。 Status select_row_groups_by_statistics( - const ::parquet::FileMetaData& metadata, + const ::parquet::FileMetaData& metadata, ::parquet::ParquetFileReader* file_reader, const std::vector>& file_schema, const reader::FileScanRequest& request, std::vector* selected_row_groups); diff --git a/be/test/format/new_parquet/parquet_reader_test.cpp b/be/test/format/new_parquet/parquet_reader_test.cpp index 0be12c271293cc..255ad574a26177 100644 --- a/be/test/format/new_parquet/parquet_reader_test.cpp +++ b/be/test/format/new_parquet/parquet_reader_test.cpp @@ -203,6 +203,30 @@ void write_int_pair_parquet_file(const std::string& file_path, int64_t row_group row_group_size, builder.build())); } +void write_dictionary_filter_parquet_file(const std::string& file_path) { + auto schema = arrow::schema({ + arrow::field("id", arrow::int32(), false), + arrow::field("value", arrow::utf8(), false), + }); + auto table = + arrow::Table::Make(schema, {build_int32_array({1, 2, 3, 4, 5, 6}), + build_string_array({"aa", "az", "lm", "lz", "za", "zz"})}); + + auto file_result = arrow::io::FileOutputStream::Open(file_path); + ASSERT_TRUE(file_result.ok()) << file_result.status(); + std::shared_ptr out = *file_result; + + ::parquet::WriterProperties::Builder builder; + builder.version(::parquet::ParquetVersion::PARQUET_2_6); + builder.data_page_version(::parquet::ParquetDataPageVersion::V2); + builder.compression(::parquet::Compression::UNCOMPRESSED); + builder.enable_dictionary("value"); + builder.disable_dictionary("id"); + builder.disable_statistics(); + PARQUET_THROW_NOT_OK(::parquet::arrow::WriteTable(*table, arrow::default_memory_pool(), out, 1, + builder.build())); +} + Block build_file_block(const std::vector& schema) { Block block; for (const auto& field : schema) { @@ -576,6 +600,103 @@ TEST_F(NewParquetReaderTest, PredicateFiltersRowGroupsByStatistics) { EXPECT_EQ(values, std::vector({"three", "four", "five"})); } +TEST_F(NewParquetReaderTest, PredicateFiltersRowGroupsByDictionary) { + write_dictionary_filter_parquet_file(_file_path); + auto parquet_file_reader = ::parquet::ParquetFileReader::OpenFile(_file_path, false); + ASSERT_EQ(parquet_file_reader->metadata()->num_row_groups(), 6); + for (int row_group_idx = 0; row_group_idx < 6; ++row_group_idx) { + auto row_group = parquet_file_reader->metadata()->RowGroup(row_group_idx); + ASSERT_NE(row_group, nullptr); + auto value_chunk = row_group->ColumnChunk(1); + ASSERT_NE(value_chunk, nullptr); + ASSERT_TRUE(value_chunk->has_dictionary_page()); + ASSERT_TRUE(value_chunk->statistics() == nullptr || + !value_chunk->statistics()->HasMinMax()); + } + + auto reader = create_reader(); + RuntimeState state {TQueryOptions(), TQueryGlobals()}; + ASSERT_TRUE(reader->init(&state).ok()); + + std::vector schema; + ASSERT_TRUE(reader->get_schema(&schema).ok()); + auto request = std::make_unique(); + request->predicate_columns = {1}; + request->non_predicate_columns = {0}; + reader::FileColumnPredicateFilter column_filter; + column_filter.file_column_id = 1; + column_filter.predicates.push_back(create_comparison_predicate( + 1, "value", schema[1].type, Field::create_field("lm"), false)); + request->column_predicate_filters.push_back(std::move(column_filter)); + ASSERT_TRUE(reader->open(request).ok()); + + std::vector ids; + std::vector values; + bool eof = false; + while (!eof) { + Block block = build_file_block(schema); + size_t rows = 0; + ASSERT_TRUE(reader->get_block(&block, &rows, &eof).ok()); + if (rows == 0) { + continue; + } + const auto& id_column = assert_cast(*block.get_by_position(0).column); + const auto& value_column = + assert_cast(*block.get_by_position(1).column); + for (size_t row = 0; row < rows; ++row) { + ids.push_back(id_column.get_element(row)); + values.push_back(value_column.get_data_at(row).to_string()); + } + } + + EXPECT_EQ(ids, std::vector({3})); + EXPECT_EQ(values, std::vector({"lm"})); +} + +TEST_F(NewParquetReaderTest, InPredicateFiltersRowGroupsByDictionary) { + write_dictionary_filter_parquet_file(_file_path); + auto reader = create_reader(); + RuntimeState state {TQueryOptions(), TQueryGlobals()}; + ASSERT_TRUE(reader->init(&state).ok()); + + std::vector schema; + ASSERT_TRUE(reader->get_schema(&schema).ok()); + auto request = std::make_unique(); + request->predicate_columns = {1}; + request->non_predicate_columns = {0}; + auto set = build_set(); + set->insert(const_cast("az"), 2); + set->insert(const_cast("za"), 2); + reader::FileColumnPredicateFilter column_filter; + column_filter.file_column_id = 1; + column_filter.predicates.push_back(create_in_list_predicate( + 1, "value", schema[1].type, set, false)); + request->column_predicate_filters.push_back(std::move(column_filter)); + ASSERT_TRUE(reader->open(request).ok()); + + std::vector ids; + std::vector values; + bool eof = false; + while (!eof) { + Block block = build_file_block(schema); + size_t rows = 0; + ASSERT_TRUE(reader->get_block(&block, &rows, &eof).ok()); + if (rows == 0) { + continue; + } + const auto& id_column = assert_cast(*block.get_by_position(0).column); + const auto& value_column = + assert_cast(*block.get_by_position(1).column); + for (size_t row = 0; row < rows; ++row) { + ids.push_back(id_column.get_element(row)); + values.push_back(value_column.get_data_at(row).to_string()); + } + } + + EXPECT_EQ(ids, std::vector({2, 5})); + EXPECT_EQ(values, std::vector({"az", "za"})); +} + TEST_F(NewParquetReaderTest, RowPositionReaderReturnsFileLocalPositions) { write_parquet_file(_file_path, 2); auto parquet_file_reader = ::parquet::ParquetFileReader::OpenFile(_file_path, false); From 97dc0f1ed3b224f35eaa5297b967678d987bf2c2 Mon Sep 17 00:00:00 2001 From: Socrates Date: Fri, 29 May 2026 10:06:01 +0800 Subject: [PATCH 32/38] [fix](be) Decode parquet dictionary page directly ### What problem does this PR solve? Issue Number: close #xxx Related PR: #xxx Problem Summary: Avoid relying on an unavailable Arrow Parquet ColumnReader::ReadDictionary API by reading the dictionary page directly and decoding PLAIN byte array dictionaries for row group pruning. ### Release note None ### Check List (For Author) - Test: Manual test - Ran build-support/clang-format.sh on parquet_statistics.cpp. - Ran git diff --check. - Fedora DEBUG BE build is rerun after this fix. - Behavior changed: No - Does this need documentation: No --- .../format/new_parquet/parquet_statistics.cpp | 43 +++++++++++++------ 1 file changed, 31 insertions(+), 12 deletions(-) diff --git a/be/src/format/new_parquet/parquet_statistics.cpp b/be/src/format/new_parquet/parquet_statistics.cpp index b7a4ad9b096070..17c4e4911f5ddf 100644 --- a/be/src/format/new_parquet/parquet_statistics.cpp +++ b/be/src/format/new_parquet/parquet_statistics.cpp @@ -19,7 +19,8 @@ #include #include -#include +#include +#include #include #include @@ -219,28 +220,39 @@ bool read_dictionary_words(::parquet::ParquetFileReader* file_reader, int row_gr if (page_reader == nullptr) { return false; } - auto column_reader = - ::parquet::ColumnReader::Make(column_schema.descriptor, std::move(page_reader)); - if (column_reader == nullptr) { - return false; - } - int32_t dictionary_length = 0; - const void* dictionary = nullptr; + std::shared_ptr<::parquet::Page> page; try { - dictionary = column_reader->ReadDictionary(&dictionary_length); + page = page_reader->NextPage(); } catch (const ::parquet::ParquetException&) { return false; } catch (const std::exception&) { return false; } - if (dictionary == nullptr || dictionary_length <= 0) { + if (page == nullptr || page->type() != ::parquet::PageType::DICTIONARY_PAGE) { + return false; + } + const auto* dictionary_page = static_cast(page.get()); + if (dictionary_page->encoding() != ::parquet::Encoding::PLAIN && + dictionary_page->encoding() != ::parquet::Encoding::PLAIN_DICTIONARY) { + return false; + } + const int32_t dictionary_length = dictionary_page->num_values(); + if (dictionary_length <= 0) { return false; } + const auto* dictionary_data = dictionary_page->data(); + const int dictionary_size = dictionary_page->size(); dict_words->values.reserve(static_cast(dictionary_length)); if (column_schema.descriptor->physical_type() == ::parquet::Type::BYTE_ARRAY) { - const auto* byte_array_values = reinterpret_cast(dictionary); + auto decoder = ::parquet::MakeTypedDecoder<::parquet::ByteArrayType>( + ::parquet::Encoding::PLAIN, column_schema.descriptor); + decoder->SetData(dictionary_length, dictionary_data, dictionary_size); + std::vector<::parquet::ByteArray> byte_array_values(static_cast(dictionary_length)); + if (decoder->Decode(byte_array_values.data(), dictionary_length) != dictionary_length) { + return false; + } for (int32_t dict_idx = 0; dict_idx < dictionary_length; ++dict_idx) { dict_words->values.emplace_back( reinterpret_cast(byte_array_values[dict_idx].ptr), @@ -254,7 +266,14 @@ bool read_dictionary_words(::parquet::ParquetFileReader* file_reader, int row_gr if (type_length <= 0) { return false; } - const auto* flba_values = reinterpret_cast(dictionary); + auto decoder = ::parquet::MakeTypedDecoder<::parquet::FLBAType>(::parquet::Encoding::PLAIN, + column_schema.descriptor); + decoder->SetData(dictionary_length, dictionary_data, dictionary_size); + std::vector<::parquet::FixedLenByteArray> flba_values( + static_cast(dictionary_length)); + if (decoder->Decode(flba_values.data(), dictionary_length) != dictionary_length) { + return false; + } for (int32_t dict_idx = 0; dict_idx < dictionary_length; ++dict_idx) { dict_words->values.emplace_back( reinterpret_cast(flba_values[dict_idx].ptr), type_length); From d8075142a42e88061204c32a32c1264fb04e7011 Mon Sep 17 00:00:00 2001 From: Socrates Date: Fri, 29 May 2026 10:24:45 +0800 Subject: [PATCH 33/38] [fix](be) Match parquet list test schema nullability ### What problem does this PR solve? Issue Number: close #xxx Related PR: #xxx Problem Summary: Construct Arrow list arrays in ParquetColumnReaderTest with explicit element field nullability so the generated arrays match the declared table schema. ### Release note None ### Check List (For Author) - Test: Manual test - Ran build-support/clang-format.sh on parquet_column_reader_test.cpp. - Ran git diff --check. - Fedora ParquetColumnReaderTest is rerun after this fix. - Behavior changed: No - Does this need documentation: No --- .../format/new_parquet/parquet_column_reader_test.cpp | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/be/test/format/new_parquet/parquet_column_reader_test.cpp b/be/test/format/new_parquet/parquet_column_reader_test.cpp index 059e9b709aa4ad..1bacab7cf8ed17 100644 --- a/be/test/format/new_parquet/parquet_column_reader_test.cpp +++ b/be/test/format/new_parquet/parquet_column_reader_test.cpp @@ -177,7 +177,8 @@ class ParquetColumnReaderTest : public testing::Test { std::shared_ptr build_required_int_list_array() { auto value_builder = std::make_shared(); - arrow::ListBuilder builder(arrow::default_memory_pool(), value_builder); + arrow::ListBuilder builder(arrow::default_memory_pool(), value_builder, + arrow::list(arrow::field("element", arrow::int32(), false))); const std::vector> values = { {1, 2}, {3}, {4, 5, 6}, {7}, {8, 9}, }; @@ -192,7 +193,8 @@ class ParquetColumnReaderTest : public testing::Test { std::shared_ptr build_nullable_int_list_array() { auto value_builder = std::make_shared(); - arrow::ListBuilder builder(arrow::default_memory_pool(), value_builder); + arrow::ListBuilder builder(arrow::default_memory_pool(), value_builder, + arrow::list(arrow::field("element", arrow::int32(), true))); EXPECT_TRUE(builder.Append().ok()); EXPECT_TRUE(value_builder->Append(10).ok()); EXPECT_TRUE(value_builder->Append(20).ok()); @@ -208,7 +210,8 @@ class ParquetColumnReaderTest : public testing::Test { std::shared_ptr build_required_nullable_int_list_array() { auto value_builder = std::make_shared(); - arrow::ListBuilder builder(arrow::default_memory_pool(), value_builder); + arrow::ListBuilder builder(arrow::default_memory_pool(), value_builder, + arrow::list(arrow::field("element", arrow::int32(), true))); EXPECT_TRUE(builder.AppendEmptyValue().ok()); EXPECT_TRUE(builder.Append().ok()); EXPECT_TRUE(value_builder->AppendNull().ok()); From cf14d1e05a7e705dccb28e72f6fa9a5f6a93ff29 Mon Sep 17 00:00:00 2001 From: Socrates Date: Fri, 29 May 2026 10:31:47 +0800 Subject: [PATCH 34/38] [fix](be) Read required nested parquet scalars ### What problem does this PR solve? Issue Number: close #xxx Related PR: #xxx Problem Summary: Support required nested scalar leaves that Arrow RecordReader reports without level buffers, and only consume materialized values when nested definition levels reach the leaf max definition level. ### Release note None ### Check List (For Author) - Test: Manual test - Ran git diff --check. Fedora BE unit test validation follows with ./run-be-ut.sh --run '--filter=ParquetColumnReaderTest.*'. - Behavior changed: No - Does this need documentation: No --- be/src/format/new_parquet/column_reader.cpp | 41 +++++++++++++++++-- .../parquet_column_reader_test.cpp | 1 + 2 files changed, 39 insertions(+), 3 deletions(-) diff --git a/be/src/format/new_parquet/column_reader.cpp b/be/src/format/new_parquet/column_reader.cpp index 37d1efa322ee01..a177e025879ebd 100644 --- a/be/src/format/new_parquet/column_reader.cpp +++ b/be/src/format/new_parquet/column_reader.cpp @@ -458,6 +458,12 @@ Status read_nested_scalar_batch(ScalarColumnReader& column_reader, int64_t batch } batch->levels_written = record_reader->levels_written(); batch->values_written = record_reader->values_written(); + if (batch->levels_written == 0 && batch->records_read > 0 && + batch->values_written == batch->records_read && + column_reader.descriptor()->max_definition_level() == 0 && + column_reader.descriptor()->max_repetition_level() == 0) { + batch->levels_written = batch->records_read; + } if (batch->levels_written < batch->records_read || batch->values_written < 0 || batch->values_written > batch->levels_written) { return Status::Corruption( @@ -501,7 +507,8 @@ Status read_nested_scalar_batch(ScalarColumnReader& column_reader, int64_t batch const int16_t max_definition_level = column_reader.descriptor()->max_definition_level(); NullMap value_null_map; for (int64_t level_idx = 0; level_idx < batch->levels_written; ++level_idx) { - if (batch->def_levels[level_idx] >= value_slot_definition_level) { + const bool has_value = batch->def_levels[level_idx] == max_definition_level; + if (batch->def_levels[level_idx] >= value_slot_definition_level && has_value) { if (value_idx >= batch->values_written) { return Status::Corruption( "Nested parquet reader returned fewer values than definition levels for " @@ -509,8 +516,10 @@ Status read_nested_scalar_batch(ScalarColumnReader& column_reader, int64_t batch column_reader.name()); } batch->value_indices[level_idx] = value_idx++; + } + if (batch->def_levels[level_idx] >= value_slot_definition_level) { if (column_reader.type()->is_nullable()) { - value_null_map.push_back(batch->def_levels[level_idx] != max_definition_level); + value_null_map.push_back(!has_value); } } } @@ -574,6 +583,32 @@ Status append_scalar_batch_value(const ScalarColumnReader& column_reader, return Status::OK(); } +bool supports_nested_scalar_record_reader(const ParquetColumnSchema& column_schema) { + if (supports_record_reader(column_schema.type_descriptor)) { + return true; + } + const auto& type_descriptor = column_schema.type_descriptor; + if (type_descriptor.extra_type_info != ParquetExtraTypeInfo::NONE || + type_descriptor.is_decimal || type_descriptor.is_timestamp || + type_descriptor.is_string_like) { + return false; + } + if (type_descriptor.converted_type != ::parquet::ConvertedType::NONE && + type_descriptor.converted_type != ::parquet::ConvertedType::UNDEFINED) { + return false; + } + switch (type_descriptor.physical_type) { + case ::parquet::Type::BOOLEAN: + case ::parquet::Type::INT32: + case ::parquet::Type::INT64: + case ::parquet::Type::FLOAT: + case ::parquet::Type::DOUBLE: + return true; + default: + return false; + } +} + ColumnArray* array_column_from_output(MutableColumnPtr& column) { if (auto* nullable_column = check_and_get_column(*column)) { return assert_cast(&nullable_column->get_nested_column()); @@ -1349,7 +1384,7 @@ Status ParquetColumnReaderFactory::create_nested_scalar_column_reader( return Status::InvalidArgument("Invalid parquet leaf column id {} for column {}", column_schema.leaf_column_id, column_schema.name); } - if (!supports_record_reader(column_schema.type_descriptor)) { + if (!supports_nested_scalar_record_reader(column_schema)) { return Status::NotSupported( "Current parquet nested scalar reader does not support column {}", column_schema.name); diff --git a/be/test/format/new_parquet/parquet_column_reader_test.cpp b/be/test/format/new_parquet/parquet_column_reader_test.cpp index 1bacab7cf8ed17..ca4003cf3772b6 100644 --- a/be/test/format/new_parquet/parquet_column_reader_test.cpp +++ b/be/test/format/new_parquet/parquet_column_reader_test.cpp @@ -740,6 +740,7 @@ class ParquetColumnReaderTest : public testing::Test { void read_and_validate(size_t field_idx) const { auto reader = create_reader(field_idx); + ASSERT_NE(reader, nullptr); MutableColumnPtr column = reader->type()->create_column(); int64_t rows_read = 0; auto st = reader->read(ROW_COUNT, column, &rows_read); From aa1381cf73e5049d0999dc0523c5c4a9d97da0ef Mon Sep 17 00:00:00 2001 From: Socrates Date: Fri, 29 May 2026 10:34:54 +0800 Subject: [PATCH 35/38] [fix](be) Stabilize parquet nested scalar levels ### What problem does this PR solve? Issue Number: close #xxx Related PR: #xxx Problem Summary: Avoid stale level buffers for required nested leaves and preserve nullable nested scalar value slot mapping expected by Arrow RecordReader. ### Release note None ### Check List (For Author) - Test: Manual test - Ran git diff --check. Fedora BE unit test validation follows with ./run-be-ut.sh --run '--filter=ParquetColumnReaderTest.*'. - Behavior changed: No - Does this need documentation: No --- be/src/format/new_parquet/column_reader.cpp | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/be/src/format/new_parquet/column_reader.cpp b/be/src/format/new_parquet/column_reader.cpp index a177e025879ebd..4d24100685eb15 100644 --- a/be/src/format/new_parquet/column_reader.cpp +++ b/be/src/format/new_parquet/column_reader.cpp @@ -482,7 +482,7 @@ Status read_nested_scalar_batch(ScalarColumnReader& column_reader, int64_t batch column_reader.name()); } batch->def_levels.resize(static_cast(batch->levels_written)); - if (def_levels == nullptr) { + if (column_reader.descriptor()->max_definition_level() == 0 || def_levels == nullptr) { std::fill(batch->def_levels.begin(), batch->def_levels.end(), column_reader.descriptor()->max_definition_level()); } else { @@ -496,7 +496,7 @@ Status read_nested_scalar_batch(ScalarColumnReader& column_reader, int64_t batch column_reader.name()); } batch->rep_levels.resize(static_cast(batch->levels_written)); - if (rep_levels == nullptr) { + if (column_reader.descriptor()->max_repetition_level() == 0 || rep_levels == nullptr) { std::fill(batch->rep_levels.begin(), batch->rep_levels.end(), 0); } else { std::copy(rep_levels, rep_levels + batch->levels_written, batch->rep_levels.begin()); @@ -507,8 +507,7 @@ Status read_nested_scalar_batch(ScalarColumnReader& column_reader, int64_t batch const int16_t max_definition_level = column_reader.descriptor()->max_definition_level(); NullMap value_null_map; for (int64_t level_idx = 0; level_idx < batch->levels_written; ++level_idx) { - const bool has_value = batch->def_levels[level_idx] == max_definition_level; - if (batch->def_levels[level_idx] >= value_slot_definition_level && has_value) { + if (batch->def_levels[level_idx] >= value_slot_definition_level) { if (value_idx >= batch->values_written) { return Status::Corruption( "Nested parquet reader returned fewer values than definition levels for " @@ -516,10 +515,8 @@ Status read_nested_scalar_batch(ScalarColumnReader& column_reader, int64_t batch column_reader.name()); } batch->value_indices[level_idx] = value_idx++; - } - if (batch->def_levels[level_idx] >= value_slot_definition_level) { if (column_reader.type()->is_nullable()) { - value_null_map.push_back(!has_value); + value_null_map.push_back(batch->def_levels[level_idx] != max_definition_level); } } } From 5ddf3f26562dd7727fbe1b0ad5270606285288ff Mon Sep 17 00:00:00 2001 From: Socrates Date: Fri, 29 May 2026 10:36:42 +0800 Subject: [PATCH 36/38] [fix](be) Align nullable parquet struct child slots ### What problem does this PR solve? Issue Number: close #xxx Related PR: #xxx Problem Summary: Read scalar struct children as row-aligned slots so nullable parent struct rows keep child value buffers aligned. ### Release note None ### Check List (For Author) - Test: Manual test - Ran git diff --check. Fedora BE unit test validation follows with ./run-be-ut.sh --run '--filter=ParquetColumnReaderTest.*'. - Behavior changed: No - Does this need documentation: No --- be/src/format/new_parquet/column_reader.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/be/src/format/new_parquet/column_reader.cpp b/be/src/format/new_parquet/column_reader.cpp index 4d24100685eb15..143cddd831aec9 100644 --- a/be/src/format/new_parquet/column_reader.cpp +++ b/be/src/format/new_parquet/column_reader.cpp @@ -794,8 +794,7 @@ Status StructColumnReader::read(int64_t rows, MutableColumnPtr& column, int64_t* std::vector child_batches(scalar_children.size()); int64_t expected_rows = -1; for (size_t child_idx = 0; child_idx < scalar_children.size(); ++child_idx) { - RETURN_IF_ERROR(read_nested_scalar_batch(*scalar_children[child_idx], rows, - _nullable_definition_level, + RETURN_IF_ERROR(read_nested_scalar_batch(*scalar_children[child_idx], rows, 0, &child_batches[child_idx])); if (expected_rows < 0) { expected_rows = child_batches[child_idx].records_read; From 0ae7e1ce9869a2950a924e398fa29e38d4ce23ce Mon Sep 17 00:00:00 2001 From: Gabriel Date: Fri, 29 May 2026 14:32:40 +0800 Subject: [PATCH 37/38] [feature](be) Support parquet minmax aggregate pushdown (#63868) ### What problem does this PR solve? Issue Number: close #xxx Related PR: #xxx Problem Summary: Add a metadata-backed MIN/MAX aggregate pushdown path for external Parquet readers and gate Iceberg v2 aggregate pushdown when delete files are present. ### Release note Support min/max aggregate pushdown for eligible external Parquet scans. ### Check List (For Author) - Test: Unit Test / Manual test - Added AggregateReaderTest and ParquetReaderTest.minmax_pushdown_from_statistics. - Manual test: git diff --check and git diff --cached --check. - Not run: run-be-ut.sh failed because this environment only has JDK 11 and requires JDK 17; clang-format script failed because llvm@16 is not installed. - Behavior changed: Yes, eligible Parquet scans can return min/max aggregate rows from footer statistics; unsafe Iceberg delete-file scans disable aggregate pushdown. - Does this need documentation: No ### What problem does this PR solve? Issue Number: close #xxx Related PR: #xxx Problem Summary: ### Release note None ### Check List (For Author) - Test - [ ] Regression test - [ ] Unit Test - [ ] Manual test (add detailed scripts or steps below) - [ ] No need to test or manual test. Explain why: - [ ] This is a refactor/code format and no logic has been changed. - [ ] Previous test can cover this change. - [ ] No code files have been changed. - [ ] Other reason - Behavior changed: - [ ] No. - [ ] Yes. - Does this need documentation? - [ ] No. - [ ] Yes. ### Check List (For Reviewer who merge this PR) - [ ] Confirm the release note - [ ] Confirm test cases - [ ] Confirm document - [ ] Add branch pick label --- be/src/format/new_parquet/parquet_reader.cpp | 108 ++- be/src/format/new_parquet/parquet_reader.h | 3 + be/src/format/reader/column_mapper.cpp | 66 +- be/src/format/reader/column_mapper.h | 6 +- be/src/format/reader/file_reader.h | 43 +- be/src/format/reader/table_reader.cpp | 22 +- be/src/format/reader/table_reader.h | 170 ++++- be/src/format/table/iceberg_reader_v2.cpp | 50 +- be/src/format/table/iceberg_reader_v2.h | 11 +- .../new_parquet/parquet_reader_test.cpp | 31 +- be/test/format/reader/expr/cast_test.cpp | 193 +++++- be/test/format/reader/table_reader_test.cpp | 654 +++++++++++++++++- 12 files changed, 1208 insertions(+), 149 deletions(-) diff --git a/be/src/format/new_parquet/parquet_reader.cpp b/be/src/format/new_parquet/parquet_reader.cpp index 2626df205fab2a..26093575c1194c 100644 --- a/be/src/format/new_parquet/parquet_reader.cpp +++ b/be/src/format/new_parquet/parquet_reader.cpp @@ -323,35 +323,27 @@ Status ParquetReader::_read_filter_columns(int64_t batch_rows, Block* file_block Status ParquetReader::_execute_filter_conjuncts(int64_t batch_rows, Block* file_block, SelectionVector* selection, uint16_t* selected_rows) { - // Expression filters may reference several predicate columns. Execute them only after all + // Conjuncts may reference several predicate columns. Execute them only after all referenced // predicate columns in the file-local block have been materialized. - for (const auto& expression_filter : _request->expression_filters) { - if (expression_filter.conjunct == nullptr) { - if (expression_filter.delete_conjunct == nullptr) { - continue; - } - } else { - if (*selected_rows == 0) { - break; - } - IColumn::Filter filter(static_cast(batch_rows), 1); - bool can_filter_all = false; - RETURN_IF_ERROR(expression_filter.conjunct->execute_filter( - file_block, filter.data(), static_cast(batch_rows), false, - &can_filter_all)); - *selected_rows = - can_filter_all ? 0 - : _apply_filter_to_selection(filter, selection, *selected_rows); - } + for (const auto& conjunct : _request->conjuncts) { if (*selected_rows == 0) { break; } - if (expression_filter.delete_conjunct == nullptr) { - continue; + IColumn::Filter filter(static_cast(batch_rows), 1); + bool can_filter_all = false; + RETURN_IF_ERROR(conjunct->execute_filter(file_block, filter.data(), + static_cast(batch_rows), false, + &can_filter_all)); + *selected_rows = + can_filter_all ? 0 : _apply_filter_to_selection(filter, selection, *selected_rows); + } + for (const auto& delete_conjunct : _request->delete_conjuncts) { + if (*selected_rows == 0) { + break; } int result_column_id = -1; - RETURN_IF_ERROR(expression_filter.delete_conjunct->root()->execute( - expression_filter.delete_conjunct.get(), file_block, &result_column_id)); + RETURN_IF_ERROR(delete_conjunct->root()->execute(delete_conjunct.get(), file_block, + &result_column_id)); DORIS_CHECK(result_column_id >= 0 && result_column_id < static_cast(file_block->columns())); const auto& delete_filter = assert_cast( @@ -745,6 +737,76 @@ Status ParquetReader::get_block(Block* file_block, size_t* rows, bool* eof) { } } +Status ParquetReader::get_aggregate_result(const reader::FileAggregateRequest& request, + reader::FileAggregateResult* result) { + DORIS_CHECK(result != nullptr); + if (_state == nullptr || _state->metadata == nullptr || _state->schema == nullptr) { + return Status::Uninitialized("ParquetReader is not open"); + } + result->count = 0; + result->columns.clear(); + if (request.agg_type != TPushAggOp::type::COUNT && + request.agg_type != TPushAggOp::type::MINMAX) { + return Status::NotSupported("Unsupported parquet aggregate pushdown type {}", + request.agg_type); + } + + // Aggregate row count in all selected row groups. For MIN/MAX aggregate, this is used to determine whether there is no row group selected. + for (const auto row_group_idx : _state->selected_row_groups) { + auto row_group_metadata = _state->metadata->RowGroup(row_group_idx); + DORIS_CHECK(row_group_metadata != nullptr); + result->count += row_group_metadata->num_rows(); + } + if (request.agg_type == TPushAggOp::type::COUNT) { + return Status::OK(); + } + + result->columns.resize(request.columns.size()); + for (size_t request_column_idx = 0; request_column_idx < request.columns.size(); + ++request_column_idx) { + const auto file_column_id = request.columns[request_column_idx].file_column_id; + if (file_column_id < 0 || + file_column_id >= static_cast(_state->file_schema.size())) { + return Status::InvalidArgument("Invalid parquet aggregate column id {}", + file_column_id); + } + const auto& column_schema = _state->file_schema[file_column_id]; + DORIS_CHECK(column_schema != nullptr); + // TODO: Support min/max pushdown for complex column by traversing down to the leaf column readers. This requires supporting complex column statistics in parquet file reader, which is currently not implemented in parquet-cpp. + if (column_schema->leaf_column_id < 0) { + return Status::NotSupported( + "Parquet aggregate pushdown only supports primitive column {}", + column_schema->name); + } + + auto& aggregate_column = result->columns[request_column_idx]; + for (const auto row_group_idx : _state->selected_row_groups) { + auto row_group_metadata = _state->metadata->RowGroup(row_group_idx); + DORIS_CHECK(row_group_metadata != nullptr); + auto column_chunk = row_group_metadata->ColumnChunk(column_schema->leaf_column_id); + DORIS_CHECK(column_chunk != nullptr); + const auto statistics = ParquetStatisticsUtils::TransformColumnStatistics( + *column_schema, column_chunk->statistics()); + if (!statistics.has_min_max) { + return Status::NotSupported("Missing parquet min/max statistics for column {}", + column_schema->name); + } + if (!aggregate_column.has_min || statistics.min_value < aggregate_column.min_value) { + aggregate_column.min_value = statistics.min_value; + aggregate_column.has_min = true; + } + if (!aggregate_column.has_max || aggregate_column.max_value < statistics.max_value) { + aggregate_column.max_value = statistics.max_value; + aggregate_column.has_max = true; + } + } + if (!aggregate_column.has_min || !aggregate_column.has_max) { + return Status::NotSupported("No parquet row group selected for min/max pushdown"); + } + } + return Status::OK(); +} + Status ParquetReader::close() { if (_state != nullptr) { if (_state->file_reader != nullptr) { diff --git a/be/src/format/new_parquet/parquet_reader.h b/be/src/format/new_parquet/parquet_reader.h index 14a891c75e1dcf..85d766f88820ce 100644 --- a/be/src/format/new_parquet/parquet_reader.h +++ b/be/src/format/new_parquet/parquet_reader.h @@ -69,6 +69,9 @@ class ParquetReader : public reader::FileReader { // 返回列必须保持 file-local 语义,不能在这里补 default/generated/partition 列。 Status get_block(Block* file_block, size_t* rows, bool* eof) override; + Status get_aggregate_result(const reader::FileAggregateRequest& request, + reader::FileAggregateResult* result) override; + Status close() override; protected: diff --git a/be/src/format/reader/column_mapper.cpp b/be/src/format/reader/column_mapper.cpp index e8e7442a8d798e..c6114b20df31cb 100644 --- a/be/src/format/reader/column_mapper.cpp +++ b/be/src/format/reader/column_mapper.cpp @@ -43,6 +43,13 @@ struct FileSlotRewriteInfo { std::string file_column_name; }; +static VExprSPtr create_file_slot_ref(const VSlotRef& slot_ref, + const FileSlotRewriteInfo& rewrite_info) { + return TableSlotRef::create_shared(slot_ref.slot_id(), + cast_set(rewrite_info.block_position), -1, + rewrite_info.file_type, rewrite_info.file_column_name); +} + static VExprSPtr rewrite_table_expr_to_file_expr( const VExprSPtr& expr, const std::map& table_column_to_file_slot) { @@ -54,9 +61,7 @@ static VExprSPtr rewrite_table_expr_to_file_expr( const auto rewrite_it = table_column_to_file_slot.find(slot_ref->slot_id()); if (rewrite_it != table_column_to_file_slot.end()) { const auto& rewrite_info = rewrite_it->second; - auto file_slot = TableSlotRef::create_shared( - slot_ref->slot_id(), cast_set(rewrite_info.block_position), -1, - rewrite_info.file_type, rewrite_info.file_column_name); + auto file_slot = create_file_slot_ref(*slot_ref, rewrite_info); if (rewrite_info.file_type->equals(*rewrite_info.table_type)) { return file_slot; } @@ -66,6 +71,27 @@ static VExprSPtr rewrite_table_expr_to_file_expr( } return expr; } + // rewrite_table_expr_to_file_expr localizes the expression tree in-place because VExpr does + // not provide a generic deep-clone API. A previous split may already have inserted Cast(slot) + // for the same table-level conjunct. Keep that rewrite idempotent: rewrite the cast child + // from table slot to the current split's file slot, and drop the cast when the current split + // no longer needs it. + if (dynamic_cast(expr.get()) != nullptr && expr->get_num_children() == 1) { + const auto& child = expr->children()[0]; + if (child->is_slot_ref()) { + const auto* slot_ref = assert_cast(child.get()); + const auto rewrite_it = table_column_to_file_slot.find(slot_ref->slot_id()); + if (rewrite_it != table_column_to_file_slot.end() && + expr->data_type()->equals(*rewrite_it->second.table_type)) { + auto rewritten_child = create_file_slot_ref(*slot_ref, rewrite_it->second); + if (rewrite_it->second.file_type->equals(*rewrite_it->second.table_type)) { + return rewritten_child; + } + expr->set_children({std::move(rewritten_child)}); + return expr; + } + } + } // VExpr currently does not provide a generic deep-clone API for arbitrary expression types. // Keep all slot-localization mutation inside ColumnMapper and rebuild it for every split @@ -85,13 +111,28 @@ static constexpr const char* ROW_LINEAGE_LAST_UPDATED_SEQ_NUMBER = "_last_update static void add_scan_column(FileScanRequest* file_request, ColumnId file_column_id, std::vector* scan_columns) { + if (scan_columns == &file_request->non_predicate_columns && + std::find(file_request->predicate_columns.begin(), file_request->predicate_columns.end(), + file_column_id) != file_request->predicate_columns.end()) { + return; + } // column_positions is the global read-column index for this scan request, so it also // deduplicates predicate_columns and non_predicate_columns across all filter/projection paths. - if (file_request->column_positions.count(file_column_id) == 0) { + const bool newly_added = file_request->column_positions.count(file_column_id) == 0; + if (newly_added) { file_request->column_positions.emplace(file_column_id, file_request->column_positions.size()); + } + if (std::find(scan_columns->begin(), scan_columns->end(), file_column_id) == + scan_columns->end()) { scan_columns->push_back(file_column_id); } + if (scan_columns == &file_request->predicate_columns) { + file_request->non_predicate_columns.erase( + std::remove(file_request->non_predicate_columns.begin(), + file_request->non_predicate_columns.end(), file_column_id), + file_request->non_predicate_columns.end()); + } } static void rebuild_projection(ColumnMapping* mapping, size_t block_position) { @@ -293,7 +334,8 @@ Status TableColumnMapper::create_scan_request(const std::vector& ta file_request->non_predicate_columns.clear(); file_request->column_positions.clear(); file_request->complex_projections.clear(); - file_request->expression_filters.clear(); + file_request->conjuncts.clear(); + file_request->delete_conjuncts.clear(); file_request->column_predicate_filters.clear(); file_request->reader_expression_map.clear(); // 1. Build referenced non-predicate columns @@ -379,19 +421,9 @@ Status TableColumnMapper::localize_filters(const std::vector& table continue; } if (table_filter.conjunct != nullptr) { - FileExpressionFilter expression_filter; - expression_filter.conjunct = + file_request->conjuncts.push_back( VExprContext::create_shared(rewrite_table_expr_to_file_expr( - table_filter.conjunct->root(), table_column_to_file_slot)); - expression_filter.file_column_ids.reserve(table_filter.slot_ids.size()); - for (const auto table_column_id : table_filter.slot_ids) { - const auto* mapping = _find_mapping(table_column_id); - if (mapping == nullptr || !mapping->file_column_id.has_value()) { - continue; - } - expression_filter.file_column_ids.push_back(*mapping->file_column_id); - } - file_request->expression_filters.push_back(std::move(expression_filter)); + table_filter.conjunct->root(), table_column_to_file_slot))); } } for (const auto& [table_column_id, predicates] : table_column_predicates) { diff --git a/be/src/format/reader/column_mapper.h b/be/src/format/reader/column_mapper.h index 75b53f68d2d09e..e1839652a4799a 100644 --- a/be/src/format/reader/column_mapper.h +++ b/be/src/format/reader/column_mapper.h @@ -106,7 +106,7 @@ class TableColumnMapper { // 把 table-level scan 请求转换成 file-local scan 请求。 // table_request 使用 table/global schema;file_request 只包含 FileReader 能理解的 - // projected_file_columns、expression_filters、column_predicate_filters 和 + // projected_file_columns、conjuncts、delete_conjuncts、column_predicate_filters 和 // reader_expression_map。 virtual Status create_scan_request(const std::vector& table_filters, const TableColumnPredicates& table_column_predicates, @@ -149,7 +149,9 @@ class TableColumnMapper { } bool _is_same_type(const DataTypePtr& table_type, const DataTypePtr& file_type) const { - return table_type == file_type; + DORIS_CHECK(table_type != nullptr); + DORIS_CHECK(file_type != nullptr); + return table_type->equals(*file_type); } TableColumnMapperOptions _options; diff --git a/be/src/format/reader/file_reader.h b/be/src/format/reader/file_reader.h index 28de8f068b0f6c..7e6d18acedc2d4 100644 --- a/be/src/format/reader/file_reader.h +++ b/be/src/format/reader/file_reader.h @@ -27,7 +27,9 @@ #include "common/status.h" #include "core/data_type/data_type.h" +#include "core/field.h" #include "exprs/vexpr_fwd.h" +#include "gen_cpp/PlanNodes_types.h" #include "io/file_factory.h" #include "io/fs/file_reader_writer_fwd.h" @@ -75,15 +77,6 @@ struct FieldProjection { std::vector children; }; -// File-local expression filter. It may reference multiple predicate_columns, so FileReader should -// evaluate it after all referenced predicate columns have been materialized in the file-local block. -struct FileExpressionFilter { - VExprContextSPtr conjunct; - // DeletePredicate - VExprContextSPtr delete_conjunct; - std::vector file_column_ids; -}; - // File-local single-column predicates for file-layer pruning, such as min/max, page index, // dictionary and bloom filter. Predicates must all belong to file_column_id. struct FileColumnPredicateFilter { @@ -108,12 +101,37 @@ struct FileScanRequest { std::vector non_predicate_columns; std::map column_positions; // file_column_id -> file-local block position std::map complex_projections; - std::vector expression_filters; + // Complex conjuncts converted to file-local predicates from table-level predicates. + VExprContextSPtrs conjuncts; + // Delete predicates converted to file-local predicates. + VExprContextSPtrs delete_conjuncts; + // Only simple predicates that can be directly evaluated on column, such as `a` > 1. Now we use it for zone-map filtering. std::vector column_predicate_filters; // fallback path if filters cannot be localized to file-local predicates. The expression can reference projected_file_columns and partition columns. std::vector> reader_expression_map; }; +struct FileAggregateRequest { + struct Column { + ColumnId file_column_id = -1; + }; + + TPushAggOp::type agg_type = TPushAggOp::type::NONE; + std::vector columns; +}; + +struct FileAggregateResult { + struct Column { + bool has_min = false; + bool has_max = false; + Field min_value; + Field max_value; + }; + + int64_t count = 0; + std::vector columns; +}; + // 文件物理读取层通用接口。 // 该接口只描述 file-local schema、file-local scan request 和 file-local block。 // TableReader/IcebergTableReader 可以通过它组合不同文件格式 reader。 @@ -188,6 +206,11 @@ class FileReader { return Status::OK(); } + virtual Status get_aggregate_result(const FileAggregateRequest& request, + FileAggregateResult* result) { + return Status::NotSupported("FileReader does not support aggregate pushdown"); + } + // 关闭当前物理文件 reader 并释放文件层状态。 // 该方法不处理 table-level delete/finalize 状态,后者由 TableReader 子类管理。 virtual Status close() { diff --git a/be/src/format/reader/table_reader.cpp b/be/src/format/reader/table_reader.cpp index 8289d637d78b14..2c92b9ca1a1d0c 100644 --- a/be/src/format/reader/table_reader.cpp +++ b/be/src/format/reader/table_reader.cpp @@ -153,6 +153,7 @@ Status TableReader::init(TableReadOptions options) { _io_ctx = options.io_ctx; _runtime_state = options.runtime_state; _scanner_profile = options.scanner_profile; + _push_down_agg_type = options.push_down_agg_type; _projected_columns = std::move(options.projected_columns); _system_properties = create_system_properties(_scan_params); _profile = std::move(options.profile); @@ -173,19 +174,13 @@ Status TableReader::_build_table_filters_from_conjuncts() { Status TableReader::_open_local_filter_exprs(const FileScanRequest& file_request) { RowDescriptor row_desc; - for (const auto& expression_filter : file_request.expression_filters) { - if (expression_filter.conjunct == nullptr) { - if (expression_filter.delete_conjunct == nullptr) { - continue; - } - } else { - RETURN_IF_ERROR(expression_filter.conjunct->prepare(_runtime_state, row_desc)); - RETURN_IF_ERROR(expression_filter.conjunct->open(_runtime_state)); - } - if (expression_filter.delete_conjunct != nullptr) { - RETURN_IF_ERROR(expression_filter.delete_conjunct->prepare(_runtime_state, row_desc)); - RETURN_IF_ERROR(expression_filter.delete_conjunct->open(_runtime_state)); - } + for (const auto& conjunct : file_request.conjuncts) { + RETURN_IF_ERROR(conjunct->prepare(_runtime_state, row_desc)); + RETURN_IF_ERROR(conjunct->open(_runtime_state)); + } + for (const auto& delete_conjunct : file_request.delete_conjuncts) { + RETURN_IF_ERROR(delete_conjunct->prepare(_runtime_state, row_desc)); + RETURN_IF_ERROR(delete_conjunct->open(_runtime_state)); } return Status::OK(); } @@ -235,6 +230,7 @@ Status TableReader::prepare_split(const SplitReadOptions& options) { _current_task = std::make_unique(); _current_task->data_file = create_file_description(options.current_range); _delete_rows = nullptr; + _aggregate_pushdown_tried = false; return _parse_delete_predicates(options); } diff --git a/be/src/format/reader/table_reader.h b/be/src/format/reader/table_reader.h index de7626dfb2418d..83e0ec44fc80fe 100644 --- a/be/src/format/reader/table_reader.h +++ b/be/src/format/reader/table_reader.h @@ -36,6 +36,7 @@ #include "core/data_type/data_type_nullable.h" #include "core/data_type/data_type_number.h" #include "core/data_type/data_type_struct.h" +#include "core/field.h" #include "exprs/vexpr_context.h" #include "exprs/vexpr_fwd.h" #include "format/new_parquet/column_reader.h" @@ -43,6 +44,7 @@ #include "format/reader/expr/delete_predicate.h" #include "format/reader/expr/slot_ref.h" #include "format/reader/file_reader.h" +#include "gen_cpp/PlanNodes_types.h" #include "runtime/descriptors.h" namespace doris { @@ -66,8 +68,8 @@ struct TableColumn { bool is_partition_key = false; }; -// table-level filter。 -// TableColumnMapper 负责把它转换成 FileExpressionFilter 或 reader_expression_map。 +// All complex predicates on table/global schema, which cannot be directly localized to file +// schema. They will be evaluated at table level and may depend on multiple columns. struct TableFilter { // 表达式过滤,适合表达 cast、复杂表达式、复杂列提取等语义。 VExprContextSPtr conjunct; @@ -108,21 +110,29 @@ struct ReadProfile { }; struct TableReadOptions { + // Columns need to be read from file and output by table reader. They are all in table/global + // schema semantics. const std::vector projected_columns; + // Simple predicates for a single column, which is parsed on scan operator. const TableColumnPredicates column_predicates; - // All conjuncts from scan operator + // All complex conjuncts from scan operator const VExprContext conjuncts; + // File format of the underlying data files, needed for reader initialization and reader-level + // filter pushdown. const FileFormat format; TFileScanRangeParams* scan_params; std::shared_ptr io_ctx; RuntimeState* runtime_state; RuntimeProfile* scanner_profile; const bool allow_missing_columns = true; + // Push-down aggregate type. + const TPushAggOp::type push_down_agg_type = TPushAggOp::type::NONE; std::unique_ptr profile; }; struct SplitReadOptions { + // Split-level information for reader initialization, which may include file path, partition values, delete file info, etc. The content is table format specific and opaque to table reader base class; it's the responsibility of the concrete table reader implementation to parse necessary information for reader initialization and filter pushdown. std::map partition_values; ShardedKVCache* cache; TFileRangeDesc current_range; @@ -175,6 +185,18 @@ class TableReader { } } + // Materialize a reduced row set for upper aggregate operators when aggregate + // pushdown can be applied. This is not the final aggregate result: COUNT emits + // `count` default rows for the upper COUNT(*), and MIN/MAX emits two rows containing + // file-level min/max values for the upper MIN/MAX. + if (!_aggregate_pushdown_tried) { + bool pushed_down = false; + RETURN_IF_ERROR(_try_materialize_aggregate_pushdown_rows(block, &pushed_down)); + if (pushed_down) { + return Status::OK(); + } + } + bool current_eof = false; _data_reader.block_template.clear_column_data(); size_t current_rows = 0; @@ -329,10 +351,8 @@ class TableReader { std::make_shared(), parquet::ParquetColumnReaderFactory::ROW_POSITION_COLUMN_NAME)); - FileExpressionFilter delete_filter; - delete_filter.delete_conjunct = VExprContext::create_shared(std::move(delete_predicate)); - delete_filter.file_column_ids.push_back(row_position_column_id); - request->expression_filters.push_back(std::move(delete_filter)); + request->delete_conjuncts.push_back( + VExprContext::create_shared(std::move(delete_predicate))); return Status::OK(); } @@ -343,7 +363,6 @@ class TableReader { _data_reader.reader.reset(); _data_reader.column_mapper.clear(); _table_filters.clear(); - _table_column_predicates.clear(); _data_reader.file_schema.clear(); _data_reader.block_schema.clear(); _data_reader.block_template.clear(); @@ -368,6 +387,62 @@ class TableReader { // Materialize virtual columns in table block, such as _row_id and _last_updated_sequence_number in Iceberg. This is called after finalize_chunk, so the virtual column can be referenced in finalize_expr. virtual Status materialize_virtual_columns(Block* table_block) { return Status::OK(); } + Status _try_materialize_aggregate_pushdown_rows(Block* block, bool* pushed_down) { + DORIS_CHECK(block != nullptr); + DORIS_CHECK(pushed_down != nullptr); + *pushed_down = false; + block->clear_column_data(_projected_columns.size()); + _aggregate_pushdown_tried = true; + if (!_supports_aggregate_pushdown(_push_down_agg_type)) { + return Status::OK(); + } + + FileAggregateRequest file_request; + _build_file_aggregate_request(_push_down_agg_type, &file_request); + FileAggregateResult file_result; + const auto status = _data_reader.reader->get_aggregate_result(file_request, &file_result); + if (status.is()) { + return Status::OK(); + } + RETURN_IF_ERROR(status); + RETURN_IF_ERROR( + _materialize_aggregate_pushdown_rows(_push_down_agg_type, file_result, block)); + *pushed_down = true; + RETURN_IF_ERROR(close_current_reader()); + return Status::OK(); + } + + virtual bool _supports_aggregate_pushdown(TPushAggOp::type agg_type) const { + // Only COUNT and MIN/MAX can be push down. + if (agg_type != TPushAggOp::type::COUNT && agg_type != TPushAggOp::type::MINMAX) { + return false; + } + // Only support aggregate pushdown when there is no delete, filter and column predicate, so + // the reduced rows consumed by the upper aggregate remain semantically equivalent to a + // normal scan. + if (_delete_rows != nullptr && !_delete_rows->empty()) { + return false; + } + if (!_table_filters.empty() || !_table_column_predicates.empty()) { + return false; + } + if (agg_type == TPushAggOp::type::COUNT) { + return true; + } + // For MIN/MAX, only support direct file-to-table column mappings. The two emitted rows + // must be enough for the upper MIN/MAX aggregate without evaluating projections, default + // expressions or virtual columns. + for (const auto& mapping : _data_reader.column_mapper.mappings()) { + if (!mapping.file_column_id.has_value() || mapping.has_complex_projection || + mapping.virtual_column_type != TableVirtualColumnType::INVALID || + mapping.default_expr != nullptr || mapping.file_type == nullptr || + mapping.table_type == nullptr) { + return false; + } + } + return true; + } + Status _materialize_mapping_column(const ColumnMapping& mapping, Block* current_block, const size_t rows, ColumnPtr* column) { if (mapping.projection != nullptr) { @@ -411,6 +486,82 @@ class TableReader { return Status::OK(); } + void _build_file_aggregate_request(TPushAggOp::type agg_type, + FileAggregateRequest* request) const { + DORIS_CHECK(request != nullptr); + DORIS_CHECK(_supports_aggregate_pushdown(agg_type)); + request->agg_type = agg_type; + request->columns.clear(); + if (agg_type == TPushAggOp::type::COUNT) { + return; + } + request->columns.reserve(_data_reader.column_mapper.mappings().size()); + for (const auto& mapping : _data_reader.column_mapper.mappings()) { + DORIS_CHECK(mapping.file_column_id.has_value()); + request->columns.push_back({*mapping.file_column_id}); + } + } + + Status _materialize_aggregate_pushdown_rows(TPushAggOp::type agg_type, + const FileAggregateResult& file_result, + Block* block) { + if (agg_type == TPushAggOp::type::COUNT) { + // COUNT pushdown is not a final count value. It emits `count` default rows so the + // upper COUNT(*) aggregate can count them and produce the final result, including + // zero rows when count is 0. + for (size_t column_idx = 0; column_idx < block->columns(); ++column_idx) { + block->replace_by_position(column_idx, + block->get_by_position(column_idx) + .type->create_column_const_with_default_value( + cast_set(file_result.count))); + } + return Status::OK(); + } + // MIN/MAX pushdown emits two rows, min first and max second, for each projected column. + // The upper MIN/MAX aggregate consumes those two rows to produce the final aggregate value. + DORIS_CHECK(file_result.columns.size() == _data_reader.column_mapper.mappings().size()); + DORIS_CHECK(block->columns() == _data_reader.column_mapper.mappings().size()); + Block file_block; + file_block.reserve(_data_reader.block_schema.size()); + for (const auto& field : _data_reader.block_schema) { + file_block.insert({field.type->create_column(), field.type, field.name}); + } + for (size_t column_idx = 0; column_idx < file_result.columns.size(); ++column_idx) { + const auto& result_column = file_result.columns[column_idx]; + if (!result_column.has_min || !result_column.has_max) { + return Status::NotSupported("Missing min/max aggregate result for column {}", + _projected_columns[column_idx].name); + } + const auto& mapping = _data_reader.column_mapper.mappings()[column_idx]; + DORIS_CHECK(mapping.file_column_id.has_value()); + bool found_file_column = false; + for (size_t block_position = 0; block_position < _data_reader.block_schema.size(); + ++block_position) { + if (_data_reader.block_schema[block_position].id == *mapping.file_column_id) { + found_file_column = true; + auto column = + file_block.get_by_position(block_position).column->assume_mutable(); + if (column->empty()) { + column->insert(result_column.min_value); + column->insert(result_column.max_value); + file_block.replace_by_position(block_position, std::move(column)); + } + break; + } + } + DORIS_CHECK(found_file_column); + } + for (size_t column_idx = 0; column_idx < _data_reader.column_mapper.mappings().size(); + ++column_idx) { + ColumnPtr table_column; + RETURN_IF_ERROR( + _materialize_mapping_column(_data_reader.column_mapper.mappings()[column_idx], + &file_block, 2, &table_column)); + block->replace_by_position(column_idx, std::move(table_column)); + } + return Status::OK(); + } + struct DataReader { std::unique_ptr reader; TableColumnMapper column_mapper; @@ -426,6 +577,7 @@ class TableReader { std::shared_ptr _system_properties; // partition key -> value std::map _partition_values; + // Predicates built from scan conjuncts before file-level localization. std::vector _table_filters; TableColumnPredicates _table_column_predicates; VExprContext _conjuncts {nullptr}; @@ -437,6 +589,8 @@ class TableReader { RuntimeState* _runtime_state; RuntimeProfile* _scanner_profile; FileFormat _format; + TPushAggOp::type _push_down_agg_type = TPushAggOp::type::NONE; + bool _aggregate_pushdown_tried = false; private: static const SchemaField* _find_schema_field(const std::vector& schema, diff --git a/be/src/format/table/iceberg_reader_v2.cpp b/be/src/format/table/iceberg_reader_v2.cpp index ed6649fce2c9ed..f9587361dcf89d 100644 --- a/be/src/format/table/iceberg_reader_v2.cpp +++ b/be/src/format/table/iceberg_reader_v2.cpp @@ -115,6 +115,13 @@ Status IcebergTableReader::customize_file_scan_request(reader::FileScanRequest* return Status::OK(); } +bool IcebergTableReader::_supports_aggregate_pushdown(TPushAggOp::type agg_type) const { + if (!TableReader::_supports_aggregate_pushdown(agg_type)) { + return false; + } + return _equality_delete_filters.empty(); +} + Status IcebergTableReader::_parse_deletion_vector_file(const TTableFormatFileDesc& t_desc, DeleteFileDesc* desc, bool* has_delete_file) { @@ -184,7 +191,6 @@ Status IcebergTableReader::_init_delete_predicates(const TTableFormatFileDesc& t equality_delete_files.push_back(delete_file); } } - // `_delete_rows != nullptr` means DeleteVector is parsed if (_delete_rows != nullptr) { _position_delete_rows_storage = *_delete_rows; @@ -250,14 +256,13 @@ std::unique_ptr IcebergTableReader::_delete_file_descriptio return file_description; } -const reader::SchemaField* IcebergTableReader::_find_delete_field( - const std::vector& schema, const std::string& name) { - for (const auto& field : schema) { - if (field.name == name) { - return &field; - } +std::string IcebergTableReader::_data_file_path() const { + if (_iceberg_params != nullptr && _iceberg_params->__isset.original_file_path) { + return _iceberg_params->original_file_path; } - return nullptr; + DORIS_CHECK(_current_task != nullptr); + DORIS_CHECK(_current_task->data_file != nullptr); + return _current_task->data_file->path; } Status IcebergTableReader::_append_row_position_output_column(reader::FileScanRequest* request) { @@ -273,8 +278,6 @@ Status IcebergTableReader::_append_equality_delete_predicates(reader::FileScanRe for (const auto& filter : _equality_delete_filters) { auto delete_predicate = std::make_shared(filter.delete_block, filter.field_ids); - reader::FileExpressionFilter expression_filter; - expression_filter.delete_conjunct = VExprContext::create_shared(delete_predicate); DCHECK_EQ(filter.field_ids.size(), filter.key_types.size()); for (size_t idx = 0; idx < filter.field_ids.size(); ++idx) { const int field_id = filter.field_ids[idx]; @@ -301,22 +304,13 @@ Status IcebergTableReader::_append_equality_delete_predicates(reader::FileScanRe cast_expr->add_child(std::move(slot)); delete_predicate->add_child(std::move(cast_expr)); } - expression_filter.file_column_ids.push_back(field_it->id); } - request->expression_filters.push_back(std::move(expression_filter)); + request->delete_conjuncts.push_back( + VExprContext::create_shared(std::move(delete_predicate))); } return Status::OK(); } -std::string IcebergTableReader::_data_file_path() const { - if (_iceberg_params != nullptr && _iceberg_params->__isset.original_file_path) { - return _iceberg_params->original_file_path; - } - DORIS_CHECK(_current_task != nullptr); - DORIS_CHECK(_current_task->data_file != nullptr); - return _current_task->data_file->path; -} - Status IcebergTableReader::_read_parquet_position_delete_file( const TIcebergDeleteFileDesc& delete_file, const TFileScanRangeParams& scan_params, IcebergDeleteFileIOContext* delete_io_ctx, PositionDeleteRowsCollector* collector) { @@ -344,8 +338,15 @@ Status IcebergTableReader::_read_parquet_position_delete_file( std::vector schema; RETURN_IF_ERROR(reader.get_schema(&schema)); - const auto* file_path_field = _find_delete_field(schema, ICEBERG_FILE_PATH); - const auto* pos_field = _find_delete_field(schema, ICEBERG_ROW_POS); + reader::SchemaField* file_path_field = nullptr; + reader::SchemaField* pos_field = nullptr; + for (auto& field : schema) { + if (field.name == ICEBERG_FILE_PATH) { + file_path_field = &field; + } else if (field.name == ICEBERG_ROW_POS) { + pos_field = &field; + } + } if (file_path_field == nullptr || pos_field == nullptr) { return Status::InternalError("Position delete parquet file is missing required columns"); } @@ -381,9 +382,8 @@ Status IcebergTableReader::_init_position_delete_rows( TFileScanRangeParams delete_scan_params = _scan_params == nullptr ? TFileScanRangeParams() : *_scan_params; reader::DeleteRows position_delete_rows; - const auto data_file_path = _data_file_path(); IcebergDeleteFileIOContext delete_io_ctx(_runtime_state); - PositionDeleteRowsCollector collector(data_file_path, &position_delete_rows); + PositionDeleteRowsCollector collector(_data_file_path(), &position_delete_rows); for (const auto& delete_file : delete_files) { RETURN_IF_ERROR(_read_parquet_position_delete_file(delete_file, delete_scan_params, &delete_io_ctx, &collector)); diff --git a/be/src/format/table/iceberg_reader_v2.h b/be/src/format/table/iceberg_reader_v2.h index 497a989289a14d..a543ae0797dec4 100644 --- a/be/src/format/table/iceberg_reader_v2.h +++ b/be/src/format/table/iceberg_reader_v2.h @@ -53,6 +53,8 @@ class IcebergTableReader : public reader::TableReader { Status customize_file_scan_request(reader::FileScanRequest* file_request) override; + bool _supports_aggregate_pushdown(TPushAggOp::type agg_type) const override; + Status _parse_deletion_vector_file(const TTableFormatFileDesc& t_desc, DeleteFileDesc* desc, bool* has_delete_file) override; @@ -93,18 +95,17 @@ class IcebergTableReader : public reader::TableReader { static std::unique_ptr _delete_file_description( const TFileRangeDesc& range); - static const reader::SchemaField* _find_delete_field( - const std::vector& schema, const std::string& name); + std::string _data_file_path() const; + // Append row position column to file scan request for position delete handling. Status _append_row_position_output_column(reader::FileScanRequest* request); - + // Append equality delete predicates to file scan request based on the delete files in iceberg + // params. DeleteVector and position delete files use the common DeleteRows path in TableReader. Status _append_equality_delete_predicates(reader::FileScanRequest* request); Status _init_equality_delete_predicates( const std::vector& delete_files); - std::string _data_file_path() const; - // Read equality/position delete files. Status _read_parquet_equality_delete_file(const TIcebergDeleteFileDesc& delete_file, const TFileScanRangeParams& scan_params, diff --git a/be/test/format/new_parquet/parquet_reader_test.cpp b/be/test/format/new_parquet/parquet_reader_test.cpp index 255ad574a26177..6d0156af9ce2db 100644 --- a/be/test/format/new_parquet/parquet_reader_test.cpp +++ b/be/test/format/new_parquet/parquet_reader_test.cpp @@ -485,9 +485,7 @@ TEST_F(NewParquetReaderTest, ReadPredicateAndNonPredicateColumnsWithSelection) { auto request = std::make_unique(); request->predicate_columns = {0}; request->non_predicate_columns = {1}; - reader::FileExpressionFilter expression_filter; - expression_filter.conjunct = create_int32_greater_than_conjunct(0, 2); - request->expression_filters.push_back(std::move(expression_filter)); + request->conjuncts.push_back(create_int32_greater_than_conjunct(0, 2)); reader::FileColumnPredicateFilter column_filter; column_filter.file_column_id = 0; column_filter.predicates.push_back(create_comparison_predicate( @@ -532,9 +530,7 @@ TEST_F(NewParquetReaderTest, ReadMultiPredicateColumnsBeforeExpressionFilter) { auto request = std::make_unique(); request->predicate_columns = {0, 1}; request->non_predicate_columns = {}; - reader::FileExpressionFilter expression_filter; - expression_filter.conjunct = create_int32_sum_greater_than_conjunct(0, 1, 7); - request->expression_filters.push_back(std::move(expression_filter)); + request->conjuncts.push_back(create_int32_sum_greater_than_conjunct(0, 1, 7)); ASSERT_TRUE(reader->open(request).ok()); size_t rows = 0; @@ -567,9 +563,7 @@ TEST_F(NewParquetReaderTest, PredicateFiltersRowGroupsByStatistics) { auto request = std::make_unique(); request->predicate_columns = {0}; request->non_predicate_columns = {1}; - reader::FileExpressionFilter expression_filter; - expression_filter.conjunct = create_int32_greater_than_conjunct(0, 2); - request->expression_filters.push_back(std::move(expression_filter)); + request->conjuncts.push_back(create_int32_greater_than_conjunct(0, 2)); reader::FileColumnPredicateFilter column_filter; column_filter.file_column_id = 0; column_filter.predicates.push_back(create_comparison_predicate( @@ -756,9 +750,7 @@ TEST_F(NewParquetReaderTest, RowPositionReaderKeepsPositionsAfterSelection) { {0, 0}, {parquet::ParquetColumnReaderFactory::ROW_POSITION_COLUMN_ID, 2}, }; - reader::FileExpressionFilter expression_filter; - expression_filter.conjunct = create_int32_greater_than_conjunct(0, 2); - request->expression_filters.push_back(std::move(expression_filter)); + request->conjuncts.push_back(create_int32_greater_than_conjunct(0, 2)); ASSERT_TRUE(reader->open(request).ok()); size_t rows = 0; @@ -800,11 +792,7 @@ TEST_F(NewParquetReaderTest, DeletePredicateFiltersRowPositions) { {0, 0}, {parquet::ParquetColumnReaderFactory::ROW_POSITION_COLUMN_ID, 2}, }; - reader::FileExpressionFilter delete_filter; - delete_filter.delete_conjunct = VExprContext::create_shared(std::move(delete_predicate)); - delete_filter.file_column_ids.push_back( - parquet::ParquetColumnReaderFactory::ROW_POSITION_COLUMN_ID); - request->expression_filters.push_back(std::move(delete_filter)); + request->delete_conjuncts.push_back(VExprContext::create_shared(std::move(delete_predicate))); ASSERT_TRUE(reader->open(request).ok()); size_t rows = 0; @@ -846,13 +834,8 @@ TEST_F(NewParquetReaderTest, QueryPredicateAndDeletePredicateFilterRowPositions) {0, 0}, {parquet::ParquetColumnReaderFactory::ROW_POSITION_COLUMN_ID, 2}, }; - reader::FileExpressionFilter expression_filter; - expression_filter.conjunct = create_int32_greater_than_conjunct(0, 2); - expression_filter.delete_conjunct = VExprContext::create_shared(std::move(delete_predicate)); - expression_filter.file_column_ids.push_back(0); - expression_filter.file_column_ids.push_back( - parquet::ParquetColumnReaderFactory::ROW_POSITION_COLUMN_ID); - request->expression_filters.push_back(std::move(expression_filter)); + request->conjuncts.push_back(create_int32_greater_than_conjunct(0, 2)); + request->delete_conjuncts.push_back(VExprContext::create_shared(std::move(delete_predicate))); ASSERT_TRUE(reader->open(request).ok()); size_t rows = 0; diff --git a/be/test/format/reader/expr/cast_test.cpp b/be/test/format/reader/expr/cast_test.cpp index a236d327a1f2c4..93858dbf53ef85 100644 --- a/be/test/format/reader/expr/cast_test.cpp +++ b/be/test/format/reader/expr/cast_test.cpp @@ -238,6 +238,26 @@ TEST_F(CastTest, ColumnMapperBuildsCastProjectionForTypeMismatch) { mapping.projection->close(); } +TEST_F(CastTest, ColumnMapperTreatsEquivalentTypesAsTrivial) { + reader::TableColumnMapper mapper; + reader::TableColumn table_column; + table_column.id = 7; + table_column.name = "value"; + table_column.type = std::make_shared(); + std::vector projected_columns {table_column}; + + reader::SchemaField file_field; + file_field.id = 0; + file_field.name = "value"; + file_field.type = std::make_shared(); + std::vector file_schema {file_field}; + + auto status = mapper.create_mapping(projected_columns, {}, file_schema); + ASSERT_TRUE(status.ok()) << status; + ASSERT_EQ(mapper.mappings().size(), 1); + EXPECT_TRUE(mapper.mappings()[0].is_trivial); +} + TEST_F(CastTest, ColumnMapperBuildsCastFilterForTypeMismatch) { reader::TableColumnMapper mapper; reader::TableColumn table_column; @@ -264,9 +284,9 @@ TEST_F(CastTest, ColumnMapperBuildsCastFilterForTypeMismatch) { reader::FileScanRequest file_request; ASSERT_TRUE( mapper.create_scan_request({table_filter}, {}, projected_columns, &file_request).ok()); - ASSERT_EQ(file_request.expression_filters.size(), 1); + ASSERT_EQ(file_request.conjuncts.size(), 1); ASSERT_EQ(file_request.predicate_columns, std::vector({0})); - const auto& localized_expr = file_request.expression_filters[0].conjunct->root(); + const auto& localized_expr = file_request.conjuncts[0]->root(); ASSERT_EQ(localized_expr->get_num_children(), 1); const auto& localized_child = localized_expr->children()[0]; ASSERT_NE(dynamic_cast(localized_child.get()), nullptr); @@ -279,7 +299,7 @@ TEST_F(CastTest, ColumnMapperBuildsCastFilterForTypeMismatch) { Block block; block.insert(ColumnHelper::create_column_with_name({11, 22})); - auto* conjunct = file_request.expression_filters[0].conjunct.get(); + auto* conjunct = file_request.conjuncts[0].get(); status = conjunct->prepare(&state, RowDescriptor()); ASSERT_TRUE(status.ok()) << status; status = conjunct->open(&state); @@ -293,7 +313,172 @@ TEST_F(CastTest, ColumnMapperBuildsCastFilterForTypeMismatch) { EXPECT_EQ(filter[0], 0); EXPECT_EQ(filter[1], 1); - file_request.expression_filters[0].conjunct->close(); + file_request.conjuncts[0]->close(); +} + +TEST_F(CastTest, ColumnMapperDoesNotNestCastFilterAcrossScanRequests) { + reader::TableColumnMapper mapper; + reader::TableColumn table_column; + table_column.id = 7; + table_column.name = "value"; + table_column.type = std::make_shared(); + std::vector projected_columns {table_column}; + + reader::SchemaField file_field; + file_field.id = 0; + file_field.name = "value"; + file_field.type = std::make_shared(); + std::vector file_schema {file_field}; + + auto status = mapper.create_mapping(projected_columns, {}, file_schema); + ASSERT_TRUE(status.ok()) << status; + + auto predicate = std::make_shared(15); + predicate->add_child(TableSlotRef::create_shared(7, 7, -1, table_column.type, "value")); + reader::TableFilter table_filter; + table_filter.conjunct = VExprContext::create_shared(predicate); + table_filter.slot_ids = {7}; + + reader::FileScanRequest first_request; + ASSERT_TRUE( + mapper.create_scan_request({table_filter}, {}, projected_columns, &first_request).ok()); + reader::FileScanRequest second_request; + ASSERT_TRUE(mapper.create_scan_request({table_filter}, {}, projected_columns, &second_request) + .ok()); + + ASSERT_EQ(second_request.conjuncts.size(), 1); + const auto& localized_expr = second_request.conjuncts[0]->root(); + ASSERT_EQ(localized_expr->get_num_children(), 1); + const auto& localized_child = localized_expr->children()[0]; + ASSERT_NE(dynamic_cast(localized_child.get()), nullptr); + ASSERT_EQ(localized_child->get_num_children(), 1); + const auto* localized_slot = + assert_cast(localized_child->children()[0].get()); + EXPECT_EQ(localized_slot->column_id(), 0); +} + +TEST_F(CastTest, ColumnMapperRewritesPreviousCastFilterToMatchingSplitType) { + reader::TableColumn table_column; + table_column.id = 7; + table_column.name = "value"; + table_column.type = std::make_shared(); + std::vector projected_columns {table_column}; + + auto predicate = std::make_shared(15); + predicate->add_child(TableSlotRef::create_shared(7, 7, -1, table_column.type, "value")); + reader::TableFilter table_filter; + table_filter.conjunct = VExprContext::create_shared(predicate); + table_filter.slot_ids = {7}; + + reader::SchemaField int_file_field; + int_file_field.id = 0; + int_file_field.name = "value"; + int_file_field.type = std::make_shared(); + + reader::TableColumnMapper int_mapper; + ASSERT_TRUE(int_mapper.create_mapping(projected_columns, {}, {int_file_field}).ok()); + reader::FileScanRequest int_request; + ASSERT_TRUE(int_mapper.create_scan_request({table_filter}, {}, projected_columns, &int_request) + .ok()); + + const auto& int_localized_expr = int_request.conjuncts[0]->root(); + ASSERT_EQ(int_localized_expr->get_num_children(), 1); + ASSERT_NE(dynamic_cast(int_localized_expr->children()[0].get()), nullptr); + + reader::SchemaField bigint_file_field; + bigint_file_field.id = 0; + bigint_file_field.name = "value"; + bigint_file_field.type = std::make_shared(); + + reader::TableColumnMapper bigint_mapper; + ASSERT_TRUE(bigint_mapper.create_mapping(projected_columns, {}, {bigint_file_field}).ok()); + reader::FileScanRequest bigint_request; + ASSERT_TRUE(bigint_mapper + .create_scan_request({table_filter}, {}, projected_columns, &bigint_request) + .ok()); + + const auto& bigint_localized_expr = bigint_request.conjuncts[0]->root(); + ASSERT_EQ(bigint_localized_expr->get_num_children(), 1); + const auto& bigint_localized_child = bigint_localized_expr->children()[0]; + const auto* localized_slot = assert_cast(bigint_localized_child.get()); + EXPECT_EQ(localized_slot->column_id(), 0); + EXPECT_TRUE(localized_slot->data_type()->equals(*bigint_file_field.type)); + + Block block; + block.insert(ColumnHelper::create_column_with_name({11, 22})); + auto* conjunct = bigint_request.conjuncts[0].get(); + auto status = conjunct->prepare(&state, RowDescriptor()); + ASSERT_TRUE(status.ok()) << status; + status = conjunct->open(&state); + ASSERT_TRUE(status.ok()) << status; + IColumn::Filter filter(block.rows(), 1); + bool can_filter_all = false; + status = conjunct->execute_filter(&block, filter.data(), block.rows(), false, &can_filter_all); + ASSERT_TRUE(status.ok()) << status; + EXPECT_FALSE(can_filter_all); + ASSERT_EQ(filter.size(), 2); + EXPECT_EQ(filter[0], 0); + EXPECT_EQ(filter[1], 1); + conjunct->close(); +} + +TEST_F(CastTest, ColumnMapperKeepsTableSlotIdWhenFileBlockPositionChanges) { + reader::TableColumn table_column; + table_column.id = 7; + table_column.name = "value"; + table_column.type = std::make_shared(); + std::vector projected_columns {table_column}; + + reader::SchemaField file_field; + file_field.id = 10; + file_field.name = "value"; + file_field.type = std::make_shared(); + + reader::TableColumnMapper mapper; + ASSERT_TRUE(mapper.create_mapping(projected_columns, {}, {file_field}).ok()); + + auto predicate = std::make_shared(15); + predicate->add_child(TableSlotRef::create_shared(7, 7, -1, table_column.type, "value")); + reader::TableFilter table_filter; + table_filter.conjunct = VExprContext::create_shared(predicate); + table_filter.slot_ids = {7}; + + reader::FileScanRequest first_request; + ASSERT_TRUE(mapper.localize_filters({table_filter}, {}, &first_request).ok()); + ASSERT_EQ(first_request.conjuncts.size(), 1); + const auto* first_slot = assert_cast( + first_request.conjuncts[0]->root()->children()[0].get()); + EXPECT_EQ(first_slot->slot_id(), 7); + EXPECT_EQ(first_slot->column_id(), 0); + + reader::FileScanRequest second_request; + second_request.column_positions.emplace(9, 0); + second_request.column_positions.emplace(10, 1); + second_request.non_predicate_columns.push_back(9); + ASSERT_TRUE(mapper.localize_filters({table_filter}, {}, &second_request).ok()); + ASSERT_EQ(second_request.conjuncts.size(), 1); + const auto* second_slot = assert_cast( + second_request.conjuncts[0]->root()->children()[0].get()); + EXPECT_EQ(second_slot->slot_id(), 7); + EXPECT_EQ(second_slot->column_id(), 1); + + Block block; + block.insert(ColumnHelper::create_column_with_name({100, 100})); + block.insert(ColumnHelper::create_column_with_name({11, 22})); + auto* conjunct = second_request.conjuncts[0].get(); + auto status = conjunct->prepare(&state, RowDescriptor()); + ASSERT_TRUE(status.ok()) << status; + status = conjunct->open(&state); + ASSERT_TRUE(status.ok()) << status; + IColumn::Filter filter(block.rows(), 1); + bool can_filter_all = false; + status = conjunct->execute_filter(&block, filter.data(), block.rows(), false, &can_filter_all); + ASSERT_TRUE(status.ok()) << status; + EXPECT_FALSE(can_filter_all); + ASSERT_EQ(filter.size(), 2); + EXPECT_EQ(filter[0], 0); + EXPECT_EQ(filter[1], 1); + conjunct->close(); } } // namespace doris diff --git a/be/test/format/reader/table_reader_test.cpp b/be/test/format/reader/table_reader_test.cpp index 8a72937002dba4..1bb6eaf26be6fb 100644 --- a/be/test/format/reader/table_reader_test.cpp +++ b/be/test/format/reader/table_reader_test.cpp @@ -268,14 +268,38 @@ void write_parquet_file(const std::string& file_path, int32_t id, const std::str builder.build())); } +void write_iceberg_equality_delete_parquet_file(const std::string& file_path, int32_t field_id, + int32_t value) { + const auto metadata = + arrow::key_value_metadata({"PARQUET:field_id"}, {std::to_string(field_id)}); + auto schema = arrow::schema({ + arrow::field("id", arrow::int32(), false)->WithMetadata(metadata), + }); + auto table = arrow::Table::Make(schema, {build_int32_array({value})}); + + auto file_result = arrow::io::FileOutputStream::Open(file_path); + ASSERT_TRUE(file_result.ok()) << file_result.status(); + std::shared_ptr out = *file_result; + + ::parquet::WriterProperties::Builder builder; + builder.version(::parquet::ParquetVersion::PARQUET_2_6); + builder.data_page_version(::parquet::ParquetDataPageVersion::V2); + builder.compression(::parquet::Compression::UNCOMPRESSED); + PARQUET_THROW_NOT_OK(::parquet::arrow::WriteTable(*table, arrow::default_memory_pool(), out, 1, + builder.build())); +} + void write_int_pair_parquet_file(const std::string& file_path, const std::vector& ids, const std::vector& scores, const std::vector& values, int64_t row_group_size = -1) { + const auto id_metadata = arrow::key_value_metadata({"PARQUET:field_id"}, {"0"}); + const auto score_metadata = arrow::key_value_metadata({"PARQUET:field_id"}, {"1"}); + const auto value_metadata = arrow::key_value_metadata({"PARQUET:field_id"}, {"2"}); auto schema = arrow::schema({ - arrow::field("id", arrow::int32(), false), - arrow::field("score", arrow::int32(), false), - arrow::field("value", arrow::utf8(), false), + arrow::field("id", arrow::int32(), false)->WithMetadata(id_metadata), + arrow::field("score", arrow::int32(), false)->WithMetadata(score_metadata), + arrow::field("value", arrow::utf8(), false)->WithMetadata(value_metadata), }); auto table = arrow::Table::Make(schema, {build_int32_array(ids), build_int32_array(scores), build_string_array(values)}); @@ -398,6 +422,16 @@ TIcebergDeleteFileDesc make_iceberg_position_delete_file(const std::string& path return delete_file; } +TIcebergDeleteFileDesc make_iceberg_equality_delete_file(const std::string& path, + const std::vector& field_ids) { + TIcebergDeleteFileDesc delete_file; + delete_file.__set_content(2); + delete_file.__set_path(path); + delete_file.__set_field_ids(field_ids); + delete_file.__set_file_format(TFileFormatType::FORMAT_PARQUET); + return delete_file; +} + TFileScanRangeParams make_local_parquet_scan_params() { TFileScanRangeParams scan_params; scan_params.__set_file_type(TFileType::FILE_LOCAL); @@ -557,6 +591,268 @@ TEST(TableReaderTest, ReopenSplitAfterClose) { std::filesystem::remove_all(test_dir); } +TEST(TableReaderTest, PushDownCountFromNewParquetReader) { + const auto test_dir = std::filesystem::temp_directory_path() / "doris_table_reader_count_test"; + std::filesystem::remove_all(test_dir); + std::filesystem::create_directories(test_dir); + + const auto file_path = (test_dir / "split.parquet").string(); + write_int_pair_parquet_file(file_path, {1, 2, 3, 4, 5}, {10, 20, 30, 40, 50}, + {"one", "two", "three", "four", "five"}, 2); + + std::vector projected_columns; + projected_columns.push_back(make_table_column(0, "id", std::make_shared())); + + RuntimeState state {TQueryOptions(), TQueryGlobals()}; + TableReader reader; + ASSERT_TRUE(reader.init({ + .projected_columns = projected_columns, + .column_predicates = {}, + .conjuncts = VExprContext(nullptr), + .format = FileFormat::PARQUET, + .scan_params = nullptr, + .io_ctx = nullptr, + .runtime_state = &state, + .scanner_profile = nullptr, + .allow_missing_columns = true, + .push_down_agg_type = TPushAggOp::type::COUNT, + .profile = nullptr, + }) + .ok()); + ASSERT_TRUE(reader.prepare_split(build_split_options(file_path)).ok()); + + Block block = build_table_block(projected_columns); + bool eos = false; + ASSERT_TRUE(reader.get_block(&block, &eos).ok()); + ASSERT_FALSE(eos); + ASSERT_EQ(block.rows(), 5); + + ASSERT_TRUE(reader.close().ok()); + std::filesystem::remove_all(test_dir); +} + +TEST(TableReaderTest, PushDownMinMaxFromNewParquetReader) { + const auto test_dir = std::filesystem::temp_directory_path() / "doris_table_reader_minmax_test"; + std::filesystem::remove_all(test_dir); + std::filesystem::create_directories(test_dir); + + const auto file_path = (test_dir / "split.parquet").string(); + write_int_pair_parquet_file(file_path, {3, 1, 5, 2}, {30, 10, 50, 20}, + {"three", "one", "five", "two"}, 2); + + std::vector projected_columns; + projected_columns.push_back(make_table_column(0, "id", std::make_shared())); + projected_columns.push_back(make_table_column(1, "score", std::make_shared())); + + RuntimeState state {TQueryOptions(), TQueryGlobals()}; + TableReader reader; + ASSERT_TRUE(reader.init({ + .projected_columns = projected_columns, + .column_predicates = {}, + .conjuncts = VExprContext(nullptr), + .format = FileFormat::PARQUET, + .scan_params = nullptr, + .io_ctx = nullptr, + .runtime_state = &state, + .scanner_profile = nullptr, + .allow_missing_columns = true, + .push_down_agg_type = TPushAggOp::type::MINMAX, + .profile = nullptr, + }) + .ok()); + ASSERT_TRUE(reader.prepare_split(build_split_options(file_path)).ok()); + + Block block = build_table_block(projected_columns); + bool eos = false; + ASSERT_TRUE(reader.get_block(&block, &eos).ok()); + ASSERT_FALSE(eos); + ASSERT_EQ(block.rows(), 2); + const auto& id_column = assert_cast(*block.get_by_position(0).column); + const auto& score_column = assert_cast(*block.get_by_position(1).column); + EXPECT_EQ(id_column.get_element(0), 1); + EXPECT_EQ(id_column.get_element(1), 5); + EXPECT_EQ(score_column.get_element(0), 10); + EXPECT_EQ(score_column.get_element(1), 50); + + ASSERT_TRUE(reader.close().ok()); + std::filesystem::remove_all(test_dir); +} + +TEST(TableReaderTest, PushDownMinMaxCastsFileValueToTableType) { + const auto test_dir = + std::filesystem::temp_directory_path() / "doris_table_reader_minmax_cast_test"; + std::filesystem::remove_all(test_dir); + std::filesystem::create_directories(test_dir); + + const auto file_path = (test_dir / "split.parquet").string(); + write_int_pair_parquet_file(file_path, {3, 1, 5, 2}, {30, 10, 50, 20}, + {"three", "one", "five", "two"}, 2); + + std::vector projected_columns; + projected_columns.push_back(make_table_column(0, "id", std::make_shared())); + + RuntimeState state {TQueryOptions(), TQueryGlobals()}; + TableReader reader; + ASSERT_TRUE(reader.init({ + .projected_columns = projected_columns, + .column_predicates = {}, + .conjuncts = VExprContext(nullptr), + .format = FileFormat::PARQUET, + .scan_params = nullptr, + .io_ctx = nullptr, + .runtime_state = &state, + .scanner_profile = nullptr, + .allow_missing_columns = true, + .push_down_agg_type = TPushAggOp::type::MINMAX, + .profile = nullptr, + }) + .ok()); + ASSERT_TRUE(reader.prepare_split(build_split_options(file_path)).ok()); + + Block block = build_table_block(projected_columns); + bool eos = false; + ASSERT_TRUE(reader.get_block(&block, &eos).ok()); + ASSERT_FALSE(eos); + ASSERT_EQ(block.rows(), 2); + const auto& id_column = assert_cast(*block.get_by_position(0).column); + EXPECT_EQ(id_column.get_element(0), 1); + EXPECT_EQ(id_column.get_element(1), 5); + + ASSERT_TRUE(reader.close().ok()); + std::filesystem::remove_all(test_dir); +} + +TEST(TableReaderTest, PushDownCountFallsBackWithTableConjunct) { + const auto test_dir = + std::filesystem::temp_directory_path() / "doris_table_reader_count_conjunct_test"; + std::filesystem::remove_all(test_dir); + std::filesystem::create_directories(test_dir); + + const auto file_path = (test_dir / "split.parquet").string(); + write_int_pair_parquet_file(file_path, {1, 2, 3}, {10, 20, 30}, {"one", "two", "three"}); + + std::vector projected_columns; + projected_columns.push_back(make_table_column(0, "id", std::make_shared())); + + RuntimeState state {TQueryOptions(), TQueryGlobals()}; + TableReader reader; + ASSERT_TRUE(reader.init({ + .projected_columns = projected_columns, + .column_predicates = {}, + .conjuncts = VExprContext( + std::make_shared(0, 0, 2)), + .format = FileFormat::PARQUET, + .scan_params = nullptr, + .io_ctx = nullptr, + .runtime_state = &state, + .scanner_profile = nullptr, + .allow_missing_columns = true, + .push_down_agg_type = TPushAggOp::type::COUNT, + .profile = nullptr, + }) + .ok()); + ASSERT_TRUE(reader.prepare_split(build_split_options(file_path)).ok()); + + Block block = build_table_block(projected_columns); + bool eos = false; + ASSERT_TRUE(reader.get_block(&block, &eos).ok()); + ASSERT_FALSE(eos); + ASSERT_EQ(block.rows(), 1); + const auto& id_column = assert_cast(*block.get_by_position(0).column); + EXPECT_EQ(id_column.get_element(0), 3); + + ASSERT_TRUE(reader.close().ok()); + std::filesystem::remove_all(test_dir); +} + +TEST(TableReaderTest, PushDownCountFallsBackWithColumnPredicate) { + const auto test_dir = + std::filesystem::temp_directory_path() / "doris_table_reader_count_predicate_test"; + std::filesystem::remove_all(test_dir); + std::filesystem::create_directories(test_dir); + + const auto file_path = (test_dir / "split.parquet").string(); + write_int_pair_parquet_file(file_path, {1, 2, 3}, {10, 20, 30}, {"one", "two", "three"}, 1); + + std::vector projected_columns; + projected_columns.push_back(make_table_column(0, "id", std::make_shared())); + + TableColumnPredicates column_predicates; + column_predicates[0].push_back(create_comparison_predicate( + 0, "id", std::make_shared(), Field::create_field(2), false)); + + RuntimeState state {TQueryOptions(), TQueryGlobals()}; + TableReader reader; + ASSERT_TRUE(reader.init({ + .projected_columns = projected_columns, + .column_predicates = std::move(column_predicates), + .conjuncts = VExprContext(nullptr), + .format = FileFormat::PARQUET, + .scan_params = nullptr, + .io_ctx = nullptr, + .runtime_state = &state, + .scanner_profile = nullptr, + .allow_missing_columns = true, + .push_down_agg_type = TPushAggOp::type::COUNT, + .profile = nullptr, + }) + .ok()); + ASSERT_TRUE(reader.prepare_split(build_split_options(file_path)).ok()); + + Block block = build_table_block(projected_columns); + bool eos = false; + ASSERT_TRUE(reader.get_block(&block, &eos).ok()); + ASSERT_FALSE(eos); + ASSERT_EQ(block.rows(), 1); + const auto& id_column = assert_cast(*block.get_by_position(0).column); + EXPECT_EQ(id_column.get_element(0), 3); + + ASSERT_TRUE(reader.close().ok()); + std::filesystem::remove_all(test_dir); +} + +TEST(TableReaderTest, PushDownMinMaxFallsBackWithoutDirectFileMapping) { + const auto test_dir = std::filesystem::temp_directory_path() / + "doris_table_reader_minmax_missing_mapping_test"; + std::filesystem::remove_all(test_dir); + std::filesystem::create_directories(test_dir); + + const auto file_path = (test_dir / "split.parquet").string(); + write_parquet_file(file_path, 1, "one"); + + std::vector projected_columns; + projected_columns.push_back( + make_table_column(99, "missing_id", std::make_shared())); + + RuntimeState state {TQueryOptions(), TQueryGlobals()}; + TableReader reader; + ASSERT_TRUE(reader.init({ + .projected_columns = projected_columns, + .column_predicates = {}, + .conjuncts = VExprContext(nullptr), + .format = FileFormat::PARQUET, + .scan_params = nullptr, + .io_ctx = nullptr, + .runtime_state = &state, + .scanner_profile = nullptr, + .allow_missing_columns = true, + .push_down_agg_type = TPushAggOp::type::MINMAX, + .profile = nullptr, + }) + .ok()); + ASSERT_TRUE(reader.prepare_split(build_split_options(file_path)).ok()); + + Block block = build_table_block(projected_columns); + bool eos = false; + ASSERT_TRUE(reader.get_block(&block, &eos).ok()); + ASSERT_FALSE(eos); + ASSERT_EQ(block.rows(), 1); + EXPECT_EQ(block.get_by_position(0).column->get_int(0), 0); + + ASSERT_TRUE(reader.close().ok()); + std::filesystem::remove_all(test_dir); +} + TEST(TableReaderTest, OpenReaderBuildsTableFiltersFromConjuncts) { const auto test_dir = std::filesystem::temp_directory_path() / "doris_table_reader_conjunct_filter_test"; @@ -644,7 +940,61 @@ TEST(TableReaderTest, OpenReaderBuildsColumnPredicateFilters) { write_int_pair_parquet_file(file_path, {1, 2, 3}, {1, 5, 8}, {"one", "two", "three"}, 1); std::vector projected_columns; - projected_columns.push_back(make_table_column(2, "value", std::make_shared())); + projected_columns.push_back(make_table_column(2, "value", std::make_shared())); + projected_columns.push_back(make_table_column(0, "id", std::make_shared())); + + TableColumnPredicates column_predicates; + column_predicates[0].push_back(create_comparison_predicate( + 0, "id", std::make_shared(), Field::create_field(2), false)); + + RuntimeState state {TQueryOptions(), TQueryGlobals()}; + TableReader reader; + ASSERT_TRUE(reader.init({ + .projected_columns = projected_columns, + .column_predicates = std::move(column_predicates), + .conjuncts = VExprContext(nullptr), + .format = FileFormat::PARQUET, + .scan_params = nullptr, + .io_ctx = nullptr, + .runtime_state = &state, + .scanner_profile = nullptr, + .allow_missing_columns = true, + .profile = nullptr, + }) + .ok()); + + ASSERT_TRUE(reader.prepare_split(build_split_options(file_path)).ok()); + + Block block = build_table_block(projected_columns); + bool eos = false; + ASSERT_TRUE(reader.get_block(&block, &eos).ok()); + ASSERT_FALSE(eos); + + const auto& value_column = assert_cast(*block.get_by_position(0).column); + const auto& id_column = assert_cast(*block.get_by_position(1).column); + ASSERT_EQ(id_column.size(), 1); + ASSERT_EQ(value_column.size(), 1); + EXPECT_EQ(id_column.get_element(0), 3); + EXPECT_EQ(value_column.get_data_at(0).to_string(), "three"); + + ASSERT_TRUE(reader.close().ok()); + std::filesystem::remove_all(test_dir); +} + +TEST(TableReaderTest, ColumnPredicateSurvivesReopenSplit) { + const auto test_dir = + std::filesystem::temp_directory_path() / "doris_table_reader_predicate_reopen_test"; + std::filesystem::remove_all(test_dir); + std::filesystem::create_directories(test_dir); + + const std::vector file_paths = { + (test_dir / "split_1.parquet").string(), + (test_dir / "split_2.parquet").string(), + }; + write_int_pair_parquet_file(file_paths[0], {1, 3}, {10, 30}, {"one", "three"}, 1); + write_int_pair_parquet_file(file_paths[1], {2, 4}, {20, 40}, {"two", "four"}, 1); + + std::vector projected_columns; projected_columns.push_back(make_table_column(0, "id", std::make_shared())); TableColumnPredicates column_predicates; @@ -667,21 +1017,22 @@ TEST(TableReaderTest, OpenReaderBuildsColumnPredicateFilters) { }) .ok()); - ASSERT_TRUE(reader.prepare_split(build_split_options(file_path)).ok()); + std::vector ids; + for (const auto& file_path : file_paths) { + ASSERT_TRUE(reader.prepare_split(build_split_options(file_path)).ok()); - Block block = build_table_block(projected_columns); - bool eos = false; - ASSERT_TRUE(reader.get_block(&block, &eos).ok()); - ASSERT_FALSE(eos); + Block block = build_table_block(projected_columns); + bool eos = false; + ASSERT_TRUE(reader.get_block(&block, &eos).ok()); + ASSERT_FALSE(eos); + const auto& id_column = assert_cast(*block.get_by_position(0).column); + ASSERT_EQ(id_column.size(), 1); + ids.push_back(id_column.get_element(0)); - const auto& value_column = assert_cast(*block.get_by_position(0).column); - const auto& id_column = assert_cast(*block.get_by_position(1).column); - ASSERT_EQ(id_column.size(), 1); - ASSERT_EQ(value_column.size(), 1); - EXPECT_EQ(id_column.get_element(0), 3); - EXPECT_EQ(value_column.get_data_at(0).to_string(), "three"); + ASSERT_TRUE(reader.close().ok()); + } - ASSERT_TRUE(reader.close().ok()); + EXPECT_EQ(ids, std::vector({3, 4})); std::filesystem::remove_all(test_dir); } @@ -763,6 +1114,51 @@ TEST(TableReaderTest, CreateScanRequestDeduplicatesSharedPredicateColumns) { } } +TEST(TableReaderTest, CreateScanRequestPromotesProjectedColumnToPredicateColumn) { + const auto int_type = std::make_shared(); + const std::vector projected_columns = { + make_table_column(0, "id", int_type), + make_table_column(1, "score", int_type), + }; + const std::vector file_schema = { + {.id = 0, + .name = "id", + .type = int_type, + .children = {}, + .file_path = {0}, + .field_id_path = {0}, + .name_path = {"id"}, + .column_type = DATA_COLUMN}, + {.id = 1, + .name = "score", + .type = int_type, + .children = {}, + .file_path = {1}, + .field_id_path = {1}, + .name_path = {"score"}, + .column_type = DATA_COLUMN}, + }; + + TableColumnMapper mapper; + ASSERT_TRUE(mapper.create_mapping(projected_columns, {}, file_schema).ok()); + + TableFilter table_filter { + .conjunct = VExprContext::create_shared( + std::make_shared(0, 0, 1)), + .slot_ids = {0}, + }; + + FileScanRequest file_request; + ASSERT_TRUE( + mapper.create_scan_request({table_filter}, {}, projected_columns, &file_request).ok()); + + EXPECT_EQ(file_request.predicate_columns, std::vector({0})); + EXPECT_EQ(file_request.non_predicate_columns, std::vector({1})); + ASSERT_EQ(file_request.column_positions.size(), 2); + EXPECT_EQ(file_request.column_positions.at(0), 1); + EXPECT_EQ(file_request.column_positions.at(1), 0); +} + TEST(TableReaderTest, OpenReaderPushesMultiColumnConjunctToParquetReader) { const auto test_dir = std::filesystem::temp_directory_path() / "doris_table_reader_multi_conjunct_test"; @@ -1194,6 +1590,7 @@ TEST(TableReaderTest, IcebergTableReaderAppliesDeletionVectorFile) { .runtime_state = &state, .scanner_profile = &profile, .allow_missing_columns = true, + .push_down_agg_type = TPushAggOp::type::COUNT, .profile = make_table_read_profile(&profile), }) .ok()); @@ -1210,6 +1607,226 @@ TEST(TableReaderTest, IcebergTableReaderAppliesDeletionVectorFile) { std::filesystem::remove_all(test_dir); } +TEST(TableReaderTest, IcebergTableReaderDoesNotPushDownAggregateWithDeletes) { + const auto test_dir = + std::filesystem::temp_directory_path() / "doris_iceberg_aggregate_delete_test"; + std::filesystem::remove_all(test_dir); + std::filesystem::create_directories(test_dir); + + const auto file_path = (test_dir / "split.parquet").string(); + const auto dv_path = (test_dir / "delete-vector.bin").string(); + write_int_pair_parquet_file(file_path, {1, 2, 3}, {10, 20, 30}, {"one", "two", "three"}); + const auto dv_size = write_iceberg_deletion_vector_file(dv_path, {0}); + + std::vector projected_columns; + projected_columns.push_back(make_table_column(0, "id", std::make_shared())); + + RuntimeProfile profile("test_profile"); + RuntimeState state {TQueryOptions(), TQueryGlobals()}; + auto scan_params = make_local_parquet_scan_params(); + io::FileReaderStats file_reader_stats; + io::FileCacheStatistics file_cache_stats; + auto io_ctx = make_io_context(&file_reader_stats, &file_cache_stats); + ShardedKVCache cache(1); + doris::iceberg::IcebergTableReader reader; + ASSERT_TRUE(reader.init({ + .projected_columns = projected_columns, + .column_predicates = {}, + .conjuncts = VExprContext(nullptr), + .format = FileFormat::PARQUET, + .scan_params = &scan_params, + .io_ctx = io_ctx, + .runtime_state = &state, + .scanner_profile = &profile, + .allow_missing_columns = true, + .push_down_agg_type = TPushAggOp::type::COUNT, + .profile = make_table_read_profile(&profile), + }) + .ok()); + + auto split_options = build_split_options(file_path); + split_options.cache = &cache; + split_options.current_range.__set_table_format_params(make_iceberg_table_format_desc( + file_path, {make_iceberg_deletion_vector(dv_path, 0, dv_size)})); + ASSERT_TRUE(reader.prepare_split(split_options).ok()); + + Block block = build_table_block(projected_columns); + bool eos = false; + ASSERT_TRUE(reader.get_block(&block, &eos).ok()); + ASSERT_FALSE(eos); + ASSERT_EQ(block.rows(), 2); + const auto& id_column = assert_cast(*block.get_by_position(0).column); + EXPECT_EQ(id_column.get_element(0), 2); + EXPECT_EQ(id_column.get_element(1), 3); + + ASSERT_TRUE(reader.close().ok()); + std::filesystem::remove_all(test_dir); +} + +TEST(TableReaderTest, IcebergTableReaderDoesNotPushDownAggregateWithPositionDelete) { + const auto test_dir = + std::filesystem::temp_directory_path() / "doris_iceberg_aggregate_position_delete_test"; + std::filesystem::remove_all(test_dir); + std::filesystem::create_directories(test_dir); + + const auto file_path = (test_dir / "split.parquet").string(); + const auto delete_file_path = (test_dir / "position-delete.parquet").string(); + write_int_pair_parquet_file(file_path, {1, 2, 3}, {10, 20, 30}, {"one", "two", "three"}); + write_position_delete_parquet_file(delete_file_path, {file_path}, {1}); + + std::vector projected_columns; + projected_columns.push_back(make_table_column(0, "id", std::make_shared())); + + RuntimeProfile profile("test_profile"); + RuntimeState state {TQueryOptions(), TQueryGlobals()}; + auto scan_params = make_local_parquet_scan_params(); + io::FileReaderStats file_reader_stats; + io::FileCacheStatistics file_cache_stats; + auto io_ctx = make_io_context(&file_reader_stats, &file_cache_stats); + ShardedKVCache cache(1); + doris::iceberg::IcebergTableReader reader; + ASSERT_TRUE(reader.init({ + .projected_columns = projected_columns, + .column_predicates = {}, + .conjuncts = VExprContext(nullptr), + .format = FileFormat::PARQUET, + .scan_params = &scan_params, + .io_ctx = io_ctx, + .runtime_state = &state, + .scanner_profile = &profile, + .allow_missing_columns = true, + .push_down_agg_type = TPushAggOp::type::COUNT, + .profile = make_table_read_profile(&profile), + }) + .ok()); + + auto split_options = build_split_options(file_path); + split_options.cache = &cache; + split_options.current_range.__set_table_format_params(make_iceberg_table_format_desc( + file_path, {make_iceberg_position_delete_file(delete_file_path)})); + ASSERT_TRUE(reader.prepare_split(split_options).ok()); + + Block block = build_table_block(projected_columns); + bool eos = false; + ASSERT_TRUE(reader.get_block(&block, &eos).ok()); + ASSERT_FALSE(eos); + ASSERT_EQ(block.rows(), 2); + const auto& id_column = assert_cast(*block.get_by_position(0).column); + EXPECT_EQ(id_column.get_element(0), 1); + EXPECT_EQ(id_column.get_element(1), 3); + + ASSERT_TRUE(reader.close().ok()); + std::filesystem::remove_all(test_dir); +} + +TEST(TableReaderTest, IcebergPositionDeleteFallsBackToSplitPath) { + const auto test_dir = + std::filesystem::temp_directory_path() / "doris_iceberg_position_delete_path_test"; + std::filesystem::remove_all(test_dir); + std::filesystem::create_directories(test_dir); + + const auto file_path = (test_dir / "split.parquet").string(); + const auto delete_file_path = (test_dir / "position-delete.parquet").string(); + write_int_pair_parquet_file(file_path, {1, 2, 3}, {10, 20, 30}, {"one", "two", "three"}); + write_position_delete_parquet_file(delete_file_path, {file_path}, {1}); + + std::vector projected_columns; + projected_columns.push_back(make_table_column(0, "id", std::make_shared())); + + RuntimeProfile profile("test_profile"); + RuntimeState state {TQueryOptions(), TQueryGlobals()}; + auto scan_params = make_local_parquet_scan_params(); + io::FileReaderStats file_reader_stats; + io::FileCacheStatistics file_cache_stats; + auto io_ctx = make_io_context(&file_reader_stats, &file_cache_stats); + ShardedKVCache cache(1); + doris::iceberg::IcebergTableReader reader; + ASSERT_TRUE(reader.init({ + .projected_columns = projected_columns, + .column_predicates = {}, + .conjuncts = VExprContext(nullptr), + .format = FileFormat::PARQUET, + .scan_params = &scan_params, + .io_ctx = io_ctx, + .runtime_state = &state, + .scanner_profile = &profile, + .allow_missing_columns = true, + .profile = make_table_read_profile(&profile), + }) + .ok()); + + auto split_options = build_split_options(file_path); + split_options.cache = &cache; + TTableFormatFileDesc table_format_params; + TIcebergFileDesc iceberg_params; + iceberg_params.__set_format_version(2); + iceberg_params.__set_delete_files({make_iceberg_position_delete_file(delete_file_path)}); + table_format_params.__set_iceberg_params(iceberg_params); + split_options.current_range.__set_table_format_params(table_format_params); + ASSERT_TRUE(reader.prepare_split(split_options).ok()); + + EXPECT_EQ(read_iceberg_ids(&reader, projected_columns), std::vector({1, 3})); + + ASSERT_TRUE(reader.close().ok()); + std::filesystem::remove_all(test_dir); +} + +TEST(TableReaderTest, IcebergTableReaderDoesNotPushDownAggregateWithEqualityDelete) { + const auto test_dir = + std::filesystem::temp_directory_path() / "doris_iceberg_aggregate_equality_delete_test"; + std::filesystem::remove_all(test_dir); + std::filesystem::create_directories(test_dir); + + const auto file_path = (test_dir / "split.parquet").string(); + const auto delete_file_path = (test_dir / "equality-delete.parquet").string(); + write_int_pair_parquet_file(file_path, {1, 2, 3}, {10, 20, 30}, {"one", "two", "three"}); + write_iceberg_equality_delete_parquet_file(delete_file_path, 0, 2); + + std::vector projected_columns; + projected_columns.push_back(make_table_column(0, "id", std::make_shared())); + + RuntimeProfile profile("test_profile"); + RuntimeState state {TQueryOptions(), TQueryGlobals()}; + auto scan_params = make_local_parquet_scan_params(); + io::FileReaderStats file_reader_stats; + io::FileCacheStatistics file_cache_stats; + auto io_ctx = make_io_context(&file_reader_stats, &file_cache_stats); + ShardedKVCache cache(1); + doris::iceberg::IcebergTableReader reader; + ASSERT_TRUE(reader.init({ + .projected_columns = projected_columns, + .column_predicates = {}, + .conjuncts = VExprContext(nullptr), + .format = FileFormat::PARQUET, + .scan_params = &scan_params, + .io_ctx = io_ctx, + .runtime_state = &state, + .scanner_profile = &profile, + .allow_missing_columns = true, + .push_down_agg_type = TPushAggOp::type::COUNT, + .profile = make_table_read_profile(&profile), + }) + .ok()); + + auto split_options = build_split_options(file_path); + split_options.cache = &cache; + split_options.current_range.__set_table_format_params(make_iceberg_table_format_desc( + file_path, {make_iceberg_equality_delete_file(delete_file_path, {0})})); + ASSERT_TRUE(reader.prepare_split(split_options).ok()); + + Block block = build_table_block(projected_columns); + bool eos = false; + ASSERT_TRUE(reader.get_block(&block, &eos).ok()); + ASSERT_FALSE(eos); + ASSERT_EQ(block.rows(), 2); + const auto& id_column = assert_cast(*block.get_by_position(0).column); + EXPECT_EQ(id_column.get_element(0), 1); + EXPECT_EQ(id_column.get_element(1), 3); + + ASSERT_TRUE(reader.close().ok()); + std::filesystem::remove_all(test_dir); +} + TEST(TableReaderTest, IcebergTableReaderAppliesPositionDeleteFile) { const auto test_dir = std::filesystem::temp_directory_path() / "doris_iceberg_position_delete_file_test"; @@ -1332,8 +1949,9 @@ TEST(TableReaderTest, RowPositionDeletePredicateColumnIsNotRepeatedAsOutputColum EXPECT_EQ(request.non_predicate_columns, std::vector({0})); ASSERT_TRUE(request.column_positions.contains(row_position_column_id)); EXPECT_EQ(request.column_positions.at(row_position_column_id), 1); - ASSERT_EQ(request.expression_filters.size(), 1); - EXPECT_NE(request.expression_filters[0].delete_conjunct, nullptr); + ASSERT_TRUE(request.conjuncts.empty()); + ASSERT_EQ(request.delete_conjuncts.size(), 1); + EXPECT_NE(request.delete_conjuncts[0], nullptr); } TEST(TableReaderTest, ParquetReaderReadsOnlyRowGroupsInFileRange) { From 6b7ae2f1d92d73543869968d28b115bc54475cf2 Mon Sep 17 00:00:00 2001 From: Gabriel Date: Fri, 29 May 2026 15:32:56 +0800 Subject: [PATCH 38/38] [test](be) Add table reader edge case unit tests (#63895) ### What problem does this PR solve? Issue Number: close #xxx Related PR: #63893 Problem Summary: Add focused BE unit coverage for new table reader and new parquet reader edge cases, including aggregate pushdown over split ranges, Iceberg equality/position deletes, row lineage after delete filtering, Parquet dictionary/statistics pruning, and IOContext release. Also clean up temporary delete predicate expression columns in the new Parquet reader so equality delete predicates with cast children do not alter the returned file block schema. ### Release note None ### Check List (For Author) - Test: Unit Test - Added BE UT cases in table_reader_test and parquet_reader_test. - Ran git diff --check. - Tried ./run-be-ut.sh with focused filters, but local JAVA_HOME points to JDK 11 and JDK_17 is not set; the runner requires JDK 17. - Behavior changed: No - Does this need documentation: No ### What problem does this PR solve? Issue Number: close #xxx Related PR: #xxx Problem Summary: ### Release note None ### Check List (For Author) - Test - [ ] Regression test - [ ] Unit Test - [ ] Manual test (add detailed scripts or steps below) - [ ] No need to test or manual test. Explain why: - [ ] This is a refactor/code format and no logic has been changed. - [ ] Previous test can cover this change. - [ ] No code files have been changed. - [ ] Other reason - Behavior changed: - [ ] No. - [ ] Yes. - Does this need documentation? - [ ] No. - [ ] Yes. ### Check List (For Reviewer who merge this PR) - [ ] Confirm the release note - [ ] Confirm test cases - [ ] Confirm document - [ ] Add branch pick label --- be/src/format/new_parquet/parquet_reader.cpp | 3 +- .../new_parquet/parquet_reader_test.cpp | 144 ++++++++++ be/test/format/reader/table_reader_test.cpp | 262 ++++++++++++++++++ 3 files changed, 408 insertions(+), 1 deletion(-) diff --git a/be/src/format/new_parquet/parquet_reader.cpp b/be/src/format/new_parquet/parquet_reader.cpp index 26093575c1194c..c38d8b810a9b2d 100644 --- a/be/src/format/new_parquet/parquet_reader.cpp +++ b/be/src/format/new_parquet/parquet_reader.cpp @@ -341,6 +341,7 @@ Status ParquetReader::_execute_filter_conjuncts(int64_t batch_rows, Block* file_ if (*selected_rows == 0) { break; } + const size_t original_columns = file_block->columns(); int result_column_id = -1; RETURN_IF_ERROR(delete_conjunct->root()->execute(delete_conjunct.get(), file_block, &result_column_id)); @@ -356,7 +357,7 @@ Status ParquetReader::_execute_filter_conjuncts(int64_t batch_rows, Block* file_ keep_filter[row] = !delete_filter[row]; has_kept_row |= keep_filter[row] != 0; } - file_block->erase(result_column_id); + file_block->erase_tail(original_columns); *selected_rows = !has_kept_row ? 0 : _apply_filter_to_selection(keep_filter, selection, *selected_rows); diff --git a/be/test/format/new_parquet/parquet_reader_test.cpp b/be/test/format/new_parquet/parquet_reader_test.cpp index 6d0156af9ce2db..fb6c6d8ab35707 100644 --- a/be/test/format/new_parquet/parquet_reader_test.cpp +++ b/be/test/format/new_parquet/parquet_reader_test.cpp @@ -227,6 +227,31 @@ void write_dictionary_filter_parquet_file(const std::string& file_path) { builder.build())); } +void write_dictionary_edge_parquet_file(const std::string& file_path) { + auto schema = arrow::schema({ + arrow::field("id", arrow::int32(), false), + arrow::field("value", arrow::utf8(), false), + }); + auto table = arrow::Table::Make( + schema, + {build_int32_array({1, 2, 3, 4, 5, 6, 7, 8}), + build_string_array({"", "same", "other", "long-value", "", "tail", "same", "last"})}); + + auto file_result = arrow::io::FileOutputStream::Open(file_path); + ASSERT_TRUE(file_result.ok()) << file_result.status(); + std::shared_ptr out = *file_result; + + ::parquet::WriterProperties::Builder builder; + builder.version(::parquet::ParquetVersion::PARQUET_2_6); + builder.data_page_version(::parquet::ParquetDataPageVersion::V2); + builder.compression(::parquet::Compression::UNCOMPRESSED); + builder.enable_dictionary("value"); + builder.disable_dictionary("id"); + builder.disable_statistics(); + PARQUET_THROW_NOT_OK(::parquet::arrow::WriteTable(*table, arrow::default_memory_pool(), out, 2, + builder.build())); +} + Block build_file_block(const std::vector& schema) { Block block; for (const auto& field : schema) { @@ -284,6 +309,10 @@ class TestFileReader final : public reader::FileReader { bool has_request() const { return _request != nullptr; } bool eof() const { return _eof; } + + bool has_io_context() const { return _io_ctx != nullptr; } + + long io_context_use_count() const { return _io_ctx.use_count(); } }; TEST(FileReaderTest, OpenStoresRequestAndCloseClearsState) { @@ -304,6 +333,25 @@ TEST(FileReaderTest, OpenStoresRequestAndCloseClearsState) { EXPECT_TRUE(reader.eof()); } +TEST(FileReaderTest, CloseReleasesSharedIOContext) { + auto system_properties = std::make_shared(); + system_properties->system_type = TFileType::FILE_LOCAL; + auto file_description = std::make_unique(); + auto io_ctx = std::make_shared(); + std::weak_ptr weak_io_ctx = io_ctx; + TestFileReader reader(system_properties, file_description, io_ctx); + + EXPECT_TRUE(reader.has_io_context()); + EXPECT_EQ(reader.io_context_use_count(), 2); + io_ctx.reset(); + EXPECT_FALSE(weak_io_ctx.expired()); + EXPECT_EQ(reader.io_context_use_count(), 1); + + ASSERT_TRUE(reader.close().ok()); + EXPECT_FALSE(reader.has_io_context()); + EXPECT_TRUE(weak_io_ctx.expired()); +} + TEST(TableColumnMapperTest, CreatesComplexProjectionForStructChildren) { reader::SchemaField struct_field; struct_field.id = 0; @@ -691,6 +739,102 @@ TEST_F(NewParquetReaderTest, InPredicateFiltersRowGroupsByDictionary) { EXPECT_EQ(values, std::vector({"az", "za"})); } +TEST_F(NewParquetReaderTest, DictionaryPageV2StringEdgesSurviveSelection) { + write_dictionary_edge_parquet_file(_file_path); + auto parquet_file_reader = ::parquet::ParquetFileReader::OpenFile(_file_path, false); + ASSERT_EQ(parquet_file_reader->metadata()->num_row_groups(), 4); + for (int row_group_idx = 0; row_group_idx < 4; ++row_group_idx) { + auto row_group = parquet_file_reader->metadata()->RowGroup(row_group_idx); + ASSERT_NE(row_group, nullptr); + ASSERT_TRUE(row_group->ColumnChunk(1)->has_dictionary_page()); + } + + auto reader = create_reader(); + RuntimeState state {TQueryOptions(), TQueryGlobals()}; + ASSERT_TRUE(reader->init(&state).ok()); + + std::vector schema; + ASSERT_TRUE(reader->get_schema(&schema).ok()); + auto request = std::make_unique(); + request->predicate_columns = {1}; + request->non_predicate_columns = {0}; + auto set = build_set(); + set->insert(const_cast(""), 0); + set->insert(const_cast("same"), 4); + reader::FileColumnPredicateFilter column_filter; + column_filter.file_column_id = 1; + column_filter.predicates.push_back(create_in_list_predicate( + 1, "value", schema[1].type, set, false)); + request->column_predicate_filters.push_back(std::move(column_filter)); + ASSERT_TRUE(reader->open(request).ok()); + + std::vector ids; + std::vector values; + bool eof = false; + while (!eof) { + Block block = build_file_block(schema); + size_t rows = 0; + ASSERT_TRUE(reader->get_block(&block, &rows, &eof).ok()); + if (rows == 0) { + continue; + } + const auto& id_column = assert_cast(*block.get_by_position(0).column); + const auto& value_column = + assert_cast(*block.get_by_position(1).column); + for (size_t row = 0; row < rows; ++row) { + ids.push_back(id_column.get_element(row)); + values.push_back(value_column.get_data_at(row).to_string()); + } + } + + EXPECT_EQ(ids, std::vector({1, 2, 5, 6, 7, 8})); + EXPECT_EQ(values, std::vector({"", "same", "", "tail", "same", "last"})); +} + +TEST_F(NewParquetReaderTest, StatisticsPruningSkipsPrefixRowGroupsAndReadsLaterGroups) { + write_parquet_file(_file_path, 1); + auto parquet_file_reader = ::parquet::ParquetFileReader::OpenFile(_file_path, false); + ASSERT_EQ(parquet_file_reader->metadata()->num_row_groups(), 5); + + auto reader = create_reader(); + RuntimeState state {TQueryOptions(), TQueryGlobals()}; + ASSERT_TRUE(reader->init(&state).ok()); + + std::vector schema; + ASSERT_TRUE(reader->get_schema(&schema).ok()); + auto request = std::make_unique(); + request->predicate_columns = {0}; + request->non_predicate_columns = {1}; + reader::FileColumnPredicateFilter column_filter; + column_filter.file_column_id = 0; + column_filter.predicates.push_back(create_comparison_predicate( + 0, "id", schema[0].type, Field::create_field(4), false)); + request->column_predicate_filters.push_back(std::move(column_filter)); + ASSERT_TRUE(reader->open(request).ok()); + + std::vector ids; + std::vector values; + bool eof = false; + while (!eof) { + Block block = build_file_block(schema); + size_t rows = 0; + ASSERT_TRUE(reader->get_block(&block, &rows, &eof).ok()); + if (rows == 0) { + continue; + } + const auto& id_column = assert_cast(*block.get_by_position(0).column); + const auto& value_column = + assert_cast(*block.get_by_position(1).column); + for (size_t row = 0; row < rows; ++row) { + ids.push_back(id_column.get_element(row)); + values.push_back(value_column.get_data_at(row).to_string()); + } + } + + EXPECT_EQ(ids, std::vector({4, 5})); + EXPECT_EQ(values, std::vector({"four", "five"})); +} + TEST_F(NewParquetReaderTest, RowPositionReaderReturnsFileLocalPositions) { write_parquet_file(_file_path, 2); auto parquet_file_reader = ::parquet::ParquetFileReader::OpenFile(_file_path, false); diff --git a/be/test/format/reader/table_reader_test.cpp b/be/test/format/reader/table_reader_test.cpp index 1bb6eaf26be6fb..c5efa0512e603f 100644 --- a/be/test/format/reader/table_reader_test.cpp +++ b/be/test/format/reader/table_reader_test.cpp @@ -289,6 +289,27 @@ void write_iceberg_equality_delete_parquet_file(const std::string& file_path, in builder.build())); } +void write_iceberg_equality_delete_bigint_parquet_file(const std::string& file_path, + int32_t field_id, int64_t value) { + const auto metadata = + arrow::key_value_metadata({"PARQUET:field_id"}, {std::to_string(field_id)}); + auto schema = arrow::schema({ + arrow::field("id", arrow::int64(), false)->WithMetadata(metadata), + }); + auto table = arrow::Table::Make(schema, {build_int64_array({value})}); + + auto file_result = arrow::io::FileOutputStream::Open(file_path); + ASSERT_TRUE(file_result.ok()) << file_result.status(); + std::shared_ptr out = *file_result; + + ::parquet::WriterProperties::Builder builder; + builder.version(::parquet::ParquetVersion::PARQUET_2_6); + builder.data_page_version(::parquet::ParquetDataPageVersion::V2); + builder.compression(::parquet::Compression::UNCOMPRESSED); + PARQUET_THROW_NOT_OK(::parquet::arrow::WriteTable(*table, arrow::default_memory_pool(), out, 1, + builder.build())); +} + void write_int_pair_parquet_file(const std::string& file_path, const std::vector& ids, const std::vector& scores, const std::vector& values, @@ -722,6 +743,90 @@ TEST(TableReaderTest, PushDownMinMaxCastsFileValueToTableType) { std::filesystem::remove_all(test_dir); } +TEST(TableReaderTest, PushDownMinMaxOnlyUsesSelectedRowGroupInFileRange) { + const auto test_dir = + std::filesystem::temp_directory_path() / "doris_table_reader_minmax_range_test"; + std::filesystem::remove_all(test_dir); + std::filesystem::create_directories(test_dir); + + const auto file_path = (test_dir / "split.parquet").string(); + write_int_pair_parquet_file(file_path, {10, 1, 100}, {100, 10, 1000}, {"ten", "one", "hundred"}, + 1); + + std::vector projected_columns; + projected_columns.push_back(make_table_column(0, "id", std::make_shared())); + + RuntimeState state {TQueryOptions(), TQueryGlobals()}; + TableReader reader; + ASSERT_TRUE(reader.init({ + .projected_columns = projected_columns, + .column_predicates = {}, + .conjuncts = VExprContext(nullptr), + .format = FileFormat::PARQUET, + .scan_params = nullptr, + .io_ctx = nullptr, + .runtime_state = &state, + .scanner_profile = nullptr, + .allow_missing_columns = true, + .push_down_agg_type = TPushAggOp::type::MINMAX, + .profile = nullptr, + }) + .ok()); + ASSERT_TRUE(reader.prepare_split(build_split_options_for_row_group_mid(file_path, 1)).ok()); + + Block block = build_table_block(projected_columns); + bool eos = false; + ASSERT_TRUE(reader.get_block(&block, &eos).ok()); + ASSERT_FALSE(eos); + ASSERT_EQ(block.rows(), 2); + const auto& id_column = assert_cast(*block.get_by_position(0).column); + EXPECT_EQ(id_column.get_element(0), 1); + EXPECT_EQ(id_column.get_element(1), 1); + + ASSERT_TRUE(reader.close().ok()); + std::filesystem::remove_all(test_dir); +} + +TEST(TableReaderTest, PushDownCountOnlyUsesSelectedRowGroupInFileRange) { + const auto test_dir = + std::filesystem::temp_directory_path() / "doris_table_reader_count_range_test"; + std::filesystem::remove_all(test_dir); + std::filesystem::create_directories(test_dir); + + const auto file_path = (test_dir / "split.parquet").string(); + write_int_pair_parquet_file(file_path, {1, 2, 3}, {10, 20, 30}, {"one", "two", "three"}, 1); + + std::vector projected_columns; + projected_columns.push_back(make_table_column(0, "id", std::make_shared())); + + RuntimeState state {TQueryOptions(), TQueryGlobals()}; + TableReader reader; + ASSERT_TRUE(reader.init({ + .projected_columns = projected_columns, + .column_predicates = {}, + .conjuncts = VExprContext(nullptr), + .format = FileFormat::PARQUET, + .scan_params = nullptr, + .io_ctx = nullptr, + .runtime_state = &state, + .scanner_profile = nullptr, + .allow_missing_columns = true, + .push_down_agg_type = TPushAggOp::type::COUNT, + .profile = nullptr, + }) + .ok()); + ASSERT_TRUE(reader.prepare_split(build_split_options_for_row_group_mid(file_path, 2)).ok()); + + Block block = build_table_block(projected_columns); + bool eos = false; + ASSERT_TRUE(reader.get_block(&block, &eos).ok()); + ASSERT_FALSE(eos); + ASSERT_EQ(block.rows(), 1); + + ASSERT_TRUE(reader.close().ok()); + std::filesystem::remove_all(test_dir); +} + TEST(TableReaderTest, PushDownCountFallsBackWithTableConjunct) { const auto test_dir = std::filesystem::temp_directory_path() / "doris_table_reader_count_conjunct_test"; @@ -1827,6 +1932,163 @@ TEST(TableReaderTest, IcebergTableReaderDoesNotPushDownAggregateWithEqualityDele std::filesystem::remove_all(test_dir); } +TEST(TableReaderTest, IcebergEqualityDeleteCastsDataColumnToDeleteKeyType) { + const auto test_dir = + std::filesystem::temp_directory_path() / "doris_iceberg_equality_delete_cast_test"; + std::filesystem::remove_all(test_dir); + std::filesystem::create_directories(test_dir); + + const auto file_path = (test_dir / "split.parquet").string(); + const auto delete_file_path = (test_dir / "equality-delete.parquet").string(); + write_int_pair_parquet_file(file_path, {1, 2, 3}, {10, 20, 30}, {"one", "two", "three"}); + write_iceberg_equality_delete_bigint_parquet_file(delete_file_path, 0, 2); + + std::vector projected_columns; + projected_columns.push_back(make_table_column(0, "id", std::make_shared())); + + RuntimeProfile profile("test_profile"); + RuntimeState state {TQueryOptions(), TQueryGlobals()}; + auto scan_params = make_local_parquet_scan_params(); + io::FileReaderStats file_reader_stats; + io::FileCacheStatistics file_cache_stats; + auto io_ctx = make_io_context(&file_reader_stats, &file_cache_stats); + ShardedKVCache cache(1); + doris::iceberg::IcebergTableReader reader; + ASSERT_TRUE(reader.init({ + .projected_columns = projected_columns, + .column_predicates = {}, + .conjuncts = VExprContext(nullptr), + .format = FileFormat::PARQUET, + .scan_params = &scan_params, + .io_ctx = io_ctx, + .runtime_state = &state, + .scanner_profile = &profile, + .allow_missing_columns = true, + .profile = make_table_read_profile(&profile), + }) + .ok()); + + auto split_options = build_split_options(file_path); + split_options.cache = &cache; + split_options.current_range.__set_table_format_params(make_iceberg_table_format_desc( + file_path, {make_iceberg_equality_delete_file(delete_file_path, {0})})); + ASSERT_TRUE(reader.prepare_split(split_options).ok()); + + EXPECT_EQ(read_iceberg_ids(&reader, projected_columns), std::vector({1, 3})); + + ASSERT_TRUE(reader.close().ok()); + std::filesystem::remove_all(test_dir); +} + +TEST(TableReaderTest, IcebergPositionDeleteOnlyMatchesOriginalDataFilePath) { + const auto test_dir = std::filesystem::temp_directory_path() / + "doris_iceberg_position_delete_path_match_test"; + std::filesystem::remove_all(test_dir); + std::filesystem::create_directories(test_dir); + + const auto file_path = (test_dir / "split.parquet").string(); + const auto other_file_path = (test_dir / "other.parquet").string(); + const auto delete_file_path = (test_dir / "position-delete.parquet").string(); + write_int_pair_parquet_file(file_path, {1, 2, 3}, {10, 20, 30}, {"one", "two", "three"}); + write_position_delete_parquet_file(delete_file_path, {other_file_path, file_path}, {0, 1}); + + std::vector projected_columns; + projected_columns.push_back(make_table_column(0, "id", std::make_shared())); + + RuntimeProfile profile("test_profile"); + RuntimeState state {TQueryOptions(), TQueryGlobals()}; + auto scan_params = make_local_parquet_scan_params(); + io::FileReaderStats file_reader_stats; + io::FileCacheStatistics file_cache_stats; + auto io_ctx = make_io_context(&file_reader_stats, &file_cache_stats); + ShardedKVCache cache(1); + doris::iceberg::IcebergTableReader reader; + ASSERT_TRUE(reader.init({ + .projected_columns = projected_columns, + .column_predicates = {}, + .conjuncts = VExprContext(nullptr), + .format = FileFormat::PARQUET, + .scan_params = &scan_params, + .io_ctx = io_ctx, + .runtime_state = &state, + .scanner_profile = &profile, + .allow_missing_columns = true, + .profile = make_table_read_profile(&profile), + }) + .ok()); + + auto split_options = build_split_options(file_path); + split_options.cache = &cache; + split_options.current_range.__set_table_format_params(make_iceberg_table_format_desc( + file_path, {make_iceberg_position_delete_file(delete_file_path)})); + ASSERT_TRUE(reader.prepare_split(split_options).ok()); + + EXPECT_EQ(read_iceberg_ids(&reader, projected_columns), std::vector({1, 3})); + + ASSERT_TRUE(reader.close().ok()); + std::filesystem::remove_all(test_dir); +} + +TEST(TableReaderTest, IcebergRowLineageRemainsFileLocalAfterDeleteFiltering) { + const auto test_dir = + std::filesystem::temp_directory_path() / "doris_iceberg_row_lineage_delete_test"; + std::filesystem::remove_all(test_dir); + std::filesystem::create_directories(test_dir); + + const auto file_path = (test_dir / "split.parquet").string(); + const auto delete_file_path = (test_dir / "position-delete.parquet").string(); + write_int_pair_parquet_file(file_path, {1, 2, 3}, {10, 20, 30}, {"one", "two", "three"}); + write_position_delete_parquet_file(delete_file_path, {file_path}, {1}); + + std::vector projected_columns; + projected_columns.push_back( + make_table_column(100, "_row_id", make_nullable(std::make_shared()))); + projected_columns.push_back(make_table_column(0, "id", std::make_shared())); + + RuntimeProfile profile("test_profile"); + RuntimeState state {TQueryOptions(), TQueryGlobals()}; + auto scan_params = make_local_parquet_scan_params(); + io::FileReaderStats file_reader_stats; + io::FileCacheStatistics file_cache_stats; + auto io_ctx = make_io_context(&file_reader_stats, &file_cache_stats); + ShardedKVCache cache(1); + doris::iceberg::IcebergTableReader reader; + ASSERT_TRUE(reader.init({ + .projected_columns = projected_columns, + .column_predicates = {}, + .conjuncts = VExprContext(nullptr), + .format = FileFormat::PARQUET, + .scan_params = &scan_params, + .io_ctx = io_ctx, + .runtime_state = &state, + .scanner_profile = &profile, + .allow_missing_columns = true, + .profile = make_table_read_profile(&profile), + }) + .ok()); + + auto split_options = build_split_options(file_path); + split_options.cache = &cache; + TTableFormatFileDesc table_format_params = make_iceberg_table_format_desc( + file_path, {make_iceberg_position_delete_file(delete_file_path)}); + table_format_params.iceberg_params.__set_first_row_id(1000); + split_options.current_range.__set_table_format_params(table_format_params); + ASSERT_TRUE(reader.prepare_split(split_options).ok()); + + Block block = build_table_block(projected_columns); + bool eos = false; + ASSERT_TRUE(reader.get_block(&block, &eos).ok()); + ASSERT_FALSE(eos); + ASSERT_EQ(block.rows(), 2); + expect_nullable_int64_column_values(*block.get_by_position(0).column, {1000, 1002}); + const auto& id_column = assert_cast(*block.get_by_position(1).column); + EXPECT_EQ(id_column.get_element(0), 1); + EXPECT_EQ(id_column.get_element(1), 3); + + ASSERT_TRUE(reader.close().ok()); + std::filesystem::remove_all(test_dir); +} + TEST(TableReaderTest, IcebergTableReaderAppliesPositionDeleteFile) { const auto test_dir = std::filesystem::temp_directory_path() / "doris_iceberg_position_delete_file_test";