From b849d2648482a0d8e9b35801bf9fedb8089d8620 Mon Sep 17 00:00:00 2001 From: yongman Date: Tue, 26 May 2026 18:37:30 +0800 Subject: [PATCH 01/19] columnar: support bucket parallel read in region Signed-off-by: yongman --- .../ffi/src/RaftStoreProxyFFI/ProxyFFI.h | 2 + .../hub-runtime/src/cloud_helper.rs | 14 + .../hub-runtime/src/columnar_impls.rs | 20 +- .../hub-runtime/src/interfaces.rs | 7 + .../hub-runtime/src/run.rs | 5 +- .../Storages/StorageDisaggregatedColumnar.cpp | 663 ++++++++++++------ .../Storages/StorageDisaggregatedColumnar.h | 109 ++- 7 files changed, 565 insertions(+), 255 deletions(-) diff --git a/contrib/tiflash-columnar-hub/hub-runtime/ffi/src/RaftStoreProxyFFI/ProxyFFI.h b/contrib/tiflash-columnar-hub/hub-runtime/ffi/src/RaftStoreProxyFFI/ProxyFFI.h index ce4e60648a8..f4621ee0af6 100644 --- a/contrib/tiflash-columnar-hub/hub-runtime/ffi/src/RaftStoreProxyFFI/ProxyFFI.h +++ b/contrib/tiflash-columnar-hub/hub-runtime/ffi/src/RaftStoreProxyFFI/ProxyFFI.h @@ -228,6 +228,8 @@ struct SSTReaderInterfaces { struct CloudStorageEngineInterfaces { bool (*fn_get_keyspace_encryption)(RaftStoreProxyPtr, uint32_t); RawCppStringPtr (*fn_get_master_key)(RaftStoreProxyPtr); + RustStrWithViewVec (*fn_get_region_bucket_keys)(uint64_t, uint64_t, + RaftStoreProxyPtr); ColumnarReaderPtr (*fn_get_columnar_reader)(uint64_t, uint64_t, uint64_t, BaseBuffView, BaseBuffView, BaseBuffView, BaseBuffView, diff --git a/contrib/tiflash-columnar-hub/hub-runtime/src/cloud_helper.rs b/contrib/tiflash-columnar-hub/hub-runtime/src/cloud_helper.rs index dbbcb525000..03277c5227c 100644 --- a/contrib/tiflash-columnar-hub/hub-runtime/src/cloud_helper.rs +++ b/contrib/tiflash-columnar-hub/hub-runtime/src/cloud_helper.rs @@ -176,6 +176,16 @@ impl PdClientWithCache { pub fn get_security_mgr(&self) -> Arc { self.pd_client.get_security_mgr() } + + pub fn get_region_bucket_keys(&self, region_id: u64, region_ver: u64) -> Vec> { + let Some(bucket_stat) = self.pd_client.get_buckets(region_id) else { + return Vec::new(); + }; + if bucket_stat.meta.region_epoch.get_version() != region_ver { + return Vec::new(); + } + bucket_stat.meta.keys.clone() + } } #[derive(Clone)] @@ -444,6 +454,10 @@ impl CloudHelper { } } } + + pub fn get_region_bucket_keys(&self, region_id: u64, region_ver: u64) -> Vec> { + self.pd_client.get_region_bucket_keys(region_id, region_ver) + } } fn collect_ia_meta_files(meta_paths: &[PathBuf]) -> std::io::Result> { diff --git a/contrib/tiflash-columnar-hub/hub-runtime/src/columnar_impls.rs b/contrib/tiflash-columnar-hub/hub-runtime/src/columnar_impls.rs index 6316c2107a6..3d063267c3f 100644 --- a/contrib/tiflash-columnar-hub/hub-runtime/src/columnar_impls.rs +++ b/contrib/tiflash-columnar-hub/hub-runtime/src/columnar_impls.rs @@ -18,10 +18,10 @@ use kvengine::{CloudColumnarReaders, TableCtx}; use protobuf::{parse_from_bytes, Message}; use crate::{ - build_from_string, + build_from_string, build_from_vec_string, interfaces_ffi::{ BaseBuffView, ColumnarReaderErrorType, ColumnarReaderPtr, RaftStoreProxyPtr, RawRustPtr, - RawVoidPtr, RustStrWithView, + RawVoidPtr, RustStrWithView, RustStrWithViewVec, }, RawRustPtrType, }; @@ -73,6 +73,22 @@ impl From for ColumnarReaderPtr { } } +pub unsafe extern "C" fn ffi_get_region_bucket_keys( + region_id: u64, + region_ver: u64, + hub_ptr: RaftStoreProxyPtr, +) -> RustStrWithViewVec { + let hub = hub_ptr.as_ref(); + let bucket_keys = hub + .cloud_helper + .get_region_bucket_keys(region_id, region_ver); + if bucket_keys.is_empty() { + RustStrWithViewVec::default() + } else { + build_from_vec_string(bucket_keys) + } +} + pub unsafe extern "C" fn ffi_make_columnar_reader( shard_id: u64, shard_ver: u64, diff --git a/contrib/tiflash-columnar-hub/hub-runtime/src/interfaces.rs b/contrib/tiflash-columnar-hub/hub-runtime/src/interfaces.rs index a52c1b0233e..5f9c9485b37 100644 --- a/contrib/tiflash-columnar-hub/hub-runtime/src/interfaces.rs +++ b/contrib/tiflash-columnar-hub/hub-runtime/src/interfaces.rs @@ -355,6 +355,13 @@ pub mod root { arg1: root::DB::RaftStoreProxyPtr, ) -> root::DB::RawCppStringPtr, >, + pub fn_get_region_bucket_keys: ::std::option::Option< + unsafe extern "C" fn( + arg1: u64, + arg2: u64, + arg3: root::DB::RaftStoreProxyPtr, + ) -> root::DB::RustStrWithViewVec, + >, pub fn_get_columnar_reader: ::std::option::Option< unsafe extern "C" fn( arg1: u64, diff --git a/contrib/tiflash-columnar-hub/hub-runtime/src/run.rs b/contrib/tiflash-columnar-hub/hub-runtime/src/run.rs index bbafede8796..22973d597b9 100644 --- a/contrib/tiflash-columnar-hub/hub-runtime/src/run.rs +++ b/contrib/tiflash-columnar-hub/hub-runtime/src/run.rs @@ -55,8 +55,8 @@ use tikv_util::{ use crate::{ cloud_helper::{CloudEngineBackends, CloudHelper}, columnar_impls::{ - ffi_make_columnar_reader, ffi_physical_table_id, ffi_read_block, ffi_read_column, - ffi_read_handle, ffi_read_version, + ffi_get_region_bucket_keys, ffi_make_columnar_reader, ffi_physical_table_id, + ffi_read_block, ffi_read_column, ffi_read_handle, ffi_read_version, }, domain_impls::ffi_gc_rust_ptr, engine_store_helper::{ @@ -1147,6 +1147,7 @@ fn build_hub_ffi_helper(hub: &ColumnarHub) -> RaftStoreProxyFFIHelper { cloud_storage_engine_interfaces: CloudStorageEngineInterfaces { fn_get_keyspace_encryption: Some(ffi_get_keyspace_encryption), fn_get_master_key: Some(ffi_get_master_key), + fn_get_region_bucket_keys: Some(ffi_get_region_bucket_keys), fn_get_columnar_reader: Some(ffi_make_columnar_reader), fn_read_block: Some(ffi_read_block), fn_read_handle: Some(ffi_read_handle), diff --git a/dbms/src/Storages/StorageDisaggregatedColumnar.cpp b/dbms/src/Storages/StorageDisaggregatedColumnar.cpp index c83f976ddb3..0f671800921 100644 --- a/dbms/src/Storages/StorageDisaggregatedColumnar.cpp +++ b/dbms/src/Storages/StorageDisaggregatedColumnar.cpp @@ -55,6 +55,7 @@ #include #include +#include #include #include @@ -65,8 +66,151 @@ namespace ErrorCodes extern const int COLUMNAR_SNAPSHOT_ERROR; } // namespace ErrorCodes +struct RNProxyReaderSharedContext +{ + LoggerPtr log; + const Context * context = nullptr; + UInt64 start_ts = 0; + DM::ColumnDefinesPtr column_defines; + int extra_table_id_index = -1; + TableID logical_table_id = 0; + String executor_id; + String table_scan_data; + String filter_conditions_data; + String table_info_data; + String ann_query_info_data; + String fts_query_info_data; + std::shared_ptr output_lock = std::make_shared(); +}; + namespace { +using ProxyPhysicalTableRanges = std::vector>; +using BucketSplitUnit = std::pair; + +void normalizeTimestampCompareDateTimeLiteralToUTC(tipb::Expr & expr, const TimezoneInfo & timezone_info); + +struct BucketSplitResult +{ + bool has_bucket_split = false; + std::vector units; +}; + +struct RegionReaderPlan +{ + RegionID region_id; + pingcap::kv::RegionVerID region_ver_id; + ProxyPhysicalTableRanges physical_table_ranges; + std::vector bucket_units; +}; + +bool isBucketBoundaryInsideRange(const String & bucket_key, const pingcap::coprocessor::KeyRange & range) +{ + if (bucket_key.empty()) + return false; + if (!range.start_key.empty() && bucket_key <= range.start_key) + return false; + if (!range.end_key.empty() && bucket_key >= range.end_key) + return false; + return true; +} + +BucketSplitResult splitRangesByBucketKeys( + const ProxyPhysicalTableRanges & physical_table_ranges, + const std::vector & bucket_keys) +{ + BucketSplitResult result; + if (bucket_keys.size() <= 2) + return result; + + for (const auto & [table_id, ranges] : physical_table_ranges) + { + for (const auto & range : ranges) + { + String current_start = range.start_key; + bool current_range_split = false; + for (const auto & bucket_key : bucket_keys) + { + if (!isBucketBoundaryInsideRange(bucket_key, range)) + continue; + result.units.emplace_back(table_id, pingcap::coprocessor::KeyRange{current_start, bucket_key}); + current_start = bucket_key; + current_range_split = true; + } + if (!range.end_key.empty() && current_start >= range.end_key) + continue; + result.units.emplace_back(table_id, pingcap::coprocessor::KeyRange{current_start, range.end_key}); + result.has_bucket_split = result.has_bucket_split || current_range_split; + } + } + return result; +} + +void appendRangeToReaderRanges( + ProxyPhysicalTableRanges & reader_ranges, + TableID table_id, + pingcap::coprocessor::KeyRange range) +{ + auto it = std::find_if(reader_ranges.begin(), reader_ranges.end(), [&](const auto & entry) { + return std::get<0>(entry) == table_id; + }); + if (it == reader_ranges.end()) + { + reader_ranges.emplace_back(table_id, pingcap::coprocessor::KeyRanges{std::move(range)}); + return; + } + std::get<1>(*it).push_back(std::move(range)); +} + +std::vector packBucketUnitsIntoReaders( + const std::vector & units, + size_t reader_count) +{ + std::vector reader_groups; + if (units.empty() || reader_count == 0) + return reader_groups; + + reader_count = std::min(reader_count, units.size()); + reader_groups.resize(reader_count); + size_t base_unit_count = units.size() / reader_count; + size_t remainder = units.size() % reader_count; + size_t unit_index = 0; + for (size_t reader_index = 0; reader_index < reader_count; ++reader_index) + { + size_t current_unit_count = base_unit_count + (reader_index < remainder ? 1 : 0); + auto & reader_ranges = reader_groups[reader_index]; + for (size_t i = 0; i < current_unit_count; ++i) + { + const auto & [table_id, range] = units[unit_index++]; + appendRangeToReaderRanges(reader_ranges, table_id, range); + } + } + return reader_groups; +} + +std::vector getRegionBucketKeysFromProxy(const Context & context, RegionID region_id, UInt64 region_ver) +{ + const Context & global_ctx = context.getGlobalContext(); + const TiFlashRaftProxyHelper * proxy_helper = global_ctx.getSharedContextDisagg()->getColumnarProxyHelper(); + if (proxy_helper == nullptr || proxy_helper->cloud_storage_engine_interfaces.fn_get_region_bucket_keys == nullptr) + return {}; + + RustStrWithViewVec bucket_keys = proxy_helper->cloud_storage_engine_interfaces.fn_get_region_bucket_keys( + region_id, + region_ver, + proxy_helper->proxy_ptr); + SCOPE_EXIT({ + if (bucket_keys.inner.ptr != nullptr) + RustGcHelper::instance().gcRustPtr(bucket_keys.inner.ptr, bucket_keys.inner.type); + }); + + std::vector res; + res.reserve(static_cast(bucket_keys.len)); + for (size_t i = 0; i < bucket_keys.len; ++i) + res.emplace_back(bucket_keys.buffs[i].data, bucket_keys.buffs[i].len); + return res; +} + std::vector> genGeneratedColumnInfosForDisaggregatedRead( const TiDBTableScan & table_scan) { @@ -109,6 +253,87 @@ std::tuple genColumnDefinesForDisaggregatedReadThroug return {std::move(column_defines), extra_table_id_index}; } +std::shared_ptr buildProxyReaderSharedContext( + const LoggerPtr & log, + const Context & context, + UInt64 start_ts, + const TiDBTableScan & table_scan, + const FilterConditions & filter_conditions) +{ + auto shared_context = std::make_shared(); + shared_context->log = log; + shared_context->context = &context; + shared_context->start_ts = start_ts; + shared_context->logical_table_id = table_scan.getLogicalTableID(); + shared_context->executor_id = table_scan.getTableScanExecutorID(); + std::tie(shared_context->column_defines, shared_context->extra_table_id_index) + = genColumnDefinesForDisaggregatedReadThroughColumnar(table_scan); + + auto table_scan_pb = *table_scan.getTableScanPB(); + const auto & timezone_info = context.getTimezoneInfo(); + if (table_scan_pb.tp() == tipb::TypePartitionTableScan) + { + auto * pushed_down_filters + = table_scan_pb.mutable_partition_table_scan()->mutable_pushed_down_filter_conditions(); + for (int i = 0; i < pushed_down_filters->size(); ++i) + normalizeTimestampCompareDateTimeLiteralToUTC(*pushed_down_filters->Mutable(i), timezone_info); + } + else + { + auto * pushed_down_filters = table_scan_pb.mutable_tbl_scan()->mutable_pushed_down_filter_conditions(); + for (int i = 0; i < pushed_down_filters->size(); ++i) + normalizeTimestampCompareDateTimeLiteralToUTC(*pushed_down_filters->Mutable(i), timezone_info); + } + shared_context->table_scan_data = table_scan_pb.SerializeAsString(); + + auto conditions = filter_conditions.conditions; + for (int i = 0; i < conditions.size(); ++i) + normalizeTimestampCompareDateTimeLiteralToUTC(*conditions.Mutable(i), timezone_info); + for (const auto & condition : conditions) + { + auto data = condition.SerializeAsString(); + uint32_t len = data.size(); + shared_context->filter_conditions_data.append(reinterpret_cast(&len), sizeof(len)); + shared_context->filter_conditions_data.append(data.data(), data.size()); + } + + tipb::TableInfo table_info; + bool is_partition_scan = table_scan.isPartitionTableScan(); + const auto & tidb_columns = table_scan.getColumns(); + const auto should_skip_column_for_columnar_table_info = [&](ColumnID column_id) { + if (column_id == MutSup::extra_table_id_col_id) + return true; + for (const auto & ci : tidb_columns) + { + if (ci.id == column_id && ci.hasGeneratedColumnFlag()) + return true; + } + return false; + }; + if (is_partition_scan) + { + for (const auto & column : table_scan_pb.partition_table_scan().columns()) + { + if (should_skip_column_for_columnar_table_info(column.column_id())) + continue; + *table_info.add_columns() = column; + } + } + else + { + for (const auto & column : table_scan_pb.tbl_scan().columns()) + { + if (should_skip_column_for_columnar_table_info(column.column_id())) + continue; + *table_info.add_columns() = column; + } + } + shared_context->table_info_data = table_info.SerializeAsString(); + shared_context->ann_query_info_data = table_scan.getANNQueryInfo().SerializeAsString(); + shared_context->fts_query_info_data = table_scan.getFTSQueryInfo().SerializeAsString(); + return shared_context; +} + bool isProxyFilterComparableExpr(tipb::ScalarFuncSig sig) { // Keep this aligned with proxy columnar filter supported signatures: @@ -317,20 +542,15 @@ void StorageDisaggregated::readThroughColumnar( filter_conditions, remote_table_ranges, num_streams); - auto [column_defines, extra_table_id_index] = genColumnDefinesForDisaggregatedReadThroughColumnar(table_scan); + const auto generated_column_infos = genGeneratedColumnInfosForDisaggregatedRead(table_scan); for (auto & task : read_proxy_tasks) { group_builder.addConcurrency(RNProxySourceOp::create({ - .context = context, - .debug_tag = log->identifier(), .exec_context = exec_context, - .columns_to_read = *column_defines, .task = task, - .extra_table_id_index = extra_table_id_index, })); } - const auto generated_column_infos = genGeneratedColumnInfosForDisaggregatedRead(table_scan); executeGeneratedColumnPlaceholder(exec_context, group_builder, generated_column_infos, log); NamesAndTypes source_columns; @@ -346,44 +566,14 @@ void StorageDisaggregated::readThroughColumnar( filterConditionsWithPushedDownFilters(exec_context, group_builder, *analyzer); } -// RNProxyReaderPtr -RNProxyReaderPtr RNProxyReader::createProxyReader( - const LoggerPtr & log, - const Context & context, - RegionID region_id, - RegionVersion region_ver, - UInt64 region_conf_ver, - const std::vector> & physical_table_ranges, - UInt64 start_ts, - const TiDBTableScan & table_scan, - const FilterConditions & filter_conditions, - std::mutex & output_lock) +ColumnarReaderPtr createProxyColumnarReader( + const RNProxyReaderSharedContext & shared_context, + const RNProxyReaderPlan & reader_plan) { - auto table_scan_pb = *table_scan.getTableScanPB(); - const auto & timezone_info = context.getTimezoneInfo(); - if (table_scan_pb.tp() == tipb::TypePartitionTableScan) - { - auto * pushed_down_filters - = table_scan_pb.mutable_partition_table_scan()->mutable_pushed_down_filter_conditions(); - for (int i = 0; i < pushed_down_filters->size(); ++i) - normalizeTimestampCompareDateTimeLiteralToUTC(*pushed_down_filters->Mutable(i), timezone_info); - } - else - { - auto * pushed_down_filters = table_scan_pb.mutable_tbl_scan()->mutable_pushed_down_filter_conditions(); - for (int i = 0; i < pushed_down_filters->size(); ++i) - normalizeTimestampCompareDateTimeLiteralToUTC(*pushed_down_filters->Mutable(i), timezone_info); - } - auto table_scan_data = table_scan_pb.SerializeAsString(); - auto table_scan_view = BaseBuffView{table_scan_data.data(), table_scan_data.size()}; - auto conditions = filter_conditions.conditions; - for (int i = 0; i < conditions.size(); ++i) - normalizeTimestampCompareDateTimeLiteralToUTC(*conditions.Mutable(i), timezone_info); - // Copy pushed down filters to filter_conditions to make filterConditions works properly. - // Proxy columnar reader use pushed down filters to reduce packs load from disk and has no - // guarantee to filter all useless data, so we rely on the filterConditions to filter data. + const auto & log = shared_context.log; + const auto & context = *shared_context.context; String tables_range_data; - for (const auto & [physical_table_id, ranges] : physical_table_ranges) + for (const auto & [physical_table_id, ranges] : reader_plan.physical_table_ranges) { tables_range_data.append(reinterpret_cast(&physical_table_id), sizeof(physical_table_id)); @@ -402,65 +592,24 @@ RNProxyReaderPtr RNProxyReader::createProxyReader( tables_range_data.append(reinterpret_cast(&ranges_data_size), sizeof(ranges_data_size)); tables_range_data.append(ranges_data.data(), ranges_data.size()); } - auto tables_range_view = BaseBuffView{tables_range_data.data(), tables_range_data.size()}; - String filter_conditions_data; - for (const auto & condition : conditions) - { - auto data = condition.SerializeAsString(); - uint32_t len = data.size(); - filter_conditions_data.append(reinterpret_cast(&len), sizeof(len)); - filter_conditions_data.append(data.data(), data.size()); - } - tipb::TableInfo table_info; - bool is_partition_scan = table_scan.isPartitionTableScan(); - const auto & tidb_columns = table_scan.getColumns(); - const auto should_skip_column_for_columnar_table_info = [&](ColumnID column_id) { - // _tidb_tid is filled locally by TiFlash, consistent with genColumnDefinesForDisaggregatedRead(). - if (column_id == MutSup::extra_table_id_col_id) - return true; - // Generated columns are not stored in kvengine; executeGeneratedColumnPlaceholder fills them later. - for (const auto & ci : tidb_columns) - { - if (ci.id == column_id && ci.hasGeneratedColumnFlag()) - return true; - } - return false; - }; - if (is_partition_scan) - { - for (const auto & column : table_scan_pb.partition_table_scan().columns()) - { - if (should_skip_column_for_columnar_table_info(column.column_id())) - continue; - *table_info.add_columns() = column; - } - } - else - { - for (const auto & column : table_scan_pb.tbl_scan().columns()) - { - if (should_skip_column_for_columnar_table_info(column.column_id())) - continue; - *table_info.add_columns() = column; - } - } - auto table_info_data = table_info.SerializeAsString(); - auto columns = BaseBuffView{table_info_data.data(), table_info_data.size()}; - auto filter_conditions_view = BaseBuffView{filter_conditions_data.data(), filter_conditions_data.size()}; - const auto & ann_query_info_pb = table_scan.getANNQueryInfo(); - const auto & fts_query_info_pb = table_scan.getFTSQueryInfo(); - auto ann_query_info_data = ann_query_info_pb.SerializeAsString(); - auto fts_query_info_data = fts_query_info_pb.SerializeAsString(); - auto ann_query_info_view = BaseBuffView{ann_query_info_data.data(), ann_query_info_data.size()}; - auto fts_query_info_view = BaseBuffView{fts_query_info_data.data(), fts_query_info_data.size()}; + BaseBuffView tables_range_view = BaseBuffView{tables_range_data.data(), tables_range_data.size()}; + BaseBuffView columns = BaseBuffView{shared_context.table_info_data.data(), shared_context.table_info_data.size()}; + BaseBuffView filter_conditions_view + = BaseBuffView{shared_context.filter_conditions_data.data(), shared_context.filter_conditions_data.size()}; + BaseBuffView table_scan_view + = BaseBuffView{shared_context.table_scan_data.data(), shared_context.table_scan_data.size()}; + BaseBuffView ann_query_info_view + = BaseBuffView{shared_context.ann_query_info_data.data(), shared_context.ann_query_info_data.size()}; + BaseBuffView fts_query_info_view + = BaseBuffView{shared_context.fts_query_info_data.data(), shared_context.fts_query_info_data.size()}; const Context & global_ctx = context.getGlobalContext(); auto * cluster = global_ctx.getTMTContext().getKVCluster(); const TiFlashRaftProxyHelper * proxy_helper = global_ctx.getSharedContextDisagg()->getColumnarProxyHelper(); RUNTIME_CHECK_MSG(proxy_helper != nullptr, "columnar proxy helper is not initialized"); ColumnarReaderPtr columnar_reader = proxy_helper->cloud_storage_engine_interfaces.fn_get_columnar_reader( - region_id, - region_ver, - start_ts, + reader_plan.region_id, + reader_plan.region_ver, + shared_context.start_ts, std::move(tables_range_view), std::move(columns), std::move(table_scan_view), @@ -468,13 +617,14 @@ RNProxyReaderPtr RNProxyReader::createProxyReader( std::move(ann_query_info_view), std::move(fts_query_info_view), proxy_helper->proxy_ptr); - bool reader_transferred = false; + bool reader_returned = false; SCOPE_EXIT({ - if (!reader_transferred) + if (!reader_returned && columnar_reader.inner.ptr != nullptr) RustGcHelper::instance().gcRustPtr(columnar_reader.inner.ptr, columnar_reader.inner.type); }); SCOPE_EXIT({ - if (!reader_transferred && columnar_reader.error_type != ColumnarReaderErrorType::OK) + if (!reader_returned && columnar_reader.error_type != ColumnarReaderErrorType::OK + && columnar_reader.error.inner.ptr != nullptr) RustGcHelper::instance().gcRustPtr(columnar_reader.error.inner.ptr, columnar_reader.error.inner.type); }); if (columnar_reader.error_type == ColumnarReaderErrorType::RegionError) @@ -482,26 +632,25 @@ RNProxyReaderPtr RNProxyReader::createProxyReader( auto error_msg = String(columnar_reader.error.buff.data, columnar_reader.error.buff.len); errorpb::Error region_error; region_error.ParseFromString(error_msg); - auto region_ver_id = pingcap::kv::RegionVerID(region_id, region_conf_ver, region_ver); + auto region_ver_id + = pingcap::kv::RegionVerID(reader_plan.region_id, reader_plan.region_conf_ver, reader_plan.region_ver); // Refresh region cache and throw an exception for retrying. if (region_error.has_epoch_not_match()) { RegionException::UnavailableRegions unavailable_regions; String region_id_ver; // region_id:region_ver:conf_ver - std::unordered_set retry_regions; for (const auto & region : region_error.epoch_not_match().current_regions()) { unavailable_regions.insert(region.id()); - retry_regions.insert(region.id()); - region_id_ver = std::to_string(region.id()) + ":" + std::to_string(region_ver) + ":" + region_id_ver = std::to_string(region.id()) + ":" + std::to_string(reader_plan.region_ver) + ":" + std::to_string(region.region_epoch().conf_ver()); } - auto guard = std::lock_guard(output_lock); + auto _guard = std::lock_guard(*shared_context.output_lock); cluster->region_cache->dropRegion(region_ver_id); LOG_WARNING( log, "create columnar reader failed region_id={}, epoch not match {}", - std::to_string(region_id), + std::to_string(reader_plan.region_id), region_ver_id.toString()); throw RegionException( std::move(unavailable_regions), @@ -511,17 +660,15 @@ RNProxyReaderPtr RNProxyReader::createProxyReader( else { RegionException::UnavailableRegions unavailable_regions; - std::unordered_set retry_regions; auto err_region_id = 0; if (region_error.has_region_not_found()) { err_region_id = region_error.region_not_found().region_id(); unavailable_regions.insert(err_region_id); - retry_regions.insert(err_region_id); LOG_WARNING( log, "create columnar reader failed region_id={}, region not found {}", - std::to_string(region_id), + std::to_string(reader_plan.region_id), std::to_string(err_region_id)); } else @@ -529,15 +676,15 @@ RNProxyReaderPtr RNProxyReader::createProxyReader( LOG_WARNING( log, "create columnar reader failed region_id={}, {}", - std::to_string(region_id), + std::to_string(reader_plan.region_id), region_error.ShortDebugString()); } - auto guard = std::lock_guard(output_lock); + auto _guard = std::lock_guard(*shared_context.output_lock); cluster->region_cache->dropRegion(region_ver_id); throw RegionException( std::move(unavailable_regions), RegionException::RegionReadStatus::NOT_FOUND, - std::to_string(region_id).c_str()); + std::to_string(reader_plan.region_id).c_str()); } } else if (columnar_reader.error_type == ColumnarReaderErrorType::LockedError) @@ -549,8 +696,8 @@ RNProxyReaderPtr RNProxyReader::createProxyReader( pingcap::kv::Backoffer bo(pingcap::kv::copNextMaxBackoff); std::vector pushed; std::vector locks{std::make_shared(lock_info)}; - auto guard = std::lock_guard(output_lock); - auto before_expired = cluster->lock_resolver->resolveLocks(bo, start_ts, locks, pushed); + auto _guard = std::lock_guard(*shared_context.output_lock); + auto before_expired = cluster->lock_resolver->resolveLocks(bo, shared_context.start_ts, locks, pushed); LOG_WARNING(log, "Finished resolve locks, before_expired={}", before_expired); throw Exception("lock error", ErrorCodes::COLUMNAR_SNAPSHOT_ERROR); } @@ -572,22 +719,99 @@ RNProxyReaderPtr RNProxyReader::createProxyReader( throw Exception(ErrorCodes::COLUMNAR_SNAPSHOT_ERROR, "{}", error_msg); } - // Create input stream. - auto [column_defines, extra_table_id_index] = genColumnDefinesForDisaggregatedReadThroughColumnar(table_scan); - BlockInputStreamPtr input_stream = RNProxyInputStream::create({ - .context = context, - .debug_tag = log->identifier(), - .columns_to_read = *column_defines, - .reader = columnar_reader, - .extra_table_id_index = extra_table_id_index, - .table_id = table_scan.getLogicalTableID(), - .executor_id = table_scan.getTableScanExecutorID(), - }); - reader_transferred = true; - return std::make_shared(input_stream); + reader_returned = true; + return columnar_reader; } // RNProxyReadTask +RNProxyReadTask::RNProxyReadTask( + std::vector reader_plans_, + std::shared_ptr shared_reader_context_) + : reader_plans(std::move(reader_plans_)) + , shared_reader_context(std::move(shared_reader_context_)) +{} + +size_t RNProxyReadTask::getReaderCount() const +{ + return reader_plans.size(); +} + +const Context & RNProxyReadTask::getContext() const +{ + return *shared_reader_context->context; +} + +const LoggerPtr & RNProxyReadTask::getLog() const +{ + return shared_reader_context->log; +} + +const DM::ColumnDefines & RNProxyReadTask::getColumnsToRead() const +{ + return *shared_reader_context->column_defines; +} + +int RNProxyReadTask::getExtraTableIDIndex() const +{ + return shared_reader_context->extra_table_id_index; +} + +TableID RNProxyReadTask::getLogicalTableID() const +{ + return shared_reader_context->logical_table_id; +} + +const String & RNProxyReadTask::getExecutorID() const +{ + return shared_reader_context->executor_id; +} + +ColumnarReaderPtr RNProxyReadTask::createColumnarReaderWithBackoff(size_t reader_index) const +{ + RUNTIME_CHECK(reader_index < reader_plans.size()); + const auto & reader_plan = reader_plans[reader_index]; + pingcap::kv::Backoffer bo(pingcap::kv::copNextMaxBackoff); + while (true) + { + try + { + LOG_INFO( + getLog(), + "materialize proxy reader for tables in region, region_id={}, table_num={}", + reader_plan.region_id, + reader_plan.physical_table_ranges.size()); + return createProxyColumnarReader(*shared_reader_context, reader_plan); + } + catch (RegionException & e) + { + LOG_WARNING(getLog(), "create proxy reader failed, backoff and retry, {}", e.message()); + bo.backoff(pingcap::kv::boRegionMiss, pingcap::Exception(e.message(), e.code())); + } + catch (Exception & e) + { + if (e.code() != ErrorCodes::COLUMNAR_SNAPSHOT_ERROR) + throw; + LOG_WARNING(getLog(), "create proxy reader failed, backoff and retry, {}", e.message()); + bo.backoff(pingcap::kv::boRegionMiss, pingcap::Exception(e.message(), e.code())); + } + } +} + +BlockInputStreamPtr RNProxyReadTask::createInputStream(size_t reader_index) +{ + RUNTIME_CHECK(reader_index < reader_plans.size()); + return RNProxyInputStream::create({ + .context = getContext(), + .log = getLog(), + .task = shared_from_this(), + .reader_index = reader_index, + .columns_to_read = getColumnsToRead(), + .extra_table_id_index = getExtraTableIDIndex(), + .table_id = getLogicalTableID(), + .executor_id = getExecutorID(), + }); +} + std::vector RNProxyReadTask::buildProxyReadTaskWithBackoff( const LoggerPtr & log, const Context & context, @@ -642,6 +866,7 @@ std::vector RNProxyReadTask::buildProxyReadTask( auto scan_context = std::make_shared(dag_context->getKeyspaceID(), dag_context->getResourceGroupName()); dag_context->scan_context_map[table_scan.getTableScanExecutorID()] = scan_context; + auto shared_reader_context = buildProxyReaderSharedContext(log, context, start_ts, table_scan, filter_conditions); std::vector tasks; // Collect all regions in the table scan. @@ -697,102 +922,141 @@ std::vector RNProxyReadTask::buildProxyReadTask( } unsigned region_num = all_remote_regions_by_region.size(); unsigned physical_table_num = physical_table_ids.size(); - unsigned real_num_streams = std::min(num_streams, region_num); - // Regions per RNProxyReader, it should be ceil of region_num / real_num_streams. - // `regions_per_reader` is the ceil of the division, so the concurrency may be less than `real_num_streams`. - unsigned regions_per_reader = (region_num + real_num_streams - 1) / real_num_streams; + const bool enable_bucket_parallel = !table_scan.keepOrder() && num_streams > region_num; + std::vector region_reader_plans; + region_reader_plans.reserve(region_num); + size_t total_max_reader_num = region_num; + for (const auto & [region_id, physical_table_ranges] : all_remote_regions_by_region) + { + RegionReaderPlan plan{ + .region_id = region_id, + .region_ver_id = region_ver_ids[region_id], + .physical_table_ranges = physical_table_ranges, + }; + if (enable_bucket_parallel) + { + auto bucket_keys = getRegionBucketKeysFromProxy(context, region_id, plan.region_ver_id.ver); + auto split_result = splitRangesByBucketKeys(physical_table_ranges, bucket_keys); + if (split_result.has_bucket_split && split_result.units.size() > 1) + { + total_max_reader_num += split_result.units.size() - 1; + plan.bucket_units = std::move(split_result.units); + } + } + region_reader_plans.emplace_back(std::move(plan)); + } + + std::vector reader_count_per_region(region_reader_plans.size(), 1); + if (enable_bucket_parallel) + { + size_t target_reader_num = std::min(total_max_reader_num, static_cast(num_streams)); + size_t extra_reader_budget = target_reader_num > region_num ? target_reader_num - region_num : 0; + while (extra_reader_budget > 0) + { + bool allocated = false; + for (size_t i = 0; i < region_reader_plans.size() && extra_reader_budget > 0; ++i) + { + const auto max_reader_count + = region_reader_plans[i].bucket_units.empty() ? 1 : region_reader_plans[i].bucket_units.size(); + if (reader_count_per_region[i] >= max_reader_count) + continue; + ++reader_count_per_region[i]; + --extra_reader_budget; + allocated = true; + } + if (!allocated) + break; + } + } + + size_t planned_reader_num = 0; + for (auto reader_count : reader_count_per_region) + planned_reader_num += reader_count; LOG_INFO( log, - "region_num={}, table_num={}, num_streams={}, real_num_streams={}, regions_per_reader={}", + "region_num={}, table_num={}, num_streams={}, keep_order={}, bucket_parallel={}, planned_reader_num={}, " + "max_reader_num={}", region_num, physical_table_num, num_streams, - real_num_streams, - regions_per_reader); - unsigned reader_idx = 0; - std::vector all_readers; - std::mutex output_lock; - auto thread_manager = newThreadManager(); + table_scan.keepOrder(), + enable_bucket_parallel, + planned_reader_num, + total_max_reader_num); - for (const auto & [region_id, physical_table_ranges] : all_remote_regions_by_region) + std::vector all_reader_plans; + all_reader_plans.reserve(planned_reader_num); + + for (size_t i = 0; i < region_reader_plans.size(); ++i) { - auto region_ver = region_ver_ids[region_id].ver; - auto region_conf_ver = region_ver_ids[region_id].conf_ver; - thread_manager->schedule( - true, - "createProxyReader", - [log, - &context, - region_id, - region_ver, - region_conf_ver, - physical_table_ranges, - start_ts, - &table_scan, - &filter_conditions, - &output_lock, - &all_readers] { - LOG_INFO( - log, - "create proxy reader for tables in region, region_id={}, table_num={}", - region_id, - physical_table_ranges.size()); - auto reader_ptr = RNProxyReader::createProxyReader( - log, - context, - region_id, - region_ver, - region_conf_ver, - physical_table_ranges, - start_ts, - table_scan, - filter_conditions, - output_lock); - { - std::lock_guard lock(output_lock); - all_readers.push_back(reader_ptr); - } + const auto & plan = region_reader_plans[i]; + auto reader_groups = plan.bucket_units.empty() || reader_count_per_region[i] <= 1 + ? std::vector{plan.physical_table_ranges} + : packBucketUnitsIntoReaders(plan.bucket_units, reader_count_per_region[i]); + for (const auto & physical_table_ranges : reader_groups) + { + all_reader_plans.push_back(RNProxyReaderPlan{ + .region_id = plan.region_id, + .region_ver = plan.region_ver_id.ver, + .region_conf_ver = plan.region_ver_id.conf_ver, + .physical_table_ranges = physical_table_ranges, }); + } } - thread_manager->wait(); - - std::vector readers; - for (auto & reader : all_readers) + unsigned reader_num = all_reader_plans.size(); + if (reader_num == 0) + return tasks; + unsigned real_num_streams = std::min(num_streams, reader_num); + // Readers per RNProxyReadTask, it should be ceil of reader_num / real_num_streams. + unsigned readers_per_task = (reader_num + real_num_streams - 1) / real_num_streams; + unsigned reader_idx = 0; + std::vector readers; + for (auto & reader_plan : all_reader_plans) { ++reader_idx; - readers.push_back(reader); - if (reader_idx == regions_per_reader) + readers.push_back(std::move(reader_plan)); + if (reader_idx == readers_per_task) { reader_idx = 0; - tasks.push_back(std::make_shared(std::move(readers))); + tasks.push_back(std::make_shared(std::move(readers), shared_reader_context)); readers.clear(); } } if (!readers.empty()) { - tasks.push_back(std::make_shared(std::move(readers))); + tasks.push_back(std::make_shared(std::move(readers), shared_reader_context)); } return tasks; } -BlockInputStreams RNProxyReadTask::getInputStreams() const +BlockInputStreams RNProxyReadTask::getInputStreams() { BlockInputStreams streams; - streams.reserve(proxy_readers.size()); - for (const auto & reader : proxy_readers) + streams.reserve(reader_plans.size()); + for (size_t reader_index = 0; reader_index < reader_plans.size(); ++reader_index) { - streams.push_back(reader->getInputStream()); + streams.push_back(createInputStream(reader_index)); } return streams; } // RNProxyInputStream +void RNProxyInputStream::ensureReader() +{ + if (reader.has_value()) + return; + reader.emplace(task->createColumnarReaderWithBackoff(reader_index)); +} + RNProxyInputStream::~RNProxyInputStream() { - SCOPE_EXIT({ RustGcHelper::instance().gcRustPtr(reader.inner.ptr, reader.inner.type); }); + SCOPE_EXIT({ + if (reader.has_value() && reader->inner.ptr != nullptr) + RustGcHelper::instance().gcRustPtr(reader->inner.ptr, reader->inner.type); + }); try { LOG_INFO( @@ -836,11 +1100,12 @@ Block RNProxyInputStream::readImpl([[maybe_unused]] FilterPtr & res_filter, [[ma { if (done) return {}; + ensureReader(); const Context & global_ctx = context.getGlobalContext(); const TiFlashRaftProxyHelper * proxy_helper = global_ctx.getSharedContextDisagg()->getColumnarProxyHelper(); RUNTIME_CHECK_MSG(proxy_helper != nullptr, "columnar proxy helper is not initialized"); Stopwatch w{CLOCK_MONOTONIC_COARSE}; - UInt64 rows = proxy_helper->cloud_storage_engine_interfaces.fn_read_block(reader, batch_size); + UInt64 rows = proxy_helper->cloud_storage_engine_interfaces.fn_read_block(reader.value(), batch_size); duration_read_sec += w.elapsedSecondsFromLastTime(); LOG_DEBUG(log, "Read {} rows from proxy", rows); if (rows == std::numeric_limits::max()) @@ -849,7 +1114,10 @@ Block RNProxyInputStream::readImpl([[maybe_unused]] FilterPtr & res_filter, [[ma throw Exception("read_block failed in tiflash-proxy", ErrorCodes::LOGICAL_ERROR); } if (rows == 0) + { + done = true; return {}; + } TableID physical_table_id = -1; Block header = getHeader(); @@ -868,9 +1136,9 @@ Block RNProxyInputStream::readImpl([[maybe_unused]] FilterPtr & res_filter, [[ma Int64 col_id = col_type_and_name[i].column_id; if (col_id == MutSup::extra_handle_id) { - RustStrWithView col_data = proxy_helper->cloud_storage_engine_interfaces.fn_read_handle(reader); + RustStrWithView col_data = proxy_helper->cloud_storage_engine_interfaces.fn_read_handle(reader.value()); SCOPE_EXIT({ RustGcHelper::instance().gcRustPtr(col_data.inner.ptr, col_data.inner.type); }); - physical_table_id = proxy_helper->cloud_storage_engine_interfaces.fn_physical_table_id(reader); + physical_table_id = proxy_helper->cloud_storage_engine_interfaces.fn_physical_table_id(reader.value()); ReadBufferFromMemory buf(col_data.buff.data, static_cast(col_data.buff.len)); auto & col = *columns[i]; col_type_and_name[i].type->deserializeBinaryBulkWithMultipleStreams( @@ -887,9 +1155,10 @@ Block RNProxyInputStream::readImpl([[maybe_unused]] FilterPtr & res_filter, [[ma } else { - RustStrWithView col_data = proxy_helper->cloud_storage_engine_interfaces.fn_read_column(reader, col_id); + RustStrWithView col_data + = proxy_helper->cloud_storage_engine_interfaces.fn_read_column(reader.value(), col_id); SCOPE_EXIT({ RustGcHelper::instance().gcRustPtr(col_data.inner.ptr, col_data.inner.type); }); - physical_table_id = proxy_helper->cloud_storage_engine_interfaces.fn_physical_table_id(reader); + physical_table_id = proxy_helper->cloud_storage_engine_interfaces.fn_physical_table_id(reader.value()); ReadBufferFromMemory buf(col_data.buff.data, static_cast(col_data.buff.len)); auto & col = *columns[i]; col_type_and_name[i].type->deserializeBinaryBulkWithMultipleStreams( @@ -976,9 +1245,12 @@ OperatorStatus RNProxySourceOp::executeIOImpl() return awaitImpl(); } + if (!current_input_stream) + current_input_stream = task->createInputStream(static_cast(current_reader_idx)); + FilterPtr filter_ignored = nullptr; Stopwatch w{CLOCK_MONOTONIC_COARSE}; - Block block = task->getProxyReaders()[current_reader_idx]->getInputStream()->read(filter_ignored, false); + Block block = current_input_stream->read(filter_ignored, false); duration_read_sec += w.elapsedSeconds(); if likely (block && block.rows() > 0) { @@ -988,11 +1260,12 @@ OperatorStatus RNProxySourceOp::executeIOImpl() } else { - if (current_reader_idx == static_cast(task->getProxyReaders().size() - 1)) + current_input_stream.reset(); + if (current_reader_idx == static_cast(task->getReaderCount() - 1)) { done = true; } - else if (current_reader_idx < static_cast(task->getProxyReaders().size() - 1)) + else if (current_reader_idx < static_cast(task->getReaderCount() - 1)) { ++current_reader_idx; } diff --git a/dbms/src/Storages/StorageDisaggregatedColumnar.h b/dbms/src/Storages/StorageDisaggregatedColumnar.h index fb1e0094b16..80641a3401b 100644 --- a/dbms/src/Storages/StorageDisaggregatedColumnar.h +++ b/dbms/src/Storages/StorageDisaggregatedColumnar.h @@ -48,49 +48,24 @@ class RSOperator; using RSOperatorPtr = std::shared_ptr; } // namespace DM -class RNProxyReader; -using RNProxyReaderPtr = std::shared_ptr; -class RNProxyReader : boost::noncopyable -{ -public: - static RNProxyReaderPtr createProxyReader( - const LoggerPtr & log, - const Context & context, - RegionID region_id, - RegionVersion region_ver, - UInt64 region_conf_ver, - const std::vector> & physical_table_ranges, - UInt64 start_ts, - const TiDBTableScan & table_scan, - const FilterConditions & filter_conditions, - std::mutex & output_lock); - - BlockInputStreamPtr getInputStream() const - { - RUNTIME_CHECK(input_stream != nullptr); - return input_stream; - } +struct RNProxyReaderSharedContext; - RNProxyReader(BlockInputStreamPtr input_stream) - : input_stream(input_stream) - {} - -private: - BlockInputStreamPtr input_stream; +struct RNProxyReaderPlan +{ + RegionID region_id; + RegionVersion region_ver; + UInt64 region_conf_ver; + std::vector> physical_table_ranges; }; class RNProxyReadTask; using RNProxyReadTaskPtr = std::shared_ptr; -class RNProxyReadTask : boost::noncopyable +class RNProxyReadTask + : public boost::noncopyable + , public std::enable_shared_from_this { public: using RemoteTableRange = std::pair; - const std::vector proxy_readers; - - static RNProxyReadTaskPtr create(const std::vector & proxy_readers) - { - return std::shared_ptr(new RNProxyReadTask(proxy_readers)); - } static std::vector buildProxyReadTaskWithBackoff( const LoggerPtr & log, @@ -110,13 +85,33 @@ class RNProxyReadTask : boost::noncopyable const std::vector & remote_table_ranges, unsigned num_streams); - BlockInputStreams getInputStreams() const; + BlockInputStreams getInputStreams(); + + BlockInputStreamPtr createInputStream(size_t reader_index); + + ColumnarReaderPtr createColumnarReaderWithBackoff(size_t reader_index) const; + + size_t getReaderCount() const; - std::vector getProxyReaders() { return proxy_readers; } + const Context & getContext() const; - RNProxyReadTask(const std::vector & proxy_readers) - : proxy_readers(proxy_readers) - {} + const LoggerPtr & getLog() const; + + const DM::ColumnDefines & getColumnsToRead() const; + + int getExtraTableIDIndex() const; + + TableID getLogicalTableID() const; + + const String & getExecutorID() const; + + RNProxyReadTask( + std::vector reader_plans, + std::shared_ptr shared_reader_context); + +private: + std::vector reader_plans; + std::shared_ptr shared_reader_context; }; class RNProxyInputStream : public IProfilingBlockInputStream @@ -139,9 +134,10 @@ class RNProxyInputStream : public IProfilingBlockInputStream struct Options { const Context & context; - std::string_view debug_tag; + LoggerPtr log; + RNProxyReadTaskPtr task; + size_t reader_index; const DM::ColumnDefines & columns_to_read; - ColumnarReaderPtr reader; int extra_table_id_index; TableID table_id; const String & executor_id; @@ -149,8 +145,9 @@ class RNProxyInputStream : public IProfilingBlockInputStream explicit RNProxyInputStream(const Options & options) : context(options.context) - , log(Logger::get(options.debug_tag)) - , reader(options.reader) + , log(options.log) + , task(options.task) + , reader_index(options.reader_index) , action(options.columns_to_read, options.extra_table_id_index) , table_id(options.table_id) , executor_id(options.executor_id) @@ -162,9 +159,13 @@ class RNProxyInputStream : public IProfilingBlockInputStream static BlockInputStreamPtr create(const Options & options) { return std::make_shared(options); } private: + void ensureReader(); + const Context & context; const LoggerPtr log; - ColumnarReaderPtr reader; + RNProxyReadTaskPtr task; + size_t reader_index; + std::optional reader; AddExtraTableIDColumnTransformAction action; TableID table_id; const String executor_id; @@ -185,23 +186,19 @@ class RNProxySourceOp : public SourceOp public: struct Options { - const Context & context; - std::string_view debug_tag; PipelineExecutorContext & exec_context; - const DM::ColumnDefines & columns_to_read; RNProxyReadTaskPtr task; - int extra_table_id_index; }; explicit RNProxySourceOp(const Options & options) - : SourceOp(options.exec_context, String(options.debug_tag)) - , context(options.context) - , log(Logger::get(options.debug_tag)) + : SourceOp(options.exec_context, options.task->getLog()->identifier()) + , context(options.task->getContext()) + , log(options.task->getLog()) , task(options.task) - , action(options.columns_to_read, options.extra_table_id_index) { - // Keep header aligned with genNamesAndTypesForTableScan when TiDB requests _tidb_tid on partition scans. - setHeader(action.getHeader()); + setHeader(AddExtraTableIDColumnTransformAction::buildHeader( + options.task->getColumnsToRead(), + options.task->getExtraTableIDIndex())); } static SourceOpPtr create(const Options & options) { return std::make_unique(options); } @@ -225,10 +222,10 @@ class RNProxySourceOp : public SourceOp const Context & context; const LoggerPtr log; RNProxyReadTaskPtr task; - AddExtraTableIDColumnTransformAction action; size_t total_rows = 0; Int32 current_reader_idx = -1; + BlockInputStreamPtr current_input_stream; // Temporarily store the block read from current_seg_task->stream and pass it to downstream operators in readImpl. std::optional t_block = std::nullopt; From 7eab8f1470689774bfcfc2d89533af82d53c418f Mon Sep 17 00:00:00 2001 From: yongman Date: Tue, 2 Jun 2026 21:45:26 +0800 Subject: [PATCH 02/19] optimize tasks dispatch Signed-off-by: yongman --- .../Storages/StorageDisaggregatedColumnar.cpp | 241 ++++++++++++++---- .../Storages/StorageDisaggregatedColumnar.h | 42 ++- 2 files changed, 229 insertions(+), 54 deletions(-) diff --git a/dbms/src/Storages/StorageDisaggregatedColumnar.cpp b/dbms/src/Storages/StorageDisaggregatedColumnar.cpp index 0f671800921..6a1a86cc1b0 100644 --- a/dbms/src/Storages/StorageDisaggregatedColumnar.cpp +++ b/dbms/src/Storages/StorageDisaggregatedColumnar.cpp @@ -543,12 +543,22 @@ void StorageDisaggregated::readThroughColumnar( remote_table_ranges, num_streams); const auto generated_column_infos = genGeneratedColumnInfosForDisaggregatedRead(table_scan); - for (auto & task : read_proxy_tasks) + if (!read_proxy_tasks.empty()) { - group_builder.addConcurrency(RNProxySourceOp::create({ - .exec_context = exec_context, - .task = task, - })); + auto & task_pool = read_proxy_tasks.front(); + const size_t source_num = std::min(num_streams, task_pool->getReaderCount()); + LOG_INFO( + log, + "use shared proxy reader task pool, reader_num={}, source_num={}", + task_pool->getReaderCount(), + source_num); + for (size_t i = 0; i < source_num; ++i) + { + group_builder.addConcurrency(RNProxySourceOp::create({ + .exec_context = exec_context, + .task = task_pool, + })); + } } executeGeneratedColumnPlaceholder(exec_context, group_builder, generated_column_infos, log); @@ -724,12 +734,22 @@ ColumnarReaderPtr createProxyColumnarReader( } // RNProxyReadTask +RNProxyReaderSlot::~RNProxyReaderSlot() +{ + if (reader.has_value() && reader->inner.ptr != nullptr) + RustGcHelper::instance().gcRustPtr(reader->inner.ptr, reader->inner.type); +} + RNProxyReadTask::RNProxyReadTask( std::vector reader_plans_, std::shared_ptr shared_reader_context_) : reader_plans(std::move(reader_plans_)) , shared_reader_context(std::move(shared_reader_context_)) -{} +{ + reader_slots.reserve(reader_plans.size()); + for (size_t i = 0; i < reader_plans.size(); ++i) + reader_slots.emplace_back(std::make_shared()); +} size_t RNProxyReadTask::getReaderCount() const { @@ -797,6 +817,134 @@ ColumnarReaderPtr RNProxyReadTask::createColumnarReaderWithBackoff(size_t reader } } +ColumnarReaderPtr RNProxyReadTask::getOrCreateReader(size_t reader_index) +{ + RUNTIME_CHECK(reader_index < reader_slots.size()); + auto slot = reader_slots[reader_index]; + bool should_create_inline = false; + { + std::unique_lock lock(slot->mutex); + switch (slot->state) + { + case RNProxyReaderMaterializeState::Ready: + { + auto reader = std::move(slot->reader); + slot->reader.reset(); + slot->state = RNProxyReaderMaterializeState::Consumed; + return reader.value(); + } + case RNProxyReaderMaterializeState::Failed: + std::rethrow_exception(slot->exception); + case RNProxyReaderMaterializeState::Consumed: + throw Exception(ErrorCodes::LOGICAL_ERROR, "proxy reader {} is already consumed", reader_index); + case RNProxyReaderMaterializeState::Creating: + slot->cv.wait(lock, [&] { return slot->state != RNProxyReaderMaterializeState::Creating; }); + if (slot->state == RNProxyReaderMaterializeState::Ready) + { + auto reader = std::move(slot->reader); + slot->reader.reset(); + slot->state = RNProxyReaderMaterializeState::Consumed; + return reader.value(); + } + if (slot->state == RNProxyReaderMaterializeState::Failed) + std::rethrow_exception(slot->exception); + throw Exception( + ErrorCodes::LOGICAL_ERROR, + "proxy reader {} becomes invalid after wait, state={}", + reader_index, + static_cast(slot->state)); + case RNProxyReaderMaterializeState::NotStarted: + slot->state = RNProxyReaderMaterializeState::Creating; + should_create_inline = true; + break; + } + } + + RUNTIME_CHECK(should_create_inline); + LOG_INFO( + getLog(), + "materialize proxy reader synchronously, reader_index={}, region_id={}", + reader_index, + reader_plans[reader_index].region_id); + try + { + auto reader = createColumnarReaderWithBackoff(reader_index); + { + auto guard = std::lock_guard(slot->mutex); + slot->state = RNProxyReaderMaterializeState::Consumed; + } + slot->cv.notify_all(); + return reader; + } + catch (...) + { + { + auto guard = std::lock_guard(slot->mutex); + slot->exception = std::current_exception(); + slot->state = RNProxyReaderMaterializeState::Failed; + } + slot->cv.notify_all(); + throw; + } +} + +void RNProxyReadTask::prefetchReader(size_t reader_index) +{ + if (reader_index >= reader_slots.size()) + return; + + std::call_once(prefetch_thread_manager_once, [&] { prefetch_thread_manager = newThreadManager(); }); + + auto slot = reader_slots[reader_index]; + { + auto guard = std::lock_guard(slot->mutex); + if (slot->state != RNProxyReaderMaterializeState::NotStarted) + return; + slot->state = RNProxyReaderMaterializeState::Creating; + } + + prefetch_thread_manager->scheduleThenDetach( + true, + "PrefetchRNProxyReader", + [self = shared_from_this(), slot, reader_index] { + LOG_INFO( + self->getLog(), + "materialize proxy reader asynchronously, reader_index={}, region_id={}", + reader_index, + self->reader_plans[reader_index].region_id); + try + { + auto reader = self->createColumnarReaderWithBackoff(reader_index); + { + auto guard = std::lock_guard(slot->mutex); + if (slot->state == RNProxyReaderMaterializeState::Consumed) + return; + slot->reader.emplace(std::move(reader)); + slot->state = RNProxyReaderMaterializeState::Ready; + } + } + catch (...) + { + { + auto guard = std::lock_guard(slot->mutex); + if (slot->state == RNProxyReaderMaterializeState::Consumed) + return; + slot->exception = std::current_exception(); + slot->state = RNProxyReaderMaterializeState::Failed; + } + } + slot->cv.notify_all(); + }); +} + +std::optional RNProxyReadTask::tryAcquireReaderIndex() +{ + const size_t reader_index = next_reader_index.fetch_add(1, std::memory_order_relaxed); + if (reader_index >= reader_plans.size()) + return std::nullopt; + return reader_index; +} + BlockInputStreamPtr RNProxyReadTask::createInputStream(size_t reader_index) { RUNTIME_CHECK(reader_index < reader_plans.size()); @@ -1004,31 +1152,9 @@ std::vector RNProxyReadTask::buildProxyReadTask( } } - unsigned reader_num = all_reader_plans.size(); - if (reader_num == 0) + if (all_reader_plans.empty()) return tasks; - unsigned real_num_streams = std::min(num_streams, reader_num); - // Readers per RNProxyReadTask, it should be ceil of reader_num / real_num_streams. - unsigned readers_per_task = (reader_num + real_num_streams - 1) / real_num_streams; - unsigned reader_idx = 0; - std::vector readers; - for (auto & reader_plan : all_reader_plans) - { - ++reader_idx; - readers.push_back(std::move(reader_plan)); - if (reader_idx == readers_per_task) - { - reader_idx = 0; - tasks.push_back(std::make_shared(std::move(readers), shared_reader_context)); - readers.clear(); - } - } - - if (!readers.empty()) - { - tasks.push_back(std::make_shared(std::move(readers), shared_reader_context)); - } - + tasks.push_back(std::make_shared(std::move(all_reader_plans), shared_reader_context)); return tasks; } @@ -1048,7 +1174,7 @@ void RNProxyInputStream::ensureReader() { if (reader.has_value()) return; - reader.emplace(task->createColumnarReaderWithBackoff(reader_index)); + reader.emplace(task->getOrCreateReader(reader_index)); } RNProxyInputStream::~RNProxyInputStream() @@ -1192,11 +1318,28 @@ Block RNProxyInputStream::readImpl([[maybe_unused]] FilterPtr & res_filter, [[ma void RNProxySourceOp::operateSuffixImpl() { UNUSED(context); - LOG_INFO(log, "Finished reading proxy snapshots, rows={} cost={:.3f}s", total_rows, duration_read_sec); + const double total_cost_sec = total_cost_watch.elapsedSeconds(); + const UInt64 rows_per_sec + = total_cost_sec > 0 ? static_cast(static_cast(total_rows) / total_cost_sec) : 0; + const UInt64 bytes_per_sec + = total_cost_sec > 0 ? static_cast(static_cast(total_bytes) / total_cost_sec) : 0; + LOG_INFO( + log, + "Finished reading proxy snapshots, task_pool_worker_total_cost={:.3f}s claimed_streams={} rows={} " + "rows_per_sec={} " + "bytes={} bytes_per_sec={} read_cost={:.3f}s", + total_cost_sec, + total_streams, + total_rows, + rows_per_sec, + total_bytes, + bytes_per_sec, + duration_read_sec); } void RNProxySourceOp::operatePrefixImpl() { + total_cost_watch.restart(); LOG_INFO(log, "Begin reading proxy snapshots"); } @@ -1215,7 +1358,7 @@ OperatorStatus RNProxySourceOp::readImpl(Block & block) return OperatorStatus::HAS_OUTPUT; } - return current_reader_idx < 0 ? OperatorStatus::IO_IN : awaitImpl(); + return awaitImpl(); } OperatorStatus RNProxySourceOp::awaitImpl() @@ -1225,11 +1368,6 @@ OperatorStatus RNProxySourceOp::awaitImpl() return OperatorStatus::HAS_OUTPUT; } - if (unlikely(current_reader_idx < 0)) - { - current_reader_idx = 0; - } - return OperatorStatus::IO_IN; } @@ -1240,14 +1378,20 @@ OperatorStatus RNProxySourceOp::executeIOImpl() return OperatorStatus::HAS_OUTPUT; } - if (unlikely(current_reader_idx < 0)) + if (!current_input_stream) { - return awaitImpl(); + auto next_reader_idx = task->tryAcquireReaderIndex(); + if (!next_reader_idx.has_value()) + { + done = true; + return OperatorStatus::HAS_OUTPUT; + } + current_reader_idx = next_reader_idx; + current_input_stream = task->createInputStream(current_reader_idx.value()); + ++total_streams; + task->prefetchReader(current_reader_idx.value() + 1); } - if (!current_input_stream) - current_input_stream = task->createInputStream(static_cast(current_reader_idx)); - FilterPtr filter_ignored = nullptr; Stopwatch w{CLOCK_MONOTONIC_COARSE}; Block block = current_input_stream->read(filter_ignored, false); @@ -1255,21 +1399,14 @@ OperatorStatus RNProxySourceOp::executeIOImpl() if likely (block && block.rows() > 0) { total_rows += block.rows(); + total_bytes += block.bytes(); t_block.emplace(std::move(block)); return OperatorStatus::HAS_OUTPUT; } else { current_input_stream.reset(); - if (current_reader_idx == static_cast(task->getReaderCount() - 1)) - { - done = true; - } - else if (current_reader_idx < static_cast(task->getReaderCount() - 1)) - { - ++current_reader_idx; - } - // Current stream is drained, try to read from next stream. + current_reader_idx.reset(); return awaitImpl(); } } diff --git a/dbms/src/Storages/StorageDisaggregatedColumnar.h b/dbms/src/Storages/StorageDisaggregatedColumnar.h index 80641a3401b..dd875031943 100644 --- a/dbms/src/Storages/StorageDisaggregatedColumnar.h +++ b/dbms/src/Storages/StorageDisaggregatedColumnar.h @@ -35,12 +35,18 @@ #include #include +#include +#include +#include +#include +#include #include #pragma GCC diagnostic pop namespace DB { class DAGContext; +class ThreadManager; namespace DM { @@ -58,6 +64,26 @@ struct RNProxyReaderPlan std::vector> physical_table_ranges; }; +enum class RNProxyReaderMaterializeState +{ + NotStarted, + Creating, + Ready, + Failed, + Consumed, +}; + +struct RNProxyReaderSlot +{ + ~RNProxyReaderSlot(); + + std::mutex mutex; + std::condition_variable cv; + RNProxyReaderMaterializeState state = RNProxyReaderMaterializeState::NotStarted; + std::optional reader; + std::exception_ptr exception; +}; + class RNProxyReadTask; using RNProxyReadTaskPtr = std::shared_ptr; class RNProxyReadTask @@ -91,6 +117,12 @@ class RNProxyReadTask ColumnarReaderPtr createColumnarReaderWithBackoff(size_t reader_index) const; + ColumnarReaderPtr getOrCreateReader(size_t reader_index); + + void prefetchReader(size_t reader_index); + + std::optional tryAcquireReaderIndex(); + size_t getReaderCount() const; const Context & getContext() const; @@ -112,6 +144,10 @@ class RNProxyReadTask private: std::vector reader_plans; std::shared_ptr shared_reader_context; + std::vector> reader_slots; + std::atomic_size_t next_reader_index = 0; + std::once_flag prefetch_thread_manager_once; + std::shared_ptr prefetch_thread_manager; }; class RNProxyInputStream : public IProfilingBlockInputStream @@ -222,9 +258,11 @@ class RNProxySourceOp : public SourceOp const Context & context; const LoggerPtr log; RNProxyReadTaskPtr task; + UInt64 total_bytes = 0; size_t total_rows = 0; + size_t total_streams = 0; - Int32 current_reader_idx = -1; + std::optional current_reader_idx; BlockInputStreamPtr current_input_stream; // Temporarily store the block read from current_seg_task->stream and pass it to downstream operators in readImpl. @@ -233,7 +271,7 @@ class RNProxySourceOp : public SourceOp bool done = false; // Count the time spent waiting for segment tasks to be ready. //double duration_wait_ready_task_sec = 0; - Stopwatch wait_stop_watch{CLOCK_MONOTONIC_COARSE}; + Stopwatch total_cost_watch{CLOCK_MONOTONIC_COARSE}; // Count the time consumed by reading blocks in the stream of segment tasks. double duration_read_sec = 0; From edc2dcae13412ecbcdff01f2e7df21b38a6e94e0 Mon Sep 17 00:00:00 2001 From: yongman Date: Tue, 2 Jun 2026 22:27:13 +0800 Subject: [PATCH 03/19] add bucket log Signed-off-by: yongman --- dbms/src/Storages/StorageDisaggregatedColumnar.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/dbms/src/Storages/StorageDisaggregatedColumnar.cpp b/dbms/src/Storages/StorageDisaggregatedColumnar.cpp index 6a1a86cc1b0..31a46f49036 100644 --- a/dbms/src/Storages/StorageDisaggregatedColumnar.cpp +++ b/dbms/src/Storages/StorageDisaggregatedColumnar.cpp @@ -1074,6 +1074,7 @@ std::vector RNProxyReadTask::buildProxyReadTask( std::vector region_reader_plans; region_reader_plans.reserve(region_num); size_t total_max_reader_num = region_num; + size_t total_split_bucket_num = 0; for (const auto & [region_id, physical_table_ranges] : all_remote_regions_by_region) { RegionReaderPlan plan{ @@ -1088,6 +1089,7 @@ std::vector RNProxyReadTask::buildProxyReadTask( if (split_result.has_bucket_split && split_result.units.size() > 1) { total_max_reader_num += split_result.units.size() - 1; + total_split_bucket_num += split_result.units.size(); plan.bucket_units = std::move(split_result.units); } } @@ -1120,6 +1122,10 @@ std::vector RNProxyReadTask::buildProxyReadTask( size_t planned_reader_num = 0; for (auto reader_count : reader_count_per_region) planned_reader_num += reader_count; + if (enable_bucket_parallel) + { + LOG_INFO(log, "bucket parallel split bucket count={}", total_split_bucket_num); + } LOG_INFO( log, "region_num={}, table_num={}, num_streams={}, keep_order={}, bucket_parallel={}, planned_reader_num={}, " From e4fb670c994b7f5bba573e7ec4cc08be3ff21ee6 Mon Sep 17 00:00:00 2001 From: yongman Date: Tue, 2 Jun 2026 22:41:53 +0800 Subject: [PATCH 04/19] optimize bucket Signed-off-by: yongman --- .../Storages/StorageDisaggregatedColumnar.cpp | 91 ++++--------------- 1 file changed, 16 insertions(+), 75 deletions(-) diff --git a/dbms/src/Storages/StorageDisaggregatedColumnar.cpp b/dbms/src/Storages/StorageDisaggregatedColumnar.cpp index 31a46f49036..c1ede7385f4 100644 --- a/dbms/src/Storages/StorageDisaggregatedColumnar.cpp +++ b/dbms/src/Storages/StorageDisaggregatedColumnar.cpp @@ -55,7 +55,6 @@ #include #include -#include #include #include @@ -146,48 +145,6 @@ BucketSplitResult splitRangesByBucketKeys( return result; } -void appendRangeToReaderRanges( - ProxyPhysicalTableRanges & reader_ranges, - TableID table_id, - pingcap::coprocessor::KeyRange range) -{ - auto it = std::find_if(reader_ranges.begin(), reader_ranges.end(), [&](const auto & entry) { - return std::get<0>(entry) == table_id; - }); - if (it == reader_ranges.end()) - { - reader_ranges.emplace_back(table_id, pingcap::coprocessor::KeyRanges{std::move(range)}); - return; - } - std::get<1>(*it).push_back(std::move(range)); -} - -std::vector packBucketUnitsIntoReaders( - const std::vector & units, - size_t reader_count) -{ - std::vector reader_groups; - if (units.empty() || reader_count == 0) - return reader_groups; - - reader_count = std::min(reader_count, units.size()); - reader_groups.resize(reader_count); - size_t base_unit_count = units.size() / reader_count; - size_t remainder = units.size() % reader_count; - size_t unit_index = 0; - for (size_t reader_index = 0; reader_index < reader_count; ++reader_index) - { - size_t current_unit_count = base_unit_count + (reader_index < remainder ? 1 : 0); - auto & reader_ranges = reader_groups[reader_index]; - for (size_t i = 0; i < current_unit_count; ++i) - { - const auto & [table_id, range] = units[unit_index++]; - appendRangeToReaderRanges(reader_ranges, table_id, range); - } - } - return reader_groups; -} - std::vector getRegionBucketKeysFromProxy(const Context & context, RegionID region_id, UInt64 region_ver) { const Context & global_ctx = context.getGlobalContext(); @@ -1095,33 +1052,7 @@ std::vector RNProxyReadTask::buildProxyReadTask( } region_reader_plans.emplace_back(std::move(plan)); } - - std::vector reader_count_per_region(region_reader_plans.size(), 1); - if (enable_bucket_parallel) - { - size_t target_reader_num = std::min(total_max_reader_num, static_cast(num_streams)); - size_t extra_reader_budget = target_reader_num > region_num ? target_reader_num - region_num : 0; - while (extra_reader_budget > 0) - { - bool allocated = false; - for (size_t i = 0; i < region_reader_plans.size() && extra_reader_budget > 0; ++i) - { - const auto max_reader_count - = region_reader_plans[i].bucket_units.empty() ? 1 : region_reader_plans[i].bucket_units.size(); - if (reader_count_per_region[i] >= max_reader_count) - continue; - ++reader_count_per_region[i]; - --extra_reader_budget; - allocated = true; - } - if (!allocated) - break; - } - } - - size_t planned_reader_num = 0; - for (auto reader_count : reader_count_per_region) - planned_reader_num += reader_count; + const size_t planned_reader_num = total_max_reader_num; if (enable_bucket_parallel) { LOG_INFO(log, "bucket parallel split bucket count={}", total_split_bucket_num); @@ -1144,18 +1075,28 @@ std::vector RNProxyReadTask::buildProxyReadTask( for (size_t i = 0; i < region_reader_plans.size(); ++i) { const auto & plan = region_reader_plans[i]; - auto reader_groups = plan.bucket_units.empty() || reader_count_per_region[i] <= 1 - ? std::vector{plan.physical_table_ranges} - : packBucketUnitsIntoReaders(plan.bucket_units, reader_count_per_region[i]); - for (const auto & physical_table_ranges : reader_groups) + if (plan.bucket_units.empty()) { all_reader_plans.push_back(RNProxyReaderPlan{ .region_id = plan.region_id, .region_ver = plan.region_ver_id.ver, .region_conf_ver = plan.region_ver_id.conf_ver, - .physical_table_ranges = physical_table_ranges, + .physical_table_ranges = plan.physical_table_ranges, }); } + else + { + for (const auto & [table_id, range] : plan.bucket_units) + { + all_reader_plans.push_back(RNProxyReaderPlan{ + .region_id = plan.region_id, + .region_ver = plan.region_ver_id.ver, + .region_conf_ver = plan.region_ver_id.conf_ver, + .physical_table_ranges = ProxyPhysicalTableRanges{ + std::make_tuple(table_id, pingcap::coprocessor::KeyRanges{range})}, + }); + } + } } if (all_reader_plans.empty()) From d7fba6475b50751b0313cc0c18df6384a23d9f5c Mon Sep 17 00:00:00 2001 From: yongman Date: Tue, 2 Jun 2026 23:25:37 +0800 Subject: [PATCH 05/19] debug logs Signed-off-by: yongman --- .../Storages/StorageDisaggregatedColumnar.cpp | 24 +++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/dbms/src/Storages/StorageDisaggregatedColumnar.cpp b/dbms/src/Storages/StorageDisaggregatedColumnar.cpp index c1ede7385f4..33ed4617726 100644 --- a/dbms/src/Storages/StorageDisaggregatedColumnar.cpp +++ b/dbms/src/Storages/StorageDisaggregatedColumnar.cpp @@ -16,6 +16,7 @@ #if ENABLE_NEXT_GEN_COLUMNAR #include #include +#include #include #include #include @@ -40,6 +41,7 @@ #include #include #include +#include #include #include #include @@ -115,6 +117,8 @@ bool isBucketBoundaryInsideRange(const String & bucket_key, const pingcap::copro } BucketSplitResult splitRangesByBucketKeys( + const LoggerPtr & log, + RegionID region_id, const ProxyPhysicalTableRanges & physical_table_ranges, const std::vector & bucket_keys) { @@ -130,10 +134,22 @@ BucketSplitResult splitRangesByBucketKeys( bool current_range_split = false; for (const auto & bucket_key : bucket_keys) { - if (!isBucketBoundaryInsideRange(bucket_key, range)) + auto bucket_key_without_keyspace = TiKVKeyspaceID::removeKeyspaceID(bucket_key); + String normalized_bucket_key(bucket_key_without_keyspace); + LOG_INFO( + log, + "bucket split compare keys, region_id={}, range_start_key={}, range_end_key={}, " + "normalized_bucket_key={}", + region_id, + Redact::keyToHexString(range.start_key.data(), range.start_key.size()), + Redact::keyToHexString(range.end_key.data(), range.end_key.size()), + Redact::keyToHexString(normalized_bucket_key.data(), normalized_bucket_key.size())); + if (!isBucketBoundaryInsideRange(normalized_bucket_key, range)) continue; - result.units.emplace_back(table_id, pingcap::coprocessor::KeyRange{current_start, bucket_key}); - current_start = bucket_key; + result.units.emplace_back( + table_id, + pingcap::coprocessor::KeyRange{current_start, normalized_bucket_key}); + current_start = std::move(normalized_bucket_key); current_range_split = true; } if (!range.end_key.empty() && current_start >= range.end_key) @@ -1042,7 +1058,7 @@ std::vector RNProxyReadTask::buildProxyReadTask( if (enable_bucket_parallel) { auto bucket_keys = getRegionBucketKeysFromProxy(context, region_id, plan.region_ver_id.ver); - auto split_result = splitRangesByBucketKeys(physical_table_ranges, bucket_keys); + auto split_result = splitRangesByBucketKeys(log, region_id, physical_table_ranges, bucket_keys); if (split_result.has_bucket_split && split_result.units.size() > 1) { total_max_reader_num += split_result.units.size() - 1; From f72a9f3d3bd0c5b917ff91694d4bf4452adb026a Mon Sep 17 00:00:00 2001 From: yongman Date: Tue, 2 Jun 2026 23:33:28 +0800 Subject: [PATCH 06/19] decode bucket key Signed-off-by: yongman --- dbms/src/Storages/StorageDisaggregatedColumnar.cpp | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/dbms/src/Storages/StorageDisaggregatedColumnar.cpp b/dbms/src/Storages/StorageDisaggregatedColumnar.cpp index 33ed4617726..f7a6538abcc 100644 --- a/dbms/src/Storages/StorageDisaggregatedColumnar.cpp +++ b/dbms/src/Storages/StorageDisaggregatedColumnar.cpp @@ -41,7 +41,7 @@ #include #include #include -#include +#include #include #include #include @@ -134,15 +134,17 @@ BucketSplitResult splitRangesByBucketKeys( bool current_range_split = false; for (const auto & bucket_key : bucket_keys) { - auto bucket_key_without_keyspace = TiKVKeyspaceID::removeKeyspaceID(bucket_key); - String normalized_bucket_key(bucket_key_without_keyspace); + const auto decoded_bucket_key + = RecordKVFormat::decodeTiKVKey(TiKVKey(bucket_key.data(), bucket_key.size())); + String normalized_bucket_key(decoded_bucket_key.data(), decoded_bucket_key.size()); LOG_INFO( log, "bucket split compare keys, region_id={}, range_start_key={}, range_end_key={}, " - "normalized_bucket_key={}", + "encoded_bucket_key={}, normalized_bucket_key={}", region_id, Redact::keyToHexString(range.start_key.data(), range.start_key.size()), Redact::keyToHexString(range.end_key.data(), range.end_key.size()), + Redact::keyToHexString(bucket_key.data(), bucket_key.size()), Redact::keyToHexString(normalized_bucket_key.data(), normalized_bucket_key.size())); if (!isBucketBoundaryInsideRange(normalized_bucket_key, range)) continue; From d346c8c3bb4e5632cab99cfff0400c8cd1bdb707 Mon Sep 17 00:00:00 2001 From: yongman Date: Tue, 2 Jun 2026 23:42:07 +0800 Subject: [PATCH 07/19] snapaccess cache Signed-off-by: yongman --- .../hub-runtime/src/cloud_helper.rs | 217 +++++++++++++++++- .../Storages/StorageDisaggregatedColumnar.cpp | 13 +- 2 files changed, 215 insertions(+), 15 deletions(-) diff --git a/contrib/tiflash-columnar-hub/hub-runtime/src/cloud_helper.rs b/contrib/tiflash-columnar-hub/hub-runtime/src/cloud_helper.rs index 03277c5227c..c79be0ceebe 100644 --- a/contrib/tiflash-columnar-hub/hub-runtime/src/cloud_helper.rs +++ b/contrib/tiflash-columnar-hub/hub-runtime/src/cloud_helper.rs @@ -16,7 +16,7 @@ use std::{ fs, ops::Deref, path::{Path, PathBuf}, - sync::Arc, + sync::{Arc, Weak}, time::{Duration, UNIX_EPOCH}, }; @@ -69,6 +69,7 @@ const BACKOFF_RETRY_COUNT: usize = 5; const SNAPSHOT_CACHE_SIZE: usize = 10240; // 10k shards const SNAPSHOT_CACHE_CAP: u64 = 1024 * 1024 * 1024; // 1GB snapshot size with memtable +const SHARED_SNAP_ACCESS_CACHE_SIZE: usize = 10240; // 10k request-local snapshot handles const SNAPSHOT_CACHE_CAPABILITY_HEADER: &str = "x-cse-snapshot-cache-version"; #[derive(Debug, Error)] @@ -201,6 +202,8 @@ pub struct CloudHelper { block_cache: BlockCache, snapshot_cache: SnapCache, snapshot_cache_capable_stores: Arc>, + shared_snap_access_cache: SharedSnapAccessCache, + shared_snap_access_loaders: Arc>>>, meta_file_cache: Arc, MetaFileCacheWeighter>>, schema_files: Arc>, runtime: Arc, @@ -248,6 +251,8 @@ impl CloudHelper { ); let snapshot_cache = SnapCache::new(SNAPSHOT_CACHE_SIZE, SNAPSHOT_CACHE_CAP); let snapshot_cache_capable_stores = Arc::new(DashMap::new()); + let shared_snap_access_cache = SharedSnapAccessCache::new(SHARED_SNAP_ACCESS_CACHE_SIZE); + let shared_snap_access_loaders = Arc::new(DashMap::new()); // Create a long-lived HTTP client for connection reuse let http_client = { @@ -267,6 +272,8 @@ impl CloudHelper { block_cache, snapshot_cache, snapshot_cache_capable_stores, + shared_snap_access_cache, + shared_snap_access_loaders, meta_file_cache, schema_files: Arc::new(DashMap::new()), runtime, @@ -382,6 +389,8 @@ impl CloudHelper { let vector_index_cache = self.vector_index_cache.clone(); let snap_cache = self.snapshot_cache.clone(); let snap_cache_capable_stores = self.snapshot_cache_capable_stores.clone(); + let shared_snap_access_cache = self.shared_snap_access_cache.clone(); + let shared_snap_access_loaders = self.shared_snap_access_loaders.clone(); let meta_file_cache = self.meta_file_cache.clone(); let columnar_file_cache = self.columnar_file_cache.clone(); let fts_cache = self.fts_cache.clone(); @@ -390,7 +399,9 @@ impl CloudHelper { let tables_clone = tables.clone(); let fts_query_info_clone = fts_query_info.clone(); self.runtime.spawn(async move { - let snap = request_snapshot_from_leader( + let snap = get_or_request_shared_snapshot( + shared_snap_access_cache, + shared_snap_access_loaders, pd_client, http_client, dfs, @@ -408,7 +419,7 @@ impl CloudHelper { shard_id, shard_ver, start_ts, - &tables_clone, + tables_clone, &master_key, fts_query_info_clone, ) @@ -880,3 +891,203 @@ impl quick_cache::Weighter for SnapWeighter { val.snap.len() as u64 } } + +#[derive(Clone)] +pub struct SharedSnapAccessCache { + core: Arc, +} + +impl Deref for SharedSnapAccessCache { + type Target = SharedSnapAccessCacheCore; + fn deref(&self) -> &Self::Target { + &self.core + } +} + +impl SharedSnapAccessCache { + pub fn new(size: usize) -> Self { + Self { + core: Arc::new(SharedSnapAccessCacheCore::new(size)), + } + } +} + +#[derive(Clone)] +pub struct SharedSnapAccessCacheCore { + cache: Arc< + Cache< + SharedSnapAccessKey, + Weak, + SharedSnapAccessWeighter, + DefaultHashBuilder, + >, + >, +} + +impl SharedSnapAccessCacheCore { + pub fn new(size: usize) -> Self { + let opts = quick_cache::OptionsBuilder::new() + .weight_capacity(size as u64) + .estimated_items_capacity(size) + .build() + .unwrap(); + + let cache = Arc::new(Cache::with_options( + opts, + SharedSnapAccessWeighter, + DefaultHashBuilder::default(), + DefaultLifecycle::default(), + )); + Self { cache } + } + + pub fn get(&self, key: &SharedSnapAccessKey) -> Option> { + self.cache.get(key) + } + + pub fn insert(&self, key: SharedSnapAccessKey, entry: Weak) { + self.cache.insert(key, entry); + } +} + +#[derive(Clone, Eq, PartialEq, Hash)] +pub struct SharedSnapAccessKey { + pub shard_id: u64, + pub shard_ver: u64, + pub start_ts: u64, + pub start_table_id: i64, + pub end_table_id: i64, + pub prepare_all: bool, +} + +impl SharedSnapAccessKey { + pub fn new( + shard_id: u64, + shard_ver: u64, + start_ts: u64, + start_table_id: i64, + end_table_id: i64, + prepare_all: bool, + ) -> Self { + Self { + shard_id, + shard_ver, + start_ts, + start_table_id, + end_table_id, + prepare_all, + } + } +} + +#[derive(Clone)] +pub struct SharedSnapAccessWeighter; + +impl quick_cache::Weighter> + for SharedSnapAccessWeighter +{ + fn weight(&self, _key: &SharedSnapAccessKey, _val: &Weak) -> u64 { + 1 + } +} + +fn upgrade_shared_snap_access( + cache: &SharedSnapAccessCache, + key: &SharedSnapAccessKey, +) -> Option { + let core = cache.get(key)?.upgrade()?; + Some(SnapAccess { core }) +} + +async fn get_or_request_shared_snapshot( + shared_snap_access_cache: SharedSnapAccessCache, + shared_snap_access_loaders: Arc>>>, + pd_client: Arc, + http_client: security::HttpClient, + dfs: Arc, + ia_ctx: IaCtx, + vector_index_cache: VectorIndexCache, + columnar_file_cache: ColumnarFileCache, + snap_cache: SnapCache, + snap_cache_capable_stores: Arc>, + meta_file_cache: Arc, MetaFileCacheWeighter>>, + schema_files: Arc>, + txn_chunk_manager: TxnChunkManager, + block_cache: BlockCache, + fts_cache: FtsCache, + fts_delta_cache: FtsDeltaCache, + shard_id: u64, + shard_ver: u64, + start_ts: u64, + tables: Vec, + master_key: &MasterKey, + fts_query_info: tipb::FtsQueryInfo, +) -> Result { + let start_table_id = tables[0].table_id; + let end_table_id = tables[tables.len() - 1].table_id; + let prepare_all = fts_query_info.get_query_type() != tipb::FtsQueryType::FtsQueryTypeInvalid; + let key = SharedSnapAccessKey::new( + shard_id, + shard_ver, + start_ts, + start_table_id, + end_table_id, + prepare_all, + ); + + if let Some(snap) = upgrade_shared_snap_access(&shared_snap_access_cache, &key) { + info!( + "reuse shared snapaccess directly, shard_id: {}, shard_ver: {}, start_ts: {}, start_table_id: {}, end_table_id: {}", + shard_id, shard_ver, start_ts, start_table_id, end_table_id + ); + return Ok(snap); + } + + let loader = shared_snap_access_loaders + .entry(key.clone()) + .or_insert_with(|| Arc::new(tokio::sync::Mutex::new(()))) + .clone(); + let _guard = loader.lock().await; + + if let Some(snap) = upgrade_shared_snap_access(&shared_snap_access_cache, &key) { + info!( + "reuse shared snapaccess after wait, shard_id: {}, shard_ver: {}, start_ts: {}, start_table_id: {}, end_table_id: {}", + shard_id, shard_ver, start_ts, start_table_id, end_table_id + ); + return Ok(snap); + } + + info!( + "load shared snapaccess, shard_id: {}, shard_ver: {}, start_ts: {}, start_table_id: {}, end_table_id: {}", + shard_id, shard_ver, start_ts, start_table_id, end_table_id + ); + let snap = request_snapshot_from_leader( + pd_client, + http_client, + dfs, + ia_ctx, + vector_index_cache, + columnar_file_cache, + snap_cache, + snap_cache_capable_stores, + meta_file_cache, + schema_files, + txn_chunk_manager, + block_cache, + fts_cache, + fts_delta_cache, + shard_id, + shard_ver, + start_ts, + &tables, + master_key, + fts_query_info, + ) + .await; + + if let Ok(ref snap_access) = snap { + shared_snap_access_cache.insert(key.clone(), Arc::downgrade(&snap_access.core)); + } + shared_snap_access_loaders.remove(&key); + snap +} diff --git a/dbms/src/Storages/StorageDisaggregatedColumnar.cpp b/dbms/src/Storages/StorageDisaggregatedColumnar.cpp index f7a6538abcc..acdff024450 100644 --- a/dbms/src/Storages/StorageDisaggregatedColumnar.cpp +++ b/dbms/src/Storages/StorageDisaggregatedColumnar.cpp @@ -117,8 +117,6 @@ bool isBucketBoundaryInsideRange(const String & bucket_key, const pingcap::copro } BucketSplitResult splitRangesByBucketKeys( - const LoggerPtr & log, - RegionID region_id, const ProxyPhysicalTableRanges & physical_table_ranges, const std::vector & bucket_keys) { @@ -137,15 +135,6 @@ BucketSplitResult splitRangesByBucketKeys( const auto decoded_bucket_key = RecordKVFormat::decodeTiKVKey(TiKVKey(bucket_key.data(), bucket_key.size())); String normalized_bucket_key(decoded_bucket_key.data(), decoded_bucket_key.size()); - LOG_INFO( - log, - "bucket split compare keys, region_id={}, range_start_key={}, range_end_key={}, " - "encoded_bucket_key={}, normalized_bucket_key={}", - region_id, - Redact::keyToHexString(range.start_key.data(), range.start_key.size()), - Redact::keyToHexString(range.end_key.data(), range.end_key.size()), - Redact::keyToHexString(bucket_key.data(), bucket_key.size()), - Redact::keyToHexString(normalized_bucket_key.data(), normalized_bucket_key.size())); if (!isBucketBoundaryInsideRange(normalized_bucket_key, range)) continue; result.units.emplace_back( @@ -1060,7 +1049,7 @@ std::vector RNProxyReadTask::buildProxyReadTask( if (enable_bucket_parallel) { auto bucket_keys = getRegionBucketKeysFromProxy(context, region_id, plan.region_ver_id.ver); - auto split_result = splitRangesByBucketKeys(log, region_id, physical_table_ranges, bucket_keys); + auto split_result = splitRangesByBucketKeys(physical_table_ranges, bucket_keys); if (split_result.has_bucket_split && split_result.units.size() > 1) { total_max_reader_num += split_result.units.size() - 1; From 794d9362af623eb6993a013e76dff2159845e9a6 Mon Sep 17 00:00:00 2001 From: yongman Date: Wed, 3 Jun 2026 15:58:33 +0800 Subject: [PATCH 08/19] avoid create segment reader thread when use columnar Signed-off-by: yongman --- dbms/src/Server/Server.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/dbms/src/Server/Server.cpp b/dbms/src/Server/Server.cpp index 8d02a30d9ae..fa68182c0c9 100644 --- a/dbms/src/Server/Server.cpp +++ b/dbms/src/Server/Server.cpp @@ -1069,6 +1069,7 @@ try LOG_INFO(log, "Init S3 GC Manager"); global_context->getTMTContext().initS3GCManager(proxy_machine.getProxyHelper()); // Initialize the thread pool of storage before the storage engine is initialized. + if (!disagg_opt.use_columnar) { LOG_INFO(log, "dt_enable_read_thread {}", global_context->getSettingsRef().dt_enable_read_thread); // `DMFileReaderPool` should be constructed before and destructed after `SegmentReaderPoolManager`. From 01fac267cd085f6bc212d16af75863382facc01b Mon Sep 17 00:00:00 2001 From: yongman Date: Fri, 5 Jun 2026 16:34:30 +0800 Subject: [PATCH 09/19] add region buckets cache Signed-off-by: yongman --- .../hub-runtime/src/cloud_helper.rs | 41 ++++++++++++++++--- 1 file changed, 36 insertions(+), 5 deletions(-) diff --git a/contrib/tiflash-columnar-hub/hub-runtime/src/cloud_helper.rs b/contrib/tiflash-columnar-hub/hub-runtime/src/cloud_helper.rs index c79be0ceebe..8428aad56e7 100644 --- a/contrib/tiflash-columnar-hub/hub-runtime/src/cloud_helper.rs +++ b/contrib/tiflash-columnar-hub/hub-runtime/src/cloud_helper.rs @@ -48,7 +48,7 @@ use kvproto::{ coprocessor::DelegateResponse, metapb::{Peer, Store}, }; -use pd_client::PdClient; +use pd_client::{BucketStat, PdClient}; use protobuf::Message; use quick_cache::{ sync::{Cache, DefaultLifecycle}, @@ -115,11 +115,27 @@ pub struct CloudEngineBackends { pub txn_chunk_mgr: TxnChunkManager, } +#[derive(Clone)] +struct RegionBucketCacheEntry { + region_ver: u64, + keys: Vec>, +} + +impl From<&BucketStat> for RegionBucketCacheEntry { + fn from(bucket_stat: &BucketStat) -> Self { + Self { + region_ver: bucket_stat.meta.region_epoch.get_version(), + keys: bucket_stat.meta.keys.clone(), + } + } +} + #[derive(Clone)] pub struct PdClientWithCache { pd_client: Arc, store_cache: Arc>, // store_id -> Store region_cache: Arc>, // region_id -> Peer + region_bucket_cache: Arc>, // region_id -> bucket keys } impl PdClientWithCache { @@ -128,6 +144,7 @@ impl PdClientWithCache { pd_client, store_cache: Arc::new(DashMap::new()), region_cache: Arc::new(DashMap::new()), + region_bucket_cache: Arc::new(DashMap::new()), } } @@ -172,6 +189,7 @@ impl PdClientWithCache { pub fn evict_region_cache(&self, region_id: u64) { self.region_cache.remove(®ion_id); + self.region_bucket_cache.remove(®ion_id); } pub fn get_security_mgr(&self) -> Arc { @@ -179,13 +197,26 @@ impl PdClientWithCache { } pub fn get_region_bucket_keys(&self, region_id: u64, region_ver: u64) -> Vec> { + if let Some(bucket_entry) = self.region_bucket_cache.get(®ion_id) { + match bucket_entry.region_ver.cmp(®ion_ver) { + std::cmp::Ordering::Equal => return bucket_entry.keys.clone(), + std::cmp::Ordering::Greater => return Vec::new(), + std::cmp::Ordering::Less => {} + } + } + let Some(bucket_stat) = self.pd_client.get_buckets(region_id) else { + self.region_bucket_cache.remove(®ion_id); return Vec::new(); }; - if bucket_stat.meta.region_epoch.get_version() != region_ver { - return Vec::new(); - } - bucket_stat.meta.keys.clone() + let bucket_entry = RegionBucketCacheEntry::from(&bucket_stat); + let bucket_keys = if bucket_entry.region_ver == region_ver { + bucket_entry.keys.clone() + } else { + Vec::new() + }; + self.region_bucket_cache.insert(region_id, bucket_entry); + bucket_keys } } From 288dc829469e382d6ed1dbe59ba0b3a0eca107a4 Mon Sep 17 00:00:00 2001 From: yongman Date: Tue, 9 Jun 2026 11:33:32 +0800 Subject: [PATCH 10/19] optimize snapaccess cache for buckets read Signed-off-by: yongman --- .../ffi/src/RaftStoreProxyFFI/ProxyFFI.h | 2 + .../hub-runtime/src/cloud_helper.rs | 160 ++++++++++++------ .../hub-runtime/src/columnar_impls.rs | 9 + .../hub-runtime/src/interfaces.rs | 3 + .../hub-runtime/src/run.rs | 6 +- .../Storages/StorageDisaggregatedColumnar.cpp | 33 ++++ 6 files changed, 160 insertions(+), 53 deletions(-) diff --git a/contrib/tiflash-columnar-hub/hub-runtime/ffi/src/RaftStoreProxyFFI/ProxyFFI.h b/contrib/tiflash-columnar-hub/hub-runtime/ffi/src/RaftStoreProxyFFI/ProxyFFI.h index f4621ee0af6..96775020bee 100644 --- a/contrib/tiflash-columnar-hub/hub-runtime/ffi/src/RaftStoreProxyFFI/ProxyFFI.h +++ b/contrib/tiflash-columnar-hub/hub-runtime/ffi/src/RaftStoreProxyFFI/ProxyFFI.h @@ -230,6 +230,8 @@ struct CloudStorageEngineInterfaces { RawCppStringPtr (*fn_get_master_key)(RaftStoreProxyPtr); RustStrWithViewVec (*fn_get_region_bucket_keys)(uint64_t, uint64_t, RaftStoreProxyPtr); + void (*fn_clear_shared_snap_access_by_start_ts)(uint64_t, + RaftStoreProxyPtr); ColumnarReaderPtr (*fn_get_columnar_reader)(uint64_t, uint64_t, uint64_t, BaseBuffView, BaseBuffView, BaseBuffView, BaseBuffView, diff --git a/contrib/tiflash-columnar-hub/hub-runtime/src/cloud_helper.rs b/contrib/tiflash-columnar-hub/hub-runtime/src/cloud_helper.rs index 8428aad56e7..7dd60e9a7ca 100644 --- a/contrib/tiflash-columnar-hub/hub-runtime/src/cloud_helper.rs +++ b/contrib/tiflash-columnar-hub/hub-runtime/src/cloud_helper.rs @@ -69,7 +69,6 @@ const BACKOFF_RETRY_COUNT: usize = 5; const SNAPSHOT_CACHE_SIZE: usize = 10240; // 10k shards const SNAPSHOT_CACHE_CAP: u64 = 1024 * 1024 * 1024; // 1GB snapshot size with memtable -const SHARED_SNAP_ACCESS_CACHE_SIZE: usize = 10240; // 10k request-local snapshot handles const SNAPSHOT_CACHE_CAPABILITY_HEADER: &str = "x-cse-snapshot-cache-version"; #[derive(Debug, Error)] @@ -234,7 +233,6 @@ pub struct CloudHelper { snapshot_cache: SnapCache, snapshot_cache_capable_stores: Arc>, shared_snap_access_cache: SharedSnapAccessCache, - shared_snap_access_loaders: Arc>>>, meta_file_cache: Arc, MetaFileCacheWeighter>>, schema_files: Arc>, runtime: Arc, @@ -282,8 +280,7 @@ impl CloudHelper { ); let snapshot_cache = SnapCache::new(SNAPSHOT_CACHE_SIZE, SNAPSHOT_CACHE_CAP); let snapshot_cache_capable_stores = Arc::new(DashMap::new()); - let shared_snap_access_cache = SharedSnapAccessCache::new(SHARED_SNAP_ACCESS_CACHE_SIZE); - let shared_snap_access_loaders = Arc::new(DashMap::new()); + let shared_snap_access_cache = SharedSnapAccessCache::new(); // Create a long-lived HTTP client for connection reuse let http_client = { @@ -304,7 +301,6 @@ impl CloudHelper { snapshot_cache, snapshot_cache_capable_stores, shared_snap_access_cache, - shared_snap_access_loaders, meta_file_cache, schema_files: Arc::new(DashMap::new()), runtime, @@ -421,7 +417,6 @@ impl CloudHelper { let snap_cache = self.snapshot_cache.clone(); let snap_cache_capable_stores = self.snapshot_cache_capable_stores.clone(); let shared_snap_access_cache = self.shared_snap_access_cache.clone(); - let shared_snap_access_loaders = self.shared_snap_access_loaders.clone(); let meta_file_cache = self.meta_file_cache.clone(); let columnar_file_cache = self.columnar_file_cache.clone(); let fts_cache = self.fts_cache.clone(); @@ -432,7 +427,6 @@ impl CloudHelper { self.runtime.spawn(async move { let snap = get_or_request_shared_snapshot( shared_snap_access_cache, - shared_snap_access_loaders, pd_client, http_client, dfs, @@ -500,6 +494,20 @@ impl CloudHelper { pub fn get_region_bucket_keys(&self, region_id: u64, region_ver: u64) -> Vec> { self.pd_client.get_region_bucket_keys(region_id, region_ver) } + + pub fn clear_shared_snap_access_by_start_ts(&self, start_ts: u64) { + if start_ts == 0 { + return; + } + + let (removed_cache_entries, removed_loader_entries) = + self.shared_snap_access_cache.remove_by_start_ts(start_ts); + + info!( + "clear shared snapaccess by start_ts, start_ts: {}, removed_cache_entries: {}, removed_loader_entries: {}", + start_ts, removed_cache_entries, removed_loader_entries + ); + } } fn collect_ia_meta_files(meta_paths: &[PathBuf]) -> std::io::Result> { @@ -936,48 +944,104 @@ impl Deref for SharedSnapAccessCache { } impl SharedSnapAccessCache { - pub fn new(size: usize) -> Self { + pub fn new() -> Self { Self { - core: Arc::new(SharedSnapAccessCacheCore::new(size)), + core: Arc::new(SharedSnapAccessCacheCore::new()), } } } #[derive(Clone)] pub struct SharedSnapAccessCacheCore { - cache: Arc< - Cache< - SharedSnapAccessKey, - Weak, - SharedSnapAccessWeighter, - DefaultHashBuilder, - >, - >, + groups: Arc>>, } impl SharedSnapAccessCacheCore { - pub fn new(size: usize) -> Self { - let opts = quick_cache::OptionsBuilder::new() - .weight_capacity(size as u64) - .estimated_items_capacity(size) - .build() - .unwrap(); - - let cache = Arc::new(Cache::with_options( - opts, - SharedSnapAccessWeighter, - DefaultHashBuilder::default(), - DefaultLifecycle::default(), - )); - Self { cache } + pub fn new() -> Self { + Self { + groups: Arc::new(DashMap::new()), + } } pub fn get(&self, key: &SharedSnapAccessKey) -> Option> { - self.cache.get(key) + self.groups + .get(&key.start_ts)? + .entries + .get(key) + .map(|entry| entry.clone()) } pub fn insert(&self, key: SharedSnapAccessKey, entry: Weak) { - self.cache.insert(key, entry); + let group = self + .groups + .entry(key.start_ts) + .or_insert_with(|| Arc::new(SharedSnapAccessGroup::new())) + .clone(); + group.entries.insert(key, entry); + } + + pub fn get_loader(&self, key: &SharedSnapAccessKey) -> Arc> { + let group = self + .groups + .entry(key.start_ts) + .or_insert_with(|| Arc::new(SharedSnapAccessGroup::new())) + .clone(); + let loader = group + .loaders + .entry(key.clone()) + .or_insert_with(|| Arc::new(tokio::sync::Mutex::new(()))) + .clone(); + loader + } + + pub fn remove_loader(&self, key: &SharedSnapAccessKey) -> bool { + let Some(group) = self.groups.get(&key.start_ts).map(|entry| entry.clone()) else { + return false; + }; + let removed = group.loaders.remove(key).is_some(); + self.try_remove_empty_group(key.start_ts, &group); + removed + } + + pub fn remove_entry(&self, key: &SharedSnapAccessKey) -> bool { + let Some(group) = self.groups.get(&key.start_ts).map(|entry| entry.clone()) else { + return false; + }; + let removed = group.entries.remove(key).is_some(); + self.try_remove_empty_group(key.start_ts, &group); + removed + } + + pub fn remove_by_start_ts(&self, start_ts: u64) -> (usize, usize) { + let Some((_, group)) = self.groups.remove(&start_ts) else { + return (0, 0); + }; + (group.entries.len(), group.loaders.len()) + } + + fn try_remove_empty_group(&self, start_ts: u64, group: &Arc) { + if group.entries.is_empty() && group.loaders.is_empty() { + if let Some(entry) = self.groups.get(&start_ts) { + if Arc::ptr_eq(entry.value(), group) { + drop(entry); + let _ = self.groups.remove(&start_ts); + } + } + } + } +} + +pub struct SharedSnapAccessGroup { + entries: DashMap>, + loaders: DashMap>>, +} + +impl SharedSnapAccessGroup { + pub fn new() -> Self { + Self { + entries: DashMap::new(), + loaders: DashMap::new(), + } } } @@ -1011,28 +1075,25 @@ impl SharedSnapAccessKey { } } -#[derive(Clone)] -pub struct SharedSnapAccessWeighter; - -impl quick_cache::Weighter> - for SharedSnapAccessWeighter -{ - fn weight(&self, _key: &SharedSnapAccessKey, _val: &Weak) -> u64 { - 1 - } -} - fn upgrade_shared_snap_access( cache: &SharedSnapAccessCache, key: &SharedSnapAccessKey, ) -> Option { - let core = cache.get(key)?.upgrade()?; + let core = match cache.get(key) { + Some(core) => match core.upgrade() { + Some(core) => core, + None => { + cache.remove_entry(key); + return None; + } + }, + None => return None, + }; Some(SnapAccess { core }) } async fn get_or_request_shared_snapshot( shared_snap_access_cache: SharedSnapAccessCache, - shared_snap_access_loaders: Arc>>>, pd_client: Arc, http_client: security::HttpClient, dfs: Arc, @@ -1074,10 +1135,7 @@ async fn get_or_request_shared_snapshot( return Ok(snap); } - let loader = shared_snap_access_loaders - .entry(key.clone()) - .or_insert_with(|| Arc::new(tokio::sync::Mutex::new(()))) - .clone(); + let loader = shared_snap_access_cache.get_loader(&key); let _guard = loader.lock().await; if let Some(snap) = upgrade_shared_snap_access(&shared_snap_access_cache, &key) { @@ -1119,6 +1177,6 @@ async fn get_or_request_shared_snapshot( if let Ok(ref snap_access) = snap { shared_snap_access_cache.insert(key.clone(), Arc::downgrade(&snap_access.core)); } - shared_snap_access_loaders.remove(&key); + shared_snap_access_cache.remove_loader(&key); snap } diff --git a/contrib/tiflash-columnar-hub/hub-runtime/src/columnar_impls.rs b/contrib/tiflash-columnar-hub/hub-runtime/src/columnar_impls.rs index 3d063267c3f..6ba02daf5ad 100644 --- a/contrib/tiflash-columnar-hub/hub-runtime/src/columnar_impls.rs +++ b/contrib/tiflash-columnar-hub/hub-runtime/src/columnar_impls.rs @@ -89,6 +89,15 @@ pub unsafe extern "C" fn ffi_get_region_bucket_keys( } } +pub unsafe extern "C" fn ffi_clear_shared_snap_access_by_start_ts( + start_ts: u64, + hub_ptr: RaftStoreProxyPtr, +) { + let hub = hub_ptr.as_ref(); + hub.cloud_helper + .clear_shared_snap_access_by_start_ts(start_ts); +} + pub unsafe extern "C" fn ffi_make_columnar_reader( shard_id: u64, shard_ver: u64, diff --git a/contrib/tiflash-columnar-hub/hub-runtime/src/interfaces.rs b/contrib/tiflash-columnar-hub/hub-runtime/src/interfaces.rs index 5f9c9485b37..8c76a251986 100644 --- a/contrib/tiflash-columnar-hub/hub-runtime/src/interfaces.rs +++ b/contrib/tiflash-columnar-hub/hub-runtime/src/interfaces.rs @@ -362,6 +362,9 @@ pub mod root { arg3: root::DB::RaftStoreProxyPtr, ) -> root::DB::RustStrWithViewVec, >, + pub fn_clear_shared_snap_access_by_start_ts: ::std::option::Option< + unsafe extern "C" fn(arg1: u64, arg2: root::DB::RaftStoreProxyPtr), + >, pub fn_get_columnar_reader: ::std::option::Option< unsafe extern "C" fn( arg1: u64, diff --git a/contrib/tiflash-columnar-hub/hub-runtime/src/run.rs b/contrib/tiflash-columnar-hub/hub-runtime/src/run.rs index 22973d597b9..95b9777ae21 100644 --- a/contrib/tiflash-columnar-hub/hub-runtime/src/run.rs +++ b/contrib/tiflash-columnar-hub/hub-runtime/src/run.rs @@ -55,8 +55,9 @@ use tikv_util::{ use crate::{ cloud_helper::{CloudEngineBackends, CloudHelper}, columnar_impls::{ - ffi_get_region_bucket_keys, ffi_make_columnar_reader, ffi_physical_table_id, - ffi_read_block, ffi_read_column, ffi_read_handle, ffi_read_version, + ffi_clear_shared_snap_access_by_start_ts, ffi_get_region_bucket_keys, + ffi_make_columnar_reader, ffi_physical_table_id, ffi_read_block, ffi_read_column, + ffi_read_handle, ffi_read_version, }, domain_impls::ffi_gc_rust_ptr, engine_store_helper::{ @@ -1148,6 +1149,7 @@ fn build_hub_ffi_helper(hub: &ColumnarHub) -> RaftStoreProxyFFIHelper { fn_get_keyspace_encryption: Some(ffi_get_keyspace_encryption), fn_get_master_key: Some(ffi_get_master_key), fn_get_region_bucket_keys: Some(ffi_get_region_bucket_keys), + fn_clear_shared_snap_access_by_start_ts: Some(ffi_clear_shared_snap_access_by_start_ts), fn_get_columnar_reader: Some(ffi_make_columnar_reader), fn_read_block: Some(ffi_read_block), fn_read_handle: Some(ffi_read_handle), diff --git a/dbms/src/Storages/StorageDisaggregatedColumnar.cpp b/dbms/src/Storages/StorageDisaggregatedColumnar.cpp index acdff024450..4c74ad36639 100644 --- a/dbms/src/Storages/StorageDisaggregatedColumnar.cpp +++ b/dbms/src/Storages/StorageDisaggregatedColumnar.cpp @@ -69,6 +69,8 @@ extern const int COLUMNAR_SNAPSHOT_ERROR; struct RNProxyReaderSharedContext { + using ClearSharedSnapAccessByStartTsFn = void (*)(uint64_t, RaftStoreProxyPtr); + LoggerPtr log; const Context * context = nullptr; UInt64 start_ts = 0; @@ -81,7 +83,30 @@ struct RNProxyReaderSharedContext String table_info_data; String ann_query_info_data; String fts_query_info_data; + RaftStoreProxyPtr proxy_ptr{}; + ClearSharedSnapAccessByStartTsFn clear_shared_snap_access_by_start_ts = nullptr; std::shared_ptr output_lock = std::make_shared(); + + ~RNProxyReaderSharedContext() noexcept + { + if (start_ts == 0 || proxy_ptr.inner == nullptr || clear_shared_snap_access_by_start_ts == nullptr) + return; + + try + { + clear_shared_snap_access_by_start_ts(start_ts, proxy_ptr); + } + catch (...) + { + try + { + LOG_WARNING(log, "clear shared snapaccess cache failed, start_ts={}", start_ts); + } + catch (...) + { + } + } + } }; namespace @@ -230,6 +255,14 @@ std::shared_ptr buildProxyReaderSharedContext( shared_context->start_ts = start_ts; shared_context->logical_table_id = table_scan.getLogicalTableID(); shared_context->executor_id = table_scan.getTableScanExecutorID(); + const TiFlashRaftProxyHelper * proxy_helper + = context.getGlobalContext().getSharedContextDisagg()->getColumnarProxyHelper(); + if (proxy_helper != nullptr) + { + shared_context->proxy_ptr = proxy_helper->proxy_ptr; + shared_context->clear_shared_snap_access_by_start_ts + = proxy_helper->cloud_storage_engine_interfaces.fn_clear_shared_snap_access_by_start_ts; + } std::tie(shared_context->column_defines, shared_context->extra_table_id_index) = genColumnDefinesForDisaggregatedReadThroughColumnar(table_scan); From ff71ab622f5472878e9ac5e6e7564ca69100cb9b Mon Sep 17 00:00:00 2001 From: yongman Date: Tue, 9 Jun 2026 15:32:10 +0800 Subject: [PATCH 11/19] fix legacy inputstream concurrency Signed-off-by: yongman --- .../Storages/StorageDisaggregatedColumnar.cpp | 228 ++++++++++++------ .../Storages/StorageDisaggregatedColumnar.h | 16 +- 2 files changed, 160 insertions(+), 84 deletions(-) diff --git a/dbms/src/Storages/StorageDisaggregatedColumnar.cpp b/dbms/src/Storages/StorageDisaggregatedColumnar.cpp index 4c74ad36639..4aa4c42d005 100644 --- a/dbms/src/Storages/StorageDisaggregatedColumnar.cpp +++ b/dbms/src/Storages/StorageDisaggregatedColumnar.cpp @@ -109,6 +109,11 @@ struct RNProxyReaderSharedContext } }; +size_t getRNProxySourceNum(size_t num_streams, size_t reader_count) +{ + return std::min(std::max(1, num_streams), reader_count); +} + namespace { using ProxyPhysicalTableRanges = std::vector>; @@ -543,7 +548,7 @@ void StorageDisaggregated::readThroughColumnar( if (!read_proxy_tasks.empty()) { auto & task_pool = read_proxy_tasks.front(); - const size_t source_num = std::min(num_streams, task_pool->getReaderCount()); + const size_t source_num = task_pool->getSourceNum(); LOG_INFO( log, "use shared proxy reader task pool, reader_num={}, source_num={}", @@ -739,10 +744,14 @@ RNProxyReaderSlot::~RNProxyReaderSlot() RNProxyReadTask::RNProxyReadTask( std::vector reader_plans_, + size_t source_num_, std::shared_ptr shared_reader_context_) : reader_plans(std::move(reader_plans_)) + , source_num(source_num_) , shared_reader_context(std::move(shared_reader_context_)) { + RUNTIME_CHECK(source_num > 0); + RUNTIME_CHECK(source_num <= reader_plans.size(), source_num, reader_plans.size()); reader_slots.reserve(reader_plans.size()); for (size_t i = 0; i < reader_plans.size(); ++i) reader_slots.emplace_back(std::make_shared()); @@ -753,6 +762,11 @@ size_t RNProxyReadTask::getReaderCount() const return reader_plans.size(); } +size_t RNProxyReadTask::getSourceNum() const +{ + return source_num; +} + const Context & RNProxyReadTask::getContext() const { return *shared_reader_context->context; @@ -957,6 +971,20 @@ BlockInputStreamPtr RNProxyReadTask::createInputStream(size_t reader_index) }); } +BlockInputStreamPtr RNProxyReadTask::createSharedInputStream() +{ + return RNProxyInputStream::create({ + .context = getContext(), + .log = getLog(), + .task = shared_from_this(), + .reader_index = std::nullopt, + .columns_to_read = getColumnsToRead(), + .extra_table_id_index = getExtraTableIDIndex(), + .table_id = getLogicalTableID(), + .executor_id = getExecutorID(), + }); +} + std::vector RNProxyReadTask::buildProxyReadTaskWithBackoff( const LoggerPtr & log, const Context & context, @@ -1141,27 +1169,53 @@ std::vector RNProxyReadTask::buildProxyReadTask( if (all_reader_plans.empty()) return tasks; - tasks.push_back(std::make_shared(std::move(all_reader_plans), shared_reader_context)); + tasks.push_back(std::make_shared( + std::move(all_reader_plans), + getRNProxySourceNum(num_streams, planned_reader_num), + shared_reader_context)); return tasks; } BlockInputStreams RNProxyReadTask::getInputStreams() { BlockInputStreams streams; - streams.reserve(reader_plans.size()); - for (size_t reader_index = 0; reader_index < reader_plans.size(); ++reader_index) + streams.reserve(source_num); + for (size_t worker_index = 0; worker_index < source_num; ++worker_index) { - streams.push_back(createInputStream(reader_index)); + streams.push_back(createSharedInputStream()); } return streams; } // RNProxyInputStream -void RNProxyInputStream::ensureReader() +bool RNProxyInputStream::ensureReader() { if (reader.has_value()) - return; - reader.emplace(task->getOrCreateReader(reader_index)); + return true; + + if (fixed_reader_index.has_value()) + { + current_reader_index = fixed_reader_index; + reader.emplace(task->getOrCreateReader(fixed_reader_index.value())); + return true; + } + + auto next_reader_index = task->tryAcquireReaderIndex(); + if (!next_reader_index.has_value()) + return false; + + current_reader_index = next_reader_index; + reader.emplace(task->getOrCreateReader(next_reader_index.value())); + task->prefetchReader(next_reader_index.value() + 1); + return true; +} + +void RNProxyInputStream::releaseReader() +{ + if (reader.has_value() && reader->inner.ptr != nullptr) + RustGcHelper::instance().gcRustPtr(reader->inner.ptr, reader->inner.type); + reader.reset(); + current_reader_index.reset(); } RNProxyInputStream::~RNProxyInputStream() @@ -1213,92 +1267,106 @@ Block RNProxyInputStream::readImpl([[maybe_unused]] FilterPtr & res_filter, [[ma { if (done) return {}; - ensureReader(); const Context & global_ctx = context.getGlobalContext(); const TiFlashRaftProxyHelper * proxy_helper = global_ctx.getSharedContextDisagg()->getColumnarProxyHelper(); RUNTIME_CHECK_MSG(proxy_helper != nullptr, "columnar proxy helper is not initialized"); - Stopwatch w{CLOCK_MONOTONIC_COARSE}; - UInt64 rows = proxy_helper->cloud_storage_engine_interfaces.fn_read_block(reader.value(), batch_size); - duration_read_sec += w.elapsedSecondsFromLastTime(); - LOG_DEBUG(log, "Read {} rows from proxy", rows); - if (rows == std::numeric_limits::max()) - { - LOG_WARNING(log, "Read block from proxy failed"); - throw Exception("read_block failed in tiflash-proxy", ErrorCodes::LOGICAL_ERROR); - } - if (rows == 0) - { - done = true; - return {}; - } - TableID physical_table_id = -1; - Block header = getHeader(); - const ColumnsWithTypeAndName & col_type_and_name = header.getColumnsWithTypeAndName(); - // Construct block from proxy column data. - MutableColumns columns = header.cloneEmptyColumns(); - for (UInt32 i = 0; i < col_type_and_name.size(); ++i) + while (true) { - LOG_DEBUG( - log, - "Read column id={} name={} type={}", - col_type_and_name[i].column_id, - col_type_and_name[i].name, - col_type_and_name[i].type->getName()); - // Read column data from proxy - Int64 col_id = col_type_and_name[i].column_id; - if (col_id == MutSup::extra_handle_id) + if (!ensureReader()) { - RustStrWithView col_data = proxy_helper->cloud_storage_engine_interfaces.fn_read_handle(reader.value()); - SCOPE_EXIT({ RustGcHelper::instance().gcRustPtr(col_data.inner.ptr, col_data.inner.type); }); - physical_table_id = proxy_helper->cloud_storage_engine_interfaces.fn_physical_table_id(reader.value()); - ReadBufferFromMemory buf(col_data.buff.data, static_cast(col_data.buff.len)); - auto & col = *columns[i]; - col_type_and_name[i].type->deserializeBinaryBulkWithMultipleStreams( - col, - [&](const IDataType::SubstreamPath &) { return &buf; }, - rows, - -1.0, // avg_value_size_hint set to -1 to indicate Decimal format from proxy - true, - {}); + done = true; + return {}; } - else if (col_id == MutSup::extra_table_id_col_id) + + Stopwatch w{CLOCK_MONOTONIC_COARSE}; + UInt64 rows = proxy_helper->cloud_storage_engine_interfaces.fn_read_block(reader.value(), batch_size); + duration_read_sec += w.elapsedSecondsFromLastTime(); + LOG_DEBUG(log, "Read {} rows from proxy", rows); + if (rows == std::numeric_limits::max()) + { + LOG_WARNING(log, "Read block from proxy failed"); + throw Exception("read_block failed in tiflash-proxy", ErrorCodes::LOGICAL_ERROR); + } + if (rows == 0) { + releaseReader(); + if (fixed_reader_index.has_value()) + { + done = true; + return {}; + } continue; } - else + + TableID physical_table_id = -1; + Block header = getHeader(); + const ColumnsWithTypeAndName & col_type_and_name = header.getColumnsWithTypeAndName(); + // Construct block from proxy column data. + MutableColumns columns = header.cloneEmptyColumns(); + for (UInt32 i = 0; i < col_type_and_name.size(); ++i) { - RustStrWithView col_data - = proxy_helper->cloud_storage_engine_interfaces.fn_read_column(reader.value(), col_id); - SCOPE_EXIT({ RustGcHelper::instance().gcRustPtr(col_data.inner.ptr, col_data.inner.type); }); - physical_table_id = proxy_helper->cloud_storage_engine_interfaces.fn_physical_table_id(reader.value()); - ReadBufferFromMemory buf(col_data.buff.data, static_cast(col_data.buff.len)); - auto & col = *columns[i]; - col_type_and_name[i].type->deserializeBinaryBulkWithMultipleStreams( - col, - [&](const IDataType::SubstreamPath &) { return &buf; }, - rows, - -1.0, // avg_value_size_hint set to -1 to indicate Decimal format from proxy - true, - {}); - LOG_DEBUG(log, "Read column data done, col size={}", col.size()); + LOG_DEBUG( + log, + "Read column id={} name={} type={}", + col_type_and_name[i].column_id, + col_type_and_name[i].name, + col_type_and_name[i].type->getName()); + // Read column data from proxy + Int64 col_id = col_type_and_name[i].column_id; + if (col_id == MutSup::extra_handle_id) + { + RustStrWithView col_data = proxy_helper->cloud_storage_engine_interfaces.fn_read_handle(reader.value()); + SCOPE_EXIT({ RustGcHelper::instance().gcRustPtr(col_data.inner.ptr, col_data.inner.type); }); + physical_table_id = proxy_helper->cloud_storage_engine_interfaces.fn_physical_table_id(reader.value()); + ReadBufferFromMemory buf(col_data.buff.data, static_cast(col_data.buff.len)); + auto & col = *columns[i]; + col_type_and_name[i].type->deserializeBinaryBulkWithMultipleStreams( + col, + [&](const IDataType::SubstreamPath &) { return &buf; }, + rows, + -1.0, // avg_value_size_hint set to -1 to indicate Decimal format from proxy + true, + {}); + } + else if (col_id == MutSup::extra_table_id_col_id) + { + continue; + } + else + { + RustStrWithView col_data + = proxy_helper->cloud_storage_engine_interfaces.fn_read_column(reader.value(), col_id); + SCOPE_EXIT({ RustGcHelper::instance().gcRustPtr(col_data.inner.ptr, col_data.inner.type); }); + physical_table_id = proxy_helper->cloud_storage_engine_interfaces.fn_physical_table_id(reader.value()); + ReadBufferFromMemory buf(col_data.buff.data, static_cast(col_data.buff.len)); + auto & col = *columns[i]; + col_type_and_name[i].type->deserializeBinaryBulkWithMultipleStreams( + col, + [&](const IDataType::SubstreamPath &) { return &buf; }, + rows, + -1.0, // avg_value_size_hint set to -1 to indicate Decimal format from proxy + true, + {}); + LOG_DEBUG(log, "Read column data done, col size={}", col.size()); + } } - } - duration_deserialize_sec += w.elapsedSecondsFromLastTime(); + duration_deserialize_sec += w.elapsedSecondsFromLastTime(); - Block block = header.cloneWithColumns(std::move(columns)); - LOG_DEBUG(log, "Read block rows={}, structure={}", block.rows(), block.dumpStructure()); - if (physical_table_id == -1) - { - LOG_WARNING(log, "physical_table_id is not set, use table_id {} instead", table_id); - physical_table_id = table_id; - } - // Fill extra table id column. - action.fill(block, physical_table_id); - block.checkNumberOfRows(); + Block block = header.cloneWithColumns(std::move(columns)); + LOG_DEBUG(log, "Read block rows={}, structure={}", block.rows(), block.dumpStructure()); + if (physical_table_id == -1) + { + LOG_WARNING(log, "physical_table_id is not set, use table_id {} instead", table_id); + physical_table_id = table_id; + } + // Fill extra table id column. + action.fill(block, physical_table_id); + block.checkNumberOfRows(); - total_bytes += block.bytes(); - return block; + total_bytes += block.bytes(); + return block; + } } // RNProxySourceOp diff --git a/dbms/src/Storages/StorageDisaggregatedColumnar.h b/dbms/src/Storages/StorageDisaggregatedColumnar.h index dd875031943..ae645d1cb4c 100644 --- a/dbms/src/Storages/StorageDisaggregatedColumnar.h +++ b/dbms/src/Storages/StorageDisaggregatedColumnar.h @@ -113,6 +113,8 @@ class RNProxyReadTask BlockInputStreams getInputStreams(); + BlockInputStreamPtr createSharedInputStream(); + BlockInputStreamPtr createInputStream(size_t reader_index); ColumnarReaderPtr createColumnarReaderWithBackoff(size_t reader_index) const; @@ -125,6 +127,8 @@ class RNProxyReadTask size_t getReaderCount() const; + size_t getSourceNum() const; + const Context & getContext() const; const LoggerPtr & getLog() const; @@ -139,10 +143,12 @@ class RNProxyReadTask RNProxyReadTask( std::vector reader_plans, + size_t source_num, std::shared_ptr shared_reader_context); private: std::vector reader_plans; + size_t source_num; std::shared_ptr shared_reader_context; std::vector> reader_slots; std::atomic_size_t next_reader_index = 0; @@ -172,7 +178,7 @@ class RNProxyInputStream : public IProfilingBlockInputStream const Context & context; LoggerPtr log; RNProxyReadTaskPtr task; - size_t reader_index; + std::optional reader_index; const DM::ColumnDefines & columns_to_read; int extra_table_id_index; TableID table_id; @@ -183,7 +189,7 @@ class RNProxyInputStream : public IProfilingBlockInputStream : context(options.context) , log(options.log) , task(options.task) - , reader_index(options.reader_index) + , fixed_reader_index(options.reader_index) , action(options.columns_to_read, options.extra_table_id_index) , table_id(options.table_id) , executor_id(options.executor_id) @@ -195,12 +201,14 @@ class RNProxyInputStream : public IProfilingBlockInputStream static BlockInputStreamPtr create(const Options & options) { return std::make_shared(options); } private: - void ensureReader(); + bool ensureReader(); + void releaseReader(); const Context & context; const LoggerPtr log; RNProxyReadTaskPtr task; - size_t reader_index; + const std::optional fixed_reader_index; + std::optional current_reader_index; std::optional reader; AddExtraTableIDColumnTransformAction action; TableID table_id; From e4699f7d794e5a5ea28243de3b928058bd35bfb7 Mon Sep 17 00:00:00 2001 From: yongman Date: Tue, 9 Jun 2026 16:03:13 +0800 Subject: [PATCH 12/19] avoid inflight request leak snapaccess Signed-off-by: yongman --- .../hub-runtime/src/cloud_helper.rs | 121 ++++++++++++++---- 1 file changed, 97 insertions(+), 24 deletions(-) diff --git a/contrib/tiflash-columnar-hub/hub-runtime/src/cloud_helper.rs b/contrib/tiflash-columnar-hub/hub-runtime/src/cloud_helper.rs index 7dd60e9a7ca..a8d3f3c7fc7 100644 --- a/contrib/tiflash-columnar-hub/hub-runtime/src/cloud_helper.rs +++ b/contrib/tiflash-columnar-hub/hub-runtime/src/cloud_helper.rs @@ -16,7 +16,10 @@ use std::{ fs, ops::Deref, path::{Path, PathBuf}, - sync::{Arc, Weak}, + sync::{ + atomic::{AtomicBool, Ordering}, + Arc, Mutex, Weak, + }, time::{Duration, UNIX_EPOCH}, }; @@ -500,12 +503,12 @@ impl CloudHelper { return; } - let (removed_cache_entries, removed_loader_entries) = + let (cleared_cache_entries, in_flight_loader_entries) = self.shared_snap_access_cache.remove_by_start_ts(start_ts); info!( - "clear shared snapaccess by start_ts, start_ts: {}, removed_cache_entries: {}, removed_loader_entries: {}", - start_ts, removed_cache_entries, removed_loader_entries + "clear shared snapaccess by start_ts, start_ts: {}, cleared_cache_entries: {}, in_flight_loader_entries: {}", + start_ts, cleared_cache_entries, in_flight_loader_entries ); } } @@ -964,41 +967,51 @@ impl SharedSnapAccessCacheCore { } pub fn get(&self, key: &SharedSnapAccessKey) -> Option> { - self.groups - .get(&key.start_ts)? - .entries - .get(key) - .map(|entry| entry.clone()) + let group = self.groups.get(&key.start_ts).map(|entry| entry.clone())?; + let _state_guard = group.state_lock.lock().unwrap(); + if group.is_terminal() { + return None; + } + group.entries.get(key).map(|entry| entry.clone()) } pub fn insert(&self, key: SharedSnapAccessKey, entry: Weak) { - let group = self - .groups - .entry(key.start_ts) - .or_insert_with(|| Arc::new(SharedSnapAccessGroup::new())) - .clone(); + let Some(group) = self.groups.get(&key.start_ts).map(|entry| entry.clone()) else { + return; + }; + let _state_guard = group.state_lock.lock().unwrap(); + if group.is_terminal() { + return; + } group.entries.insert(key, entry); } - pub fn get_loader(&self, key: &SharedSnapAccessKey) -> Arc> { - let group = self - .groups - .entry(key.start_ts) - .or_insert_with(|| Arc::new(SharedSnapAccessGroup::new())) - .clone(); + pub fn get_loader(&self, key: &SharedSnapAccessKey) -> Option>> { + let group = match self.groups.entry(key.start_ts) { + dashmap::mapref::entry::Entry::Occupied(entry) => entry.get().clone(), + dashmap::mapref::entry::Entry::Vacant(entry) => { + entry.insert(Arc::new(SharedSnapAccessGroup::new())).clone() + } + }; + let _state_guard = group.state_lock.lock().unwrap(); + if group.is_terminal() { + return None; + } let loader = group .loaders .entry(key.clone()) .or_insert_with(|| Arc::new(tokio::sync::Mutex::new(()))) .clone(); - loader + Some(loader) } pub fn remove_loader(&self, key: &SharedSnapAccessKey) -> bool { let Some(group) = self.groups.get(&key.start_ts).map(|entry| entry.clone()) else { return false; }; + let _state_guard = group.state_lock.lock().unwrap(); let removed = group.loaders.remove(key).is_some(); + drop(_state_guard); self.try_remove_empty_group(key.start_ts, &group); removed } @@ -1007,19 +1020,29 @@ impl SharedSnapAccessCacheCore { let Some(group) = self.groups.get(&key.start_ts).map(|entry| entry.clone()) else { return false; }; + let _state_guard = group.state_lock.lock().unwrap(); let removed = group.entries.remove(key).is_some(); + drop(_state_guard); self.try_remove_empty_group(key.start_ts, &group); removed } pub fn remove_by_start_ts(&self, start_ts: u64) -> (usize, usize) { - let Some((_, group)) = self.groups.remove(&start_ts) else { + let Some(group) = self.groups.get(&start_ts).map(|entry| entry.clone()) else { return (0, 0); }; - (group.entries.len(), group.loaders.len()) + let _state_guard = group.state_lock.lock().unwrap(); + group.mark_terminal(); + let removed_entries = group.entries.len(); + let in_flight_loaders = group.loaders.len(); + group.entries.clear(); + drop(_state_guard); + self.try_remove_empty_group(start_ts, &group); + (removed_entries, in_flight_loaders) } fn try_remove_empty_group(&self, start_ts: u64, group: &Arc) { + let _state_guard = group.state_lock.lock().unwrap(); if group.entries.is_empty() && group.loaders.is_empty() { if let Some(entry) = self.groups.get(&start_ts) { if Arc::ptr_eq(entry.value(), group) { @@ -1034,6 +1057,8 @@ impl SharedSnapAccessCacheCore { pub struct SharedSnapAccessGroup { entries: DashMap>, loaders: DashMap>>, + terminal: AtomicBool, + state_lock: Mutex<()>, } impl SharedSnapAccessGroup { @@ -1041,8 +1066,18 @@ impl SharedSnapAccessGroup { Self { entries: DashMap::new(), loaders: DashMap::new(), + terminal: AtomicBool::new(false), + state_lock: Mutex::new(()), } } + + fn is_terminal(&self) -> bool { + self.terminal.load(Ordering::Acquire) + } + + fn mark_terminal(&self) { + self.terminal.store(true, Ordering::Release); + } } #[derive(Clone, Eq, PartialEq, Hash)] @@ -1135,7 +1170,9 @@ async fn get_or_request_shared_snapshot( return Ok(snap); } - let loader = shared_snap_access_cache.get_loader(&key); + let Some(loader) = shared_snap_access_cache.get_loader(&key) else { + return Err(format!("shared snapaccess evicted, start_ts: {}", start_ts).into()); + }; let _guard = loader.lock().await; if let Some(snap) = upgrade_shared_snap_access(&shared_snap_access_cache, &key) { @@ -1180,3 +1217,39 @@ async fn get_or_request_shared_snapshot( shared_snap_access_cache.remove_loader(&key); snap } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn shared_snap_access_eviction_is_sticky_for_in_flight_loader() { + let cache = SharedSnapAccessCache::new(); + let key = SharedSnapAccessKey::new(1, 2, 3, 4, 5, false); + + let loader = cache + .get_loader(&key) + .expect("active group should create loader"); + cache.insert(key.clone(), Weak::new()); + assert!(cache + .groups + .get(&key.start_ts) + .is_some_and(|group| group.entries.contains_key(&key))); + + let (removed_entries, in_flight_loaders) = cache.remove_by_start_ts(key.start_ts); + assert_eq!(removed_entries, 1); + assert_eq!(in_flight_loaders, 1); + assert!(cache.get(&key).is_none()); + assert!(cache.get_loader(&key).is_none()); + + cache.insert(key.clone(), Weak::new()); + assert!(cache + .groups + .get(&key.start_ts) + .is_some_and(|group| group.entries.is_empty())); + + drop(loader); + assert!(cache.remove_loader(&key)); + assert!(cache.groups.get(&key.start_ts).is_none()); + } +} From 6bfcdb984e8711cff4c071214736c7d9902724e5 Mon Sep 17 00:00:00 2001 From: yongman Date: Tue, 9 Jun 2026 17:30:10 +0800 Subject: [PATCH 13/19] clear snap access in last owner drop Signed-off-by: yongman --- .../Storages/StorageDisaggregatedColumnar.cpp | 80 +++++++++++++++---- 1 file changed, 65 insertions(+), 15 deletions(-) diff --git a/dbms/src/Storages/StorageDisaggregatedColumnar.cpp b/dbms/src/Storages/StorageDisaggregatedColumnar.cpp index 4aa4c42d005..278ba86377a 100644 --- a/dbms/src/Storages/StorageDisaggregatedColumnar.cpp +++ b/dbms/src/Storages/StorageDisaggregatedColumnar.cpp @@ -59,6 +59,7 @@ #include #include +#include namespace DB { @@ -71,6 +72,50 @@ struct RNProxyReaderSharedContext { using ClearSharedSnapAccessByStartTsFn = void (*)(uint64_t, RaftStoreProxyPtr); + struct StartTsClearRegistry + { + enum class UnregisterResult + { + NotRegistered, + NotLastOwner, + LastOwner, + }; + + std::mutex mutex; + std::unordered_map ref_counts; + + void registerStartTs(UInt64 start_ts) + { + if (start_ts == 0) + return; + auto guard = std::lock_guard(mutex); + ++ref_counts[start_ts]; + } + + UnregisterResult unregisterStartTs(UInt64 start_ts) + { + if (start_ts == 0) + return UnregisterResult::NotRegistered; + + auto guard = std::lock_guard(mutex); + auto it = ref_counts.find(start_ts); + if (it == ref_counts.end() || it->second == 0) + return UnregisterResult::NotRegistered; + --it->second; + if (it->second != 0) + return UnregisterResult::NotLastOwner; + + ref_counts.erase(it); + return UnregisterResult::LastOwner; + } + }; + + static StartTsClearRegistry & getStartTsClearRegistry() + { + static StartTsClearRegistry registry; + return registry; + } + LoggerPtr log; const Context * context = nullptr; UInt64 start_ts = 0; @@ -86,10 +131,18 @@ struct RNProxyReaderSharedContext RaftStoreProxyPtr proxy_ptr{}; ClearSharedSnapAccessByStartTsFn clear_shared_snap_access_by_start_ts = nullptr; std::shared_ptr output_lock = std::make_shared(); + bool registered_for_start_ts = false; ~RNProxyReaderSharedContext() noexcept { - if (start_ts == 0 || proxy_ptr.inner == nullptr || clear_shared_snap_access_by_start_ts == nullptr) + if (!registered_for_start_ts) + return; + + auto unregister_result = getStartTsClearRegistry().unregisterStartTs(start_ts); + if (unregister_result != StartTsClearRegistry::UnregisterResult::LastOwner) + return; + + if (proxy_ptr.inner == nullptr || clear_shared_snap_access_by_start_ts == nullptr) return; try @@ -98,13 +151,7 @@ struct RNProxyReaderSharedContext } catch (...) { - try - { - LOG_WARNING(log, "clear shared snapaccess cache failed, start_ts={}", start_ts); - } - catch (...) - { - } + LOG_WARNING(log, "clear shared snapaccess cache failed, start_ts={}", start_ts); } } }; @@ -258,6 +305,8 @@ std::shared_ptr buildProxyReaderSharedContext( shared_context->log = log; shared_context->context = &context; shared_context->start_ts = start_ts; + RNProxyReaderSharedContext::getStartTsClearRegistry().registerStartTs(start_ts); + shared_context->registered_for_start_ts = true; shared_context->logical_table_id = table_scan.getLogicalTableID(); shared_context->executor_id = table_scan.getTableScanExecutorID(); const TiFlashRaftProxyHelper * proxy_helper @@ -1156,13 +1205,14 @@ std::vector RNProxyReadTask::buildProxyReadTask( { for (const auto & [table_id, range] : plan.bucket_units) { - all_reader_plans.push_back(RNProxyReaderPlan{ - .region_id = plan.region_id, - .region_ver = plan.region_ver_id.ver, - .region_conf_ver = plan.region_ver_id.conf_ver, - .physical_table_ranges = ProxyPhysicalTableRanges{ - std::make_tuple(table_id, pingcap::coprocessor::KeyRanges{range})}, - }); + all_reader_plans.push_back( + RNProxyReaderPlan{ + .region_id = plan.region_id, + .region_ver = plan.region_ver_id.ver, + .region_conf_ver = plan.region_ver_id.conf_ver, + .physical_table_ranges + = ProxyPhysicalTableRanges{std::make_tuple(table_id, pingcap::coprocessor::KeyRanges{range})}, + }); } } } From 44b0350ba2bb9279b62e78574da4961a0f7e9e71 Mon Sep 17 00:00:00 2001 From: yongman Date: Tue, 9 Jun 2026 17:54:05 +0800 Subject: [PATCH 14/19] format Signed-off-by: yongman --- .../src/Storages/StorageDisaggregatedColumnar.cpp | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/dbms/src/Storages/StorageDisaggregatedColumnar.cpp b/dbms/src/Storages/StorageDisaggregatedColumnar.cpp index 278ba86377a..34d88a20356 100644 --- a/dbms/src/Storages/StorageDisaggregatedColumnar.cpp +++ b/dbms/src/Storages/StorageDisaggregatedColumnar.cpp @@ -1205,14 +1205,13 @@ std::vector RNProxyReadTask::buildProxyReadTask( { for (const auto & [table_id, range] : plan.bucket_units) { - all_reader_plans.push_back( - RNProxyReaderPlan{ - .region_id = plan.region_id, - .region_ver = plan.region_ver_id.ver, - .region_conf_ver = plan.region_ver_id.conf_ver, - .physical_table_ranges - = ProxyPhysicalTableRanges{std::make_tuple(table_id, pingcap::coprocessor::KeyRanges{range})}, - }); + all_reader_plans.push_back(RNProxyReaderPlan{ + .region_id = plan.region_id, + .region_ver = plan.region_ver_id.ver, + .region_conf_ver = plan.region_ver_id.conf_ver, + .physical_table_ranges + = ProxyPhysicalTableRanges{std::make_tuple(table_id, pingcap::coprocessor::KeyRanges{range})}, + }); } } } From 18bc6212b1f6ec55e3d76ee1942f7bf8b5ee4a4a Mon Sep 17 00:00:00 2001 From: yongman Date: Wed, 10 Jun 2026 16:12:37 +0800 Subject: [PATCH 15/19] fix epoch not match retry Signed-off-by: yongman --- .../hub-runtime/src/cloud_helper.rs | 2 +- .../Storages/StorageDisaggregatedColumnar.cpp | 416 +++++++++++------- .../Storages/StorageDisaggregatedColumnar.h | 69 +-- 3 files changed, 289 insertions(+), 198 deletions(-) diff --git a/contrib/tiflash-columnar-hub/hub-runtime/src/cloud_helper.rs b/contrib/tiflash-columnar-hub/hub-runtime/src/cloud_helper.rs index a8d3f3c7fc7..babece96d2b 100644 --- a/contrib/tiflash-columnar-hub/hub-runtime/src/cloud_helper.rs +++ b/contrib/tiflash-columnar-hub/hub-runtime/src/cloud_helper.rs @@ -713,7 +713,7 @@ async fn request_snapshot_from_leader( continue; } if delegate_resp.get_region_error().has_epoch_not_match() { - // Return epoch not match error to TiDB to retry. + // Return epoch not match error to caller to retry new plan. error!( "{} request_snapshot_from_leader failed, epoch not match, {:?}", tag, diff --git a/dbms/src/Storages/StorageDisaggregatedColumnar.cpp b/dbms/src/Storages/StorageDisaggregatedColumnar.cpp index 34d88a20356..263b7ce2d09 100644 --- a/dbms/src/Storages/StorageDisaggregatedColumnar.cpp +++ b/dbms/src/Storages/StorageDisaggregatedColumnar.cpp @@ -18,7 +18,6 @@ #include #include #include -#include #include #include #include @@ -252,6 +251,79 @@ std::vector getRegionBucketKeysFromProxy(const Context & context, Region return res; } +std::vector buildRegionReaderPlansFromPhysicalTableRanges( + const LoggerPtr & log, + const Context & context, + const ProxyPhysicalTableRanges & physical_table_ranges) +{ + std::vector region_reader_plans; + if (physical_table_ranges.empty()) + return region_reader_plans; + + pingcap::kv::Cluster * cluster = context.getTMTContext().getKVCluster(); + pingcap::kv::Backoffer bo(pingcap::kv::copBuildTaskMaxBackoff); + auto & region_cache = cluster->region_cache; + + std::unordered_map plan_index_by_region_id; + region_reader_plans.reserve(physical_table_ranges.size()); + + for (const auto & [physical_table_id, ranges] : physical_table_ranges) + { + const auto locations = pingcap::coprocessor::details::splitKeyRangesByLocations(region_cache, bo, ranges); + for (const auto & location : locations) + { + const auto & region = location.location.region; + auto it = plan_index_by_region_id.find(region.id); + if (it == plan_index_by_region_id.end()) + { + plan_index_by_region_id.emplace(region.id, region_reader_plans.size()); + region_reader_plans.push_back(RegionReaderPlan{ + .region_id = region.id, + .region_ver_id = region, + .physical_table_ranges + = ProxyPhysicalTableRanges{std::make_tuple(physical_table_id, location.ranges)}, + }); + continue; + } + + auto & plan = region_reader_plans[it->second]; + if (plan.region_ver_id != region) + { + region_cache->dropRegion(plan.region_ver_id); + region_cache->dropRegion(region); + LOG_WARNING( + log, + "buildProxyReadTask failed region_id={}, epoch not match {}", + region.id, + region.toString()); + throw RegionException( + RegionException::UnavailableRegions{region.id}, + RegionException::RegionReadStatus::EPOCH_NOT_MATCH, + region.toString().c_str()); + } + plan.physical_table_ranges.push_back(std::make_tuple(physical_table_id, location.ranges)); + } + } + + return region_reader_plans; +} + +std::vector buildReaderPlansFromRegionReaderPlans(const std::vector & region_reader_plans) +{ + std::vector reader_plans; + reader_plans.reserve(region_reader_plans.size()); + for (const auto & plan : region_reader_plans) + { + reader_plans.push_back(RNProxyReaderPlan{ + .region_id = plan.region_id, + .region_ver = plan.region_ver_id.ver, + .region_conf_ver = plan.region_ver_id.conf_ver, + .physical_table_ranges = plan.physical_table_ranges, + }); + } + return reader_plans; +} + std::vector> genGeneratedColumnInfosForDisaggregatedRead( const TiDBTableScan & table_scan) { @@ -785,30 +857,29 @@ ColumnarReaderPtr createProxyColumnarReader( } // RNProxyReadTask -RNProxyReaderSlot::~RNProxyReaderSlot() +RNProxyReaderWork::~RNProxyReaderWork() { if (reader.has_value() && reader->inner.ptr != nullptr) RustGcHelper::instance().gcRustPtr(reader->inner.ptr, reader->inner.type); } RNProxyReadTask::RNProxyReadTask( - std::vector reader_plans_, + std::vector reader_plans, size_t source_num_, std::shared_ptr shared_reader_context_) - : reader_plans(std::move(reader_plans_)) + : reader_count(reader_plans.size()) , source_num(source_num_) , shared_reader_context(std::move(shared_reader_context_)) { RUNTIME_CHECK(source_num > 0); - RUNTIME_CHECK(source_num <= reader_plans.size(), source_num, reader_plans.size()); - reader_slots.reserve(reader_plans.size()); - for (size_t i = 0; i < reader_plans.size(); ++i) - reader_slots.emplace_back(std::make_shared()); + RUNTIME_CHECK(source_num <= reader_count, source_num, reader_count); + for (auto & reader_plan : reader_plans) + pending_reader_works.push_back(std::make_shared(std::move(reader_plan))); } size_t RNProxyReadTask::getReaderCount() const { - return reader_plans.size(); + return reader_count; } size_t RNProxyReadTask::getSourceNum() const @@ -846,15 +917,40 @@ const String & RNProxyReadTask::getExecutorID() const return shared_reader_context->executor_id; } -ColumnarReaderPtr RNProxyReadTask::createColumnarReaderWithBackoff(size_t reader_index) const +void RNProxyReadTask::replaceReaderWork( + const RNProxyReaderWorkPtr & reader_work, + std::vector replanned_reader_plans) +{ + RUNTIME_CHECK(reader_work != nullptr); + RUNTIME_CHECK(!replanned_reader_plans.empty()); + + reader_work->plan = std::move(replanned_reader_plans.front()); + if (replanned_reader_plans.size() == 1) + return; + + auto queue_guard = std::lock_guard(pending_reader_works_mutex); + for (auto it = replanned_reader_plans.rbegin(); it != replanned_reader_plans.rend() - 1; ++it) + pending_reader_works.push_front(std::make_shared(*it)); +} + +#ifdef DBMS_PUBLIC_GTEST +void RNProxyReadTask::replaceReaderWorkForTest( + const RNProxyReaderWorkPtr & reader_work, + std::vector replanned_reader_plans) { - RUNTIME_CHECK(reader_index < reader_plans.size()); - const auto & reader_plan = reader_plans[reader_index]; + replaceReaderWork(reader_work, std::move(replanned_reader_plans)); +} +#endif + +ColumnarReaderPtr RNProxyReadTask::createColumnarReaderWithBackoff(const RNProxyReaderWorkPtr & reader_work) +{ + RUNTIME_CHECK(reader_work != nullptr); pingcap::kv::Backoffer bo(pingcap::kv::copNextMaxBackoff); while (true) { try { + const auto & reader_plan = reader_work->plan; LOG_INFO( getLog(), "materialize proxy reader for tables in region, region_id={}, table_num={}", @@ -864,6 +960,30 @@ ColumnarReaderPtr RNProxyReadTask::createColumnarReaderWithBackoff(size_t reader } catch (RegionException & e) { + if (e.status == RegionException::RegionReadStatus::EPOCH_NOT_MATCH + || e.status == RegionException::RegionReadStatus::NOT_FOUND) + { + try + { + auto replanned_region_reader_plans = buildRegionReaderPlansFromPhysicalTableRanges( + getLog(), + getContext(), + reader_work->plan.physical_table_ranges); + auto replanned_reader_plans = buildReaderPlansFromRegionReaderPlans(replanned_region_reader_plans); + const auto replanned_reader_plan_count = replanned_reader_plans.size(); + replaceReaderWork(reader_work, std::move(replanned_reader_plans)); + LOG_WARNING( + getLog(), + "replanned proxy reader work after region error, old_error={}, new_region_id={}, split_count={}", + e.message(), + reader_work->plan.region_id, + replanned_reader_plan_count); + } + catch (const std::exception & replan_e) + { + LOG_WARNING(getLog(), "replan proxy reader work failed, {}", replan_e.what()); + } + } LOG_WARNING(getLog(), "create proxy reader failed, backoff and retry, {}", e.message()); bo.backoff(pingcap::kv::boRegionMiss, pingcap::Exception(e.message(), e.code())); } @@ -877,142 +997,153 @@ ColumnarReaderPtr RNProxyReadTask::createColumnarReaderWithBackoff(size_t reader } } -ColumnarReaderPtr RNProxyReadTask::getOrCreateReader(size_t reader_index) +ColumnarReaderPtr RNProxyReadTask::getOrCreateReader(const RNProxyReaderWorkPtr & reader_work) { - RUNTIME_CHECK(reader_index < reader_slots.size()); - auto slot = reader_slots[reader_index]; + RUNTIME_CHECK(reader_work != nullptr); + bool should_create_inline = false; + while (true) { - std::unique_lock lock(slot->mutex); - switch (slot->state) - { - case RNProxyReaderMaterializeState::Ready: { - auto reader = std::move(slot->reader); - slot->reader.reset(); - slot->state = RNProxyReaderMaterializeState::Consumed; - return reader.value(); - } - case RNProxyReaderMaterializeState::Failed: - std::rethrow_exception(slot->exception); - case RNProxyReaderMaterializeState::Consumed: - throw Exception(ErrorCodes::LOGICAL_ERROR, "proxy reader {} is already consumed", reader_index); - case RNProxyReaderMaterializeState::Creating: - slot->cv.wait(lock, [&] { return slot->state != RNProxyReaderMaterializeState::Creating; }); - if (slot->state == RNProxyReaderMaterializeState::Ready) + std::unique_lock lock(reader_work->mutex); + switch (reader_work->state) { - auto reader = std::move(slot->reader); - slot->reader.reset(); - slot->state = RNProxyReaderMaterializeState::Consumed; + case RNProxyReaderMaterializeState::Ready: + { + auto reader = std::move(reader_work->reader); + reader_work->reader.reset(); + reader_work->exception = nullptr; + reader_work->state = RNProxyReaderMaterializeState::Consumed; return reader.value(); } - if (slot->state == RNProxyReaderMaterializeState::Failed) - std::rethrow_exception(slot->exception); - throw Exception( - ErrorCodes::LOGICAL_ERROR, - "proxy reader {} becomes invalid after wait, state={}", - reader_index, - static_cast(slot->state)); - case RNProxyReaderMaterializeState::NotStarted: - slot->state = RNProxyReaderMaterializeState::Creating; - should_create_inline = true; - break; + case RNProxyReaderMaterializeState::Failed: + std::rethrow_exception(reader_work->exception); + case RNProxyReaderMaterializeState::Consumed: + throw Exception( + ErrorCodes::LOGICAL_ERROR, + "proxy reader work for region {} is already consumed", + reader_work->plan.region_id); + case RNProxyReaderMaterializeState::Creating: + reader_work->cv.wait(lock, [&] { + return reader_work->state != RNProxyReaderMaterializeState::Creating; + }); + continue; + case RNProxyReaderMaterializeState::NotStarted: + reader_work->state = RNProxyReaderMaterializeState::Creating; + should_create_inline = true; + break; + } } + break; } RUNTIME_CHECK(should_create_inline); - LOG_INFO( - getLog(), - "materialize proxy reader synchronously, reader_index={}, region_id={}", - reader_index, - reader_plans[reader_index].region_id); try { - auto reader = createColumnarReaderWithBackoff(reader_index); + auto reader = createColumnarReaderWithBackoff(reader_work); { - auto guard = std::lock_guard(slot->mutex); - slot->state = RNProxyReaderMaterializeState::Consumed; + auto guard = std::lock_guard(reader_work->mutex); + reader_work->reader.reset(); + reader_work->exception = nullptr; + reader_work->state = RNProxyReaderMaterializeState::Consumed; } - slot->cv.notify_all(); + reader_work->cv.notify_all(); return reader; } catch (...) { { - auto guard = std::lock_guard(slot->mutex); - slot->exception = std::current_exception(); - slot->state = RNProxyReaderMaterializeState::Failed; + auto guard = std::lock_guard(reader_work->mutex); + reader_work->reader.reset(); + reader_work->exception = std::current_exception(); + reader_work->state = RNProxyReaderMaterializeState::Failed; } - slot->cv.notify_all(); + reader_work->cv.notify_all(); throw; } } -void RNProxyReadTask::prefetchReader(size_t reader_index) +void RNProxyReadTask::prefetchPendingWork() { - if (reader_index >= reader_slots.size()) - return; + RNProxyReaderWorkPtr reader_work; + { + auto guard = std::lock_guard(pending_reader_works_mutex); + if (pending_reader_works.empty()) + return; + reader_work = pending_reader_works.front(); + } - std::call_once(prefetch_thread_manager_once, [&] { prefetch_thread_manager = newThreadManager(); }); + prefetchReaderWork(reader_work); +} + +void RNProxyReadTask::prefetchReaderWork(const RNProxyReaderWorkPtr & reader_work) +{ + RUNTIME_CHECK(reader_work != nullptr); - auto slot = reader_slots[reader_index]; { - auto guard = std::lock_guard(slot->mutex); - if (slot->state != RNProxyReaderMaterializeState::NotStarted) + auto guard = std::lock_guard(reader_work->mutex); + if (reader_work->state != RNProxyReaderMaterializeState::NotStarted) return; - slot->state = RNProxyReaderMaterializeState::Creating; + reader_work->state = RNProxyReaderMaterializeState::Creating; } - prefetch_thread_manager->scheduleThenDetach( + LOG_INFO( + getLog(), + "materialize proxy reader asynchronously, region_id={}", + reader_work->plan.region_id); + newThreadManager()->scheduleThenDetach( true, "PrefetchRNProxyReader", - [self = shared_from_this(), slot, reader_index] { - LOG_INFO( - self->getLog(), - "materialize proxy reader asynchronously, reader_index={}, region_id={}", - reader_index, - self->reader_plans[reader_index].region_id); + [self = shared_from_this(), reader_work] { try { - auto reader = self->createColumnarReaderWithBackoff(reader_index); + auto reader = self->createColumnarReaderWithBackoff(reader_work); { - auto guard = std::lock_guard(slot->mutex); - if (slot->state == RNProxyReaderMaterializeState::Consumed) + auto guard = std::lock_guard(reader_work->mutex); + if (reader_work->state == RNProxyReaderMaterializeState::Consumed) return; - slot->reader.emplace(std::move(reader)); - slot->state = RNProxyReaderMaterializeState::Ready; + reader_work->reader.emplace(std::move(reader)); + reader_work->exception = nullptr; + reader_work->state = RNProxyReaderMaterializeState::Ready; } } catch (...) { { - auto guard = std::lock_guard(slot->mutex); - if (slot->state == RNProxyReaderMaterializeState::Consumed) + auto guard = std::lock_guard(reader_work->mutex); + if (reader_work->state == RNProxyReaderMaterializeState::Consumed) return; - slot->exception = std::current_exception(); - slot->state = RNProxyReaderMaterializeState::Failed; + reader_work->reader.reset(); + reader_work->exception = std::current_exception(); + reader_work->state = RNProxyReaderMaterializeState::Failed; } } - slot->cv.notify_all(); + reader_work->cv.notify_all(); }); } -std::optional RNProxyReadTask::tryAcquireReaderIndex() +std::optional RNProxyReadTask::tryAcquireReaderWork() { - const size_t reader_index = next_reader_index.fetch_add(1, std::memory_order_relaxed); - if (reader_index >= reader_plans.size()) - return std::nullopt; - return reader_index; + RNProxyReaderWorkPtr reader_work; + { + auto guard = std::lock_guard(pending_reader_works_mutex); + if (pending_reader_works.empty()) + return std::nullopt; + reader_work = pending_reader_works.front(); + pending_reader_works.pop_front(); + } + prefetchPendingWork(); + return reader_work; } -BlockInputStreamPtr RNProxyReadTask::createInputStream(size_t reader_index) +BlockInputStreamPtr RNProxyReadTask::createInputStream(const RNProxyReaderWorkPtr & reader_work) { - RUNTIME_CHECK(reader_index < reader_plans.size()); + RUNTIME_CHECK(reader_work != nullptr); return RNProxyInputStream::create({ .context = getContext(), .log = getLog(), .task = shared_from_this(), - .reader_index = reader_index, + .reader_work = reader_work, .columns_to_read = getColumnsToRead(), .extra_table_id_index = getExtraTableIDIndex(), .table_id = getLogicalTableID(), @@ -1026,7 +1157,7 @@ BlockInputStreamPtr RNProxyReadTask::createSharedInputStream() .context = getContext(), .log = getLog(), .task = shared_from_this(), - .reader_index = std::nullopt, + .reader_work = nullptr, .columns_to_read = getColumnsToRead(), .extra_table_id_index = getExtraTableIDIndex(), .table_id = getLogicalTableID(), @@ -1091,75 +1222,23 @@ std::vector RNProxyReadTask::buildProxyReadTask( auto shared_reader_context = buildProxyReaderSharedContext(log, context, start_ts, table_scan, filter_conditions); std::vector tasks; - // Collect all regions in the table scan. - std::unordered_map>> - all_remote_regions_by_region; - std::unordered_map region_ver_ids; - - std::vector physical_table_ids; - std::vector ranges_for_each_physical_table; - physical_table_ids.reserve(remote_table_ranges.size()); - ranges_for_each_physical_table.reserve(remote_table_ranges.size()); + ProxyPhysicalTableRanges physical_table_ranges; + physical_table_ranges.reserve(remote_table_ranges.size()); for (const auto & remote_table_range : remote_table_ranges) - { - physical_table_ids.emplace_back(remote_table_range.first); - ranges_for_each_physical_table.emplace_back(remote_table_range.second); - } - pingcap::kv::Cluster * cluster = context.getTMTContext().getKVCluster(); - pingcap::kv::Backoffer bo(pingcap::kv::copBuildTaskMaxBackoff); - auto & region_cache = cluster->region_cache; - for (auto idx = 0; idx < static_cast(ranges_for_each_physical_table.size()); idx++) - { - const auto physical_table_id = physical_table_ids[idx]; - const auto ranges = ranges_for_each_physical_table[idx]; - const auto locations = pingcap::coprocessor::details::splitKeyRangesByLocations(region_cache, bo, ranges); - for (const auto & location : locations) - { - // If the region_ver_ids already exists, compare the value with location.location.region. - // If the value is not equal, drop cache and retry. - const auto & region = location.location.region; - if (auto it = region_ver_ids.find(region.id); it != region_ver_ids.end() && it->second != region) - { - region_cache->dropRegion(it->second); - region_cache->dropRegion(region); - region_ver_ids.erase(it); - LOG_WARNING( - log, - "buildProxyReadTask failed region_id={}, epoch not match {}", - region.id, - region.toString()); - throw RegionException( - RegionException::UnavailableRegions{region.id}, - RegionException::RegionReadStatus::EPOCH_NOT_MATCH, - region.toString().c_str()); - } - all_remote_regions_by_region[region.id].push_back(std::make_tuple(physical_table_id, location.ranges)); - region_ver_ids[region.id] = region; - LOG_DEBUG( - log, - "buildProxyReadTask, physical_table_id={}, region_ver_id={}", - physical_table_id, - region.toString()); - } - } - unsigned region_num = all_remote_regions_by_region.size(); - unsigned physical_table_num = physical_table_ids.size(); + physical_table_ranges.emplace_back(remote_table_range.first, remote_table_range.second); + + auto region_reader_plans = buildRegionReaderPlansFromPhysicalTableRanges(log, context, physical_table_ranges); + const auto region_num = static_cast(region_reader_plans.size()); + const auto physical_table_num = static_cast(physical_table_ranges.size()); const bool enable_bucket_parallel = !table_scan.keepOrder() && num_streams > region_num; - std::vector region_reader_plans; - region_reader_plans.reserve(region_num); size_t total_max_reader_num = region_num; size_t total_split_bucket_num = 0; - for (const auto & [region_id, physical_table_ranges] : all_remote_regions_by_region) + for (auto & plan : region_reader_plans) { - RegionReaderPlan plan{ - .region_id = region_id, - .region_ver_id = region_ver_ids[region_id], - .physical_table_ranges = physical_table_ranges, - }; if (enable_bucket_parallel) { - auto bucket_keys = getRegionBucketKeysFromProxy(context, region_id, plan.region_ver_id.ver); - auto split_result = splitRangesByBucketKeys(physical_table_ranges, bucket_keys); + auto bucket_keys = getRegionBucketKeysFromProxy(context, plan.region_id, plan.region_ver_id.ver); + auto split_result = splitRangesByBucketKeys(plan.physical_table_ranges, bucket_keys); if (split_result.has_bucket_split && split_result.units.size() > 1) { total_max_reader_num += split_result.units.size() - 1; @@ -1167,7 +1246,6 @@ std::vector RNProxyReadTask::buildProxyReadTask( plan.bucket_units = std::move(split_result.units); } } - region_reader_plans.emplace_back(std::move(plan)); } const size_t planned_reader_num = total_max_reader_num; if (enable_bucket_parallel) @@ -1242,20 +1320,19 @@ bool RNProxyInputStream::ensureReader() if (reader.has_value()) return true; - if (fixed_reader_index.has_value()) + if (fixed_reader_work != nullptr) { - current_reader_index = fixed_reader_index; - reader.emplace(task->getOrCreateReader(fixed_reader_index.value())); + current_reader_work = fixed_reader_work; + reader.emplace(task->getOrCreateReader(fixed_reader_work)); return true; } - auto next_reader_index = task->tryAcquireReaderIndex(); - if (!next_reader_index.has_value()) + auto next_reader_work = task->tryAcquireReaderWork(); + if (!next_reader_work.has_value()) return false; - current_reader_index = next_reader_index; - reader.emplace(task->getOrCreateReader(next_reader_index.value())); - task->prefetchReader(next_reader_index.value() + 1); + current_reader_work = next_reader_work.value(); + reader.emplace(task->getOrCreateReader(next_reader_work.value())); return true; } @@ -1264,7 +1341,7 @@ void RNProxyInputStream::releaseReader() if (reader.has_value() && reader->inner.ptr != nullptr) RustGcHelper::instance().gcRustPtr(reader->inner.ptr, reader->inner.type); reader.reset(); - current_reader_index.reset(); + current_reader_work.reset(); } RNProxyInputStream::~RNProxyInputStream() @@ -1340,7 +1417,7 @@ Block RNProxyInputStream::readImpl([[maybe_unused]] FilterPtr & res_filter, [[ma if (rows == 0) { releaseReader(); - if (fixed_reader_index.has_value()) + if (fixed_reader_work != nullptr) { done = true; return {}; @@ -1484,16 +1561,14 @@ OperatorStatus RNProxySourceOp::executeIOImpl() if (!current_input_stream) { - auto next_reader_idx = task->tryAcquireReaderIndex(); - if (!next_reader_idx.has_value()) + auto next_reader_work = task->tryAcquireReaderWork(); + if (!next_reader_work.has_value()) { done = true; return OperatorStatus::HAS_OUTPUT; } - current_reader_idx = next_reader_idx; - current_input_stream = task->createInputStream(current_reader_idx.value()); + current_input_stream = task->createInputStream(next_reader_work.value()); ++total_streams; - task->prefetchReader(current_reader_idx.value() + 1); } FilterPtr filter_ignored = nullptr; @@ -1510,7 +1585,6 @@ OperatorStatus RNProxySourceOp::executeIOImpl() else { current_input_stream.reset(); - current_reader_idx.reset(); return awaitImpl(); } } diff --git a/dbms/src/Storages/StorageDisaggregatedColumnar.h b/dbms/src/Storages/StorageDisaggregatedColumnar.h index ae645d1cb4c..7e02509a529 100644 --- a/dbms/src/Storages/StorageDisaggregatedColumnar.h +++ b/dbms/src/Storages/StorageDisaggregatedColumnar.h @@ -18,6 +18,7 @@ #if ENABLE_NEXT_GEN_COLUMNAR #include #include +#include #include #include #include @@ -37,6 +38,7 @@ #include #include +#include #include #include #include @@ -46,7 +48,6 @@ namespace DB { class DAGContext; -class ThreadManager; namespace DM { @@ -54,6 +55,15 @@ class RSOperator; using RSOperatorPtr = std::shared_ptr; } // namespace DM +enum class RNProxyReaderMaterializeState +{ + NotStarted, + Creating, + Ready, + Failed, + Consumed, +}; + struct RNProxyReaderSharedContext; struct RNProxyReaderPlan @@ -64,19 +74,15 @@ struct RNProxyReaderPlan std::vector> physical_table_ranges; }; -enum class RNProxyReaderMaterializeState +struct RNProxyReaderWork { - NotStarted, - Creating, - Ready, - Failed, - Consumed, -}; + explicit RNProxyReaderWork(RNProxyReaderPlan plan_) + : plan(std::move(plan_)) + {} -struct RNProxyReaderSlot -{ - ~RNProxyReaderSlot(); + ~RNProxyReaderWork(); + RNProxyReaderPlan plan; std::mutex mutex; std::condition_variable cv; RNProxyReaderMaterializeState state = RNProxyReaderMaterializeState::NotStarted; @@ -84,6 +90,8 @@ struct RNProxyReaderSlot std::exception_ptr exception; }; +using RNProxyReaderWorkPtr = std::shared_ptr; + class RNProxyReadTask; using RNProxyReadTaskPtr = std::shared_ptr; class RNProxyReadTask @@ -115,15 +123,19 @@ class RNProxyReadTask BlockInputStreamPtr createSharedInputStream(); - BlockInputStreamPtr createInputStream(size_t reader_index); + BlockInputStreamPtr createInputStream(const RNProxyReaderWorkPtr & reader_work); - ColumnarReaderPtr createColumnarReaderWithBackoff(size_t reader_index) const; + ColumnarReaderPtr createColumnarReaderWithBackoff(const RNProxyReaderWorkPtr & reader_work); - ColumnarReaderPtr getOrCreateReader(size_t reader_index); + ColumnarReaderPtr getOrCreateReader(const RNProxyReaderWorkPtr & reader_work); - void prefetchReader(size_t reader_index); + std::optional tryAcquireReaderWork(); - std::optional tryAcquireReaderIndex(); +#ifdef DBMS_PUBLIC_GTEST + void replaceReaderWorkForTest( + const RNProxyReaderWorkPtr & reader_work, + std::vector replanned_reader_plans); +#endif size_t getReaderCount() const; @@ -147,13 +159,19 @@ class RNProxyReadTask std::shared_ptr shared_reader_context); private: - std::vector reader_plans; + void prefetchPendingWork(); + + void prefetchReaderWork(const RNProxyReaderWorkPtr & reader_work); + + void replaceReaderWork( + const RNProxyReaderWorkPtr & reader_work, + std::vector replanned_reader_plans); + + size_t reader_count; size_t source_num; std::shared_ptr shared_reader_context; - std::vector> reader_slots; - std::atomic_size_t next_reader_index = 0; - std::once_flag prefetch_thread_manager_once; - std::shared_ptr prefetch_thread_manager; + mutable std::mutex pending_reader_works_mutex; + std::deque pending_reader_works; }; class RNProxyInputStream : public IProfilingBlockInputStream @@ -178,7 +196,7 @@ class RNProxyInputStream : public IProfilingBlockInputStream const Context & context; LoggerPtr log; RNProxyReadTaskPtr task; - std::optional reader_index; + RNProxyReaderWorkPtr reader_work; const DM::ColumnDefines & columns_to_read; int extra_table_id_index; TableID table_id; @@ -189,7 +207,7 @@ class RNProxyInputStream : public IProfilingBlockInputStream : context(options.context) , log(options.log) , task(options.task) - , fixed_reader_index(options.reader_index) + , fixed_reader_work(options.reader_work) , action(options.columns_to_read, options.extra_table_id_index) , table_id(options.table_id) , executor_id(options.executor_id) @@ -207,8 +225,8 @@ class RNProxyInputStream : public IProfilingBlockInputStream const Context & context; const LoggerPtr log; RNProxyReadTaskPtr task; - const std::optional fixed_reader_index; - std::optional current_reader_index; + const RNProxyReaderWorkPtr fixed_reader_work; + RNProxyReaderWorkPtr current_reader_work; std::optional reader; AddExtraTableIDColumnTransformAction action; TableID table_id; @@ -270,7 +288,6 @@ class RNProxySourceOp : public SourceOp size_t total_rows = 0; size_t total_streams = 0; - std::optional current_reader_idx; BlockInputStreamPtr current_input_stream; // Temporarily store the block read from current_seg_task->stream and pass it to downstream operators in readImpl. From 1a52fdfc9edfad0cae151f80a68716a3cafef60d Mon Sep 17 00:00:00 2001 From: yongman Date: Wed, 10 Jun 2026 18:47:03 +0800 Subject: [PATCH 16/19] format Signed-off-by: yongman --- .../Storages/StorageDisaggregatedColumnar.cpp | 58 +++++++++---------- 1 file changed, 27 insertions(+), 31 deletions(-) diff --git a/dbms/src/Storages/StorageDisaggregatedColumnar.cpp b/dbms/src/Storages/StorageDisaggregatedColumnar.cpp index 263b7ce2d09..f1b00a13c6e 100644 --- a/dbms/src/Storages/StorageDisaggregatedColumnar.cpp +++ b/dbms/src/Storages/StorageDisaggregatedColumnar.cpp @@ -308,7 +308,8 @@ std::vector buildRegionReaderPlansFromPhysicalTableRanges( return region_reader_plans; } -std::vector buildReaderPlansFromRegionReaderPlans(const std::vector & region_reader_plans) +std::vector buildReaderPlansFromRegionReaderPlans( + const std::vector & region_reader_plans) { std::vector reader_plans; reader_plans.reserve(region_reader_plans.size()); @@ -974,7 +975,8 @@ ColumnarReaderPtr RNProxyReadTask::createColumnarReaderWithBackoff(const RNProxy replaceReaderWork(reader_work, std::move(replanned_reader_plans)); LOG_WARNING( getLog(), - "replanned proxy reader work after region error, old_error={}, new_region_id={}, split_count={}", + "replanned proxy reader work after region error, old_error={}, new_region_id={}, " + "split_count={}", e.message(), reader_work->plan.region_id, replanned_reader_plan_count); @@ -1087,39 +1089,33 @@ void RNProxyReadTask::prefetchReaderWork(const RNProxyReaderWorkPtr & reader_wor reader_work->state = RNProxyReaderMaterializeState::Creating; } - LOG_INFO( - getLog(), - "materialize proxy reader asynchronously, region_id={}", - reader_work->plan.region_id); - newThreadManager()->scheduleThenDetach( - true, - "PrefetchRNProxyReader", - [self = shared_from_this(), reader_work] { - try + LOG_INFO(getLog(), "materialize proxy reader asynchronously, region_id={}", reader_work->plan.region_id); + newThreadManager()->scheduleThenDetach(true, "PrefetchRNProxyReader", [self = shared_from_this(), reader_work] { + try + { + auto reader = self->createColumnarReaderWithBackoff(reader_work); { - auto reader = self->createColumnarReaderWithBackoff(reader_work); - { - auto guard = std::lock_guard(reader_work->mutex); - if (reader_work->state == RNProxyReaderMaterializeState::Consumed) - return; - reader_work->reader.emplace(std::move(reader)); - reader_work->exception = nullptr; - reader_work->state = RNProxyReaderMaterializeState::Ready; - } + auto guard = std::lock_guard(reader_work->mutex); + if (reader_work->state == RNProxyReaderMaterializeState::Consumed) + return; + reader_work->reader.emplace(std::move(reader)); + reader_work->exception = nullptr; + reader_work->state = RNProxyReaderMaterializeState::Ready; } - catch (...) + } + catch (...) + { { - { - auto guard = std::lock_guard(reader_work->mutex); - if (reader_work->state == RNProxyReaderMaterializeState::Consumed) - return; - reader_work->reader.reset(); - reader_work->exception = std::current_exception(); - reader_work->state = RNProxyReaderMaterializeState::Failed; - } + auto guard = std::lock_guard(reader_work->mutex); + if (reader_work->state == RNProxyReaderMaterializeState::Consumed) + return; + reader_work->reader.reset(); + reader_work->exception = std::current_exception(); + reader_work->state = RNProxyReaderMaterializeState::Failed; } - reader_work->cv.notify_all(); - }); + } + reader_work->cv.notify_all(); + }); } std::optional RNProxyReadTask::tryAcquireReaderWork() From 32a2c4cd47fb27ede5b2723884d4960926e4070b Mon Sep 17 00:00:00 2001 From: yongman Date: Thu, 11 Jun 2026 09:12:55 +0800 Subject: [PATCH 17/19] polish logs and rename proxy to columnar Signed-off-by: yongman --- .../Storages/StorageDisaggregatedColumnar.cpp | 292 +++++++++--------- .../Storages/StorageDisaggregatedColumnar.h | 87 +++--- 2 files changed, 189 insertions(+), 190 deletions(-) diff --git a/dbms/src/Storages/StorageDisaggregatedColumnar.cpp b/dbms/src/Storages/StorageDisaggregatedColumnar.cpp index f1b00a13c6e..29586f0c166 100644 --- a/dbms/src/Storages/StorageDisaggregatedColumnar.cpp +++ b/dbms/src/Storages/StorageDisaggregatedColumnar.cpp @@ -67,7 +67,7 @@ namespace ErrorCodes extern const int COLUMNAR_SNAPSHOT_ERROR; } // namespace ErrorCodes -struct RNProxyReaderSharedContext +struct RNColumnarReaderSharedContext { using ClearSharedSnapAccessByStartTsFn = void (*)(uint64_t, RaftStoreProxyPtr); @@ -132,7 +132,7 @@ struct RNProxyReaderSharedContext std::shared_ptr output_lock = std::make_shared(); bool registered_for_start_ts = false; - ~RNProxyReaderSharedContext() noexcept + ~RNColumnarReaderSharedContext() noexcept { if (!registered_for_start_ts) return; @@ -155,14 +155,14 @@ struct RNProxyReaderSharedContext } }; -size_t getRNProxySourceNum(size_t num_streams, size_t reader_count) +size_t getRNColumnarSourceNum(size_t num_streams, size_t reader_count) { return std::min(std::max(1, num_streams), reader_count); } namespace { -using ProxyPhysicalTableRanges = std::vector>; +using ColumnarPhysicalTableRanges = std::vector>; using BucketSplitUnit = std::pair; void normalizeTimestampCompareDateTimeLiteralToUTC(tipb::Expr & expr, const TimezoneInfo & timezone_info); @@ -177,7 +177,7 @@ struct RegionReaderPlan { RegionID region_id; pingcap::kv::RegionVerID region_ver_id; - ProxyPhysicalTableRanges physical_table_ranges; + ColumnarPhysicalTableRanges physical_table_ranges; std::vector bucket_units; }; @@ -193,7 +193,7 @@ bool isBucketBoundaryInsideRange(const String & bucket_key, const pingcap::copro } BucketSplitResult splitRangesByBucketKeys( - const ProxyPhysicalTableRanges & physical_table_ranges, + const ColumnarPhysicalTableRanges & physical_table_ranges, const std::vector & bucket_keys) { BucketSplitResult result; @@ -228,7 +228,7 @@ BucketSplitResult splitRangesByBucketKeys( return result; } -std::vector getRegionBucketKeysFromProxy(const Context & context, RegionID region_id, UInt64 region_ver) +std::vector getRegionBucketKeysFromColumnar(const Context & context, RegionID region_id, UInt64 region_ver) { const Context & global_ctx = context.getGlobalContext(); const TiFlashRaftProxyHelper * proxy_helper = global_ctx.getSharedContextDisagg()->getColumnarProxyHelper(); @@ -254,7 +254,7 @@ std::vector getRegionBucketKeysFromProxy(const Context & context, Region std::vector buildRegionReaderPlansFromPhysicalTableRanges( const LoggerPtr & log, const Context & context, - const ProxyPhysicalTableRanges & physical_table_ranges) + const ColumnarPhysicalTableRanges & physical_table_ranges) { std::vector region_reader_plans; if (physical_table_ranges.empty()) @@ -281,7 +281,7 @@ std::vector buildRegionReaderPlansFromPhysicalTableRanges( .region_id = region.id, .region_ver_id = region, .physical_table_ranges - = ProxyPhysicalTableRanges{std::make_tuple(physical_table_id, location.ranges)}, + = ColumnarPhysicalTableRanges{std::make_tuple(physical_table_id, location.ranges)}, }); continue; } @@ -293,7 +293,7 @@ std::vector buildRegionReaderPlansFromPhysicalTableRanges( region_cache->dropRegion(region); LOG_WARNING( log, - "buildProxyReadTask failed region_id={}, epoch not match {}", + "build RegionReaderPlan failed region_id={}, epoch not match {}", region.id, region.toString()); throw RegionException( @@ -308,14 +308,14 @@ std::vector buildRegionReaderPlansFromPhysicalTableRanges( return region_reader_plans; } -std::vector buildReaderPlansFromRegionReaderPlans( +std::vector buildReaderPlansFromRegionReaderPlans( const std::vector & region_reader_plans) { - std::vector reader_plans; + std::vector reader_plans; reader_plans.reserve(region_reader_plans.size()); for (const auto & plan : region_reader_plans) { - reader_plans.push_back(RNProxyReaderPlan{ + reader_plans.push_back(RNColumnarReaderPlan{ .region_id = plan.region_id, .region_ver = plan.region_ver_id.ver, .region_conf_ver = plan.region_ver_id.conf_ver, @@ -367,18 +367,18 @@ std::tuple genColumnDefinesForDisaggregatedReadThroug return {std::move(column_defines), extra_table_id_index}; } -std::shared_ptr buildProxyReaderSharedContext( +std::shared_ptr buildColumnarReaderSharedContext( const LoggerPtr & log, const Context & context, UInt64 start_ts, const TiDBTableScan & table_scan, const FilterConditions & filter_conditions) { - auto shared_context = std::make_shared(); + auto shared_context = std::make_shared(); shared_context->log = log; shared_context->context = &context; shared_context->start_ts = start_ts; - RNProxyReaderSharedContext::getStartTsClearRegistry().registerStartTs(start_ts); + RNColumnarReaderSharedContext::getStartTsClearRegistry().registerStartTs(start_ts); shared_context->registered_for_start_ts = true; shared_context->logical_table_id = table_scan.getLogicalTableID(); shared_context->executor_id = table_scan.getTableScanExecutorID(); @@ -458,10 +458,10 @@ std::shared_ptr buildProxyReaderSharedContext( return shared_context; } -bool isProxyFilterComparableExpr(tipb::ScalarFuncSig sig) +bool isColumnarFilterComparableExpr(tipb::ScalarFuncSig sig) { - // Keep this aligned with proxy columnar filter supported signatures: - // `contrib/tiflash-proxy/components/kvengine/src/table/columnar/filter.rs`. + // Keep this aligned with kvengine columnar filter supported signatures: + // `components/kvengine/src/table/columnar/filter.rs`. switch (sig) { case tipb::ScalarFuncSig::LTInt: @@ -525,9 +525,9 @@ void normalizeTimestampCompareDateTimeLiteralToUTC(tipb::Expr & expr, const Time if (!isFunctionExpr(expr)) return; - // Only normalize for comparison expressions that proxy filter supports. + // Only normalize for comparison expressions that columnar filter supports. // Keep recursion so nested comparisons under AND/OR/NOT still work. - if (isScalarFunctionExpr(expr) && isProxyFilterComparableExpr(expr.sig())) + if (isScalarFunctionExpr(expr) && isColumnarFilterComparableExpr(expr.sig())) { bool has_timestamp_column = false; bool only_column_or_literal = true; @@ -546,9 +546,9 @@ void normalizeTimestampCompareDateTimeLiteralToUTC(tipb::Expr & expr, const Time } } - // Proxy filter parser only supports simple column-literal expressions. + // Columnar filter parser only supports simple column-literal expressions. // If a timestamp column is compared with a datetime literal, normalize the - // datetime literal from session timezone to UTC before passing to proxy. + // datetime literal from session timezone to UTC before passing to columnar. if (has_timestamp_column && only_column_or_literal && column_ref_count == 1) { static const auto & time_zone_utc = DateLUT::instance("UTC"); @@ -580,7 +580,7 @@ void StorageDisaggregated::filterConditionsWithPushedDownFilters( DAGExpressionAnalyzer & analyzer, DAGPipeline & pipeline) { - // Proxy columnar reader uses late-materialization filters only to reduce packs loaded from disk. + // Columnar reader uses late-materialization filters only to reduce packs loaded from disk. // It does not guarantee that all rows failing those filters are removed, so merge them into // FilterConditions and re-apply them in the TiFlash pipeline for correctness. FilterConditions conditions(filter_conditions.executor_id, filter_conditions.conditions); @@ -598,7 +598,7 @@ void StorageDisaggregated::filterConditionsWithPushedDownFilters( PipelineExecGroupBuilder & group_builder, DAGExpressionAnalyzer & analyzer) { - // Proxy columnar reader uses late-materialization filters only to reduce packs loaded from disk. + // Columnar reader uses late-materialization filters only to reduce packs loaded from disk. // It does not guarantee that all rows failing those filters are removed, so merge them into // FilterConditions and re-apply them in the TiFlash pipeline for correctness. FilterConditions conditions(filter_conditions.executor_id, filter_conditions.conditions); @@ -616,7 +616,7 @@ BlockInputStreams StorageDisaggregated::readThroughColumnar(const Context & cont const UInt64 start_ts = sender_target_mpp_task_id.gather_id.query_id.start_ts; auto [remote_table_ranges, region_num] = buildRemoteTableRanges(); const auto generated_column_infos = genGeneratedColumnInfosForDisaggregatedRead(table_scan); - auto read_proxy_tasks = RNProxyReadTask::buildProxyReadTaskWithBackoff( + auto read_columnar_tasks = RNColumnarReadTask::buildColumnarReadTaskWithBackoff( log, context, start_ts, @@ -624,12 +624,12 @@ BlockInputStreams StorageDisaggregated::readThroughColumnar(const Context & cont filter_conditions, remote_table_ranges, num_streams); - for (auto & task : read_proxy_tasks) + for (auto & task : read_columnar_tasks) { auto streams = task->getInputStreams(); pipeline.streams.insert(pipeline.streams.end(), streams.begin(), streams.end()); } - // Avoid reading generated columns from proxy, generate placeholders locally. + // Avoid reading generated columns from columnar, generate placeholders locally. executeGeneratedColumnPlaceholder(generated_column_infos, log, pipeline); NamesAndTypes source_columns; source_columns.reserve(table_scan.getColumnSize()); @@ -640,7 +640,7 @@ BlockInputStreams StorageDisaggregated::readThroughColumnar(const Context & cont } analyzer = std::make_unique(std::move(source_columns), context); - // Handle duration/timestamp cast for proxy path. + // Handle duration/timestamp cast for columnar path. // We still execute pushed-down filters on RN side, so timestamp columns in those filters // must also be converted from UTC to session timezone. extraCast(*analyzer, pipeline, /*include_pushed_down_filter_columns=*/true); @@ -658,7 +658,7 @@ void StorageDisaggregated::readThroughColumnar( { const UInt64 start_ts = sender_target_mpp_task_id.gather_id.query_id.start_ts; auto [remote_table_ranges, region_num] = buildRemoteTableRanges(); - auto read_proxy_tasks = RNProxyReadTask::buildProxyReadTaskWithBackoff( + auto read_columnar_tasks = RNColumnarReadTask::buildColumnarReadTaskWithBackoff( log, context, start_ts, @@ -667,18 +667,18 @@ void StorageDisaggregated::readThroughColumnar( remote_table_ranges, num_streams); const auto generated_column_infos = genGeneratedColumnInfosForDisaggregatedRead(table_scan); - if (!read_proxy_tasks.empty()) + if (!read_columnar_tasks.empty()) { - auto & task_pool = read_proxy_tasks.front(); + auto & task_pool = read_columnar_tasks.front(); const size_t source_num = task_pool->getSourceNum(); LOG_INFO( log, - "use shared proxy reader task pool, reader_num={}, source_num={}", + "use shared columnar reader task pool, reader_num={}, source_num={}", task_pool->getReaderCount(), source_num); for (size_t i = 0; i < source_num; ++i) { - group_builder.addConcurrency(RNProxySourceOp::create({ + group_builder.addConcurrency(RNColumnarSourceOp::create({ .exec_context = exec_context, .task = task_pool, })); @@ -694,15 +694,15 @@ void StorageDisaggregated::readThroughColumnar( source_columns.emplace_back(col.name, col.type); analyzer = std::make_unique(std::move(source_columns), context); - // Handle duration/timestamp cast for proxy path. + // Handle duration/timestamp cast for columnar path. extraCast(exec_context, group_builder, *analyzer, /*include_pushed_down_filter_columns=*/true); // Handle filter filterConditionsWithPushedDownFilters(exec_context, group_builder, *analyzer); } -ColumnarReaderPtr createProxyColumnarReader( - const RNProxyReaderSharedContext & shared_context, - const RNProxyReaderPlan & reader_plan) +ColumnarReaderPtr createColumnarReader( + const RNColumnarReaderSharedContext & shared_context, + const RNColumnarReaderPlan & reader_plan) { const auto & log = shared_context.log; const auto & context = *shared_context.context; @@ -857,17 +857,17 @@ ColumnarReaderPtr createProxyColumnarReader( return columnar_reader; } -// RNProxyReadTask -RNProxyReaderWork::~RNProxyReaderWork() +// RNColumnarReadTask +RNColumnarReaderWork::~RNColumnarReaderWork() { if (reader.has_value() && reader->inner.ptr != nullptr) RustGcHelper::instance().gcRustPtr(reader->inner.ptr, reader->inner.type); } -RNProxyReadTask::RNProxyReadTask( - std::vector reader_plans, +RNColumnarReadTask::RNColumnarReadTask( + std::vector reader_plans, size_t source_num_, - std::shared_ptr shared_reader_context_) + std::shared_ptr shared_reader_context_) : reader_count(reader_plans.size()) , source_num(source_num_) , shared_reader_context(std::move(shared_reader_context_)) @@ -875,52 +875,52 @@ RNProxyReadTask::RNProxyReadTask( RUNTIME_CHECK(source_num > 0); RUNTIME_CHECK(source_num <= reader_count, source_num, reader_count); for (auto & reader_plan : reader_plans) - pending_reader_works.push_back(std::make_shared(std::move(reader_plan))); + pending_reader_works.push_back(std::make_shared(std::move(reader_plan))); } -size_t RNProxyReadTask::getReaderCount() const +size_t RNColumnarReadTask::getReaderCount() const { return reader_count; } -size_t RNProxyReadTask::getSourceNum() const +size_t RNColumnarReadTask::getSourceNum() const { return source_num; } -const Context & RNProxyReadTask::getContext() const +const Context & RNColumnarReadTask::getContext() const { return *shared_reader_context->context; } -const LoggerPtr & RNProxyReadTask::getLog() const +const LoggerPtr & RNColumnarReadTask::getLog() const { return shared_reader_context->log; } -const DM::ColumnDefines & RNProxyReadTask::getColumnsToRead() const +const DM::ColumnDefines & RNColumnarReadTask::getColumnsToRead() const { return *shared_reader_context->column_defines; } -int RNProxyReadTask::getExtraTableIDIndex() const +int RNColumnarReadTask::getExtraTableIDIndex() const { return shared_reader_context->extra_table_id_index; } -TableID RNProxyReadTask::getLogicalTableID() const +TableID RNColumnarReadTask::getLogicalTableID() const { return shared_reader_context->logical_table_id; } -const String & RNProxyReadTask::getExecutorID() const +const String & RNColumnarReadTask::getExecutorID() const { return shared_reader_context->executor_id; } -void RNProxyReadTask::replaceReaderWork( - const RNProxyReaderWorkPtr & reader_work, - std::vector replanned_reader_plans) +void RNColumnarReadTask::replaceReaderWork( + const RNColumnarReaderWorkPtr & reader_work, + std::vector replanned_reader_plans) { RUNTIME_CHECK(reader_work != nullptr); RUNTIME_CHECK(!replanned_reader_plans.empty()); @@ -931,19 +931,19 @@ void RNProxyReadTask::replaceReaderWork( auto queue_guard = std::lock_guard(pending_reader_works_mutex); for (auto it = replanned_reader_plans.rbegin(); it != replanned_reader_plans.rend() - 1; ++it) - pending_reader_works.push_front(std::make_shared(*it)); + pending_reader_works.push_front(std::make_shared(*it)); } #ifdef DBMS_PUBLIC_GTEST -void RNProxyReadTask::replaceReaderWorkForTest( - const RNProxyReaderWorkPtr & reader_work, - std::vector replanned_reader_plans) +void RNColumnarReadTask::replaceReaderWorkForTest( + const RNColumnarReaderWorkPtr & reader_work, + std::vector replanned_reader_plans) { replaceReaderWork(reader_work, std::move(replanned_reader_plans)); } #endif -ColumnarReaderPtr RNProxyReadTask::createColumnarReaderWithBackoff(const RNProxyReaderWorkPtr & reader_work) +ColumnarReaderPtr RNColumnarReadTask::createColumnarReaderWithBackoff(const RNColumnarReaderWorkPtr & reader_work) { RUNTIME_CHECK(reader_work != nullptr); pingcap::kv::Backoffer bo(pingcap::kv::copNextMaxBackoff); @@ -954,10 +954,10 @@ ColumnarReaderPtr RNProxyReadTask::createColumnarReaderWithBackoff(const RNProxy const auto & reader_plan = reader_work->plan; LOG_INFO( getLog(), - "materialize proxy reader for tables in region, region_id={}, table_num={}", + "materialize columnar reader for tables in region, region_id={}, table_num={}", reader_plan.region_id, reader_plan.physical_table_ranges.size()); - return createProxyColumnarReader(*shared_reader_context, reader_plan); + return createColumnarReader(*shared_reader_context, reader_plan); } catch (RegionException & e) { @@ -975,7 +975,7 @@ ColumnarReaderPtr RNProxyReadTask::createColumnarReaderWithBackoff(const RNProxy replaceReaderWork(reader_work, std::move(replanned_reader_plans)); LOG_WARNING( getLog(), - "replanned proxy reader work after region error, old_error={}, new_region_id={}, " + "replanned columnar reader work after region error, old_error={}, new_region_id={}, " "split_count={}", e.message(), reader_work->plan.region_id, @@ -983,10 +983,10 @@ ColumnarReaderPtr RNProxyReadTask::createColumnarReaderWithBackoff(const RNProxy } catch (const std::exception & replan_e) { - LOG_WARNING(getLog(), "replan proxy reader work failed, {}", replan_e.what()); + LOG_WARNING(getLog(), "replan columnar reader work failed, {}", replan_e.what()); } } - LOG_WARNING(getLog(), "create proxy reader failed, backoff and retry, {}", e.message()); + LOG_WARNING(getLog(), "create columnar reader failed, backoff and retry, {}", e.message()); bo.backoff(pingcap::kv::boRegionMiss, pingcap::Exception(e.message(), e.code())); } catch (Exception & e) @@ -999,7 +999,7 @@ ColumnarReaderPtr RNProxyReadTask::createColumnarReaderWithBackoff(const RNProxy } } -ColumnarReaderPtr RNProxyReadTask::getOrCreateReader(const RNProxyReaderWorkPtr & reader_work) +ColumnarReaderPtr RNColumnarReadTask::getOrCreateReader(const RNColumnarReaderWorkPtr & reader_work) { RUNTIME_CHECK(reader_work != nullptr); @@ -1010,28 +1010,28 @@ ColumnarReaderPtr RNProxyReadTask::getOrCreateReader(const RNProxyReaderWorkPtr std::unique_lock lock(reader_work->mutex); switch (reader_work->state) { - case RNProxyReaderMaterializeState::Ready: + case RNColumnarReaderMaterializeState::Ready: { auto reader = std::move(reader_work->reader); reader_work->reader.reset(); reader_work->exception = nullptr; - reader_work->state = RNProxyReaderMaterializeState::Consumed; + reader_work->state = RNColumnarReaderMaterializeState::Consumed; return reader.value(); } - case RNProxyReaderMaterializeState::Failed: + case RNColumnarReaderMaterializeState::Failed: std::rethrow_exception(reader_work->exception); - case RNProxyReaderMaterializeState::Consumed: + case RNColumnarReaderMaterializeState::Consumed: throw Exception( ErrorCodes::LOGICAL_ERROR, - "proxy reader work for region {} is already consumed", + "columnar reader work for region {} is already consumed", reader_work->plan.region_id); - case RNProxyReaderMaterializeState::Creating: + case RNColumnarReaderMaterializeState::Creating: reader_work->cv.wait(lock, [&] { - return reader_work->state != RNProxyReaderMaterializeState::Creating; + return reader_work->state != RNColumnarReaderMaterializeState::Creating; }); continue; - case RNProxyReaderMaterializeState::NotStarted: - reader_work->state = RNProxyReaderMaterializeState::Creating; + case RNColumnarReaderMaterializeState::NotStarted: + reader_work->state = RNColumnarReaderMaterializeState::Creating; should_create_inline = true; break; } @@ -1047,7 +1047,7 @@ ColumnarReaderPtr RNProxyReadTask::getOrCreateReader(const RNProxyReaderWorkPtr auto guard = std::lock_guard(reader_work->mutex); reader_work->reader.reset(); reader_work->exception = nullptr; - reader_work->state = RNProxyReaderMaterializeState::Consumed; + reader_work->state = RNColumnarReaderMaterializeState::Consumed; } reader_work->cv.notify_all(); return reader; @@ -1058,16 +1058,16 @@ ColumnarReaderPtr RNProxyReadTask::getOrCreateReader(const RNProxyReaderWorkPtr auto guard = std::lock_guard(reader_work->mutex); reader_work->reader.reset(); reader_work->exception = std::current_exception(); - reader_work->state = RNProxyReaderMaterializeState::Failed; + reader_work->state = RNColumnarReaderMaterializeState::Failed; } reader_work->cv.notify_all(); throw; } } -void RNProxyReadTask::prefetchPendingWork() +void RNColumnarReadTask::prefetchPendingWork() { - RNProxyReaderWorkPtr reader_work; + RNColumnarReaderWorkPtr reader_work; { auto guard = std::lock_guard(pending_reader_works_mutex); if (pending_reader_works.empty()) @@ -1078,49 +1078,49 @@ void RNProxyReadTask::prefetchPendingWork() prefetchReaderWork(reader_work); } -void RNProxyReadTask::prefetchReaderWork(const RNProxyReaderWorkPtr & reader_work) +void RNColumnarReadTask::prefetchReaderWork(const RNColumnarReaderWorkPtr & reader_work) { RUNTIME_CHECK(reader_work != nullptr); { auto guard = std::lock_guard(reader_work->mutex); - if (reader_work->state != RNProxyReaderMaterializeState::NotStarted) + if (reader_work->state != RNColumnarReaderMaterializeState::NotStarted) return; - reader_work->state = RNProxyReaderMaterializeState::Creating; + reader_work->state = RNColumnarReaderMaterializeState::Creating; } - LOG_INFO(getLog(), "materialize proxy reader asynchronously, region_id={}", reader_work->plan.region_id); - newThreadManager()->scheduleThenDetach(true, "PrefetchRNProxyReader", [self = shared_from_this(), reader_work] { + LOG_INFO(getLog(), "materialize columnar reader asynchronously, region_id={}", reader_work->plan.region_id); + newThreadManager()->scheduleThenDetach(true, "PrefetchRNColumnarReader", [self = shared_from_this(), reader_work] { try { auto reader = self->createColumnarReaderWithBackoff(reader_work); { auto guard = std::lock_guard(reader_work->mutex); - if (reader_work->state == RNProxyReaderMaterializeState::Consumed) + if (reader_work->state == RNColumnarReaderMaterializeState::Consumed) return; reader_work->reader.emplace(std::move(reader)); reader_work->exception = nullptr; - reader_work->state = RNProxyReaderMaterializeState::Ready; + reader_work->state = RNColumnarReaderMaterializeState::Ready; } } catch (...) { { auto guard = std::lock_guard(reader_work->mutex); - if (reader_work->state == RNProxyReaderMaterializeState::Consumed) + if (reader_work->state == RNColumnarReaderMaterializeState::Consumed) return; reader_work->reader.reset(); reader_work->exception = std::current_exception(); - reader_work->state = RNProxyReaderMaterializeState::Failed; + reader_work->state = RNColumnarReaderMaterializeState::Failed; } } reader_work->cv.notify_all(); }); } -std::optional RNProxyReadTask::tryAcquireReaderWork() +std::optional RNColumnarReadTask::tryAcquireReaderWork() { - RNProxyReaderWorkPtr reader_work; + RNColumnarReaderWorkPtr reader_work; { auto guard = std::lock_guard(pending_reader_works_mutex); if (pending_reader_works.empty()) @@ -1132,10 +1132,10 @@ std::optional RNProxyReadTask::tryAcquireReaderWork() return reader_work; } -BlockInputStreamPtr RNProxyReadTask::createInputStream(const RNProxyReaderWorkPtr & reader_work) +BlockInputStreamPtr RNColumnarReadTask::createInputStream(const RNColumnarReaderWorkPtr & reader_work) { RUNTIME_CHECK(reader_work != nullptr); - return RNProxyInputStream::create({ + return RNColumnarInputStream::create({ .context = getContext(), .log = getLog(), .task = shared_from_this(), @@ -1147,9 +1147,9 @@ BlockInputStreamPtr RNProxyReadTask::createInputStream(const RNProxyReaderWorkPt }); } -BlockInputStreamPtr RNProxyReadTask::createSharedInputStream() +BlockInputStreamPtr RNColumnarReadTask::createSharedInputStream() { - return RNProxyInputStream::create({ + return RNColumnarInputStream::create({ .context = getContext(), .log = getLog(), .task = shared_from_this(), @@ -1161,7 +1161,7 @@ BlockInputStreamPtr RNProxyReadTask::createSharedInputStream() }); } -std::vector RNProxyReadTask::buildProxyReadTaskWithBackoff( +std::vector RNColumnarReadTask::buildColumnarReadTaskWithBackoff( const LoggerPtr & log, const Context & context, UInt64 start_ts, @@ -1170,13 +1170,13 @@ std::vector RNProxyReadTask::buildProxyReadTaskWithBackoff( const std::vector & remote_table_ranges, unsigned num_streams) { - std::vector tasks; + std::vector tasks; pingcap::kv::Backoffer bo(pingcap::kv::copNextMaxBackoff); while (true) { try { - tasks = RNProxyReadTask::buildProxyReadTask( + tasks = RNColumnarReadTask::buildColumnarReadTask( log, context, start_ts, @@ -1188,21 +1188,21 @@ std::vector RNProxyReadTask::buildProxyReadTaskWithBackoff( } catch (RegionException & e) { - LOG_WARNING(log, "buildProxyReadTask failed, backoff and retry, {}", e.message()); + LOG_WARNING(log, "buildColumnarReadTask failed, backoff and retry, {}", e.message()); bo.backoff(pingcap::kv::boRegionMiss, pingcap::Exception(e.message(), e.code())); } catch (Exception & e) { if (e.code() != ErrorCodes::COLUMNAR_SNAPSHOT_ERROR) throw; - LOG_WARNING(log, "buildProxyReadTask failed, backoff and retry, {}", e.message()); + LOG_WARNING(log, "buildColumnarReadTask failed, backoff and retry, {}", e.message()); bo.backoff(pingcap::kv::boRegionMiss, pingcap::Exception(e.message(), e.code())); } } return tasks; } -std::vector RNProxyReadTask::buildProxyReadTask( +std::vector RNColumnarReadTask::buildColumnarReadTask( const LoggerPtr & log, const Context & context, UInt64 start_ts, @@ -1215,10 +1215,11 @@ std::vector RNProxyReadTask::buildProxyReadTask( auto scan_context = std::make_shared(dag_context->getKeyspaceID(), dag_context->getResourceGroupName()); dag_context->scan_context_map[table_scan.getTableScanExecutorID()] = scan_context; - auto shared_reader_context = buildProxyReaderSharedContext(log, context, start_ts, table_scan, filter_conditions); + auto shared_reader_context + = buildColumnarReaderSharedContext(log, context, start_ts, table_scan, filter_conditions); - std::vector tasks; - ProxyPhysicalTableRanges physical_table_ranges; + std::vector tasks; + ColumnarPhysicalTableRanges physical_table_ranges; physical_table_ranges.reserve(remote_table_ranges.size()); for (const auto & remote_table_range : remote_table_ranges) physical_table_ranges.emplace_back(remote_table_range.first, remote_table_range.second); @@ -1228,47 +1229,38 @@ std::vector RNProxyReadTask::buildProxyReadTask( const auto physical_table_num = static_cast(physical_table_ranges.size()); const bool enable_bucket_parallel = !table_scan.keepOrder() && num_streams > region_num; size_t total_max_reader_num = region_num; - size_t total_split_bucket_num = 0; for (auto & plan : region_reader_plans) { if (enable_bucket_parallel) { - auto bucket_keys = getRegionBucketKeysFromProxy(context, plan.region_id, plan.region_ver_id.ver); + auto bucket_keys = getRegionBucketKeysFromColumnar(context, plan.region_id, plan.region_ver_id.ver); auto split_result = splitRangesByBucketKeys(plan.physical_table_ranges, bucket_keys); if (split_result.has_bucket_split && split_result.units.size() > 1) { total_max_reader_num += split_result.units.size() - 1; - total_split_bucket_num += split_result.units.size(); plan.bucket_units = std::move(split_result.units); } } } - const size_t planned_reader_num = total_max_reader_num; - if (enable_bucket_parallel) - { - LOG_INFO(log, "bucket parallel split bucket count={}", total_split_bucket_num); - } LOG_INFO( log, - "region_num={}, table_num={}, num_streams={}, keep_order={}, bucket_parallel={}, planned_reader_num={}, " - "max_reader_num={}", + "region_num={}, table_num={}, num_streams={}, keep_order={}, bucket_parallel={}, planned_reader_num={}", region_num, physical_table_num, num_streams, table_scan.keepOrder(), enable_bucket_parallel, - planned_reader_num, total_max_reader_num); - std::vector all_reader_plans; - all_reader_plans.reserve(planned_reader_num); + std::vector all_reader_plans; + all_reader_plans.reserve(total_max_reader_num); for (size_t i = 0; i < region_reader_plans.size(); ++i) { const auto & plan = region_reader_plans[i]; if (plan.bucket_units.empty()) { - all_reader_plans.push_back(RNProxyReaderPlan{ + all_reader_plans.push_back(RNColumnarReaderPlan{ .region_id = plan.region_id, .region_ver = plan.region_ver_id.ver, .region_conf_ver = plan.region_ver_id.conf_ver, @@ -1279,12 +1271,12 @@ std::vector RNProxyReadTask::buildProxyReadTask( { for (const auto & [table_id, range] : plan.bucket_units) { - all_reader_plans.push_back(RNProxyReaderPlan{ + all_reader_plans.push_back(RNColumnarReaderPlan{ .region_id = plan.region_id, .region_ver = plan.region_ver_id.ver, .region_conf_ver = plan.region_ver_id.conf_ver, .physical_table_ranges - = ProxyPhysicalTableRanges{std::make_tuple(table_id, pingcap::coprocessor::KeyRanges{range})}, + = ColumnarPhysicalTableRanges{std::make_tuple(table_id, pingcap::coprocessor::KeyRanges{range})}, }); } } @@ -1292,14 +1284,14 @@ std::vector RNProxyReadTask::buildProxyReadTask( if (all_reader_plans.empty()) return tasks; - tasks.push_back(std::make_shared( + tasks.push_back(std::make_shared( std::move(all_reader_plans), - getRNProxySourceNum(num_streams, planned_reader_num), + getRNColumnarSourceNum(num_streams, total_max_reader_num), shared_reader_context)); return tasks; } -BlockInputStreams RNProxyReadTask::getInputStreams() +BlockInputStreams RNColumnarReadTask::getInputStreams() { BlockInputStreams streams; streams.reserve(source_num); @@ -1310,8 +1302,8 @@ BlockInputStreams RNProxyReadTask::getInputStreams() return streams; } -// RNProxyInputStream -bool RNProxyInputStream::ensureReader() +// RNColumnarInputStream +bool RNColumnarInputStream::ensureReader() { if (reader.has_value()) return true; @@ -1332,7 +1324,7 @@ bool RNProxyInputStream::ensureReader() return true; } -void RNProxyInputStream::releaseReader() +void RNColumnarInputStream::releaseReader() { if (reader.has_value() && reader->inner.ptr != nullptr) RustGcHelper::instance().gcRustPtr(reader->inner.ptr, reader->inner.type); @@ -1340,7 +1332,7 @@ void RNProxyInputStream::releaseReader() current_reader_work.reset(); } -RNProxyInputStream::~RNProxyInputStream() +RNColumnarInputStream::~RNColumnarInputStream() { SCOPE_EXIT({ if (reader.has_value() && reader->inner.ptr != nullptr) @@ -1348,15 +1340,18 @@ RNProxyInputStream::~RNProxyInputStream() }); try { + const auto * dag_context = context.getDAGContext(); + const auto keyspace_id = dag_context != nullptr ? dag_context->getKeyspaceID() : NullspaceID; LOG_INFO( log, - "Finished reading remote snapshot through proxy, rows={} bytes={} read_cost={:.3f}s " + "Finished reading remote snapshot through columnar, keyspace_id={} rows={} bytes={} read_cost={:.3f}s " "deserialize_cost={:.3f}s", + keyspace_id, action.totalRows(), total_bytes, duration_read_sec, duration_deserialize_sec); - if (auto * dag_context = context.getDAGContext(); dag_context != nullptr) + if (dag_context != nullptr) { if (auto it = dag_context->scan_context_map.find(executor_id); it != dag_context->scan_context_map.end()) { @@ -1374,24 +1369,24 @@ RNProxyInputStream::~RNProxyInputStream() } } -Block RNProxyInputStream::read(FilterPtr & res_filter, bool return_filter) +Block RNColumnarInputStream::read(FilterPtr & res_filter, bool return_filter) { return readImpl(res_filter, return_filter); } -Block RNProxyInputStream::readImpl() +Block RNColumnarInputStream::readImpl() { FilterPtr filter_ignored; return readImpl(filter_ignored, false); } -Block RNProxyInputStream::readImpl([[maybe_unused]] FilterPtr & res_filter, [[maybe_unused]] bool return_filter) +Block RNColumnarInputStream::readImpl([[maybe_unused]] FilterPtr & res_filter, [[maybe_unused]] bool return_filter) { if (done) return {}; const Context & global_ctx = context.getGlobalContext(); const TiFlashRaftProxyHelper * proxy_helper = global_ctx.getSharedContextDisagg()->getColumnarProxyHelper(); - RUNTIME_CHECK_MSG(proxy_helper != nullptr, "columnar proxy helper is not initialized"); + RUNTIME_CHECK_MSG(proxy_helper != nullptr, "columnar helper is not initialized"); while (true) { @@ -1404,11 +1399,11 @@ Block RNProxyInputStream::readImpl([[maybe_unused]] FilterPtr & res_filter, [[ma Stopwatch w{CLOCK_MONOTONIC_COARSE}; UInt64 rows = proxy_helper->cloud_storage_engine_interfaces.fn_read_block(reader.value(), batch_size); duration_read_sec += w.elapsedSecondsFromLastTime(); - LOG_DEBUG(log, "Read {} rows from proxy", rows); + LOG_DEBUG(log, "Read {} rows from columnar", rows); if (rows == std::numeric_limits::max()) { - LOG_WARNING(log, "Read block from proxy failed"); - throw Exception("read_block failed in tiflash-proxy", ErrorCodes::LOGICAL_ERROR); + LOG_WARNING(log, "Read block from columnar failed"); + throw Exception("read_block failed in columnar", ErrorCodes::LOGICAL_ERROR); } if (rows == 0) { @@ -1424,7 +1419,7 @@ Block RNProxyInputStream::readImpl([[maybe_unused]] FilterPtr & res_filter, [[ma TableID physical_table_id = -1; Block header = getHeader(); const ColumnsWithTypeAndName & col_type_and_name = header.getColumnsWithTypeAndName(); - // Construct block from proxy column data. + // Construct block from columnar column data. MutableColumns columns = header.cloneEmptyColumns(); for (UInt32 i = 0; i < col_type_and_name.size(); ++i) { @@ -1434,7 +1429,7 @@ Block RNProxyInputStream::readImpl([[maybe_unused]] FilterPtr & res_filter, [[ma col_type_and_name[i].column_id, col_type_and_name[i].name, col_type_and_name[i].type->getName()); - // Read column data from proxy + // Read column data from columnar Int64 col_id = col_type_and_name[i].column_id; if (col_id == MutSup::extra_handle_id) { @@ -1447,7 +1442,7 @@ Block RNProxyInputStream::readImpl([[maybe_unused]] FilterPtr & res_filter, [[ma col, [&](const IDataType::SubstreamPath &) { return &buf; }, rows, - -1.0, // avg_value_size_hint set to -1 to indicate Decimal format from proxy + -1.0, // avg_value_size_hint set to -1 to indicate Decimal format from columnar true, {}); } @@ -1467,7 +1462,7 @@ Block RNProxyInputStream::readImpl([[maybe_unused]] FilterPtr & res_filter, [[ma col, [&](const IDataType::SubstreamPath &) { return &buf; }, rows, - -1.0, // avg_value_size_hint set to -1 to indicate Decimal format from proxy + -1.0, // avg_value_size_hint set to -1 to indicate Decimal format from columnar true, {}); LOG_DEBUG(log, "Read column data done, col size={}", col.size()); @@ -1491,10 +1486,11 @@ Block RNProxyInputStream::readImpl([[maybe_unused]] FilterPtr & res_filter, [[ma } } -// RNProxySourceOp -void RNProxySourceOp::operateSuffixImpl() +// RNColumnarSourceOp +void RNColumnarSourceOp::operateSuffixImpl() { UNUSED(context); + const auto keyspace_id = exec_context.getKeyspaceID(); const double total_cost_sec = total_cost_watch.elapsedSeconds(); const UInt64 rows_per_sec = total_cost_sec > 0 ? static_cast(static_cast(total_rows) / total_cost_sec) : 0; @@ -1502,9 +1498,11 @@ void RNProxySourceOp::operateSuffixImpl() = total_cost_sec > 0 ? static_cast(static_cast(total_bytes) / total_cost_sec) : 0; LOG_INFO( log, - "Finished reading proxy snapshots, task_pool_worker_total_cost={:.3f}s claimed_streams={} rows={} " + "Finished reading columnar snapshots, keyspace_id={} task_pool_worker_total_cost={:.3f}s claimed_streams={} " + "rows={} " "rows_per_sec={} " "bytes={} bytes_per_sec={} read_cost={:.3f}s", + keyspace_id, total_cost_sec, total_streams, total_rows, @@ -1514,13 +1512,13 @@ void RNProxySourceOp::operateSuffixImpl() duration_read_sec); } -void RNProxySourceOp::operatePrefixImpl() +void RNColumnarSourceOp::operatePrefixImpl() { total_cost_watch.restart(); - LOG_INFO(log, "Begin reading proxy snapshots"); + LOG_INFO(log, "Begin reading columnar snapshots, keyspace_id={}", exec_context.getKeyspaceID()); } -OperatorStatus RNProxySourceOp::readImpl(Block & block) +OperatorStatus RNColumnarSourceOp::readImpl(Block & block) { if (unlikely(done)) { @@ -1538,7 +1536,7 @@ OperatorStatus RNProxySourceOp::readImpl(Block & block) return awaitImpl(); } -OperatorStatus RNProxySourceOp::awaitImpl() +OperatorStatus RNColumnarSourceOp::awaitImpl() { if (unlikely(done || t_block.has_value())) { @@ -1548,7 +1546,7 @@ OperatorStatus RNProxySourceOp::awaitImpl() return OperatorStatus::IO_IN; } -OperatorStatus RNProxySourceOp::executeIOImpl() +OperatorStatus RNColumnarSourceOp::executeIOImpl() { if (unlikely(done || t_block.has_value())) { diff --git a/dbms/src/Storages/StorageDisaggregatedColumnar.h b/dbms/src/Storages/StorageDisaggregatedColumnar.h index 7e02509a529..5c75cc674f9 100644 --- a/dbms/src/Storages/StorageDisaggregatedColumnar.h +++ b/dbms/src/Storages/StorageDisaggregatedColumnar.h @@ -55,7 +55,7 @@ class RSOperator; using RSOperatorPtr = std::shared_ptr; } // namespace DM -enum class RNProxyReaderMaterializeState +enum class RNColumnarReaderMaterializeState { NotStarted, Creating, @@ -64,9 +64,9 @@ enum class RNProxyReaderMaterializeState Consumed, }; -struct RNProxyReaderSharedContext; +struct RNColumnarReaderSharedContext; -struct RNProxyReaderPlan +struct RNColumnarReaderPlan { RegionID region_id; RegionVersion region_ver; @@ -74,34 +74,34 @@ struct RNProxyReaderPlan std::vector> physical_table_ranges; }; -struct RNProxyReaderWork +struct RNColumnarReaderWork { - explicit RNProxyReaderWork(RNProxyReaderPlan plan_) + explicit RNColumnarReaderWork(RNColumnarReaderPlan plan_) : plan(std::move(plan_)) {} - ~RNProxyReaderWork(); + ~RNColumnarReaderWork(); - RNProxyReaderPlan plan; + RNColumnarReaderPlan plan; std::mutex mutex; std::condition_variable cv; - RNProxyReaderMaterializeState state = RNProxyReaderMaterializeState::NotStarted; + RNColumnarReaderMaterializeState state = RNColumnarReaderMaterializeState::NotStarted; std::optional reader; std::exception_ptr exception; }; -using RNProxyReaderWorkPtr = std::shared_ptr; +using RNColumnarReaderWorkPtr = std::shared_ptr; -class RNProxyReadTask; -using RNProxyReadTaskPtr = std::shared_ptr; -class RNProxyReadTask +class RNColumnarReadTask; +using RNColumnarReadTaskPtr = std::shared_ptr; +class RNColumnarReadTask : public boost::noncopyable - , public std::enable_shared_from_this + , public std::enable_shared_from_this { public: using RemoteTableRange = std::pair; - static std::vector buildProxyReadTaskWithBackoff( + static std::vector buildColumnarReadTaskWithBackoff( const LoggerPtr & log, const Context & context, UInt64 start_ts, @@ -110,7 +110,7 @@ class RNProxyReadTask const std::vector & remote_table_ranges, unsigned num_streams); - static std::vector buildProxyReadTask( + static std::vector buildColumnarReadTask( const LoggerPtr & log, const Context & context, UInt64 start_ts, @@ -123,18 +123,18 @@ class RNProxyReadTask BlockInputStreamPtr createSharedInputStream(); - BlockInputStreamPtr createInputStream(const RNProxyReaderWorkPtr & reader_work); + BlockInputStreamPtr createInputStream(const RNColumnarReaderWorkPtr & reader_work); - ColumnarReaderPtr createColumnarReaderWithBackoff(const RNProxyReaderWorkPtr & reader_work); + ColumnarReaderPtr createColumnarReaderWithBackoff(const RNColumnarReaderWorkPtr & reader_work); - ColumnarReaderPtr getOrCreateReader(const RNProxyReaderWorkPtr & reader_work); + ColumnarReaderPtr getOrCreateReader(const RNColumnarReaderWorkPtr & reader_work); - std::optional tryAcquireReaderWork(); + std::optional tryAcquireReaderWork(); #ifdef DBMS_PUBLIC_GTEST void replaceReaderWorkForTest( - const RNProxyReaderWorkPtr & reader_work, - std::vector replanned_reader_plans); + const RNColumnarReaderWorkPtr & reader_work, + std::vector replanned_reader_plans); #endif size_t getReaderCount() const; @@ -153,33 +153,33 @@ class RNProxyReadTask const String & getExecutorID() const; - RNProxyReadTask( - std::vector reader_plans, + RNColumnarReadTask( + std::vector reader_plans, size_t source_num, - std::shared_ptr shared_reader_context); + std::shared_ptr shared_reader_context); private: void prefetchPendingWork(); - void prefetchReaderWork(const RNProxyReaderWorkPtr & reader_work); + void prefetchReaderWork(const RNColumnarReaderWorkPtr & reader_work); void replaceReaderWork( - const RNProxyReaderWorkPtr & reader_work, - std::vector replanned_reader_plans); + const RNColumnarReaderWorkPtr & reader_work, + std::vector replanned_reader_plans); size_t reader_count; size_t source_num; - std::shared_ptr shared_reader_context; + std::shared_ptr shared_reader_context; mutable std::mutex pending_reader_works_mutex; - std::deque pending_reader_works; + std::deque pending_reader_works; }; -class RNProxyInputStream : public IProfilingBlockInputStream +class RNColumnarInputStream : public IProfilingBlockInputStream { static constexpr auto NAME = "RNProxy"; public: - ~RNProxyInputStream(); + ~RNColumnarInputStream(); String getName() const { return NAME; } Block getHeader() const { return header; } @@ -195,15 +195,15 @@ class RNProxyInputStream : public IProfilingBlockInputStream { const Context & context; LoggerPtr log; - RNProxyReadTaskPtr task; - RNProxyReaderWorkPtr reader_work; + RNColumnarReadTaskPtr task; + RNColumnarReaderWorkPtr reader_work; const DM::ColumnDefines & columns_to_read; int extra_table_id_index; TableID table_id; const String & executor_id; }; - explicit RNProxyInputStream(const Options & options) + explicit RNColumnarInputStream(const Options & options) : context(options.context) , log(options.log) , task(options.task) @@ -216,7 +216,8 @@ class RNProxyInputStream : public IProfilingBlockInputStream setHeader(action.getHeader()); } - static BlockInputStreamPtr create(const Options & options) { return std::make_shared(options); } + static BlockInputStreamPtr create(const Options & options) + { return std::make_shared(options); } private: bool ensureReader(); @@ -224,9 +225,9 @@ class RNProxyInputStream : public IProfilingBlockInputStream const Context & context; const LoggerPtr log; - RNProxyReadTaskPtr task; - const RNProxyReaderWorkPtr fixed_reader_work; - RNProxyReaderWorkPtr current_reader_work; + RNColumnarReadTaskPtr task; + const RNColumnarReaderWorkPtr fixed_reader_work; + RNColumnarReaderWorkPtr current_reader_work; std::optional reader; AddExtraTableIDColumnTransformAction action; TableID table_id; @@ -241,7 +242,7 @@ class RNProxyInputStream : public IProfilingBlockInputStream UInt64 total_bytes = 0; }; -class RNProxySourceOp : public SourceOp +class RNColumnarSourceOp : public SourceOp { static constexpr auto NAME = "RNProxy"; @@ -249,10 +250,10 @@ class RNProxySourceOp : public SourceOp struct Options { PipelineExecutorContext & exec_context; - RNProxyReadTaskPtr task; + RNColumnarReadTaskPtr task; }; - explicit RNProxySourceOp(const Options & options) + explicit RNColumnarSourceOp(const Options & options) : SourceOp(options.exec_context, options.task->getLog()->identifier()) , context(options.task->getContext()) , log(options.task->getLog()) @@ -263,7 +264,7 @@ class RNProxySourceOp : public SourceOp options.task->getExtraTableIDIndex())); } - static SourceOpPtr create(const Options & options) { return std::make_unique(options); } + static SourceOpPtr create(const Options & options) { return std::make_unique(options); } String getName() const override { return NAME; } @@ -283,7 +284,7 @@ class RNProxySourceOp : public SourceOp private: const Context & context; const LoggerPtr log; - RNProxyReadTaskPtr task; + RNColumnarReadTaskPtr task; UInt64 total_bytes = 0; size_t total_rows = 0; size_t total_streams = 0; From 38054304685dfa619150afc59efd01ced4895bbb Mon Sep 17 00:00:00 2001 From: yongman Date: Thu, 11 Jun 2026 09:55:03 +0800 Subject: [PATCH 18/19] address comments Signed-off-by: yongman --- .../hub-runtime/src/cloud_helper.rs | 4 +++- dbms/src/Storages/StorageDisaggregatedColumnar.cpp | 14 +++++++++++--- dbms/src/Storages/StorageDisaggregatedColumnar.h | 4 +++- 3 files changed, 17 insertions(+), 5 deletions(-) diff --git a/contrib/tiflash-columnar-hub/hub-runtime/src/cloud_helper.rs b/contrib/tiflash-columnar-hub/hub-runtime/src/cloud_helper.rs index babece96d2b..dc924d8948c 100644 --- a/contrib/tiflash-columnar-hub/hub-runtime/src/cloud_helper.rs +++ b/contrib/tiflash-columnar-hub/hub-runtime/src/cloud_helper.rs @@ -207,7 +207,9 @@ impl PdClientWithCache { } } - let Some(bucket_stat) = self.pd_client.get_buckets(region_id) else { + let Ok(Some(bucket_stat)) = + futures::executor::block_on(self.pd_client.get_buckets_async(region_id)) + else { self.region_bucket_cache.remove(®ion_id); return Vec::new(); }; diff --git a/dbms/src/Storages/StorageDisaggregatedColumnar.cpp b/dbms/src/Storages/StorageDisaggregatedColumnar.cpp index 29586f0c166..c77f23b1c8b 100644 --- a/dbms/src/Storages/StorageDisaggregatedColumnar.cpp +++ b/dbms/src/Storages/StorageDisaggregatedColumnar.cpp @@ -208,9 +208,17 @@ BucketSplitResult splitRangesByBucketKeys( bool current_range_split = false; for (const auto & bucket_key : bucket_keys) { - const auto decoded_bucket_key - = RecordKVFormat::decodeTiKVKey(TiKVKey(bucket_key.data(), bucket_key.size())); - String normalized_bucket_key(decoded_bucket_key.data(), decoded_bucket_key.size()); + String normalized_bucket_key; + try + { + const auto decoded_bucket_key + = RecordKVFormat::decodeTiKVKey(TiKVKey(bucket_key.data(), bucket_key.size())); + normalized_bucket_key.assign(decoded_bucket_key.data(), decoded_bucket_key.size()); + } + catch (...) + { + continue; + } if (!isBucketBoundaryInsideRange(normalized_bucket_key, range)) continue; result.units.emplace_back( diff --git a/dbms/src/Storages/StorageDisaggregatedColumnar.h b/dbms/src/Storages/StorageDisaggregatedColumnar.h index 5c75cc674f9..113193510b7 100644 --- a/dbms/src/Storages/StorageDisaggregatedColumnar.h +++ b/dbms/src/Storages/StorageDisaggregatedColumnar.h @@ -217,7 +217,9 @@ class RNColumnarInputStream : public IProfilingBlockInputStream } static BlockInputStreamPtr create(const Options & options) - { return std::make_shared(options); } + { + return std::make_shared(options); + } private: bool ensureReader(); From 1a7dbdefa9a1977fa3c9c23807b76ba5560e49a5 Mon Sep 17 00:00:00 2001 From: JaySon-Huang Date: Thu, 11 Jun 2026 16:38:48 +0800 Subject: [PATCH 19/19] add comments and fix lint Signed-off-by: JaySon-Huang --- .../Storages/StorageDisaggregatedColumnar.cpp | 32 ++++++++++++------- 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/dbms/src/Storages/StorageDisaggregatedColumnar.cpp b/dbms/src/Storages/StorageDisaggregatedColumnar.cpp index c77f23b1c8b..fbc93f9d195 100644 --- a/dbms/src/Storages/StorageDisaggregatedColumnar.cpp +++ b/dbms/src/Storages/StorageDisaggregatedColumnar.cpp @@ -211,6 +211,11 @@ BucketSplitResult splitRangesByBucketKeys( String normalized_bucket_key; try { + // Bucket boundaries from PD are TiKV encoded keys. Empty region boundaries and + // malformed non-empty keys are both possible invalid split points, and length + // checks alone cannot validate TiKV memcomparable encoding markers/padding. + // Skip only the bad boundary so the original range is still covered by a + // coarser reader plan. const auto decoded_bucket_key = RecordKVFormat::decodeTiKVKey(TiKVKey(bucket_key.data(), bucket_key.size())); normalized_bucket_key.assign(decoded_bucket_key.data(), decoded_bucket_key.size()); @@ -734,15 +739,14 @@ ColumnarReaderPtr createColumnarReader( tables_range_data.append(reinterpret_cast(&ranges_data_size), sizeof(ranges_data_size)); tables_range_data.append(ranges_data.data(), ranges_data.size()); } - BaseBuffView tables_range_view = BaseBuffView{tables_range_data.data(), tables_range_data.size()}; - BaseBuffView columns = BaseBuffView{shared_context.table_info_data.data(), shared_context.table_info_data.size()}; - BaseBuffView filter_conditions_view + auto tables_range_view = BaseBuffView{tables_range_data.data(), tables_range_data.size()}; + auto columns = BaseBuffView{shared_context.table_info_data.data(), shared_context.table_info_data.size()}; + auto filter_conditions_view = BaseBuffView{shared_context.filter_conditions_data.data(), shared_context.filter_conditions_data.size()}; - BaseBuffView table_scan_view - = BaseBuffView{shared_context.table_scan_data.data(), shared_context.table_scan_data.size()}; - BaseBuffView ann_query_info_view + auto table_scan_view = BaseBuffView{shared_context.table_scan_data.data(), shared_context.table_scan_data.size()}; + auto ann_query_info_view = BaseBuffView{shared_context.ann_query_info_data.data(), shared_context.ann_query_info_data.size()}; - BaseBuffView fts_query_info_view + auto fts_query_info_view = BaseBuffView{shared_context.fts_query_info_data.data(), shared_context.fts_query_info_data.size()}; const Context & global_ctx = context.getGlobalContext(); auto * cluster = global_ctx.getTMTContext().getKVCluster(); @@ -787,7 +791,7 @@ ColumnarReaderPtr createColumnarReader( region_id_ver = std::to_string(region.id()) + ":" + std::to_string(reader_plan.region_ver) + ":" + std::to_string(region.region_epoch().conf_ver()); } - auto _guard = std::lock_guard(*shared_context.output_lock); + auto guard = std::lock_guard(*shared_context.output_lock); cluster->region_cache->dropRegion(region_ver_id); LOG_WARNING( log, @@ -821,7 +825,7 @@ ColumnarReaderPtr createColumnarReader( std::to_string(reader_plan.region_id), region_error.ShortDebugString()); } - auto _guard = std::lock_guard(*shared_context.output_lock); + auto guard = std::lock_guard(*shared_context.output_lock); cluster->region_cache->dropRegion(region_ver_id); throw RegionException( std::move(unavailable_regions), @@ -838,7 +842,7 @@ ColumnarReaderPtr createColumnarReader( pingcap::kv::Backoffer bo(pingcap::kv::copNextMaxBackoff); std::vector pushed; std::vector locks{std::make_shared(lock_info)}; - auto _guard = std::lock_guard(*shared_context.output_lock); + auto guard = std::lock_guard(*shared_context.output_lock); auto before_expired = cluster->lock_resolver->resolveLocks(bo, shared_context.start_ts, locks, pushed); LOG_WARNING(log, "Finished resolve locks, before_expired={}", before_expired); throw Exception("lock error", ErrorCodes::COLUMNAR_SNAPSHOT_ERROR); @@ -937,6 +941,8 @@ void RNColumnarReadTask::replaceReaderWork( if (replanned_reader_plans.size() == 1) return; + // If the original range now spans multiple regions, enqueue the remaining partitions for + // other sources. These ranges are produced by re-splitting the failed work's own key ranges. auto queue_guard = std::lock_guard(pending_reader_works_mutex); for (auto it = replanned_reader_plans.rbegin(); it != replanned_reader_plans.rend() - 1; ++it) pending_reader_works.push_front(std::make_shared(*it)); @@ -974,6 +980,9 @@ ColumnarReaderPtr RNColumnarReadTask::createColumnarReaderWithBackoff(const RNCo { try { + // Replan only the key ranges owned by this failed work. Dropping the stale + // region cache happens before this exception, so this locate pass can pick up + // the latest region epoch and split layout. auto replanned_region_reader_plans = buildRegionReaderPlansFromPhysicalTableRanges( getLog(), getContext(), @@ -1263,9 +1272,8 @@ std::vector RNColumnarReadTask::buildColumnarReadTask( std::vector all_reader_plans; all_reader_plans.reserve(total_max_reader_num); - for (size_t i = 0; i < region_reader_plans.size(); ++i) + for (const auto & plan : region_reader_plans) { - const auto & plan = region_reader_plans[i]; if (plan.bucket_units.empty()) { all_reader_plans.push_back(RNColumnarReaderPlan{