From 32bde44b5cf5a1af441334a74a087bd7559abadc Mon Sep 17 00:00:00 2001 From: Yongting You <2010youy01@gmail.com> Date: Wed, 20 May 2026 14:24:28 +0800 Subject: [PATCH 1/2] forbid reserve() to prevent panic in clippy --- clippy.toml | 1 + datafusion/functions-aggregate/src/median.rs | 9 +++++++-- .../functions-aggregate/src/percentile_cont.rs | 10 ++++++++-- datafusion/functions/src/string/common.rs | 4 ++++ datafusion/functions/src/strings.rs | 8 ++++++++ datafusion/physical-plan/src/recursive_query.rs | 13 +++++++++++-- datafusion/spark/src/function/math/hex.rs | 12 ++++++++++-- datafusion/spark/src/function/math/unhex.rs | 11 +++++++++-- 8 files changed, 58 insertions(+), 10 deletions(-) diff --git a/clippy.toml b/clippy.toml index ea3609b574c06..7b781d9b6605f 100644 --- a/clippy.toml +++ b/clippy.toml @@ -1,6 +1,7 @@ disallowed-methods = [ { path = "tokio::task::spawn", reason = "To provide cancel-safety, use `SpawnedTask::spawn` instead (https://github.com/apache/datafusion/issues/6513)" }, { path = "tokio::task::spawn_blocking", reason = "To provide cancel-safety, use `SpawnedTask::spawn_blocking` instead (https://github.com/apache/datafusion/issues/6513)" }, + { path = "std::vec::Vec::reserve", reason = "Use `Vec::try_reserve` so allocation failures can be reported instead of panicking", replacement = "try_reserve" }, ] disallowed-types = [ diff --git a/datafusion/functions-aggregate/src/median.rs b/datafusion/functions-aggregate/src/median.rs index 02a49ab6dcca0..e7e7d03937f12 100644 --- a/datafusion/functions-aggregate/src/median.rs +++ b/datafusion/functions-aggregate/src/median.rs @@ -41,7 +41,7 @@ use arrow::datatypes::{ use datafusion_common::types::{NativeType, logical_float64}; use datafusion_common::{ - DataFusionError, Result, ScalarValue, assert_eq_or_internal_err, + DataFusionError, Result, ScalarValue, assert_eq_or_internal_err, exec_datafusion_err, internal_datafusion_err, }; use datafusion_expr::function::StateFieldsArgs; @@ -282,7 +282,12 @@ impl Accumulator for MedianAccumulator { fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> { let values = values[0].as_primitive::(); - self.all_values.reserve(values.len() - values.null_count()); + let additional = values.len() - values.null_count(); + self.all_values.try_reserve(additional).map_err(|e| { + exec_datafusion_err!( + "failed to reserve {additional} values for median accumulator: {e}" + ) + })?; self.all_values.extend(values.iter().flatten()); Ok(()) } diff --git a/datafusion/functions-aggregate/src/percentile_cont.rs b/datafusion/functions-aggregate/src/percentile_cont.rs index 256388c216f00..714988bde2acf 100644 --- a/datafusion/functions-aggregate/src/percentile_cont.rs +++ b/datafusion/functions-aggregate/src/percentile_cont.rs @@ -38,7 +38,8 @@ use datafusion_functions_aggregate_common::noop_accumulator::NoopAccumulator; use crate::min_max::{max_udaf, min_udaf}; use datafusion_common::{ - Result, ScalarValue, internal_datafusion_err, utils::take_function_args, + Result, ScalarValue, exec_datafusion_err, internal_datafusion_err, + utils::take_function_args, }; use datafusion_expr::utils::format_state_name; use datafusion_expr::{ @@ -420,7 +421,12 @@ where fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> { let values = values[0].as_primitive::(); - self.all_values.reserve(values.len() - values.null_count()); + let additional = values.len() - values.null_count(); + self.all_values.try_reserve(additional).map_err(|e| { + exec_datafusion_err!( + "failed to reserve {additional} values for percentile_cont accumulator: {e}" + ) + })?; self.all_values.extend(values.iter().flatten()); Ok(()) } diff --git a/datafusion/functions/src/string/common.rs b/datafusion/functions/src/string/common.rs index 2732ba4f86ef2..355546ae7f925 100644 --- a/datafusion/functions/src/string/common.rs +++ b/datafusion/functions/src/string/common.rs @@ -520,6 +520,10 @@ fn case_conversion_utf8view_ascii_inner u8>( block_size = block_size.saturating_mul(2); } let to_reserve = len.max(block_size as usize); + #[expect( + clippy::disallowed_methods, + reason = "StringView block_size bounds growth, so reserve cannot overflow capacity" + )] in_progress.reserve(to_reserve); } diff --git a/datafusion/functions/src/strings.rs b/datafusion/functions/src/strings.rs index 1d02def4765cc..b524f4485c6aa 100644 --- a/datafusion/functions/src/strings.rs +++ b/datafusion/functions/src/strings.rs @@ -703,6 +703,10 @@ impl StringViewArrayBuilder { if self.in_progress.capacity() < required_cap { self.flush_in_progress(); let to_reserve = (length as usize).max(self.next_block_size() as usize); + #[expect( + clippy::disallowed_methods, + reason = "StringView block_size bounds growth, so reserve cannot overflow capacity" + )] self.in_progress.reserve(to_reserve); } @@ -730,6 +734,10 @@ impl StringViewArrayBuilder { if self.in_progress.capacity() < required_cap { self.flush_in_progress(); let to_reserve = (length as usize).max(self.next_block_size() as usize); + #[expect( + clippy::disallowed_methods, + reason = "StringView block_size bounds growth, so reserve cannot overflow capacity" + )] self.in_progress.reserve(to_reserve); } } diff --git a/datafusion/physical-plan/src/recursive_query.rs b/datafusion/physical-plan/src/recursive_query.rs index c160f9a0dc763..15a84ba527c52 100644 --- a/datafusion/physical-plan/src/recursive_query.rs +++ b/datafusion/physical-plan/src/recursive_query.rs @@ -39,7 +39,9 @@ use arrow::datatypes::{Field, Schema, SchemaRef}; use arrow::record_batch::RecordBatch; use datafusion_common::tree_node::TreeNodeRecursion; use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode}; -use datafusion_common::{Result, internal_datafusion_err, not_impl_err}; +use datafusion_common::{ + Result, exec_datafusion_err, internal_datafusion_err, not_impl_err, +}; use datafusion_execution::TaskContext; use datafusion_execution::memory_pool::{MemoryConsumer, MemoryReservation}; use datafusion_physical_expr::PhysicalExpr; @@ -489,7 +491,14 @@ impl DistinctDeduplicator { /// We also detect duplicates by enforcing that group ids are increasing. fn deduplicate(&mut self, batch: &RecordBatch) -> Result { let size_before = self.group_values.len(); - self.intern_output_buffer.reserve(batch.num_rows()); + let additional = batch.num_rows(); + self.intern_output_buffer + .try_reserve(additional) + .map_err(|e| { + exec_datafusion_err!( + "failed to reserve {additional} recursive query group ids: {e}" + ) + })?; self.group_values .intern(batch.columns(), &mut self.intern_output_buffer)?; let mask = new_groups_mask(&self.intern_output_buffer, size_before); diff --git a/datafusion/spark/src/function/math/hex.rs b/datafusion/spark/src/function/math/hex.rs index 22e0b5b0786ea..55c9cda63c888 100644 --- a/datafusion/spark/src/function/math/hex.rs +++ b/datafusion/spark/src/function/math/hex.rs @@ -31,7 +31,7 @@ use datafusion_common::utils::take_function_args; use datafusion_common::{ DataFusionError, cast::{as_binary_array, as_fixed_size_binary_array, as_int64_array}, - exec_err, + exec_datafusion_err, exec_err, }; use datafusion_expr::{ Coercion, ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, TypeSignature, @@ -178,7 +178,15 @@ where if let Some(b) = v { let bytes = b.as_ref(); buffer.clear(); - buffer.reserve(bytes.len() * 2); + let additional = bytes + .len() + .checked_mul(2) + .ok_or_else(|| exec_datafusion_err!("hex output size overflow"))?; + buffer.try_reserve(additional).map_err(|e| { + exec_datafusion_err!( + "failed to reserve {additional} bytes for hex output: {e}" + ) + })?; for &byte in bytes { buffer.extend_from_slice(&lookup[byte as usize]); } diff --git a/datafusion/spark/src/function/math/unhex.rs b/datafusion/spark/src/function/math/unhex.rs index f6c9e2fa27a67..6739e6a15c582 100644 --- a/datafusion/spark/src/function/math/unhex.rs +++ b/datafusion/spark/src/function/math/unhex.rs @@ -22,7 +22,9 @@ use datafusion_common::cast::{ }; use datafusion_common::types::logical_string; use datafusion_common::utils::take_function_args; -use datafusion_common::{DataFusionError, Result, ScalarValue, exec_err}; +use datafusion_common::{ + DataFusionError, Result, ScalarValue, exec_datafusion_err, exec_err, +}; use datafusion_expr::{ Coercion, ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, TypeSignatureClass, Volatility, @@ -125,7 +127,12 @@ where for v in iter { if let Some(s) = v { buffer.clear(); - buffer.reserve(s.as_ref().len().div_ceil(2)); + let additional = s.as_ref().len().div_ceil(2); + buffer.try_reserve(additional).map_err(|e| { + exec_datafusion_err!( + "failed to reserve {additional} bytes for unhex output: {e}" + ) + })?; if unhex_common(s.as_ref().as_bytes(), &mut buffer) { builder.append_value(&buffer); } else { From be0576aab78652c0b511e764768bbd786e287870 Mon Sep 17 00:00:00 2001 From: Yongting You <2010youy01@gmail.com> Date: Sat, 23 May 2026 09:57:31 +0800 Subject: [PATCH 2/2] review: better doc --- datafusion/functions/src/string/common.rs | 2 +- datafusion/functions/src/strings.rs | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/datafusion/functions/src/string/common.rs b/datafusion/functions/src/string/common.rs index 355546ae7f925..6ecd41b0b9a5c 100644 --- a/datafusion/functions/src/string/common.rs +++ b/datafusion/functions/src/string/common.rs @@ -522,7 +522,7 @@ fn case_conversion_utf8view_ascii_inner u8>( let to_reserve = len.max(block_size as usize); #[expect( clippy::disallowed_methods, - reason = "StringView block_size bounds growth, so reserve cannot overflow capacity" + reason = "StringView's block size bounds growth, so reserve cannot overflow capacity arithmetically. This hot loop intentionally avoids the extra `try_reserve` checks. It remains subject to allocator failure/OOM, which must be managed externally." )] in_progress.reserve(to_reserve); } diff --git a/datafusion/functions/src/strings.rs b/datafusion/functions/src/strings.rs index b524f4485c6aa..144d567f5be0a 100644 --- a/datafusion/functions/src/strings.rs +++ b/datafusion/functions/src/strings.rs @@ -705,7 +705,7 @@ impl StringViewArrayBuilder { let to_reserve = (length as usize).max(self.next_block_size() as usize); #[expect( clippy::disallowed_methods, - reason = "StringView block_size bounds growth, so reserve cannot overflow capacity" + reason = "StringView's block size bounds growth, so reserve cannot overflow capacity arithmetically. This hot loop intentionally avoids the extra `try_reserve` checks. It remains subject to allocator failure/OOM, which must be managed externally." )] self.in_progress.reserve(to_reserve); } @@ -736,7 +736,7 @@ impl StringViewArrayBuilder { let to_reserve = (length as usize).max(self.next_block_size() as usize); #[expect( clippy::disallowed_methods, - reason = "StringView block_size bounds growth, so reserve cannot overflow capacity" + reason = "StringView's block size bounds growth, so reserve cannot overflow capacity arithmetically. This hot loop intentionally avoids the extra `try_reserve` checks. It remains subject to allocator failure/OOM, which must be managed externally." )] self.in_progress.reserve(to_reserve); }