From 37768b8aadd0e9de543029778814d585588ff64f Mon Sep 17 00:00:00 2001
From: pchintar <89355405+pchintar@users.noreply.github.com>
Date: Sat, 13 Jun 2026 07:04:43 +0530
Subject: [PATCH 01/39] fix: Enable sliding window execution for covar_pop,
covar_samp, and corr (#22764)
## Which issue does this PR close?
- Closes #22763 .
## Rationale for this change
Bounded sliding window queries using `covar_pop`, `covar_samp`, and
`corr` currently fail with a `retract_batch is not implemented` error,
preventing these aggregates from being used with sliding window frames.
## What changes are included in this PR?
* Included `supports_retract_batch()` for the covariance and correlation
accumulators.
* Added SQL logic tests covering bounded sliding window execution for
covariance and correlation aggregates.
## Are these changes tested?
Yes.
Added SQL logic tests covering:
* Single-row bounded sliding frames
* Multi-row bounded sliding frames
for `covar_pop`, `covar_samp`, and `corr`.
## Are there any user-facing changes?
Yes.
`covar_pop`, `covar_samp`, and `corr` can now be used with bounded
sliding window frames that previously failed. Also, no changes were made
to any public APIs.
---
.../functions-aggregate/src/correlation.rs | 4 +
.../functions-aggregate/src/covariance.rs | 12 ++
.../functions-aggregate/src/variance.rs | 7 +
datafusion/sqllogictest/test_files/window.slt | 136 ++++++++++++++++++
4 files changed, 159 insertions(+)
diff --git a/datafusion/functions-aggregate/src/correlation.rs b/datafusion/functions-aggregate/src/correlation.rs
index 2621fcf0bf3c7..5a95cfe8320fc 100644
--- a/datafusion/functions-aggregate/src/correlation.rs
+++ b/datafusion/functions-aggregate/src/correlation.rs
@@ -281,6 +281,10 @@ impl Accumulator for CorrelationAccumulator {
self.stddev2.retract_batch(&values[1..2])?;
Ok(())
}
+
+ fn supports_retract_batch(&self) -> bool {
+ true
+ }
}
#[derive(Default)]
diff --git a/datafusion/functions-aggregate/src/covariance.rs b/datafusion/functions-aggregate/src/covariance.rs
index 18d602ab33940..bd7c8a039076a 100644
--- a/datafusion/functions-aggregate/src/covariance.rs
+++ b/datafusion/functions-aggregate/src/covariance.rs
@@ -305,6 +305,14 @@ impl Accumulator for CovarianceAccumulator {
_ => continue,
};
+ if self.count <= 1 {
+ self.count = 0;
+ self.mean1 = 0.0;
+ self.mean2 = 0.0;
+ self.algo_const = 0.0;
+ continue;
+ }
+
let new_count = self.count - 1;
let delta1 = self.mean1 - value1;
let new_mean1 = delta1 / new_count as f64 + self.mean1;
@@ -373,4 +381,8 @@ impl Accumulator for CovarianceAccumulator {
fn size(&self) -> usize {
size_of_val(self)
}
+
+ fn supports_retract_batch(&self) -> bool {
+ true
+ }
}
diff --git a/datafusion/functions-aggregate/src/variance.rs b/datafusion/functions-aggregate/src/variance.rs
index ce3e00b9ffd91..d5fddf01f2d52 100644
--- a/datafusion/functions-aggregate/src/variance.rs
+++ b/datafusion/functions-aggregate/src/variance.rs
@@ -348,6 +348,13 @@ impl Accumulator for VarianceAccumulator {
fn retract_batch(&mut self, values: &[ArrayRef]) -> Result<()> {
let arr = as_float64_array(&values[0])?;
for value in arr.iter().flatten() {
+ if self.count <= 1 {
+ self.count = 0;
+ self.mean = 0.0;
+ self.m2 = 0.0;
+ continue;
+ }
+
let new_count = self.count - 1;
let delta1 = self.mean - value;
let new_mean = delta1 / new_count as f64 + self.mean;
diff --git a/datafusion/sqllogictest/test_files/window.slt b/datafusion/sqllogictest/test_files/window.slt
index 1b51950a70e1b..59b43d6476571 100644
--- a/datafusion/sqllogictest/test_files/window.slt
+++ b/datafusion/sqllogictest/test_files/window.slt
@@ -6626,6 +6626,142 @@ ORDER BY i;
3 1
4 NULL
+# Covariance/correlation sliding-window regression test. Verifies correct
+# results across row removals and a NULL-gap empty-frame transition.
+query IRRR
+SELECT
+ column1,
+ covar_pop(column2, column3) OVER (
+ ORDER BY column1
+ ROWS BETWEEN 1 PRECEDING AND CURRENT ROW
+ ),
+ covar_samp(column2, column3) OVER (
+ ORDER BY column1
+ ROWS BETWEEN 1 PRECEDING AND CURRENT ROW
+ ),
+ corr(column2, column3) OVER (
+ ORDER BY column1
+ ROWS BETWEEN 1 PRECEDING AND CURRENT ROW
+ )
+FROM (
+ VALUES
+ (1, 10.0, 5.0),
+ (2, NULL, NULL),
+ (3, NULL, NULL),
+ (4, 30.0, 10.0),
+ (5, 40.0, 20.0),
+ (6, 50.0, 10.0)
+);
+----
+1 0 NULL NULL
+2 0 NULL NULL
+3 NULL NULL NULL
+4 0 NULL NULL
+5 25 50 1
+6 -25 -50 -1
+
+# Multi-row covariance/correlation sliding-window regression test. Verifies
+# correct accumulation when valid rows enter the frame after a reset.
+query IRRR
+SELECT
+ column1,
+ covar_pop(column2, column3) OVER (
+ ORDER BY column1
+ ROWS BETWEEN 2 PRECEDING AND CURRENT ROW
+ ),
+ covar_samp(column2, column3) OVER (
+ ORDER BY column1
+ ROWS BETWEEN 2 PRECEDING AND CURRENT ROW
+ ),
+ corr(column2, column3) OVER (
+ ORDER BY column1
+ ROWS BETWEEN 2 PRECEDING AND CURRENT ROW
+ )
+FROM (
+ VALUES
+ (1, 10.0, 5.0),
+ (2, NULL, NULL),
+ (3, NULL, NULL),
+ (4, 30.0, 10.0),
+ (5, 40.0, 20.0),
+ (6, 50.0, 10.0)
+);
+----
+1 0 NULL NULL
+2 0 NULL NULL
+3 0 NULL NULL
+4 0 NULL NULL
+5 25 50 1
+6 0 0 0
+
+# Covariance/correlation sliding-window regression test. Rows with NULL in
+# either input column must not contribute to the aggregate state.
+query IRRR
+SELECT
+ column1,
+ covar_pop(column2, column3) OVER (
+ ORDER BY column1
+ ROWS BETWEEN 3 PRECEDING AND CURRENT ROW
+ ),
+ covar_samp(column2, column3) OVER (
+ ORDER BY column1
+ ROWS BETWEEN 3 PRECEDING AND CURRENT ROW
+ ),
+ corr(column2, column3) OVER (
+ ORDER BY column1
+ ROWS BETWEEN 3 PRECEDING AND CURRENT ROW
+ )
+FROM (
+ VALUES
+ (1, 10.0, 5.0),
+ (2, 20.0, NULL),
+ (3, NULL, 15.0),
+ (4, 30.0, 10.0),
+ (5, 40.0, 20.0)
+);
+----
+1 0 NULL NULL
+2 0 NULL NULL
+3 0 NULL NULL
+4 25 50 1
+5 25 50 1
+
+# Variance/stddev sliding-window regression test. Verifies that retracting
+# the last valid row resets the aggregate state.
+query IRRRR
+SELECT
+ column1,
+ var_pop(column2) OVER (
+ ORDER BY column1
+ ROWS BETWEEN 1 PRECEDING AND CURRENT ROW
+ ),
+ var_samp(column2) OVER (
+ ORDER BY column1
+ ROWS BETWEEN 1 PRECEDING AND CURRENT ROW
+ ),
+ stddev_pop(column2) OVER (
+ ORDER BY column1
+ ROWS BETWEEN 1 PRECEDING AND CURRENT ROW
+ ),
+ stddev_samp(column2) OVER (
+ ORDER BY column1
+ ROWS BETWEEN 1 PRECEDING AND CURRENT ROW
+ )
+FROM (
+ VALUES
+ (1, 10.0),
+ (2, NULL),
+ (3, NULL),
+ (4, 30.0),
+ (5, 40.0)
+);
+----
+1 0 NULL 0 NULL
+2 0 NULL 0 NULL
+3 NULL NULL NULL NULL
+4 0 NULL 0 NULL
+5 25 50 5 7.071067811865
+
# Decimal variant — the integer-division path would otherwise panic on an
# empty frame.
query IR
From f9317284cf56c517742c212fe893af05ff955a23 Mon Sep 17 00:00:00 2001
From: Kumar Ujjawal
Date: Sat, 13 Jun 2026 07:06:39 +0530
Subject: [PATCH 02/39] fix: handle `date_bin` negative subsecond and overflow
cases (#22610)
## Which issue does this PR close?
- Closes #22528
## Rationale for this change
`date_bin` had a few edge cases that could return the wrong result,
return an error only on array inputs, or panic/wrap when scaling
timestamp and time values to nanoseconds.
## What changes are included in this PR?
- Fix negative sub-second timestamp conversion before the epoch.
- Make scalar and array paths return `NULL` consistently for per-row
binning errors.
- Use checked scaling when converting timestamp and time values to
nanoseconds.
- Return an error for invalid shared origin values that overflow during
scaling.
- Simplify duplicated stride and scale handling.
## Are these changes tested?
Yes
## Are there any user-facing changes?
No public API changes.
---
datafusion/functions/src/datetime/date_bin.rs | 439 +++++++++---------
.../test_files/date_bin_errors.slt | 28 +-
2 files changed, 250 insertions(+), 217 deletions(-)
diff --git a/datafusion/functions/src/datetime/date_bin.rs b/datafusion/functions/src/datetime/date_bin.rs
index 38b491e42bcbd..06ffd8ba5b3c6 100644
--- a/datafusion/functions/src/datetime/date_bin.rs
+++ b/datafusion/functions/src/datetime/date_bin.rs
@@ -34,9 +34,7 @@ use arrow::datatypes::{
use arrow::error::ArrowError;
use arrow::temporal_conversions::NANOSECONDS_IN_DAY;
use datafusion_common::cast::as_primitive_array;
-use datafusion_common::{
- DataFusionError, Result, ScalarValue, exec_err, not_impl_err, plan_err,
-};
+use datafusion_common::{Result, ScalarValue, exec_err, not_impl_err, plan_err};
use datafusion_expr::TypeSignature::Exact;
use datafusion_expr::sort_properties::{ExprProperties, SortProperties};
use datafusion_expr::{
@@ -420,14 +418,44 @@ fn date_bin_months_interval(stride_months: i64, source: i64, origin: i64) -> Res
}
fn to_utc_date_time(nanos: i64) -> Result> {
- let secs = nanos / NANOS_PER_SEC;
- let nsec = (nanos % NANOS_PER_SEC) as u32;
+ // Keep negative sub-second values normalized as seconds + non-negative nanos.
+ let secs = nanos.div_euclid(NANOS_PER_SEC);
+ let nsec = nanos.rem_euclid(NANOS_PER_SEC) as u32;
match DateTime::from_timestamp(secs, nsec) {
Some(dt) => Ok(dt),
None => exec_err!("Invalid timestamp value"),
}
}
+fn timestamp_scale() -> i64 {
+ match T::UNIT {
+ Nanosecond => 1,
+ Microsecond => NANOS_PER_MICRO,
+ Millisecond => NANOS_PER_MILLI,
+ Second => NANOSECONDS,
+ }
+}
+
+// Scale to nanoseconds and report overflow as a normal error.
+fn checked_scale_to_nanos(x: i64, scale: i64) -> Result {
+ match x.checked_mul(scale) {
+ Some(scaled) => Ok(scaled),
+ None => exec_err!("date_bin timestamp value {x} * scale {scale} overflows i64"),
+ }
+}
+
+fn validate_time_stride(stride: &Interval) -> Result<()> {
+ match stride {
+ Interval::Months(m) if *m > 0 => {
+ exec_err!("DATE_BIN stride for TIME input must be less than 1 day")
+ }
+ Interval::Nanoseconds(ns) if *ns >= NANOSECONDS_IN_DAY => {
+ exec_err!("DATE_BIN stride for TIME input must be less than 1 day")
+ }
+ _ => Ok(()),
+ }
+}
+
// Supported intervals:
// 1. IntervalDayTime: this means that the stride is in days, hours, minutes, seconds and milliseconds
// We will assume month interval won't be converted into this type
@@ -498,83 +526,20 @@ fn date_bin_impl(
(*v, false)
}
ColumnarValue::Scalar(ScalarValue::Time32Millisecond(Some(v))) => {
- match stride {
- Interval::Months(m) => {
- if m > 0 {
- return exec_err!(
- "DATE_BIN stride for TIME input must be less than 1 day"
- );
- }
- }
- Interval::Nanoseconds(ns) => {
- if ns >= NANOSECONDS_IN_DAY {
- return exec_err!(
- "DATE_BIN stride for TIME input must be less than 1 day"
- );
- }
- }
- }
-
- (*v as i64 * NANOS_PER_MILLI, true)
+ validate_time_stride(&stride)?;
+ // TIME origins can come from reinterpret casts, so scale defensively.
+ (checked_scale_to_nanos(*v as i64, NANOS_PER_MILLI)?, true)
}
ColumnarValue::Scalar(ScalarValue::Time32Second(Some(v))) => {
- match stride {
- Interval::Months(m) => {
- if m > 0 {
- return exec_err!(
- "DATE_BIN stride for TIME input must be less than 1 day"
- );
- }
- }
- Interval::Nanoseconds(ns) => {
- if ns >= NANOSECONDS_IN_DAY {
- return exec_err!(
- "DATE_BIN stride for TIME input must be less than 1 day"
- );
- }
- }
- }
-
- (*v as i64 * NANOS_PER_SEC, true)
+ validate_time_stride(&stride)?;
+ (checked_scale_to_nanos(*v as i64, NANOS_PER_SEC)?, true)
}
ColumnarValue::Scalar(ScalarValue::Time64Microsecond(Some(v))) => {
- match stride {
- Interval::Months(m) => {
- if m > 0 {
- return exec_err!(
- "DATE_BIN stride for TIME input must be less than 1 day"
- );
- }
- }
- Interval::Nanoseconds(ns) => {
- if ns >= NANOSECONDS_IN_DAY {
- return exec_err!(
- "DATE_BIN stride for TIME input must be less than 1 day"
- );
- }
- }
- }
-
- (*v * NANOS_PER_MICRO, true)
+ validate_time_stride(&stride)?;
+ (checked_scale_to_nanos(*v, NANOS_PER_MICRO)?, true)
}
ColumnarValue::Scalar(ScalarValue::Time64Nanosecond(Some(v))) => {
- match stride {
- Interval::Months(m) => {
- if m > 0 {
- return exec_err!(
- "DATE_BIN stride for TIME input must be less than 1 day"
- );
- }
- }
- Interval::Nanoseconds(ns) => {
- if ns >= NANOSECONDS_IN_DAY {
- return exec_err!(
- "DATE_BIN stride for TIME input must be less than 1 day"
- );
- }
- }
- }
-
+ validate_time_stride(&stride)?;
(*v, true)
}
ColumnarValue::Scalar(v) => {
@@ -597,91 +562,49 @@ fn date_bin_impl(
return exec_err!("DATE_BIN stride must be non-zero");
}
- fn timestamp_scale() -> i64 {
- match T::UNIT {
- Nanosecond => 1,
- Microsecond => NANOS_PER_MICRO,
- Millisecond => NANOS_PER_MILLI,
- Second => NANOSECONDS,
- }
- }
-
- fn timestamp_scale_overflow_error(x: i64) -> DataFusionError {
- DataFusionError::Execution(format!(
- "DATE_BIN source timestamp {x} cannot be represented in nanoseconds"
- ))
+ fn transform_scalar_with_stride(
+ value: Option,
+ origin: i64,
+ stride: i64,
+ stride_fn: BinFunction,
+ ) -> Option {
+ let scale = timestamp_scale::();
+ value
+ .and_then(|val| val.checked_mul(scale))
+ .and_then(|scaled| stride_fn(stride, scaled, origin).ok())
+ .map(|binned| binned / scale)
}
Ok(match array {
ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(v, tz_opt)) => {
- let scale = timestamp_scale::();
ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(
- match *v {
- Some(val) => {
- let scaled = val
- .checked_mul(scale)
- .ok_or_else(|| timestamp_scale_overflow_error(val))?;
- match stride_fn(stride, scaled, origin) {
- Ok(result) => Some(result / scale),
- Err(_) => None,
- }
- }
- None => None,
- },
+ transform_scalar_with_stride::(
+ *v, origin, stride, stride_fn,
+ ),
tz_opt.clone(),
))
}
ColumnarValue::Scalar(ScalarValue::TimestampMicrosecond(v, tz_opt)) => {
- let scale = timestamp_scale::();
ColumnarValue::Scalar(ScalarValue::TimestampMicrosecond(
- match *v {
- Some(val) => {
- let scaled = val
- .checked_mul(scale)
- .ok_or_else(|| timestamp_scale_overflow_error(val))?;
- match stride_fn(stride, scaled, origin) {
- Ok(result) => Some(result / scale),
- Err(_) => None,
- }
- }
- None => None,
- },
+ transform_scalar_with_stride::(
+ *v, origin, stride, stride_fn,
+ ),
tz_opt.clone(),
))
}
ColumnarValue::Scalar(ScalarValue::TimestampMillisecond(v, tz_opt)) => {
- let scale = timestamp_scale::();
ColumnarValue::Scalar(ScalarValue::TimestampMillisecond(
- match *v {
- Some(val) => {
- let scaled = val
- .checked_mul(scale)
- .ok_or_else(|| timestamp_scale_overflow_error(val))?;
- match stride_fn(stride, scaled, origin) {
- Ok(result) => Some(result / scale),
- Err(_) => None,
- }
- }
- None => None,
- },
+ transform_scalar_with_stride::(
+ *v, origin, stride, stride_fn,
+ ),
tz_opt.clone(),
))
}
ColumnarValue::Scalar(ScalarValue::TimestampSecond(v, tz_opt)) => {
- let scale = timestamp_scale::();
ColumnarValue::Scalar(ScalarValue::TimestampSecond(
- match *v {
- Some(val) => {
- let scaled = val
- .checked_mul(scale)
- .ok_or_else(|| timestamp_scale_overflow_error(val))?;
- match stride_fn(stride, scaled, origin) {
- Ok(result) => Some(result / scale),
- Err(_) => None,
- }
- }
- None => None,
- },
+ transform_scalar_with_stride::(
+ *v, origin, stride, stride_fn,
+ ),
tz_opt.clone(),
))
}
@@ -689,39 +612,30 @@ fn date_bin_impl(
if !is_time {
return exec_err!("DATE_BIN with Time32 source requires Time32 origin");
}
- let result = v.and_then(|x| {
- match stride_fn(stride, x as i64 * NANOS_PER_MILLI, origin) {
- Ok(binned_nanos) => {
- let nanos = binned_nanos % (NANOSECONDS_IN_DAY);
- Some((nanos / NANOS_PER_MILLI) as i32)
- }
- Err(_) => None,
- }
- });
+ let result = v
+ .and_then(|x| (x as i64).checked_mul(NANOS_PER_MILLI))
+ .and_then(|scaled| stride_fn(stride, scaled, origin).ok())
+ .map(|binned| ((binned % NANOSECONDS_IN_DAY) / NANOS_PER_MILLI) as i32);
ColumnarValue::Scalar(ScalarValue::Time32Millisecond(result))
}
ColumnarValue::Scalar(ScalarValue::Time32Second(v)) => {
if !is_time {
return exec_err!("DATE_BIN with Time32 source requires Time32 origin");
}
- let result = v.and_then(|x| {
- match stride_fn(stride, x as i64 * NANOS_PER_SEC, origin) {
- Ok(binned_nanos) => {
- let nanos = binned_nanos % (NANOSECONDS_IN_DAY);
- Some((nanos / NANOS_PER_SEC) as i32)
- }
- Err(_) => None,
- }
- });
+ let result = v
+ .and_then(|x| (x as i64).checked_mul(NANOS_PER_SEC))
+ .and_then(|scaled| stride_fn(stride, scaled, origin).ok())
+ .map(|binned| ((binned % NANOSECONDS_IN_DAY) / NANOS_PER_SEC) as i32);
ColumnarValue::Scalar(ScalarValue::Time32Second(result))
}
ColumnarValue::Scalar(ScalarValue::Time64Nanosecond(v)) => {
if !is_time {
return exec_err!("DATE_BIN with Time64 source requires Time64 origin");
}
- let result = v.and_then(|x| match stride_fn(stride, x, origin) {
- Ok(binned_nanos) => Some(binned_nanos % (NANOSECONDS_IN_DAY)),
- Err(_) => None,
+ let result = v.and_then(|x| {
+ stride_fn(stride, x, origin)
+ .map(|binned| binned % NANOSECONDS_IN_DAY)
+ .ok()
});
ColumnarValue::Scalar(ScalarValue::Time64Nanosecond(result))
}
@@ -729,14 +643,10 @@ fn date_bin_impl(
if !is_time {
return exec_err!("DATE_BIN with Time64 source requires Time64 origin");
}
- let result =
- v.and_then(|x| match stride_fn(stride, x * NANOS_PER_MICRO, origin) {
- Ok(binned_nanos) => {
- let nanos = binned_nanos % (NANOSECONDS_IN_DAY);
- Some(nanos / NANOS_PER_MICRO)
- }
- Err(_) => None,
- });
+ let result = v
+ .and_then(|x| x.checked_mul(NANOS_PER_MICRO))
+ .and_then(|scaled| stride_fn(stride, scaled, origin).ok())
+ .map(|binned| (binned % NANOSECONDS_IN_DAY) / NANOS_PER_MICRO);
ColumnarValue::Scalar(ScalarValue::Time64Microsecond(result))
}
ColumnarValue::Array(array) => {
@@ -753,22 +663,12 @@ fn date_bin_impl(
let array = as_primitive_array::(array)?;
let scale = timestamp_scale::();
- let values = array
- .iter()
- .map(|val| match val {
- Some(val) => {
- let scaled = val
- .checked_mul(scale)
- .ok_or_else(|| timestamp_scale_overflow_error(val))?;
- Ok(stride_fn(stride, scaled, origin)
- .ok()
- .map(|binned| binned / scale))
- }
- None => Ok(None),
- })
- .collect::>>()?;
-
- let result = PrimitiveArray::::from_iter(values);
+ // Per-row errors become NULL, matching scalar behavior.
+ let result: PrimitiveArray = array.unary_opt(|val| {
+ val.checked_mul(scale)
+ .and_then(|scaled| stride_fn(stride, scaled, origin).ok())
+ .map(|binned| binned / scale)
+ });
let array = result.with_timezone_opt(tz_opt.clone());
Ok(ColumnarValue::Array(Arc::new(array)))
@@ -803,14 +703,15 @@ fn date_bin_impl(
}
let array = array.as_primitive::();
let result: PrimitiveArray =
- array.try_unary(|x| {
- stride_fn(stride, x as i64 * NANOS_PER_MILLI, origin)
- .map(|binned_nanos| {
- let nanos = binned_nanos % (NANOSECONDS_IN_DAY);
- (nanos / NANOS_PER_MILLI) as i32
+ array.unary_opt(|x| {
+ (x as i64)
+ .checked_mul(NANOS_PER_MILLI)
+ .and_then(|scaled| stride_fn(stride, scaled, origin).ok())
+ .map(|binned| {
+ ((binned % NANOSECONDS_IN_DAY) / NANOS_PER_MILLI)
+ as i32
})
- .map_err(|e| ArrowError::ComputeError(e.to_string()))
- })?;
+ });
ColumnarValue::Array(Arc::new(result))
}
Time32(Second) => {
@@ -820,15 +721,14 @@ fn date_bin_impl(
);
}
let array = array.as_primitive::();
- let result: PrimitiveArray =
- array.try_unary(|x| {
- stride_fn(stride, x as i64 * NANOS_PER_SEC, origin)
- .map(|binned_nanos| {
- let nanos = binned_nanos % (NANOSECONDS_IN_DAY);
- (nanos / NANOS_PER_SEC) as i32
- })
- .map_err(|e| ArrowError::ComputeError(e.to_string()))
- })?;
+ let result: PrimitiveArray = array.unary_opt(|x| {
+ (x as i64)
+ .checked_mul(NANOS_PER_SEC)
+ .and_then(|scaled| stride_fn(stride, scaled, origin).ok())
+ .map(|binned| {
+ ((binned % NANOSECONDS_IN_DAY) / NANOS_PER_SEC) as i32
+ })
+ });
ColumnarValue::Array(Arc::new(result))
}
Time64(Microsecond) => {
@@ -839,14 +739,13 @@ fn date_bin_impl(
}
let array = array.as_primitive::();
let result: PrimitiveArray =
- array.try_unary(|x| {
- stride_fn(stride, x * NANOS_PER_MICRO, origin)
- .map(|binned_nanos| {
- let nanos = binned_nanos % (NANOSECONDS_IN_DAY);
- nanos / NANOS_PER_MICRO
+ array.unary_opt(|x| {
+ x.checked_mul(NANOS_PER_MICRO)
+ .and_then(|scaled| stride_fn(stride, scaled, origin).ok())
+ .map(|binned| {
+ (binned % NANOSECONDS_IN_DAY) / NANOS_PER_MICRO
})
- .map_err(|e| ArrowError::ComputeError(e.to_string()))
- })?;
+ });
ColumnarValue::Array(Arc::new(result))
}
Time64(Nanosecond) => {
@@ -857,11 +756,11 @@ fn date_bin_impl(
}
let array = array.as_primitive::();
let result: PrimitiveArray =
- array.try_unary(|x| {
+ array.unary_opt(|x| {
stride_fn(stride, x, origin)
.map(|binned_nanos| binned_nanos % (NANOSECONDS_IN_DAY))
- .map_err(|e| ArrowError::ComputeError(e.to_string()))
- })?;
+ .ok()
+ });
ColumnarValue::Array(Arc::new(result))
}
_ => {
@@ -917,6 +816,31 @@ mod tests {
DateBinFunc::new().invoke_with_args(args)
}
+ fn assert_null_scalar(value: ColumnarValue, expected_type: DataType) {
+ let ColumnarValue::Scalar(value) = value else {
+ panic!("expected scalar, got {value:?}");
+ };
+ assert_eq!(value.data_type(), expected_type);
+ assert!(value.is_null(), "expected NULL, got {value:?}");
+ }
+
+ fn assert_array_null_then_valid(value: ColumnarValue, expected_type: DataType) {
+ let ColumnarValue::Array(array) = value else {
+ panic!("expected array, got {value:?}");
+ };
+ assert_eq!(array.data_type(), &expected_type);
+ assert!(array.is_null(0), "expected NULL at row 0");
+ assert!(array.is_valid(1), "expected valid value at row 1");
+ }
+
+ fn assert_overflow_error(result: Result) {
+ let err = result.expect_err("expected overflow error");
+ assert!(
+ err.strip_backtrace().contains("overflows i64"),
+ "unexpected error: {err}"
+ );
+ }
+
#[test]
fn test_date_bin() {
let return_field = &Arc::new(Field::new(
@@ -1433,6 +1357,97 @@ mod tests {
}
}
+ #[test]
+ fn test_date_bin_scale_overflow_returns_null() {
+ // Scaling non-nanosecond timestamps to nanoseconds can overflow.
+ use arrow::array::{
+ ArrayRef, TimestampMicrosecondArray, TimestampMillisecondArray,
+ TimestampSecondArray,
+ };
+
+ let scalar_cases = [
+ ScalarValue::TimestampSecond(Some(i64::MAX), None),
+ ScalarValue::TimestampMillisecond(Some(i64::MAX), None),
+ ScalarValue::TimestampMicrosecond(Some(i64::MAX), None),
+ ];
+ for source in scalar_cases {
+ let expected_type = source.data_type();
+ let return_field = Arc::new(Field::new("f", expected_type.clone(), true));
+ let args = vec![
+ ColumnarValue::Scalar(ScalarValue::new_interval_dt(1, 0)),
+ ColumnarValue::Scalar(source),
+ ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(0), None)),
+ ];
+ let result = invoke_date_bin_with_args(args, 1, &return_field)
+ .unwrap_or_else(|e| panic!("expected Ok for {expected_type}, got {e:?}"));
+ assert_null_scalar(result, expected_type);
+ }
+
+ let array_cases: Vec = vec![
+ Arc::new(TimestampSecondArray::from(vec![Some(i64::MAX), Some(0)])),
+ Arc::new(TimestampMillisecondArray::from(vec![
+ Some(i64::MAX),
+ Some(0),
+ ])),
+ Arc::new(TimestampMicrosecondArray::from(vec![
+ Some(i64::MAX),
+ Some(0),
+ ])),
+ ];
+ for array in array_cases {
+ let dt = array.data_type().clone();
+ let return_field = Arc::new(Field::new("f", dt.clone(), true));
+ let args = vec![
+ ColumnarValue::Scalar(ScalarValue::new_interval_dt(1, 0)),
+ ColumnarValue::Array(array),
+ ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(Some(0), None)),
+ ];
+ let result = invoke_date_bin_with_args(args, 2, &return_field)
+ .unwrap_or_else(|e| panic!("expected Ok for {dt:?}, got {e:?}"));
+ assert_array_null_then_valid(result, dt);
+ }
+ }
+
+ #[test]
+ fn test_date_bin_time64_micro_overflow_handling() {
+ // Time64(Microsecond) can hold out-of-range values after reinterpret casts.
+ use arrow::array::Time64MicrosecondArray;
+
+ let data_type = DataType::Time64(TimeUnit::Microsecond);
+ let return_field = &Arc::new(Field::new("f", data_type.clone(), true));
+ let stride = || ColumnarValue::Scalar(ScalarValue::new_interval_dt(0, 1000));
+ let origin = || ColumnarValue::Scalar(ScalarValue::Time64Microsecond(Some(0)));
+
+ // Out-of-range source values are per-row data, so they become NULL.
+ let args = vec![
+ stride(),
+ ColumnarValue::Scalar(ScalarValue::Time64Microsecond(Some(i64::MAX))),
+ origin(),
+ ];
+ let result = invoke_date_bin_with_args(args, 1, return_field).unwrap();
+ assert_null_scalar(result, data_type.clone());
+
+ let array = Arc::new(Time64MicrosecondArray::from(vec![Some(i64::MAX), Some(0)]));
+ let args = vec![stride(), ColumnarValue::Array(array), origin()];
+ let result = invoke_date_bin_with_args(args, 2, return_field).unwrap();
+ assert_array_null_then_valid(result, data_type);
+
+ let bad_origin =
+ || ColumnarValue::Scalar(ScalarValue::Time64Microsecond(Some(i64::MAX)));
+
+ // Out-of-range origins are shared inputs, so they return an error.
+ let args = vec![
+ stride(),
+ ColumnarValue::Scalar(ScalarValue::Time64Microsecond(Some(0))),
+ bad_origin(),
+ ];
+ assert_overflow_error(invoke_date_bin_with_args(args, 1, return_field));
+
+ let array = Arc::new(Time64MicrosecondArray::from(vec![Some(0), Some(1)]));
+ let args = vec![stride(), ColumnarValue::Array(array), bad_origin()];
+ assert_overflow_error(invoke_date_bin_with_args(args, 2, return_field));
+ }
+
#[test]
fn test_date_bin_compute_distance_rem_overflow() {
// Regression for #22215: `time_diff % stride` panics with "attempt to
diff --git a/datafusion/sqllogictest/test_files/date_bin_errors.slt b/datafusion/sqllogictest/test_files/date_bin_errors.slt
index 20408c84ef79a..53cba506defd6 100644
--- a/datafusion/sqllogictest/test_files/date_bin_errors.slt
+++ b/datafusion/sqllogictest/test_files/date_bin_errors.slt
@@ -23,10 +23,24 @@ select date_bin(interval '1637426858 months', to_timestamp_millis(1040292460), t
----
NULL
-# Negative timestamp with month interval - should return NULL instead of panicking
+# Issue #22528: negative sub-second source with month interval.
query P
select date_bin(interval '1 month', to_timestamp_millis(-1040292460), timestamp '1984-01-07 00:00:00');
----
+1969-12-07T00:00:00
+
+# Array path should match the scalar path above.
+query P
+select date_bin(interval '1 month', c, timestamp '1984-01-07 00:00:00')
+from values (to_timestamp_millis(-1040292460)) t(c);
+----
+1969-12-07T00:00:00
+
+# Array path should return NULL for per-row overflow.
+query P
+select date_bin(interval '1637426858 months', c, timestamp '1984-01-07 00:00:00')
+from values (to_timestamp_millis(1040292460)) t(c);
+----
NULL
# Large stride causing overflow - should return NULL
@@ -79,16 +93,18 @@ select date_bin(
----
NULL
-# Source timestamp scaling to nanoseconds overflows: should return an error, not panic
-query error DataFusion error: Execution error: DATE_BIN source timestamp 9223372036854775807 cannot be represented in nanoseconds
+# Source timestamp scaling to nanoseconds overflows: should return NULL, not panic
+query P
select date_bin(
interval '1 nanosecond',
arrow_cast(9223372036854775807, 'Timestamp(Second, None)'),
timestamp '1970-01-01 00:00:00'
);
+----
+NULL
-# Source timestamp scaling to nanoseconds overflows in array path: should return an error, not panic
-query error DataFusion error: Execution error: DATE_BIN source timestamp 9223372036854775807 cannot be represented in nanoseconds
+# Source timestamp scaling to nanoseconds overflows in array path: should return NULL, not panic
+query P
select date_bin(
interval '1 nanosecond',
ts,
@@ -97,3 +113,5 @@ select date_bin(
from (
values (arrow_cast(9223372036854775807, 'Timestamp(Second, None)'))
) as t(ts);
+----
+NULL
From e5f7af15a6dafe9e2847a81bae5fddfda5a60c0a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?David=20L=C3=B3pez?=
Date: Sat, 13 Jun 2026 03:37:17 +0200
Subject: [PATCH 03/39] feat(spark): add `concat_ws` with array support
(#20928)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
## Which issue does this PR close?
- Part of #15914
## Rationale for this change
DataFusion core's `concat_ws` does not support array arguments. Spark's
`concat_ws(sep, ...)` accepts both scalar strings and arrays, expanding
array elements and skipping nulls. This is needed for Spark
compatibility in the `datafusion-spark` crate.
## What changes are included in this PR?
- New `SparkConcatWs` UDF in
`datafusion/spark/src/function/string/concat_ws.rs`
- Supports `concat_ws(sep, str1, str2, ...)` with scalar strings
- Supports array arguments: `concat_ws(',', array('a', 'b'), 'c')` →
`"a,b,c"`
- Null scalars and null array elements are skipped (Spark behavior)
- Null separator returns NULL
- Zero value arguments (`concat_ws(',')`) returns empty string
- Supports Utf8, LargeUtf8, Utf8View, List, and LargeList types
- Registered the function in `mod.rs` (`make_udf_function!`,
`export_functions!`, `functions()`)
- Replaced commented-out SLT tests with 14 working test cases covering
basic usage, arrays, mixed arguments, nulls, column expressions, and
edge cases
## Are these changes tested?
Yes.
- 7 unit tests in `concat_ws.rs` (basic, null values skipped, null
separator, list arrays, list with nulls, mixed scalar+list, multiple
rows)
- 14 SLT tests in `spark/string/concat_ws.slt` covering scalars, arrays,
nulls, column expressions, and edge cases
## Are there any user-facing changes?
No. This is a new function in the `datafusion-spark` crate only.
---
.../spark/src/function/string/concat_ws.rs | 297 ++++++++++++++
datafusion/spark/src/function/string/mod.rs | 8 +
.../test_files/spark/string/concat_ws.slt | 383 +++++++++++++++++-
3 files changed, 669 insertions(+), 19 deletions(-)
create mode 100644 datafusion/spark/src/function/string/concat_ws.rs
diff --git a/datafusion/spark/src/function/string/concat_ws.rs b/datafusion/spark/src/function/string/concat_ws.rs
new file mode 100644
index 0000000000000..c9ed1369a51a7
--- /dev/null
+++ b/datafusion/spark/src/function/string/concat_ws.rs
@@ -0,0 +1,297 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Spark-compatible `concat_ws`: joins strings (and array elements) with a separator.
+//!
+//! Null scalar args and null array elements are skipped; a null separator yields a
+//! null row. Non-string args are coerced to STRING; list args (`List`, `LargeList`,
+//! `ListView`, `LargeListView`, `FixedSizeList`) expand their elements.
+//!
+//! Differences with DataFusion core `concat_ws`:
+//! - Accepts list arguments and expands their elements
+//! - Always returns Utf8 (Spark's `STRING` type)
+//! - Coerces non-string scalars (numbers, booleans, dates, ...) to Utf8
+
+use std::fmt::Write as _;
+use std::sync::Arc;
+
+use arrow::array::{
+ Array, ArrayRef, AsArray, GenericListArray, LargeStringArray, OffsetSizeTrait,
+ StringArray, StringBuilder, StringViewArray,
+};
+use arrow::datatypes::{DataType, Field};
+use datafusion_common::{Result, ScalarValue};
+use datafusion_expr::{
+ ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility,
+};
+
+use crate::function::error_utils::{
+ invalid_arg_count_exec_err, unsupported_data_type_exec_err,
+};
+
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct SparkConcatWs {
+ signature: Signature,
+}
+
+impl Default for SparkConcatWs {
+ fn default() -> Self {
+ Self::new()
+ }
+}
+
+impl SparkConcatWs {
+ pub fn new() -> Self {
+ Self {
+ signature: Signature::user_defined(Volatility::Immutable),
+ }
+ }
+}
+
+impl ScalarUDFImpl for SparkConcatWs {
+ fn name(&self) -> &str {
+ "concat_ws"
+ }
+
+ fn signature(&self) -> &Signature {
+ &self.signature
+ }
+
+ fn return_type(&self, _arg_types: &[DataType]) -> Result {
+ Ok(DataType::Utf8)
+ }
+
+ fn coerce_types(&self, arg_types: &[DataType]) -> Result> {
+ if arg_types.is_empty() {
+ return Err(invalid_arg_count_exec_err("concat_ws", (1, i32::MAX), 0));
+ }
+ Ok(arg_types
+ .iter()
+ .enumerate()
+ .map(|(i, dt)| match dt {
+ DataType::Utf8 | DataType::LargeUtf8 | DataType::Utf8View => dt.clone(),
+ // Non-separator list args expand their elements at runtime.
+ // Normalize the list variant so the kernel only sees
+ // List/LargeList, AND force the element type to Utf8 so the
+ // planner inserts a cast for non-string children (Spark
+ // coerces them to STRING the same way it does for scalars).
+ DataType::List(f)
+ | DataType::ListView(f)
+ | DataType::FixedSizeList(f, _)
+ if i > 0 =>
+ {
+ DataType::List(Arc::new(Field::new(
+ f.name(),
+ DataType::Utf8,
+ f.is_nullable(),
+ )))
+ }
+ DataType::LargeList(f) | DataType::LargeListView(f) if i > 0 => {
+ DataType::LargeList(Arc::new(Field::new(
+ f.name(),
+ DataType::Utf8,
+ f.is_nullable(),
+ )))
+ }
+ // Spark casts everything else (numbers, booleans, dates,
+ // binary, null...) to STRING.
+ _ => DataType::Utf8,
+ })
+ .collect())
+ }
+
+ fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result {
+ // Only separator provided → empty string (or NULL if separator is null).
+ // Arg-count validation happens in coerce_types at planning time.
+ if args.args.len() == 1 {
+ return only_separator(&args.args[0]);
+ }
+
+ spark_concat_ws(&args.args, args.number_rows)
+ }
+}
+
+fn only_separator(sep: &ColumnarValue) -> Result {
+ match sep {
+ ColumnarValue::Scalar(s) if s.is_null() => {
+ Ok(ColumnarValue::Scalar(ScalarValue::Utf8(None)))
+ }
+ ColumnarValue::Scalar(_) => Ok(ColumnarValue::Scalar(ScalarValue::Utf8(Some(
+ String::new(),
+ )))),
+ ColumnarValue::Array(arr) => {
+ let mut builder = StringBuilder::with_capacity(arr.len(), 0);
+ for row_idx in 0..arr.len() {
+ if arr.is_null(row_idx) {
+ builder.append_null();
+ } else {
+ builder.append_value("");
+ }
+ }
+ Ok(ColumnarValue::Array(Arc::new(builder.finish()) as ArrayRef))
+ }
+ }
+}
+
+fn spark_concat_ws(args: &[ColumnarValue], num_rows: usize) -> Result {
+ let arrays = ColumnarValue::values_to_arrays(args)?;
+ let sep_view = StringView::try_new(&arrays[0])?;
+ let arg_views: Vec = arrays[1..]
+ .iter()
+ .map(ArgView::try_new)
+ .collect::>()?;
+
+ let mut builder = StringBuilder::with_capacity(num_rows, num_rows * 16);
+
+ for row_idx in 0..num_rows {
+ if sep_view.is_null(row_idx) {
+ builder.append_null();
+ continue;
+ }
+
+ // Write parts directly into the builder via its `fmt::Write` impl;
+ // `append_value("")` then finalises the row (offset + validity) with
+ // no extra copy from an intermediate `String`.
+ let separator = sep_view.value(row_idx);
+ let mut first = true;
+ for view in &arg_views {
+ view.write_row(row_idx, separator, &mut builder, &mut first)?;
+ }
+ builder.append_value("");
+ }
+
+ Ok(ColumnarValue::Array(Arc::new(builder.finish()) as ArrayRef))
+}
+
+/// Typed view over a string array that downcasts once and exposes
+/// per-row access without further dispatch.
+enum StringView<'a> {
+ Utf8(&'a StringArray),
+ LargeUtf8(&'a LargeStringArray),
+ Utf8View(&'a StringViewArray),
+}
+
+impl<'a> StringView<'a> {
+ fn try_new(arr: &'a ArrayRef) -> Result {
+ match arr.data_type() {
+ DataType::Utf8 => Ok(Self::Utf8(arr.as_string::())),
+ DataType::LargeUtf8 => Ok(Self::LargeUtf8(arr.as_string::())),
+ DataType::Utf8View => Ok(Self::Utf8View(arr.as_string_view())),
+ other => Err(unsupported_data_type_exec_err("concat_ws", "STRING", other)),
+ }
+ }
+
+ fn value(&self, idx: usize) -> &str {
+ match self {
+ Self::Utf8(a) => a.value(idx),
+ Self::LargeUtf8(a) => a.value(idx),
+ Self::Utf8View(a) => a.value(idx),
+ }
+ }
+
+ fn is_null(&self, idx: usize) -> bool {
+ match self {
+ Self::Utf8(a) => a.is_null(idx),
+ Self::LargeUtf8(a) => a.is_null(idx),
+ Self::Utf8View(a) => a.is_null(idx),
+ }
+ }
+}
+
+/// Per-argument view: a string array or a list of strings. The downcast
+/// happens once at construction time. `DataType::Null` cannot appear here —
+/// `coerce_types` rewrites it to `Utf8` before invocation.
+enum ArgView<'a> {
+ Str(StringView<'a>),
+ List(&'a GenericListArray),
+ LargeList(&'a GenericListArray),
+}
+
+impl<'a> ArgView<'a> {
+ fn try_new(arr: &'a ArrayRef) -> Result {
+ match arr.data_type() {
+ DataType::Utf8 | DataType::LargeUtf8 | DataType::Utf8View => {
+ Ok(Self::Str(StringView::try_new(arr)?))
+ }
+ DataType::List(_) => Ok(Self::List(arr.as_list::())),
+ DataType::LargeList(_) => Ok(Self::LargeList(arr.as_list::())),
+ other => Err(unsupported_data_type_exec_err(
+ "concat_ws",
+ "STRING or ARRAY",
+ other,
+ )),
+ }
+ }
+
+ fn write_row(
+ &self,
+ row_idx: usize,
+ sep: &str,
+ builder: &mut StringBuilder,
+ first: &mut bool,
+ ) -> Result<()> {
+ match self {
+ Self::Str(view) => {
+ if !view.is_null(row_idx) {
+ push_part(builder, view.value(row_idx), sep, first);
+ }
+ }
+ Self::List(list) => write_list_row(*list, row_idx, sep, builder, first)?,
+ Self::LargeList(list) => write_list_row(*list, row_idx, sep, builder, first)?,
+ }
+ Ok(())
+ }
+}
+
+fn write_list_row(
+ list: &GenericListArray,
+ row_idx: usize,
+ sep: &str,
+ builder: &mut StringBuilder,
+ first: &mut bool,
+) -> Result<()> {
+ if list.is_null(row_idx) {
+ return Ok(());
+ }
+ let values = list.value(row_idx);
+ // An empty array (e.g. `array()`) contributes nothing — Spark renders it
+ // as the empty string, not an error.
+ if values.is_empty() {
+ return Ok(());
+ }
+ let view = StringView::try_new(&values)?;
+ for i in 0..values.len() {
+ if !view.is_null(i) {
+ push_part(builder, view.value(i), sep, first);
+ }
+ }
+ Ok(())
+}
+
+// `StringBuilder::write_str` only does `extend_from_slice` and never errors;
+// the `.expect(..)` is a documentation hint, not a real failure path.
+fn push_part(builder: &mut StringBuilder, part: &str, sep: &str, first: &mut bool) {
+ if !*first {
+ builder
+ .write_str(sep)
+ .expect("StringBuilder::write_str is infallible");
+ }
+ *first = false;
+ builder
+ .write_str(part)
+ .expect("StringBuilder::write_str is infallible");
+}
diff --git a/datafusion/spark/src/function/string/mod.rs b/datafusion/spark/src/function/string/mod.rs
index 9c90ded5f7e1b..bc94c27732c91 100644
--- a/datafusion/spark/src/function/string/mod.rs
+++ b/datafusion/spark/src/function/string/mod.rs
@@ -19,6 +19,7 @@ pub mod ascii;
pub mod base64;
pub mod char;
pub mod concat;
+pub mod concat_ws;
pub mod elt;
pub mod format_string;
pub mod ilike;
@@ -40,6 +41,7 @@ make_udf_function!(ascii::SparkAscii, ascii);
make_udf_function!(base64::SparkBase64, base64);
make_udf_function!(char::CharFunc, char);
make_udf_function!(concat::SparkConcat, concat);
+make_udf_function!(concat_ws::SparkConcatWs, concat_ws);
make_udf_function!(ilike::SparkILike, ilike);
make_udf_function!(length::SparkLengthFunc, length);
make_udf_function!(elt::SparkElt, elt);
@@ -77,6 +79,11 @@ pub mod expr_fn {
"Concatenates multiple input strings into a single string. Returns NULL if any input is NULL.",
args
));
+ export_functions!((
+ concat_ws,
+ "Concatenates strings with separator. Supports arrays. Null values are skipped.",
+ sep args
+ ));
export_functions!((
elt,
"Returns the n-th input (1-indexed), e.g. returns 2nd input when n is 2. The function returns NULL if the index is 0 or exceeds the length of the array.",
@@ -142,6 +149,7 @@ pub fn functions() -> Vec> {
base64(),
char(),
concat(),
+ concat_ws(),
elt(),
ilike(),
length(),
diff --git a/datafusion/sqllogictest/test_files/spark/string/concat_ws.slt b/datafusion/sqllogictest/test_files/spark/string/concat_ws.slt
index 62df636bba9ce..f6404cab8f3cd 100644
--- a/datafusion/sqllogictest/test_files/spark/string/concat_ws.slt
+++ b/datafusion/sqllogictest/test_files/spark/string/concat_ws.slt
@@ -21,22 +21,367 @@
# For more information, please see:
# https://github.com/apache/datafusion/issues/15914
-## Original Query: SELECT concat_ws(' ', 'Spark', 'SQL');
-## PySpark 3.5.5 Result: {'concat_ws( , Spark, SQL)': 'Spark SQL', 'typeof(concat_ws( , Spark, SQL))': 'string', 'typeof( )': 'string', 'typeof(Spark)': 'string', 'typeof(SQL)': 'string'}
-#query
-#SELECT concat_ws(' '::string, 'Spark'::string, 'SQL'::string);
-
-## Original Query: SELECT concat_ws('/', 'foo', null, 'bar');
-## PySpark 3.5.5 Result: {'concat_ws(/, foo, NULL, bar)': 'foo/bar', 'typeof(concat_ws(/, foo, NULL, bar))': 'string', 'typeof(/)': 'string', 'typeof(foo)': 'string', 'typeof(NULL)': 'void', 'typeof(bar)': 'string'}
-#query
-#SELECT concat_ws('/'::string, 'foo'::string, NULL::void, 'bar'::string);
-
-## Original Query: SELECT concat_ws('s');
-## PySpark 3.5.5 Result: {'concat_ws(s)': '', 'typeof(concat_ws(s))': 'string', 'typeof(s)': 'string'}
-#query
-#SELECT concat_ws('s'::string);
-
-## Original Query: SELECT concat_ws(null, 'Spark', 'SQL');
-## PySpark 3.5.5 Result: {'concat_ws(NULL, Spark, SQL)': None, 'typeof(concat_ws(NULL, Spark, SQL))': 'string', 'typeof(NULL)': 'void', 'typeof(Spark)': 'string', 'typeof(SQL)': 'string'}
-#query
-#SELECT concat_ws(NULL::void, 'Spark'::string, 'SQL'::string);
+## ── Basic scalar usage ──────────────────────────────────────
+
+## Multiple string arguments
+query T
+SELECT concat_ws(',', 'a', 'b', 'c');
+----
+a,b,c
+
+## Space separator
+query T
+SELECT concat_ws(' ', 'Spark', 'SQL');
+----
+Spark SQL
+
+## Slash separator with null skipped
+query T
+SELECT concat_ws('/', 'foo', NULL, 'bar');
+----
+foo/bar
+
+## Single argument after separator
+query T
+SELECT concat_ws(',', 'a');
+----
+a
+
+## No arguments after separator → empty string
+query T
+SELECT concat_ws(',');
+----
+(empty)
+
+## Null separator returns null
+query T
+SELECT concat_ws(NULL, 'a', 'b', 'c');
+----
+NULL
+
+## All null arguments → empty string
+query T
+SELECT concat_ws(',', CAST(NULL AS STRING), CAST(NULL AS STRING));
+----
+(empty)
+
+## ── Array arguments ─────────────────────────────────────────
+
+## Array argument
+query T
+SELECT concat_ws(',', array('a', 'b', 'c'));
+----
+a,b,c
+
+## Array with nulls skipped
+query T
+SELECT concat_ws(',', array('a', NULL, 'c'));
+----
+a,c
+
+## Multiple arrays
+query T
+SELECT concat_ws(',', array('a', 'b'), array('c', 'd'));
+----
+a,b,c,d
+
+## Mixed scalar and array arguments
+query T
+SELECT concat_ws(',', 'x', array('a', 'b'), 'y');
+----
+x,a,b,y
+
+## Null array is skipped
+query T
+SELECT concat_ws(',', 'x', CAST(NULL AS ARRAY), 'y');
+----
+x,y
+
+## ── Edge cases ───────────────────────────────────────────────
+
+## Separator column with no value arguments
+query T
+SELECT concat_ws(sep) AS result FROM VALUES (','), ('-') AS t(sep);
+----
+(empty)
+(empty)
+
+## Null separator in column with no value arguments
+query T
+SELECT concat_ws(sep) AS result FROM VALUES (CAST(NULL AS STRING)), (',') AS t(sep);
+----
+NULL
+(empty)
+
+## ── Column expressions ──────────────────────────────────────
+
+## concat_ws on columns
+query T
+SELECT concat_ws('-', a, b) AS result FROM VALUES ('hello', 'world'), ('foo', 'bar') AS t(a, b);
+----
+hello-world
+foo-bar
+
+## concat_ws with null in columns
+query T
+SELECT concat_ws(',', a, b) AS result FROM VALUES ('a', 'b'), ('c', CAST(NULL AS STRING)), (CAST(NULL AS STRING), 'd') AS t(a, b);
+----
+a,b
+c
+d
+
+## Scalar-only arguments over multiple rows (broadcast test)
+query T
+SELECT concat_ws(',', 'a', 'b') AS result FROM VALUES (1), (2), (3) AS t(x);
+----
+a,b
+a,b
+a,b
+
+## ── Additional edge cases ───────────────────────────────────
+
+## Empty separator — values concatenated with nothing between
+query T
+SELECT concat_ws('', 'a', 'b', 'c');
+----
+abc
+
+## Empty-string values are NOT skipped (only NULLs are)
+query T
+SELECT concat_ws(',', '', 'a', '', 'b');
+----
+,a,,b
+
+## Multi-character separator
+query T
+SELECT concat_ws(' - ', 'a', 'b', 'c');
+----
+a - b - c
+
+## Utf8View separator
+query TT
+SELECT concat_ws(arrow_cast(',', 'Utf8View'), 'a', 'b'), arrow_typeof(concat_ws(arrow_cast(',', 'Utf8View'), 'a', 'b'));
+----
+a,b Utf8
+
+## LargeUtf8 separator
+query TT
+SELECT concat_ws(arrow_cast(',', 'LargeUtf8'), 'a', 'b'), arrow_typeof(concat_ws(arrow_cast(',', 'LargeUtf8'), 'a', 'b'));
+----
+a,b Utf8
+
+## Empty array → empty string
+query T
+SELECT concat_ws(',', array());
+----
+(empty)
+
+## Scalar + array + array mix
+query T
+SELECT concat_ws(',', array('a', 'b'), 'c', array('d', 'e'));
+----
+a,b,c,d,e
+
+## All-NULL row mixed with non-NULL rows
+query T
+SELECT concat_ws(',', a, b, c) AS result FROM VALUES
+ ('a', 'b', 'c'),
+ (CAST(NULL AS STRING), 'b', 'c'),
+ ('a', CAST(NULL AS STRING), CAST(NULL AS STRING)),
+ (CAST(NULL AS STRING), CAST(NULL AS STRING), CAST(NULL AS STRING))
+ AS t(a, b, c);
+----
+a,b,c
+b,c
+a
+(empty)
+
+## Separator from column (per-row separator), with NULL rows
+query T
+SELECT concat_ws(sep, a, b) AS result FROM VALUES
+ (',', 'a', 'b'),
+ (CAST(NULL AS STRING), 'a', 'b'),
+ ('|', 'x', 'y')
+ AS t(sep, a, b);
+----
+a,b
+NULL
+x|y
+
+## ── Spark cross-checked extras ──────────────────────────────
+
+## Zero arguments → error (Spark: WRONG_NUM_ARGS)
+query error
+SELECT concat_ws();
+
+## Numeric separator coerced to string
+query T
+SELECT concat_ws(1, 'a', 'b');
+----
+a1b
+
+## Only numeric separator, no values → empty string
+query T
+SELECT concat_ws(123);
+----
+(empty)
+
+## Numeric values coerced to string
+query T
+SELECT concat_ws(',', 1, 2, 3);
+----
+1,2,3
+
+## Float values
+query T
+SELECT concat_ws(',', 1.5, 2.5);
+----
+1.5,2.5
+
+## Boolean values
+query T
+SELECT concat_ws(',', true, false);
+----
+true,false
+
+## Mixed numeric and string
+query T
+SELECT concat_ws(',', CAST(1 AS BIGINT), 'a');
+----
+1,a
+
+## Date values
+query T
+SELECT concat_ws(',', DATE '2024-01-01', 'x');
+----
+2024-01-01,x
+
+## Multi-byte UTF-8 separator
+query T
+SELECT concat_ws('é', 'a', 'b');
+----
+aéb
+
+## Nested concat_ws
+query T
+SELECT concat_ws('|', concat_ws(',', 'a', 'b'), concat_ws(',', 'c', 'd'));
+----
+a,b|c,d
+
+## All-NULL elements in array → empty string
+query T
+SELECT concat_ws(',', array(CAST(NULL AS STRING), CAST(NULL AS STRING), CAST(NULL AS STRING)));
+----
+(empty)
+
+## Empty-string elements in arrays are NOT skipped
+query T
+SELECT concat_ws(',', array(''), array(''));
+----
+,
+
+## Multiple arrays interleaved with scalars and NULLs
+query T
+SELECT concat_ws(',', array('a', 'b'), 'c', CAST(NULL AS STRING), array('d'), '', 'e');
+----
+a,b,c,d,,e
+
+## Long argument list (variadic)
+query T
+SELECT concat_ws('-', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j');
+----
+a-b-c-d-e-f-g-h-i-j
+
+## Long array
+query T
+SELECT concat_ws(',', array('a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j'));
+----
+a,b,c,d,e,f,g,h,i,j
+
+## All-empty arguments
+query T
+SELECT concat_ws('', '', '', '');
+----
+(empty)
+
+## Empty separator with all NULLs
+query T
+SELECT concat_ws('', CAST(NULL AS STRING), CAST(NULL AS STRING));
+----
+(empty)
+
+## Long string preserved (length sanity)
+query I
+SELECT length(concat_ws(',', repeat('x', 1000), repeat('y', 1000)));
+----
+2001
+
+## ── List variants (FixedSizeList / ListView) ────────────────
+
+## FixedSizeList argument is expanded element-by-element
+query T
+SELECT concat_ws(',', arrow_cast(make_array('a', 'b', 'c'), 'FixedSizeList(3, Utf8)'));
+----
+a,b,c
+
+## ListView argument is expanded element-by-element
+query T
+SELECT concat_ws(',', arrow_cast(make_array('a', 'b'), 'ListView(Utf8)'));
+----
+a,b
+
+## LargeListView argument is expanded element-by-element
+query T
+SELECT concat_ws(',', arrow_cast(make_array('a', 'b'), 'LargeListView(Utf8)'));
+----
+a,b
+
+## ── Binary coercion (Spark casts binary to its UTF-8 view) ──
+
+## Binary argument is coerced to its string representation
+query T
+SELECT concat_ws(',', X'4869');
+----
+Hi
+
+## ── Null-separator-over-rows shape ──────────────────────────
+
+## Null separator on a multi-row column yields NULL on every row
+query T
+SELECT concat_ws(NULL, v) AS result FROM VALUES ('a'), ('b'), ('c') AS t(v) ORDER BY v;
+----
+NULL
+NULL
+NULL
+
+## ── Non-string list elements (planner-inserted element cast) ─
+
+## Array of integers — elements must be cast to STRING
+query T
+SELECT concat_ws(',', array(1, 2, 3));
+----
+1,2,3
+
+## Array of doubles
+query T
+SELECT concat_ws('-', array(1.5, 2.5, 3.5));
+----
+1.5-2.5-3.5
+
+## Array of booleans
+query T
+SELECT concat_ws(',', array(true, false, true));
+----
+true,false,true
+
+## Mixed: string scalar + int array + string scalar
+query T
+SELECT concat_ws(',', 'x', array(1, 2), 'y');
+----
+x,1,2,y
+
+## ── Struct rejection ────────────────────────────────────────
+
+## Struct argument is rejected (not coerced to string)
+query error
+SELECT concat_ws(',', named_struct('a', 1));
From 574a1e6b39acd5b2c521d554863ac50fc0855654 Mon Sep 17 00:00:00 2001
From: "Ahmed EL."
Date: Sat, 13 Jun 2026 02:40:47 +0100
Subject: [PATCH 04/39] fix: preserve Spark next_day whitespace validation
(#22720)
## Which issue does this PR close?
- Closes #22717.
## Rationale for this change
Spark does not trim `dayOfWeek` before matching it in `next_day`, but
`datafusion-spark` currently does. That makes values like `' MO '`
succeed in DataFusion even though Spark treats them as invalid.
## What changes are included in this PR?
- remove the `.trim()` call from `spark_next_day`
- add a regression test proving whitespace-padded day names are rejected
## Are these changes tested?
- `cargo test -p datafusion-spark
next_day_rejects_whitespace_padded_day_names -- --nocapture`
- `cargo test -p datafusion-spark`
- `cargo fmt --all --check`
- `cargo clippy -p datafusion-spark --all-targets --all-features
--no-deps -- -D warnings`
Note: the broader package clippy invocation still reports an existing
unused import warning in untouched
`datafusion/core/src/execution/session_state.rs` on current main.
## Are there any user-facing changes?
Behavior now matches Spark for whitespace-padded `dayOfWeek` inputs in
`next_day`.
---------
Signed-off-by: xfocus3
Co-authored-by: xfocus3
Co-authored-by: Ahmed El amraouiyine
---
datafusion/spark/src/function/datetime/next_day.rs | 8 +++++++-
.../sqllogictest/test_files/spark/datetime/next_day.slt | 6 ++++++
2 files changed, 13 insertions(+), 1 deletion(-)
diff --git a/datafusion/spark/src/function/datetime/next_day.rs b/datafusion/spark/src/function/datetime/next_day.rs
index 2241043d44cd7..2ef222526f387 100644
--- a/datafusion/spark/src/function/datetime/next_day.rs
+++ b/datafusion/spark/src/function/datetime/next_day.rs
@@ -210,7 +210,7 @@ where
fn spark_next_day(days: i32, day_of_week: &str) -> Option {
let date = Date32Type::to_naive_date_opt(days)?;
- let day_of_week = day_of_week.trim().to_uppercase();
+ let day_of_week = day_of_week.to_uppercase();
let day_of_week = match day_of_week.as_str() {
"MO" | "MON" | "MONDAY" => Some("MONDAY"),
"TU" | "TUE" | "TUESDAY" => Some("TUESDAY"),
@@ -279,4 +279,10 @@ mod tests {
assert_eq!(field.data_type(), &DataType::Date32);
assert!(field.is_nullable());
}
+
+ #[test]
+ fn next_day_rejects_whitespace_padded_day_names() {
+ let monday = 19723; // 2024-01-01
+ assert_eq!(spark_next_day(monday, " MO "), None);
+ }
}
diff --git a/datafusion/sqllogictest/test_files/spark/datetime/next_day.slt b/datafusion/sqllogictest/test_files/spark/datetime/next_day.slt
index 872d1f2b58eb6..b0ffd7d0e412f 100644
--- a/datafusion/sqllogictest/test_files/spark/datetime/next_day.slt
+++ b/datafusion/sqllogictest/test_files/spark/datetime/next_day.slt
@@ -36,6 +36,12 @@ SELECT next_day('2015-07-27'::DATE, 'Sat'::string);
----
2015-08-01
+# Whitespace-padded day names should be rejected (return NULL) per Spark behavior
+query D
+SELECT next_day('2015-01-14'::DATE, ' MO '::string);
+----
+NULL
+
query error Failed to coerce arguments to satisfy a call to 'next_day' function
SELECT next_day('2015-07-27'::DATE);
From 3bece3dde20cf8973ded8a8dc44978bda64cf640 Mon Sep 17 00:00:00 2001
From: Adam Gutglick
Date: Sat, 13 Jun 2026 13:17:11 +0100
Subject: [PATCH 05/39] Upgrade minimal tokio-postgres version to address
security advisory (#22937)
## Which issue does this PR close?
- Closes #.
## Rationale for this change
`cargo audit` currently reports the following vulnerabilities:
```
Crate: postgres-protocol
Version: 0.6.11
Title: Unbounded SCRAM iteration count allows a malicious server to cause CPU-exhaustion denial of service
Date: 2026-06-12
ID: RUSTSEC-2026-0179
URL: https://rustsec.org/advisories/RUSTSEC-2026-0179
Severity: 8.7 (high)
Solution: Upgrade to >=0.6.12
Crate: postgres-protocol
Version: 0.6.11
Title: Panic decoding a malformed `hstore` value allows denial of service
Date: 2026-06-12
ID: RUSTSEC-2026-0180
URL: https://rustsec.org/advisories/RUSTSEC-2026-0180
Severity: 6.9 (medium)
Solution: Upgrade to >=0.6.12
Crate: tokio-postgres
Version: 0.7.17
Title: Panic on a `DataRow` with fewer fields than columns allows denial of service
Date: 2026-06-12
ID: RUSTSEC-2026-0178
URL: https://rustsec.org/advisories/RUSTSEC-2026-0178
Severity: 6.9 (medium)
Solution: Upgrade to >=0.7.18
```
## What changes are included in this PR?
Upgrade the minimal version of the `tokio-postgres` dependency
## Are these changes tested?
Existing tests
## Are there any user-facing changes?
None
Signed-off-by: Adam Gutglick
---
Cargo.lock | 20 ++++++++++----------
datafusion/sqllogictest/Cargo.toml | 2 +-
2 files changed, 11 insertions(+), 11 deletions(-)
diff --git a/Cargo.lock b/Cargo.lock
index ed90dd25bda7b..df6b263adb009 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4698,9 +4698,9 @@ dependencies = [
[[package]]
name = "postgres-derive"
-version = "0.4.8"
+version = "0.4.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ca1dad89d9ffdbf78502fde418eeede499b87772d88be780478f7f76dc8d471f"
+checksum = "4d9d9089bb0ce62f4b5d52a0be0f4acfb35738b979380670d3dea85fe38d6ddd"
dependencies = [
"heck",
"proc-macro2",
@@ -4710,9 +4710,9 @@ dependencies = [
[[package]]
name = "postgres-protocol"
-version = "0.6.11"
+version = "0.6.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "56201207dac53e2f38e848e31b4b91616a6bb6e0c7205b77718994a7f49e70fc"
+checksum = "08808e3c483c46e999108051c78334f473d5adb59d78bb80a1268c7e6aa6c514"
dependencies = [
"base64 0.22.1",
"byteorder",
@@ -4728,9 +4728,9 @@ dependencies = [
[[package]]
name = "postgres-types"
-version = "0.2.13"
+version = "0.2.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8dc729a129e682e8d24170cd30ae1aa01b336b096cbb56df6d534ffec133d186"
+checksum = "851ca9db4932932d69f3ea811b1abe63087a0f740a47692619dd40d4899b68be"
dependencies = [
"bytes",
"chrono",
@@ -4818,7 +4818,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "343d3bd7056eda839b03204e68deff7d1b13aba7af2b2fd16890697274262ee7"
dependencies = [
"heck",
- "itertools 0.13.0",
+ "itertools 0.14.0",
"log",
"multimap",
"petgraph",
@@ -4837,7 +4837,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "27c6023962132f4b30eb4c172c91ce92d933da334c59c23cddee82358ddafb0b"
dependencies = [
"anyhow",
- "itertools 0.13.0",
+ "itertools 0.14.0",
"proc-macro2",
"quote",
"syn 2.0.117",
@@ -6236,9 +6236,9 @@ dependencies = [
[[package]]
name = "tokio-postgres"
-version = "0.7.17"
+version = "0.7.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4dd8df5ef180f6364759a6f00f7aadda4fbbac86cdee37480826a6ff9f3574ce"
+checksum = "a528f7d280f6d5b9cd149635c8705b0dd049754bc67d81d31fa25169a93809d3"
dependencies = [
"async-trait",
"byteorder",
diff --git a/datafusion/sqllogictest/Cargo.toml b/datafusion/sqllogictest/Cargo.toml
index a642fbe22a6e3..a0c18c90867c7 100644
--- a/datafusion/sqllogictest/Cargo.toml
+++ b/datafusion/sqllogictest/Cargo.toml
@@ -65,7 +65,7 @@ tempfile = { workspace = true }
testcontainers-modules = { workspace = true, features = ["postgres"], optional = true }
thiserror = "2.0.18"
tokio = { workspace = true }
-tokio-postgres = { version = "0.7.17", optional = true }
+tokio-postgres = { version = "0.7.18", optional = true }
[features]
avro = ["datafusion/avro"]
From 58e37a0b3af584761164edb831522e3cc9aee8d0 Mon Sep 17 00:00:00 2001
From: Kumar Ujjawal
Date: Sat, 13 Jun 2026 19:42:24 +0530
Subject: [PATCH 06/39] Clearly gate sliding SUM(DISTINCT) type support
(#22866)
## Which issue does this PR close?
- Closes #22820.
## Rationale for this change
Sliding `SUM(DISTINCT)` only supports `Int64`, but it was routed through
the wider `SUM` type dispatch path. This made unsupported types fail
with a less clear accumulator error.
## What changes are included in this PR?
This PR adds an explicit `Int64` gate for sliding `SUM(DISTINCT)`.
Unsupported types now return a clear feature error that names the
operation and type. The existing `Int64` path is unchanged.
## Are these changes tested?
Yes
## Are there any user-facing changes?
No public API change
---
datafusion/functions-aggregate/src/sum.rs | 21 +++++++----
datafusion/sqllogictest/test_files/window.slt | 35 +++++++++++++++++++
2 files changed, 49 insertions(+), 7 deletions(-)
diff --git a/datafusion/functions-aggregate/src/sum.rs b/datafusion/functions-aggregate/src/sum.rs
index c3c2e5e0b9677..1a1f9c59a2964 100644
--- a/datafusion/functions-aggregate/src/sum.rs
+++ b/datafusion/functions-aggregate/src/sum.rs
@@ -303,13 +303,18 @@ impl AggregateUDFImpl for Sum {
args: AccumulatorArgs,
) -> Result> {
if args.is_distinct {
- // distinct path: use our sliding‐window distinct‐sum
- macro_rules! helper_distinct {
- ($t:ty, $dt:expr) => {
- Ok(Box::new(SlidingDistinctSumAccumulator::try_new(&$dt)?))
- };
+ // distinct path: [`SlidingDistinctSumAccumulator`] only implements
+ // Int64, so gate the supported type here rather than dispatching
+ // through `downcast_sum!`, which accepts every SUM type
+ match args.return_field.data_type() {
+ DataType::Int64 => Ok(Box::new(SlidingDistinctSumAccumulator::try_new(
+ &DataType::Int64,
+ )?)),
+ _ => not_impl_err!(
+ "SUM(DISTINCT) over sliding window frames is only supported for Int64, got {}",
+ args.expr_fields[0].data_type()
+ ),
}
- downcast_sum!(args, helper_distinct)
} else {
// non‐distinct path: existing sliding sum
macro_rules! helper {
@@ -525,7 +530,9 @@ impl SlidingDistinctSumAccumulator {
pub fn try_new(data_type: &DataType) -> Result {
// TODO support other numeric types
if *data_type != DataType::Int64 {
- return exec_err!("SlidingDistinctSumAccumulator only supports Int64");
+ return exec_err!(
+ "SlidingDistinctSumAccumulator only supports Int64, got {data_type}"
+ );
}
Ok(Self {
counts: HashMap::default(),
diff --git a/datafusion/sqllogictest/test_files/window.slt b/datafusion/sqllogictest/test_files/window.slt
index 59b43d6476571..090f44f5628f7 100644
--- a/datafusion/sqllogictest/test_files/window.slt
+++ b/datafusion/sqllogictest/test_files/window.slt
@@ -5981,6 +5981,41 @@ FROM table_distinct_sum_nulls;
5 5
+# SUM(DISTINCT) over sliding (bounded) window frames is only implemented
+# for Int64. Other SUM-supported input types must fail with a clear
+# capability error instead of an accumulator-internal one.
+statement ok
+CREATE TABLE table_distinct_sum_types(ts INT, f DOUBLE, d DECIMAL(10, 2)) AS VALUES
+ (1, 1.5, 1.50), (2, 2.5, 2.50), (3, 1.5, 1.50);
+
+query error DataFusion error: This feature is not implemented: SUM\(DISTINCT\) over sliding window frames is only supported for Int64, got Float64
+SELECT SUM(DISTINCT f) OVER (
+ ORDER BY ts ROWS BETWEEN 1 PRECEDING AND CURRENT ROW
+) FROM table_distinct_sum_types;
+
+query error DataFusion error: This feature is not implemented: SUM\(DISTINCT\) over sliding window frames is only supported for Int64, got Decimal128\(10, 2\)
+SELECT SUM(DISTINCT d) OVER (
+ ORDER BY ts ROWS BETWEEN 1 PRECEDING AND CURRENT ROW
+) FROM table_distinct_sum_types;
+
+query error DataFusion error: This feature is not implemented: SUM\(DISTINCT\) over sliding window frames is only supported for Int64, got UInt64
+SELECT SUM(DISTINCT arrow_cast(ts, 'UInt64')) OVER (
+ ORDER BY ts ROWS BETWEEN 1 PRECEDING AND CURRENT ROW
+) FROM table_distinct_sum_types;
+
+# Unbounded frames take the regular distinct-sum path and keep
+# supporting all SUM input types.
+query R
+SELECT SUM(DISTINCT f) OVER (ORDER BY ts) FROM table_distinct_sum_types;
+----
+1.5
+4
+4
+
+statement ok
+DROP TABLE table_distinct_sum_types;
+
+
# FILTER clause with window functions
# Verify FILTER clause with non-aggregate window functions fails with a clear message
From a7280b87f541f0ed65674be377ac4998b7069a6d Mon Sep 17 00:00:00 2001
From: Amogh Ramesh
Date: Sat, 13 Jun 2026 20:10:01 +0530
Subject: [PATCH 07/39] FFI: plumb `placement` for `FFI_ScalarUDF` (#22608)
## Which issue does this PR close?
- Part of #22330.
This is the first of the per-method PRs that issue describes. It plumbs
`placement` only; the remaining defaulted methods follow separately, so
the umbrella issue stays open.
## Rationale for this change
`FFI_ScalarUDF` (`datafusion/ffi/src/udf/mod.rs`) carried no function
pointer for `placement`, and `ForeignScalarUDF` did not override it, so
a producer's override of `ScalarUDFImpl::placement` (default body at
`datafusion/expr/src/udf.rs:1028`) was dropped on the consumer side and
every foreign UDF fell back to `KeepInPlace`. A UDF loaded over FFI
never delivered its leaf-pushdown hint to the optimizer.
## What changes are included in this PR?
- New `FFI_ExpressionPlacement` enum bridge in
`datafusion/ffi/src/placement.rs`, in the shape of `FFI_Volatility`:
`#[repr(u8)]` with `From` impls both ways and a round-trip test over
every variant.
- A `placement` function pointer on `FFI_ScalarUDF`, populated in the
`From>` constructor, with `placement_fn_wrapper` on the
producer side and a forwarding `ForeignScalarUDF::placement` on the
consumer side. `placement` is infallible, so the pointer returns the
enum directly rather than `FFI_Result`.
Adding a field to the `#[repr(C)]` struct changes its layout, so this is
an API change and should carry the `api change` label (I can't add it
myself). It targets `main` and should not be back-ported to a release
branch.
`display_name` is also on the issue's list, but it has been deprecated
since 50.0.0, so it should be dropped from the gap list rather than
plumbed. I have left it and the remaining methods to follow-up PRs.
## Are these changes tested?
Yes.
- Unit: a round-trip test over all four `ExpressionPlacement` variants,
plus a forced-foreign test (`mock_foreign_marker_id`) using a UDF whose
`placement` override depends on its arguments. The assertions cover
ordered, reordered, and empty argument slices, so argument marshalling
is checked, not just the return value.
- Integration: `tests/ffi_udf.rs` loads the UDF from the real cdylib and
asserts the override survives the boundary, which is the surface a
layout change needs.
Run with `cargo test -p datafusion-ffi` and `cargo test -p
datafusion-ffi --features integration-tests`.
## Are there any user-facing changes?
A `placement` override on a `ScalarUDFImpl` is now preserved across the
FFI boundary instead of being silently replaced by the default. This is
an ABI change to `FFI_ScalarUDF`; consumers must be recompiled against
the new layout.
---------
Signed-off-by: Amogh Ramesh
---
datafusion/ffi/src/lib.rs | 1 +
datafusion/ffi/src/placement.rs | 72 ++++++++++++++
datafusion/ffi/src/tests/mod.rs | 3 +
datafusion/ffi/src/tests/udf_udaf_udwf.rs | 54 ++++++++++-
datafusion/ffi/src/udf/mod.rs | 111 +++++++++++++++++++++-
datafusion/ffi/tests/ffi_udf.rs | 27 +++++-
6 files changed, 263 insertions(+), 5 deletions(-)
create mode 100644 datafusion/ffi/src/placement.rs
diff --git a/datafusion/ffi/src/lib.rs b/datafusion/ffi/src/lib.rs
index 4df6c4b570f34..fd2ac58576b09 100644
--- a/datafusion/ffi/src/lib.rs
+++ b/datafusion/ffi/src/lib.rs
@@ -36,6 +36,7 @@ pub mod ffi_option;
pub mod insert_op;
pub mod physical_expr;
pub mod physical_optimizer;
+pub mod placement;
pub mod plan_properties;
pub mod proto;
pub mod record_batch_stream;
diff --git a/datafusion/ffi/src/placement.rs b/datafusion/ffi/src/placement.rs
new file mode 100644
index 0000000000000..837f0e3aad647
--- /dev/null
+++ b/datafusion/ffi/src/placement.rs
@@ -0,0 +1,72 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use datafusion_expr::ExpressionPlacement;
+
+#[expect(non_camel_case_types)]
+#[repr(u8)]
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum FFI_ExpressionPlacement {
+ Literal,
+ Column,
+ MoveTowardsLeafNodes,
+ KeepInPlace,
+}
+
+impl From for FFI_ExpressionPlacement {
+ fn from(value: ExpressionPlacement) -> Self {
+ match value {
+ ExpressionPlacement::Literal => Self::Literal,
+ ExpressionPlacement::Column => Self::Column,
+ ExpressionPlacement::MoveTowardsLeafNodes => Self::MoveTowardsLeafNodes,
+ ExpressionPlacement::KeepInPlace => Self::KeepInPlace,
+ }
+ }
+}
+
+impl From for ExpressionPlacement {
+ fn from(value: FFI_ExpressionPlacement) -> Self {
+ match value {
+ FFI_ExpressionPlacement::Literal => Self::Literal,
+ FFI_ExpressionPlacement::Column => Self::Column,
+ FFI_ExpressionPlacement::MoveTowardsLeafNodes => Self::MoveTowardsLeafNodes,
+ FFI_ExpressionPlacement::KeepInPlace => Self::KeepInPlace,
+ }
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use datafusion::logical_expr::ExpressionPlacement;
+
+ use super::FFI_ExpressionPlacement;
+
+ fn test_round_trip_placement(placement: ExpressionPlacement) {
+ let ffi_placement: FFI_ExpressionPlacement = placement.into();
+ let round_trip: ExpressionPlacement = ffi_placement.into();
+
+ assert_eq!(placement, round_trip);
+ }
+
+ #[test]
+ fn test_all_round_trip_placement() {
+ test_round_trip_placement(ExpressionPlacement::Literal);
+ test_round_trip_placement(ExpressionPlacement::Column);
+ test_round_trip_placement(ExpressionPlacement::MoveTowardsLeafNodes);
+ test_round_trip_placement(ExpressionPlacement::KeepInPlace);
+ }
+}
diff --git a/datafusion/ffi/src/tests/mod.rs b/datafusion/ffi/src/tests/mod.rs
index 03b3a7ab246c7..dcd0910ecb4e9 100644
--- a/datafusion/ffi/src/tests/mod.rs
+++ b/datafusion/ffi/src/tests/mod.rs
@@ -90,6 +90,8 @@ pub struct ForeignLibraryModule {
pub create_timezone_udf: extern "C" fn() -> FFI_ScalarUDF,
+ pub create_placement_udf: extern "C" fn() -> FFI_ScalarUDF,
+
pub create_table_function:
extern "C" fn(FFI_LogicalExtensionCodec) -> FFI_TableFunction,
@@ -251,6 +253,7 @@ pub extern "C" fn datafusion_ffi_get_module() -> ForeignLibraryModule {
create_scalar_udf: create_ffi_abs_func,
create_nullary_udf: create_ffi_random_func,
create_timezone_udf: udf_udaf_udwf::create_timezone_func,
+ create_placement_udf: udf_udaf_udwf::create_placement_func,
create_table_function: create_ffi_table_func,
create_sum_udaf: create_ffi_sum_func,
create_stddev_udaf: create_ffi_stddev_func,
diff --git a/datafusion/ffi/src/tests/udf_udaf_udwf.rs b/datafusion/ffi/src/tests/udf_udaf_udwf.rs
index 399a2cc6be5cd..b393f5db3a506 100644
--- a/datafusion/ffi/src/tests/udf_udaf_udwf.rs
+++ b/datafusion/ffi/src/tests/udf_udaf_udwf.rs
@@ -21,8 +21,8 @@ use arrow_schema::DataType;
use datafusion_catalog::TableFunctionImpl;
use datafusion_common::ScalarValue;
use datafusion_expr::{
- AggregateUDF, ColumnarValue, ScalarFunctionArgs, ScalarUDF, ScalarUDFImpl, Signature,
- Volatility, WindowUDF,
+ AggregateUDF, ColumnarValue, ExpressionPlacement, ScalarFunctionArgs, ScalarUDF,
+ ScalarUDFImpl, Signature, Volatility, WindowUDF,
};
use datafusion_functions::math::abs::AbsFunc;
use datafusion_functions::math::random::RandomFunc;
@@ -112,6 +112,56 @@ pub(crate) extern "C" fn create_timezone_func() -> FFI_ScalarUDF {
udf.into()
}
+#[derive(Debug, PartialEq, Eq, Hash)]
+struct PlacementUDF {
+ signature: Signature,
+}
+
+impl ScalarUDFImpl for PlacementUDF {
+ fn name(&self) -> &str {
+ "placement_udf"
+ }
+
+ fn signature(&self) -> &Signature {
+ &self.signature
+ }
+
+ fn return_type(
+ &self,
+ _arg_types: &[DataType],
+ ) -> datafusion_common::Result {
+ Ok(DataType::Int64)
+ }
+
+ fn invoke_with_args(
+ &self,
+ _args: ScalarFunctionArgs,
+ ) -> datafusion_common::Result {
+ datafusion_common::internal_err!("placement_udf is not meant to be invoked")
+ }
+
+ fn placement(&self, args: &[ExpressionPlacement]) -> ExpressionPlacement {
+ // Push to the leaves only for a (Column, Literal) pairing, so the
+ // test catches dropped, reordered, or truncated arguments.
+ if matches!(
+ args,
+ [ExpressionPlacement::Column, ExpressionPlacement::Literal]
+ ) {
+ ExpressionPlacement::MoveTowardsLeafNodes
+ } else {
+ ExpressionPlacement::KeepInPlace
+ }
+ }
+}
+
+pub(crate) extern "C" fn create_placement_func() -> FFI_ScalarUDF {
+ let udf: Arc = Arc::new(ScalarUDF::from(PlacementUDF {
+ signature: Signature::uniform(1, vec![DataType::Int64], Volatility::Immutable),
+ }));
+
+ udf.into()
+}
+
pub(crate) extern "C" fn create_ffi_table_func(
codec: FFI_LogicalExtensionCodec,
) -> FFI_TableFunction {
diff --git a/datafusion/ffi/src/udf/mod.rs b/datafusion/ffi/src/udf/mod.rs
index ff18a30e4ba19..4fc22e859f9fb 100644
--- a/datafusion/ffi/src/udf/mod.rs
+++ b/datafusion/ffi/src/udf/mod.rs
@@ -28,8 +28,8 @@ use datafusion_common::config::ConfigOptions;
use datafusion_common::{DataFusionError, Result, internal_err};
use datafusion_expr::type_coercion::functions::fields_with_udf;
use datafusion_expr::{
- ColumnarValue, ReturnFieldArgs, ScalarFunctionArgs, ScalarUDF, ScalarUDFImpl,
- Signature,
+ ColumnarValue, ExpressionPlacement, ReturnFieldArgs, ScalarFunctionArgs, ScalarUDF,
+ ScalarUDFImpl, Signature,
};
use return_type_args::{
FFI_ReturnFieldArgs, ForeignReturnFieldArgs, ForeignReturnFieldArgsOwned,
@@ -41,6 +41,7 @@ use stabby::vec::Vec as SVec;
use crate::arrow_wrappers::{WrappedArray, WrappedSchema};
use crate::config::FFI_ConfigOptions;
use crate::expr::columnar_value::FFI_ColumnarValue;
+use crate::placement::FFI_ExpressionPlacement;
use crate::util::{
FFI_Result, rvec_wrapped_to_vec_datatype, vec_datatype_to_rvec_wrapped,
};
@@ -91,6 +92,14 @@ pub struct FFI_ScalarUDF {
arg_types: SVec,
) -> FFI_Result>,
+ /// FFI equivalent to the `placement` of a [`ScalarUDFImpl`]. Returns the
+ /// placement hint for the underlying [`ScalarUDF`] given each argument's
+ /// placement. Infallible, so it returns the value directly, not an `FFI_Result`.
+ pub placement: unsafe extern "C" fn(
+ udf: &Self,
+ args: SVec,
+ ) -> FFI_ExpressionPlacement,
+
/// Used to create a clone on the provider of the udf. This should
/// only need to be called by the receiver of the udf.
pub clone: unsafe extern "C" fn(udf: &Self) -> Self,
@@ -157,6 +166,18 @@ unsafe extern "C" fn coerce_types_fn_wrapper(
sresult!(vec_datatype_to_rvec_wrapped(&return_types))
}
+unsafe extern "C" fn placement_fn_wrapper(
+ udf: &FFI_ScalarUDF,
+ args: SVec,
+) -> FFI_ExpressionPlacement {
+ let args = args
+ .into_iter()
+ .map(ExpressionPlacement::from)
+ .collect::>();
+
+ udf.inner().placement(&args).into()
+}
+
unsafe extern "C" fn invoke_with_args_fn_wrapper(
udf: &FFI_ScalarUDF,
args: SVec,
@@ -250,6 +271,7 @@ impl From> for FFI_ScalarUDF {
invoke_with_args: invoke_with_args_fn_wrapper,
return_field_from_args: return_field_from_args_fn_wrapper,
coerce_types: coerce_types_fn_wrapper,
+ placement: placement_fn_wrapper,
clone: clone_fn_wrapper,
release: release_fn_wrapper,
private_data: Box::into_raw(private_data) as *mut c_void,
@@ -427,12 +449,59 @@ impl ScalarUDFImpl for ForeignScalarUDF {
Ok(rvec_wrapped_to_vec_datatype(&result_types)?)
}
}
+
+ fn placement(&self, args: &[ExpressionPlacement]) -> ExpressionPlacement {
+ let args = args
+ .iter()
+ .map(|p| FFI_ExpressionPlacement::from(*p))
+ .collect::>();
+
+ let result = unsafe { (self.udf.placement)(&self.udf, args) };
+
+ result.into()
+ }
}
#[cfg(test)]
mod tests {
use super::*;
+ #[derive(Debug, PartialEq, Eq, Hash)]
+ struct PlacementUDF {
+ signature: Signature,
+ }
+
+ impl ScalarUDFImpl for PlacementUDF {
+ fn name(&self) -> &str {
+ "placement_udf"
+ }
+
+ fn signature(&self) -> &Signature {
+ &self.signature
+ }
+
+ fn return_type(&self, _arg_types: &[DataType]) -> Result {
+ Ok(DataType::Int64)
+ }
+
+ fn invoke_with_args(&self, _args: ScalarFunctionArgs) -> Result {
+ internal_err!("placement_udf is not meant to be invoked")
+ }
+
+ fn placement(&self, args: &[ExpressionPlacement]) -> ExpressionPlacement {
+ // Push to the leaves only for a (Column, Literal) pairing, so the
+ // test catches dropped, reordered, or truncated arguments.
+ if matches!(
+ args,
+ [ExpressionPlacement::Column, ExpressionPlacement::Literal]
+ ) {
+ ExpressionPlacement::MoveTowardsLeafNodes
+ } else {
+ ExpressionPlacement::KeepInPlace
+ }
+ }
+ }
+
#[test]
fn test_round_trip_scalar_udf() -> Result<()> {
let original_udf = datafusion::functions::math::abs::AbsFunc::new();
@@ -467,4 +536,42 @@ mod tests {
Ok(())
}
+
+ #[test]
+ fn test_ffi_udf_placement_round_trip() -> Result<()> {
+ use datafusion_expr::Volatility;
+
+ let original_udf = Arc::new(ScalarUDF::from(PlacementUDF {
+ signature: Signature::uniform(
+ 1,
+ vec![DataType::Int64],
+ Volatility::Immutable,
+ ),
+ }));
+
+ let mut ffi_udf = FFI_ScalarUDF::from(original_udf);
+
+ // Force the foreign path so the call travels through the FFI vtable
+ // rather than downcasting back to the original local type.
+ ffi_udf.library_marker_id = crate::mock_foreign_marker_id;
+ let foreign_udf: Arc = (&ffi_udf).into();
+ assert!(foreign_udf.is::());
+
+ // Without the plumbing the override is dropped and every call is
+ // KeepInPlace. The three cases also check the arguments survive the
+ // round trip in order.
+ assert_eq!(
+ foreign_udf
+ .placement(&[ExpressionPlacement::Column, ExpressionPlacement::Literal]),
+ ExpressionPlacement::MoveTowardsLeafNodes
+ );
+ assert_eq!(
+ foreign_udf
+ .placement(&[ExpressionPlacement::Literal, ExpressionPlacement::Column]),
+ ExpressionPlacement::KeepInPlace
+ );
+ assert_eq!(foreign_udf.placement(&[]), ExpressionPlacement::KeepInPlace);
+
+ Ok(())
+ }
}
diff --git a/datafusion/ffi/tests/ffi_udf.rs b/datafusion/ffi/tests/ffi_udf.rs
index 6e6cb31f53133..dffaf83c479b1 100644
--- a/datafusion/ffi/tests/ffi_udf.rs
+++ b/datafusion/ffi/tests/ffi_udf.rs
@@ -23,7 +23,7 @@ mod tests {
use arrow::datatypes::DataType;
use datafusion::common::record_batch;
use datafusion::error::Result;
- use datafusion::logical_expr::{ScalarUDF, ScalarUDFImpl};
+ use datafusion::logical_expr::{ExpressionPlacement, ScalarUDF, ScalarUDFImpl};
use datafusion::prelude::{SessionContext, col};
use datafusion_execution::config::SessionConfig;
use datafusion_expr::lit;
@@ -91,6 +91,31 @@ mod tests {
Ok(())
}
+ /// This test validates that a producer's `placement` override survives the
+ /// FFI boundary instead of collapsing to the default `KeepInPlace`.
+ #[tokio::test]
+ async fn test_scalar_udf_placement() -> Result<()> {
+ let module = get_module()?;
+
+ let ffi_placement_func = (module.create_placement_udf)();
+ let foreign_func: Arc = (&ffi_placement_func).into();
+
+ // The override pushes to the leaves only for (Column, Literal), so these
+ // also check the arguments cross the boundary in order.
+ assert_eq!(
+ foreign_func
+ .placement(&[ExpressionPlacement::Column, ExpressionPlacement::Literal]),
+ ExpressionPlacement::MoveTowardsLeafNodes
+ );
+ assert_eq!(
+ foreign_func
+ .placement(&[ExpressionPlacement::Literal, ExpressionPlacement::Column]),
+ ExpressionPlacement::KeepInPlace
+ );
+
+ Ok(())
+ }
+
#[tokio::test]
async fn test_config_on_scalar_udf() -> Result<()> {
let module = get_module()?;
From 78033fa679c6031927e6b25154e664fc5cefcaec Mon Sep 17 00:00:00 2001
From: Nathan <56370526+nathanb9@users.noreply.github.com>
Date: Sat, 13 Jun 2026 11:41:33 -0400
Subject: [PATCH 08/39] refactor: introduce ProbeEnd state in
NestedLoopJoinExec (#22865)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
## Which issue does this PR close?
- Closes #22808.
## Rationale for this change
Follow-up to #22791, as suggested in review by @2010YOUY01.
That PR fixed a double-decrement bug where `EmitLeftUnmatched` did two
jobs at once — deciding whether a partition emits unmatched-left rows
(which decrements the shared `probe_threads_counter`) and performing the
emit. Because the state is re-enterable (a ready batch can be flushed
before the state advances to `Done`), the counter could be decremented
twice, driving it to zero before all partitions finished probing and
emitting spurious NULL-padded rows. #22791 patched this with a
`probe_completed_reported` guard flag.
This refactor makes "decrement exactly once per probe stream" a
structural property of the state graph rather than a runtime guard, so
the inner logic is easier to follow and the bug is harder to
reintroduce.
## What changes are included in this PR?
Restructures the state machine from `FetchingRight → EmitLeftUnmatched`
to `FetchingRight → ProbeEnd → EmitLeftUnmatched`:
- Adds a dedicated `ProbeEnd` state, entered exactly once per left chunk
when the right side is exhausted. It owns the single
`report_probe_completed()` call and records whether this stream is the
unmatched-left emitter.
- Replaces the `probe_completed_reported` guard flag with an
`is_unmatched_left_emitter` field that `EmitLeftUnmatched` only reads.
- Removes the per-chunk flag reset in the memory-limited path (the
decision is recomputed in `ProbeEnd` for each chunk) and reverts the
`Arc::clone` workaround #22791 needed in `process_left_unmatched`.
- Updates the state-transition doc graph and arm comments.
No behavior change is expected.
## Are these changes tested?
Yes — covered by existing tests:
- All 42 `nested_loop_join` unit tests and the full
`datafusion-physical-plan` suite pass.
- `joins.slt` sqllogictests pass (including the multi-partition LEFT
JOIN regression test added in #22791).
- 41 `join_fuzz` tests (`cargo test --features extended_tests`)
comparing `NestedLoopJoinExec` against `HashJoinExec` across every join
type, filtered and unfiltered, with a multi-partition probe side — the
exact scenario class of the original bug — pass.
- `cargo fmt` and `cargo clippy --all-targets --all-features -- -D
warnings` are clean.
## Are there any user-facing changes?
No.
---
.../src/joins/nested_loop_join.rs | 141 ++++++++++++------
1 file changed, 96 insertions(+), 45 deletions(-)
diff --git a/datafusion/physical-plan/src/joins/nested_loop_join.rs b/datafusion/physical-plan/src/joins/nested_loop_join.rs
index 0bd053a9db12c..a4cea2c0ccc44 100644
--- a/datafusion/physical-plan/src/joins/nested_loop_join.rs
+++ b/datafusion/physical-plan/src/joins/nested_loop_join.rs
@@ -874,6 +874,16 @@ enum NLJState {
FetchingRight,
ProbeRight,
EmitRightUnmatched,
+ /// Entered exactly once per left chunk, when the probe (right) side is
+ /// exhausted and probing for the current chunk is finished. This state
+ /// owns the single [`JoinLeftData::report_probe_completed`] call that
+ /// decrements the shared probe-threads counter, and records in
+ /// `is_unmatched_left_emitter` whether this stream is the one responsible
+ /// for emitting unmatched-left rows. Splitting this decision out of
+ /// `EmitLeftUnmatched` makes "decrement exactly once" a structural
+ /// property of the state graph, so the (re-enterable) emit state no longer
+ /// has to guard against decrementing twice.
+ ProbeEnd,
EmitLeftUnmatched,
/// Emit unmatched right rows using the global bitmap accumulated across
/// all left chunks. Only used in memory-limited mode for join types that
@@ -1065,16 +1075,17 @@ pub(crate) struct NestedLoopJoinStream {
/// Memory-limited spill fallback state. See [`SpillState`] for details.
spill_state: SpillState,
- /// Whether this stream has already reported probe completion for the current
- /// left chunk via [`JoinLeftData::report_probe_completed`]. The shared
- /// probe-threads counter must be decremented exactly once per probe stream;
- /// without this guard a stream that yields a ready batch while finishing the
- /// `EmitLeftUnmatched` state (and is then re-polled with `left_emit_idx`
- /// still 0) would decrement the counter twice, driving it to zero
- /// prematurely and causing a sibling partition to emit unmatched-left rows
- /// before all partitions finished probing (spurious NULL-padded rows).
- /// Reset to `false` when starting a new left chunk in memory-limited mode.
- probe_completed_reported: bool,
+ /// Whether this stream is the one responsible for emitting unmatched-left
+ /// rows for the current left chunk. Set in the [`NLJState::ProbeEnd`] state,
+ /// which is entered exactly once per chunk and owns the single
+ /// [`JoinLeftData::report_probe_completed`] call: the stream that drives the
+ /// shared probe-threads counter to zero (the last to finish probing) becomes
+ /// the emitter. Because the decrement happens once in `ProbeEnd` rather than
+ /// in the re-enterable `EmitLeftUnmatched` state, the counter can never be
+ /// decremented twice, so it cannot reach zero before all partitions finish
+ /// probing (which would otherwise let a partition emit spurious NULL-padded
+ /// unmatched-left rows early).
+ is_unmatched_left_emitter: bool,
}
pub(crate) struct NestedLoopJoinMetrics {
@@ -1118,7 +1129,7 @@ impl Stream for NestedLoopJoinStream {
/// BufferingLeft → FetchingRight
///
/// FetchingRight → ProbeRight (if right batch available)
- /// FetchingRight → EmitLeftUnmatched (if right exhausted)
+ /// FetchingRight → ProbeEnd (if right exhausted)
///
/// ProbeRight → ProbeRight (next left row or after yielding output)
/// ProbeRight → EmitRightUnmatched (for special join types like right join)
@@ -1126,6 +1137,9 @@ impl Stream for NestedLoopJoinStream {
///
/// EmitRightUnmatched → FetchingRight
///
+ /// ProbeEnd → EmitLeftUnmatched (records whether this stream is the
+ /// unmatched-left emitter, then always continues to EmitLeftUnmatched)
+ ///
/// EmitLeftUnmatched → EmitLeftUnmatched (only process 1 chunk for each
/// iteration)
/// EmitLeftUnmatched → Done (if finished)
@@ -1161,8 +1175,8 @@ impl Stream for NestedLoopJoinStream {
// 1. --> ProbeRight
// Start processing the join for the newly fetched right
// batch.
- // 2. --> EmitLeftUnmatched: When the right side input is exhausted, (maybe) emit
- // unmatched left side rows.
+ // 2. --> ProbeEnd: When the right side input is exhausted,
+ // probing for the current left chunk is finished.
//
// After fetching a new batch from the right side, it will
// process all rows from the buffered left data:
@@ -1176,9 +1190,10 @@ impl Stream for NestedLoopJoinStream {
// at once in memory.
//
// So after the right side input is exhausted, the join phase
- // for the current buffered left data is finished. We can go to
- // the next `EmitLeftUnmatched` phase to check if there is any
- // special handling (e.g., in cases like left join).
+ // for the current buffered left data is finished. We go to the
+ // `ProbeEnd` state, which records probe completion before the
+ // `EmitLeftUnmatched` phase checks if there is any special
+ // handling (e.g., in cases like left join).
NLJState::FetchingRight => {
debug!("[NLJState] Entering: {:?}", self.state);
// stop on drop
@@ -1241,6 +1256,28 @@ impl Stream for NestedLoopJoinStream {
}
}
+ // NLJState transitions:
+ // 1. --> EmitLeftUnmatched
+ // Probing for the current left chunk is finished. Report
+ // probe completion exactly once (decrementing the shared
+ // probe-threads counter) and record whether this stream is
+ // the unmatched-left emitter, then always advance to
+ // `EmitLeftUnmatched`.
+ NLJState::ProbeEnd => {
+ debug!("[NLJState] Entering: {:?}", self.state);
+
+ // stop on drop
+ let join_metric = self.metrics.join_metrics.join_time.clone();
+ let _join_timer = join_metric.timer();
+
+ match self.handle_probe_end() {
+ ControlFlow::Continue(()) => continue,
+ ControlFlow::Break(poll) => {
+ return self.metrics.join_metrics.baseline.record_poll(poll);
+ }
+ }
+ }
+
// NLJState transitions:
// 1. --> EmitLeftUnmatched(1)
// If we have already buffered enough output to yield, it
@@ -1348,7 +1385,7 @@ impl NestedLoopJoinStream {
handled_empty_output: false,
should_track_unmatched_right: need_produce_right_in_final(join_type),
spill_state,
- probe_completed_reported: false,
+ is_unmatched_left_emitter: false,
}
}
@@ -1724,7 +1761,10 @@ impl NestedLoopJoinStream {
}
Some(Err(e)) => ControlFlow::Break(Poll::Ready(Some(Err(e)))),
None => {
- self.state = NLJState::EmitLeftUnmatched;
+ // Right side exhausted: probing for the current left chunk
+ // is finished. `ProbeEnd` reports probe completion before
+ // emitting unmatched-left rows.
+ self.state = NLJState::ProbeEnd;
ControlFlow::Continue(())
}
},
@@ -1837,6 +1877,34 @@ impl NestedLoopJoinStream {
}
}
+ /// Handle ProbeEnd state - record probe completion for the current chunk.
+ ///
+ /// Entered exactly once per left chunk, when the right side is exhausted.
+ /// This is the single place that decrements the shared probe-threads counter
+ /// via [`JoinLeftData::report_probe_completed`]: the stream that drives the
+ /// counter to zero (the last to finish probing) is the one responsible for
+ /// emitting unmatched-left rows, recorded in `is_unmatched_left_emitter`.
+ ///
+ /// Owning the decrement here — rather than in the re-enterable
+ /// `EmitLeftUnmatched` state — makes "decrement exactly once per stream" a
+ /// structural property of the state graph, so the counter cannot reach zero
+ /// before all partitions finish probing (which would let a partition emit
+ /// spurious NULL-padded unmatched-left rows early).
+ ///
+ /// Always transitions to `EmitLeftUnmatched`.
+ fn handle_probe_end(&mut self) -> ControlFlow>>> {
+ // Decrement the shared counter exactly once for this stream/chunk. The
+ // last stream to finish probing (the one that drives the counter to
+ // zero) becomes the unmatched-left emitter.
+ let is_emitter = match self.get_left_data() {
+ Ok(left_data) => left_data.report_probe_completed(),
+ Err(e) => return ControlFlow::Break(Poll::Ready(Some(Err(e)))),
+ };
+ self.is_unmatched_left_emitter = is_emitter;
+ self.state = NLJState::EmitLeftUnmatched;
+ ControlFlow::Continue(())
+ }
+
/// Handle EmitLeftUnmatched state - emit unmatched left rows.
///
/// In memory-limited mode, after processing all unmatched rows for the
@@ -1876,9 +1944,9 @@ impl NestedLoopJoinStream {
self.left_probe_idx = 0;
self.left_emit_idx = 0;
// Each memory-limited chunk gets a fresh per-chunk
- // `JoinLeftData`/counter, so allow this stream to report
- // completion again for the next chunk.
- self.probe_completed_reported = false;
+ // `JoinLeftData`/counter; `is_unmatched_left_emitter` is
+ // recomputed when `ProbeEnd` is re-entered for the next
+ // chunk, so it does not need to be reset here.
self.state = NLJState::BufferingLeft;
} else if self.is_memory_limited()
&& self.should_track_unmatched_right
@@ -2357,9 +2425,7 @@ impl NestedLoopJoinStream {
/// true -> continue in the same EmitLeftUnmatched state
/// false -> next state (Done)
fn process_left_unmatched(&mut self) -> Result {
- // Clone the shared `Arc` so the immutable borrow of `self`
- // ends here and we can update `self.probe_completed_reported` below.
- let left_data = Arc::clone(self.get_left_data()?);
+ let left_data = self.get_left_data()?;
let left_batch = left_data.batch();
// ========
@@ -2368,29 +2434,14 @@ impl NestedLoopJoinStream {
// Early return if join type can't have unmatched rows
let join_type_no_produce_left = !need_produce_result_in_final(self.join_type);
- // Early return if another thread is already processing unmatched rows.
- //
- // The shared probe-threads counter must be decremented exactly once per
- // probe stream. This function can be re-entered with `left_emit_idx`
- // still 0 (e.g. when a ready batch was flushed via an early return in
- // `handle_emit_left_unmatched` before the state advanced), so guard the
- // decrement with `probe_completed_reported` instead of relying solely on
- // `left_emit_idx == 0`. Decrementing twice would drive the counter to
- // zero prematurely and let a partition emit unmatched-left rows before
- // all partitions finished probing, producing spurious NULL-padded rows.
- let handled_by_other_partition = if self.probe_completed_reported {
- // Already counted this stream's completion; if we're the designated
- // emitter we have `left_emit_idx > 0` (or are mid-emit) and continue,
- // otherwise another partition is handling emission.
- self.left_emit_idx == 0
- } else {
- self.probe_completed_reported = true;
- self.left_emit_idx == 0 && !left_data.report_probe_completed()
- };
// Stop processing unmatched rows, the caller will go to the next state
let finished = self.left_emit_idx >= left_batch.num_rows();
- if join_type_no_produce_left || handled_by_other_partition || finished {
+ // `ProbeEnd` already recorded whether this stream emits unmatched-left
+ // rows. Every probe partition passes through this state, but only the
+ // one that finished probing last is the emitter, so this flag is false
+ // for the others.
+ if join_type_no_produce_left || !self.is_unmatched_left_emitter || finished {
return Ok(false);
}
@@ -2402,7 +2453,7 @@ impl NestedLoopJoinStream {
let end_idx = std::cmp::min(start_idx + self.batch_size, left_batch.num_rows());
if let Some(batch) =
- self.process_left_unmatched_range(&left_data, start_idx, end_idx)?
+ self.process_left_unmatched_range(left_data, start_idx, end_idx)?
{
self.output_buffer.push_batch(batch)?;
}
From cb2542c5bebacb75d014b2138daef24af371663e Mon Sep 17 00:00:00 2001
From: fys <40801205+fengys1996@users.noreply.github.com>
Date: Sun, 14 Jun 2026 11:27:07 +0800
Subject: [PATCH 09/39] fix: TRY_CAST returns NULL for timestamp/date overflow
(#22897)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
## Which issue does this PR close?
- Closes #22896.
## Rationale for this change
`TRY_CAST` should return NULL on cast failure, but overflowing
date/timestamp casts returned errors.
## What changes are included in this PR?
- Make scalar temporal overflow checks respect CastOptions.safe.
- Skip DataFusion’s array pre-check for safe casts so Arrow can return
NULLs.
- Add regression tests.
## Are these changes tested?
Yes:
```bash
cargo test -p datafusion-common timestamp_overflow_returns
cargo test -p datafusion-expr-common timestamp_array_to_timestamp_overflow
cargo test --test sqllogictests -- datetime/timestamps.slt
```
## Are there any user-facing changes?
Yes. TRY_CAST for overflowing date/timestamp casts now returns NULL;
regular CAST still errors.
## Known Limitation
This PR does not add Date array-path coverage yet. For example:
```sql
SELECT TRY_CAST(d AS TIMESTAMP(9))
FROM (VALUES (DATE '3000-01-01')) t(d);
```
This depends on the upstream Arrow fix in apache/arrow-rs#9825. Once
DataFusion updates to an Arrow version containing that fix, we can add
this regression test.
---
datafusion/common/src/scalar/mod.rs | 45 ++++++++++++++++++-
datafusion/expr-common/src/columnar_value.rs | 32 ++++++++++++-
.../test_files/datetime/timestamps.slt | 19 ++++++++
3 files changed, 94 insertions(+), 2 deletions(-)
diff --git a/datafusion/common/src/scalar/mod.rs b/datafusion/common/src/scalar/mod.rs
index c9013af72619c..8a8a47b3bb50b 100644
--- a/datafusion/common/src/scalar/mod.rs
+++ b/datafusion/common/src/scalar/mod.rs
@@ -4292,7 +4292,14 @@ impl ScalarValue {
.or_else(|| timestamp_to_timestamp_multiplier(&source_type, target_type))
&& let Some(value) = self.temporal_scalar_value_as_i64()
{
- ensure_timestamp_in_bounds(value, multiplier, &source_type, target_type)?;
+ match ensure_timestamp_in_bounds(value, multiplier, &source_type, target_type)
+ {
+ Ok(()) => {}
+ Err(_) if cast_options.safe => {
+ return ScalarValue::try_new_null(target_type);
+ }
+ Err(e) => return Err(e),
+ }
}
let scalar_array = self.to_array()?;
@@ -10190,6 +10197,24 @@ mod tests {
);
}
+ #[test]
+ fn safe_cast_date_to_timestamp_overflow_returns_null() {
+ let scalar = ScalarValue::Date32(Some(i32::MAX));
+ let safe_options = CastOptions {
+ safe: true,
+ ..DEFAULT_CAST_OPTIONS
+ };
+
+ let casted = scalar
+ .cast_to_with_options(
+ &DataType::Timestamp(TimeUnit::Nanosecond, None),
+ &safe_options,
+ )
+ .expect("expected safe cast to return null");
+
+ assert_eq!(casted, ScalarValue::TimestampNanosecond(None, None));
+ }
+
#[test]
fn cast_timestamp_to_timestamp_overflow_returns_error() {
let scalar = ScalarValue::TimestampSecond(Some(i64::MAX), None);
@@ -10203,6 +10228,24 @@ mod tests {
);
}
+ #[test]
+ fn safe_cast_timestamp_to_timestamp_overflow_returns_null() {
+ let scalar = ScalarValue::TimestampSecond(Some(i64::MAX), None);
+ let safe_options = CastOptions {
+ safe: true,
+ ..DEFAULT_CAST_OPTIONS
+ };
+
+ let casted = scalar
+ .cast_to_with_options(
+ &DataType::Timestamp(TimeUnit::Nanosecond, None),
+ &safe_options,
+ )
+ .expect("expected safe cast to return null");
+
+ assert_eq!(casted, ScalarValue::TimestampNanosecond(None, None));
+ }
+
#[test]
fn null_dictionary_scalar_produces_null_dictionary_array() {
let dictionary_scalar = ScalarValue::Dictionary(
diff --git a/datafusion/expr-common/src/columnar_value.rs b/datafusion/expr-common/src/columnar_value.rs
index caeb3f10da752..ef9192c3569d9 100644
--- a/datafusion/expr-common/src/columnar_value.rs
+++ b/datafusion/expr-common/src/columnar_value.rs
@@ -325,7 +325,9 @@ fn cast_array_by_name(
) {
datafusion_common::nested_struct::cast_column(array, cast_type, cast_options)
} else {
- ensure_temporal_array_timestamp_bounds(array, cast_type)?;
+ if !cast_options.safe {
+ ensure_temporal_array_timestamp_bounds(array, cast_type)?;
+ }
Ok(kernels::cast::cast_with_options(
array,
cast_type,
@@ -766,4 +768,32 @@ mod tests {
"unexpected error: {err}"
);
}
+
+ #[test]
+ fn safe_cast_timestamp_array_to_timestamp_overflow_returns_null() {
+ let overflow_value = i64::MAX / 1_000_000_000 + 1;
+ let array: ArrayRef =
+ Arc::new(TimestampSecondArray::from(vec![Some(overflow_value)]));
+ let value = ColumnarValue::Array(array);
+ let safe_options = CastOptions {
+ safe: true,
+ ..DEFAULT_CAST_OPTIONS
+ };
+
+ let casted = value
+ .cast_to(
+ &DataType::Timestamp(TimeUnit::Nanosecond, None),
+ Some(&safe_options),
+ )
+ .expect("expected safe cast to return null");
+
+ let ColumnarValue::Array(array) = casted else {
+ panic!("expected array after cast");
+ };
+ let array = array
+ .as_any()
+ .downcast_ref::()
+ .expect("expected TimestampNanosecondArray");
+ assert!(array.is_null(0));
+ }
}
diff --git a/datafusion/sqllogictest/test_files/datetime/timestamps.slt b/datafusion/sqllogictest/test_files/datetime/timestamps.slt
index 89c6f0a12139e..06740fa0f5439 100644
--- a/datafusion/sqllogictest/test_files/datetime/timestamps.slt
+++ b/datafusion/sqllogictest/test_files/datetime/timestamps.slt
@@ -5379,6 +5379,25 @@ SELECT to_timestamp(arrow_cast(-9223372036, 'Int64'));
query error converted value exceeds the representable i64 range
SELECT to_timestamp(arrow_cast(9223372037, 'Int64'));
+# TRY_CAST returns NULL for timestamp/date casts that overflow
+query P
+SELECT TRY_CAST(arrow_cast(9223372037, 'Timestamp(s)') AS TIMESTAMP(9));
+----
+NULL
+
+query P
+SELECT TRY_CAST(DATE '3000-01-01' AS TIMESTAMP(9));
+----
+NULL
+
+query P
+SELECT TRY_CAST(ts AS TIMESTAMP(9)) AS ts
+FROM (
+ VALUES (arrow_cast(9223372037, 'Timestamp(s)'))
+) t(ts);
+----
+NULL
+
# Float truncation behavior
query P
SELECT to_timestamp_seconds(arrow_cast(-1.9, 'Float64'));
From d428760d709a375f3d997c84e9c4748a22584149 Mon Sep 17 00:00:00 2001
From: Jordan Epstein <32082339+jordepic@users.noreply.github.com>
Date: Sun, 14 Jun 2026 07:37:35 -0500
Subject: [PATCH 10/39] fix: count shared buffers once in hash join build-side
memory accounting (#22862)
## Which issue does this PR close?
- Closes #22861.
## Rationale for this change
When using DataFusion comet I noticed that my hash join operator was
failing with the following error: `Failed to acquire 142606336 bytes
where 17142251456 bytes already reserved and the fair limit is
17179869184 bytes, 4 registered`. Looking into this more, DataFusion
asks to reserve memory for each batch (by default 8192 rows) of the
build side of a hash join - and tries to reserve (without actually
allocating it) num_batches * batch_size. This is problematic when these
are batches are zero-copy slices of a larger batch (e.g.
GroupedHashAggregateStream), since the slice size is evaluated to be the
size of the larger buffer. This is because the reference to the slice
actually keeps the entire buffer from being freed. DataFusion doesn't
overallocate memory (the underlying data is the same), but it does
over-request it (in the centralized accounting system), which can lead
to these "ResourcesExhausted" exceptions.
## What changes are included in this PR?
In this change, we keep track of all of the buffers that we've already
counted via a set of pointers. This way, we don't redundantly request
memory for the whole arrow buffer for each sub-slice of it. We choose
this approach as opposed to just requesting a smaller amount of memory
per batch, because as mentioned before, the pointer to each batch
technically keeps the entire arrow-buffer from being freed.
## Are these changes tested?
The new hash join test fails on main with ResourcesExhausted and passes
with this change.
## Are there any user-facing changes?
No breaking changes. Adds a new public helper
count_record_batch_memory_size to datafusion-common.
Co-authored-by: Jordan Epstein
---
datafusion/common/src/utils/memory.rs | 90 ++++++++++++++++---
.../physical-plan/src/joins/hash_join/exec.rs | 66 +++++++++++++-
2 files changed, 140 insertions(+), 16 deletions(-)
diff --git a/datafusion/common/src/utils/memory.rs b/datafusion/common/src/utils/memory.rs
index 78ec434d2b577..21c084119e120 100644
--- a/datafusion/common/src/utils/memory.rs
+++ b/datafusion/common/src/utils/memory.rs
@@ -21,7 +21,8 @@ use crate::error::_exec_datafusion_err;
use crate::{HashSet, Result};
use arrow::array::ArrayData;
use arrow::record_batch::RecordBatch;
-use std::{mem::size_of, ptr::NonNull};
+use std::mem::size_of;
+use std::num::NonZero;
/// Estimates the memory size required for a hash table prior to allocation.
///
@@ -131,34 +132,74 @@ pub fn estimate_memory_size(num_elements: usize, fixed_size: usize) -> Result
/// `Buffer`. This method provides temporary fix until the issue is resolved:
///
pub fn get_record_batch_memory_size(batch: &RecordBatch) -> usize {
- // Store pointers to `Buffer`'s start memory address (instead of actual
- // used data region's pointer represented by current `Array`)
- let mut counted_buffers: HashSet> = HashSet::new();
- let mut total_size = 0;
-
- for array in batch.columns() {
- let array_data = array.to_data();
- count_array_data_memory_size(&array_data, &mut counted_buffers, &mut total_size);
+ RecordBatchMemoryCounter::new().count_batch(batch)
+}
+
+/// Tracks the memory used by a sequence of [`RecordBatch`]es that may share
+/// underlying buffers, counting each buffer exactly once.
+///
+/// Use this instead of [`get_record_batch_memory_size`] to account for the
+/// total memory of a sequence of batches, e.g. when buffering the batches of
+/// an input stream. Such batches can share buffers (for example, operators
+/// like aggregates emit one large batch as multiple zero-copy slices), and
+/// calling [`get_record_batch_memory_size`] per batch counts the shared
+/// buffers once per batch, while this counter counts them exactly once. A
+/// batch's buffers are kept alive by the batch even when only a sub-range is
+/// referenced, so counting unique buffers in full reflects the memory the
+/// batches actually retain.
+#[derive(Debug, Default)]
+pub struct RecordBatchMemoryCounter {
+ /// Start addresses of `Buffer`s that have already been counted (instead of
+ /// actual used data region's pointer represented by current `Array`)
+ counted_buffers: HashSet>,
+ /// Total memory of all unique buffers counted so far
+ memory_usage: usize,
+}
+
+impl RecordBatchMemoryCounter {
+ pub fn new() -> Self {
+ Self::default()
}
- total_size
+ /// Count `batch`, returning the memory used by its buffers that have not
+ /// been counted before.
+ pub fn count_batch(&mut self, batch: &RecordBatch) -> usize {
+ let mut total_size = 0;
+
+ for array in batch.columns() {
+ let array_data = array.to_data();
+ count_array_data_memory_size(
+ &array_data,
+ &mut self.counted_buffers,
+ &mut total_size,
+ );
+ }
+
+ self.memory_usage += total_size;
+ total_size
+ }
+
+ /// Total memory of the unique buffers of all batches counted so far.
+ pub fn memory_usage(&self) -> usize {
+ self.memory_usage
+ }
}
/// Count the memory usage of `array_data` and its children recursively.
fn count_array_data_memory_size(
array_data: &ArrayData,
- counted_buffers: &mut HashSet>,
+ counted_buffers: &mut HashSet>,
total_size: &mut usize,
) {
// Count memory usage for `array_data`
for buffer in array_data.buffers() {
- if counted_buffers.insert(buffer.data_ptr()) {
+ if counted_buffers.insert(buffer.data_ptr().addr()) {
*total_size += buffer.capacity();
} // Otherwise the buffer's memory is already counted
}
if let Some(null_buffer) = array_data.nulls()
- && counted_buffers.insert(null_buffer.inner().inner().data_ptr())
+ && counted_buffers.insert(null_buffer.inner().inner().data_ptr().addr())
{
*total_size += null_buffer.inner().inner().capacity();
}
@@ -295,6 +336,29 @@ mod record_batch_tests {
assert_eq!(size_origin, size_sliced);
}
+ #[test]
+ fn test_record_batch_memory_counter_buffer_shared_across_batches() {
+ let schema = Arc::new(Schema::new(vec![Field::new(
+ "ints",
+ DataType::Int32,
+ false,
+ )]));
+
+ let int_array = Int32Array::from(vec![1, 2, 3, 4, 5, 6]);
+ let batch = RecordBatch::try_new(schema, vec![Arc::new(int_array)]).unwrap();
+ let slices = [batch.slice(0, 2), batch.slice(2, 2), batch.slice(4, 2)];
+
+ // Counting each slice individually counts the shared buffer once per slice
+ let summed: usize = slices.iter().map(get_record_batch_memory_size).sum();
+ assert_eq!(summed, 3 * get_record_batch_memory_size(&batch));
+
+ // A counter shared across the batches counts it exactly once
+ let mut counter = RecordBatchMemoryCounter::new();
+ let deduped: usize = slices.iter().map(|slice| counter.count_batch(slice)).sum();
+ assert_eq!(deduped, get_record_batch_memory_size(&batch));
+ assert_eq!(counter.memory_usage(), get_record_batch_memory_size(&batch));
+ }
+
#[test]
fn test_get_record_batch_memory_size_nested_array() {
let schema = Arc::new(Schema::new(vec![
diff --git a/datafusion/physical-plan/src/joins/hash_join/exec.rs b/datafusion/physical-plan/src/joins/hash_join/exec.rs
index 3774a300209d0..7cddae276f5fa 100644
--- a/datafusion/physical-plan/src/joins/hash_join/exec.rs
+++ b/datafusion/physical-plan/src/joins/hash_join/exec.rs
@@ -52,7 +52,6 @@ use crate::projection::{
try_pushdown_through_join,
};
use crate::repartition::REPARTITION_RANDOM_STATE;
-use crate::spill::get_record_batch_memory_size;
use crate::{
DisplayAs, DisplayFormatType, Distribution, ExecutionPlan, Partitioning,
PlanProperties, SendableRecordBatchStream, Statistics,
@@ -72,7 +71,7 @@ use arrow::record_batch::RecordBatch;
use arrow::util::bit_util;
use arrow_schema::{DataType, Schema};
use datafusion_common::config::ConfigOptions;
-use datafusion_common::utils::memory::estimate_memory_size;
+use datafusion_common::utils::memory::{RecordBatchMemoryCounter, estimate_memory_size};
use datafusion_common::{
JoinSide, JoinType, NullEquality, Result, assert_or_internal_err, internal_err,
plan_err, project_schema,
@@ -1817,6 +1816,10 @@ struct BuildSideState {
metrics: BuildProbeJoinMetrics,
reservation: MemoryReservation,
bounds_accumulators: Option>,
+ /// Counts the memory of `batches` for `reservation`. Batches can share
+ /// underlying buffers (e.g. when the input emits zero-copy slices of one
+ /// larger batch), so each buffer must be reserved only once.
+ memory_counter: RecordBatchMemoryCounter,
}
impl BuildSideState {
@@ -1833,6 +1836,7 @@ impl BuildSideState {
num_rows: 0,
metrics,
reservation,
+ memory_counter: RecordBatchMemoryCounter::new(),
bounds_accumulators: should_compute_dynamic_filters
.then(|| {
on_left
@@ -1923,7 +1927,7 @@ async fn collect_left_input(
}
// Decide if we spill or not
- let batch_size = get_record_batch_memory_size(&batch);
+ let batch_size = state.memory_counter.count_batch(&batch);
// Reserve memory for incoming batch
state.reservation.try_grow(batch_size)?;
// Update metrics
@@ -1945,6 +1949,7 @@ async fn collect_left_input(
metrics,
mut reservation,
bounds_accumulators,
+ memory_counter: _,
} = state;
// Compute bounds
@@ -5369,6 +5374,61 @@ mod tests {
Ok(())
}
+ #[tokio::test]
+ async fn build_side_sliced_batches_memory_accounting() -> Result<()> {
+ // The build side emits zero-copy slices of one large batch, as e.g. an
+ // aggregate emitting its output in batch_size chunks does. The buffers
+ // shared by the slices must be reserved once in total, not once per
+ // slice: per-slice accounting reserves number_of_slices x parent size
+ // and aborts queries that fit in memory with room to spare.
+ let n = 4096;
+ let v: Vec = (0..n).collect();
+ let parent = build_table_i32(("a1", &v), ("b1", &v), ("c1", &v));
+ let slices: Vec =
+ (0..16).map(|i| parent.slice(i * 256, 256)).collect();
+ let left =
+ TestMemoryExec::try_new_exec(&[slices], parent.schema(), None).unwrap();
+
+ let right_batch = build_table_i32(
+ ("a2", &vec![10, 11]),
+ ("b2", &vec![0, 1]),
+ ("c2", &vec![14, 15]),
+ );
+ let right = TestMemoryExec::try_new_exec(
+ &[vec![right_batch.clone()]],
+ right_batch.schema(),
+ None,
+ )
+ .unwrap();
+ let on = vec![(
+ Arc::new(Column::new_with_schema("b1", &parent.schema())?) as _,
+ Arc::new(Column::new_with_schema("b2", &right_batch.schema())?) as _,
+ )];
+
+ // Enough for the parent batch (~48KB) plus the join hash table, but far
+ // below the ~768KB that per-slice accounting would reserve
+ let runtime = RuntimeEnvBuilder::new()
+ .with_memory_limit(400_000, 1.0)
+ .build_arc()?;
+ let task_ctx = TaskContext::default().with_runtime(runtime);
+ let task_ctx = Arc::new(task_ctx);
+
+ let join = join(
+ left,
+ right,
+ on,
+ &JoinType::Inner,
+ NullEquality::NullEqualsNothing,
+ )?;
+
+ let stream = join.execute(0, task_ctx)?;
+ let batches = common::collect(stream).await?;
+ let num_rows: usize = batches.iter().map(|b| b.num_rows()).sum();
+ assert_eq!(num_rows, 2);
+
+ Ok(())
+ }
+
#[tokio::test]
async fn partitioned_join_overallocation() -> Result<()> {
// Prepare partitioned inputs for HashJoinExec
From 6520315d41851d1fb31da0ae1b4f22e48a6b2705 Mon Sep 17 00:00:00 2001
From: ajegou
Date: Mon, 15 Jun 2026 10:04:34 +0200
Subject: [PATCH 11/39] fix(topk): call attempt_early_completion when filter
rejects entire batch (#22852)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
## Which issue does this PR close?
- Closes #22849
- A related cross-partition starvation case is tracked separately in
#22874 and addressed by an upcoming follow-up PR — see
[discussion](https://github.com/apache/datafusion/pull/22852#issuecomment-4670382915)
for details
## Rationale for this change
`TopK::insert_batch` short-circuits when the heap's dynamic filter
rejects every row in a batch:
```rust
if !filter.has_true() {
// nothing to filter, so no need to update
return Ok(());
}
```
The early-exit check `attempt_early_completion(&batch)` lives later in
the same function, gated on `replacements > 0`. So a batch that the
filter rejects entirely bypasses the check.
The heap's dynamic filter is derived from the heap's worst row (via
`update_filter`). A batch whose rows all come from a strictly worse sort
prefix is exactly the batch the filter rejects entirely — i.e. the very
signal `attempt_early_completion` is designed to detect ("the next batch
is past the heap's boundary, we can stop") is what causes the function
to short-circuit *before* the check runs.
This is a feature-interaction regression between two PRs that were both
correct in isolation. The `attempt_early_completion` mechanism was added
by #15563 (closing #15529). At the time, there was no heap-derived
dynamic filter on TopK, so the only sensible call site was right after a
successful heap insertion. Two months later, #15770 added the
dynamic-filter pushdown for TopK sorts, introducing the
`!filter.has_true()` short-circuit. The two features address different
problems and the new short-circuit didn't connect to the existing
prefix-completion check — which is how this gap opened up.
**Consequence**: on a TopK over an input ordered on the sort prefix,
`finished = true` is never set once the heap stabilizes. Since
`finished` is the signal `SortExec` uses to stop pulling from its input
(via `Poll::Ready(None)` from the TopK stream, which cascades into
dropping the source stream), the source keeps being polled long past the
point where no further row can improve the heap. The LIMIT optimization
effectively degrades to "heap saves memory but reads everything";
sources with cancellable streams (e.g. networked sources) never receive
the cancellation signal.
## What changes are included in this PR?
Single behavioral change in `datafusion/physical-plan/src/topk/mod.rs`:
call `attempt_early_completion(&batch)` immediately before the `return
Ok(())` in the `!filter.has_true()` branch.
Why this scope, not a broader restructuring:
- The existing `attempt_early_completion` call inside `if replacements >
0` is load-bearing for a related case: a batch containing a mix of
"still valuable" rows and "past the boundary" rows. The existing
`test_try_finish_marks_finished_with_prefix` test covers this case —
Batch 2 with `a=[2,3], b=[10,20]` against a heap where `heap.max.a = 2`;
the `(2, 10)` row must be inserted before the check on the `(3, 20)`
last row triggers. Moving the call earlier would skip the insertion of
valuable rows and break that test.
- The bug is specifically that the *short-circuit* path doesn't call the
check. The fix targets exactly that path.
- A related but separate gap is not addressed here: when
`filter.has_true() == true` but `replacements == 0` (the filter accepts
some rows but `find_new_topk_items` ends up inserting none of them), the
existing call inside `if replacements > 0` is also skipped. This
requires a divergence between the heap's filter predicate and the
row-byte comparison used inside `find_new_topk_items`, which shouldn't
normally happen (the filter is derived from the heap's worst row using
the same comparator). A deterministic synthetic repro would likely
require concurrent heap updates from sibling partitions or
boundary-value edge cases (NaN/NULL semantics, type coercion). Happy to
send a follow-up if reviewers want it covered; the workload that
motivated this fix was the filter-rejection case empirically.
## Are these changes tested?
Yes. Added a regression test
`test_try_finish_fires_when_filter_rejects_entire_batch`. The assertion
target is `topk.finished` — the flag that signals "stop pulling from the
source" to upstream consumers (read by `TopKExec::poll_next` to emit
`Poll::Ready(None)`). Asserting that the flag transitions on the
fully-filter-rejected batch is equivalent to asserting that the
source-stopping mechanism activates.
- Builds a TopK over a `(a, b)` sort with prefix `a`, k=3.
- Inserts a batch that fills the heap with rows from `a ∈ {1, 2}`;
`update_filter` tightens the filter to `a < 2 OR (a = 2 AND b < 30)`.
- Inserts a second batch with all rows at `a = 3` — filter rejects every
row.
- Without the fix: `insert_batch` short-circuits, `topk.finished` stays
`false`. Test fails.
- With the fix: `attempt_early_completion` fires (last-row prefix `a =
3` > heap.max prefix `a = 2`), `topk.finished` becomes `true`. Test
passes.
The test also asserts the emitted top-K is unchanged from after batch 1,
confirming no candidate row was incorrectly excluded by the early bail.
All 28 existing `topk::` tests continue to pass (including
`test_try_finish_marks_finished_with_prefix`, which exercises the
mixed-prefix case).
## Are there any user-facing changes?
No public API or output changes. The fix only changes when TopK marks
itself `finished = true` — specifically, it now fires
`attempt_early_completion` for batches that are entirely rejected by the
heap's dynamic filter, where previously it would silently skip the
check. Output of TopK is unchanged; only the early-exit behavior
improves.
---------
Co-authored-by: Gabriel <45515538+gabotechs@users.noreply.github.com>
---
datafusion/physical-plan/src/topk/mod.rs | 101 ++++++++++++++++++-----
1 file changed, 82 insertions(+), 19 deletions(-)
diff --git a/datafusion/physical-plan/src/topk/mod.rs b/datafusion/physical-plan/src/topk/mod.rs
index 9da606dc90db2..8a8bfd204ecb6 100644
--- a/datafusion/physical-plan/src/topk/mod.rs
+++ b/datafusion/physical-plan/src/topk/mod.rs
@@ -255,7 +255,9 @@ impl TopK {
let array = filtered.into_array(num_rows)?;
let mut filter = array.as_boolean().clone();
if !filter.has_true() {
- // nothing to filter, so no need to update
+ // The heap is unchanged, but a fully rejected batch can still prove
+ // that the shared sort prefix has passed the heap boundary.
+ self.attempt_early_completion(&batch)?;
return Ok(());
}
// only update the keys / rows if the filter does not match all rows
@@ -1099,20 +1101,15 @@ mod tests {
assert_eq!(record_batch_store.batches_size, 0);
}
- /// This test validates that the `try_finish` method marks the TopK operator as finished
- /// when the prefix (on column "a") of the last row in the current batch is strictly greater
- /// than the max top‑k row.
- /// The full sort expression is defined on both columns ("a", "b"), but the input ordering is only on "a".
- #[tokio::test]
- async fn test_try_finish_marks_finished_with_prefix() -> Result<()> {
- // Create a schema with two columns.
+ /// Builds an `(a Int32, b Float64)` schema and a `TopK` with full sort
+ /// `(a ASC, b ASC)`, input prefix `[a]`, `k = 3`, `batch_size = 2`. Used by
+ /// the prefix-completion tests below to keep their per-scenario logic in focus.
+ fn build_ab_prefix_topk() -> Result<(Arc, TopK)> {
let schema = Arc::new(Schema::new(vec![
Field::new("a", DataType::Int32, false),
Field::new("b", DataType::Float64, false),
]));
- // Create sort expressions.
- // Full sort: first by "a", then by "b".
let sort_expr_a = PhysicalSortExpr {
expr: col("a", schema.as_ref())?,
options: SortOptions::default(),
@@ -1122,28 +1119,33 @@ mod tests {
options: SortOptions::default(),
};
- // Input ordering uses only column "a" (a prefix of the full sort).
+ // Input ordering uses only column "a" (a prefix of the full sort on (a, b)).
let prefix = vec![sort_expr_a.clone()];
let full_expr = LexOrdering::from([sort_expr_a, sort_expr_b]);
- // Create a dummy runtime environment and metrics.
- let runtime = Arc::new(RuntimeEnv::default());
- let metrics = ExecutionPlanMetricsSet::new();
-
- // Create a TopK instance with k = 3 and batch_size = 2.
- let mut topk = TopK::try_new(
+ let topk = TopK::try_new(
0,
Arc::clone(&schema),
prefix,
full_expr,
3,
2,
- runtime,
- &metrics,
+ Arc::new(RuntimeEnv::default()),
+ &ExecutionPlanMetricsSet::new(),
Arc::new(RwLock::new(TopKDynamicFilters::new(Arc::new(
DynamicFilterPhysicalExpr::new(vec![], lit(true)),
)))),
)?;
+ Ok((schema, topk))
+ }
+
+ /// This test validates that the `try_finish` method marks the TopK operator as finished
+ /// when the prefix (on column "a") of the last row in the current batch is strictly greater
+ /// than the max top‑k row.
+ /// The full sort expression is defined on both columns ("a", "b"), but the input ordering is only on "a".
+ #[tokio::test]
+ async fn test_try_finish_marks_finished_with_prefix() -> Result<()> {
+ let (schema, mut topk) = build_ab_prefix_topk()?;
// Create the first batch with two columns:
// Column "a": [1, 1, 2], Column "b": [20.0, 15.0, 30.0].
@@ -1196,6 +1198,67 @@ mod tests {
Ok(())
}
+ /// Regression test for #22849: a batch whose rows are entirely rejected by the
+ /// heap's dynamic filter must still trigger `attempt_early_completion` when its
+ /// last row's prefix is worse than the heap's worst.
+ ///
+ /// Before the fix, the `!filter.has_true()` short-circuit returned without calling
+ /// `attempt_early_completion`. Because the heap's filter is itself derived from the
+ /// heap's worst row, a batch from a strictly-worse prefix is exactly the case the
+ /// filter rejects entirely — i.e. the very signal the early-exit was designed to
+ /// detect was being silently dropped.
+ #[tokio::test]
+ async fn test_try_finish_fires_when_filter_rejects_entire_batch() -> Result<()> {
+ let (schema, mut topk) = build_ab_prefix_topk()?;
+
+ // Batch 1 fills the heap with (1, 20.0), (1, 15.0), (2, 30.0).
+ // heap.max becomes (a=2, b=30.0); update_filter tightens the heap filter to
+ // a < 2 OR (a = 2 AND b < 30.0).
+ let array_a1: ArrayRef =
+ Arc::new(Int32Array::from(vec![Some(1), Some(1), Some(2)]));
+ let array_b1: ArrayRef = Arc::new(Float64Array::from(vec![20.0, 15.0, 30.0]));
+ let batch1 = RecordBatch::try_new(Arc::clone(&schema), vec![array_a1, array_b1])?;
+ topk.insert_batch(batch1)?;
+ assert!(
+ !topk.finished,
+ "Expected 'finished' to be false after batch 1 \
+ (last row prefix a=2 equals heap.max prefix a=2, not strictly greater)."
+ );
+
+ // Batch 2: every row has a=3, so the heap's filter (a < 2 OR (a = 2 AND b < 30))
+ // rejects every row. Before the fix, `insert_batch` would short-circuit on
+ // `!filter.has_true()` and return without checking the prefix; `finished`
+ // would stay false even though no future batch could improve the heap.
+ let array_a2: ArrayRef = Arc::new(Int32Array::from(vec![Some(3), Some(3)]));
+ let array_b2: ArrayRef = Arc::new(Float64Array::from(vec![10.0, 20.0]));
+ let batch2 = RecordBatch::try_new(Arc::clone(&schema), vec![array_a2, array_b2])?;
+ topk.insert_batch(batch2)?;
+ assert!(
+ topk.finished,
+ "Expected 'finished' to be true after batch 2 \
+ (filter rejected every row, but the batch's last row prefix a=3 \
+ is strictly greater than heap.max prefix a=2)."
+ );
+
+ // The emitted top-k is unchanged from after batch 1 since none of batch 2's
+ // rows could improve the heap.
+ let results: Vec<_> = topk.emit()?.try_collect().await?;
+ assert_batches_eq!(
+ &[
+ "+---+------+",
+ "| a | b |",
+ "+---+------+",
+ "| 1 | 15.0 |",
+ "| 1 | 20.0 |",
+ "| 2 | 30.0 |",
+ "+---+------+",
+ ],
+ &results
+ );
+
+ Ok(())
+ }
+
/// This test verifies that the dynamic filter is marked as complete after TopK processing finishes.
#[tokio::test]
async fn test_topk_marks_filter_complete() -> Result<()> {
From 99895e686ddafaef8bccf8600688a4c7b1a2b994 Mon Sep 17 00:00:00 2001
From: Michael Kleen
Date: Mon, 15 Jun 2026 10:30:53 +0200
Subject: [PATCH 12/39] refactor: Simplify heap size estimation for types that
own no heap allocations (#22918)
## Which issue does this PR close?
- Closes None.
## Rationale for this change
This pr simplifies heap size estimation by using a macro for types that
own no heap allocations.
This removes a lot of redundant code.
## What changes are included in this PR?
See above.
## Are these changes tested?
Yes, previous tests are passing and more tests are added.
## Are there any user-facing changes?
No.
---
datafusion/common/src/heap_size.rs | 158 +++++++++--------------------
1 file changed, 48 insertions(+), 110 deletions(-)
diff --git a/datafusion/common/src/heap_size.rs b/datafusion/common/src/heap_size.rs
index 494ad35e1eeb4..802f9d3883222 100644
--- a/datafusion/common/src/heap_size.rs
+++ b/datafusion/common/src/heap_size.rs
@@ -410,24 +410,6 @@ impl DFHeapSize for UnionFields {
}
}
-impl DFHeapSize for UnionMode {
- fn heap_size(&self, _: &mut DFHeapSizeCtx) -> usize {
- 0 // no heap allocations
- }
-}
-
-impl DFHeapSize for TimeUnit {
- fn heap_size(&self, _: &mut DFHeapSizeCtx) -> usize {
- 0 // no heap allocations
- }
-}
-
-impl DFHeapSize for IntervalUnit {
- fn heap_size(&self, _: &mut DFHeapSizeCtx) -> usize {
- 0 // no heap allocations
- }
-}
-
impl DFHeapSize for Field {
fn heap_size(&self, ctx: &mut DFHeapSizeCtx) -> usize {
self.name().heap_size(ctx)
@@ -452,98 +434,40 @@ impl DFHeapSize for IntervalDayTime {
}
}
-impl DFHeapSize for DateTime {
- fn heap_size(&self, _: &mut DFHeapSizeCtx) -> usize {
- 0 // no heap allocations
- }
-}
-
-impl DFHeapSize for bool {
- fn heap_size(&self, _: &mut DFHeapSizeCtx) -> usize {
- 0 // no heap allocations
- }
-}
-impl DFHeapSize for u8 {
- fn heap_size(&self, _: &mut DFHeapSizeCtx) -> usize {
- 0 // no heap allocations
- }
-}
-
-impl DFHeapSize for u16 {
- fn heap_size(&self, _: &mut DFHeapSizeCtx) -> usize {
- 0 // no heap allocations
- }
-}
-
-impl DFHeapSize for u32 {
- fn heap_size(&self, _: &mut DFHeapSizeCtx) -> usize {
- 0 // no heap allocations
- }
-}
-
-impl DFHeapSize for u64 {
- fn heap_size(&self, _: &mut DFHeapSizeCtx) -> usize {
- 0 // no heap allocations
- }
-}
-
-impl DFHeapSize for i8 {
- fn heap_size(&self, _: &mut DFHeapSizeCtx) -> usize {
- 0 // no heap allocations
- }
-}
-
-impl DFHeapSize for i16 {
- fn heap_size(&self, _: &mut DFHeapSizeCtx) -> usize {
- 0 // no heap allocations
- }
-}
-
-impl DFHeapSize for i32 {
- fn heap_size(&self, _: &mut DFHeapSizeCtx) -> usize {
- 0 // no heap allocations
- }
-}
-impl DFHeapSize for i64 {
- fn heap_size(&self, _: &mut DFHeapSizeCtx) -> usize {
- 0 // no heap allocations
- }
-}
-
-impl DFHeapSize for i128 {
- fn heap_size(&self, _: &mut DFHeapSizeCtx) -> usize {
- 0 // no heap allocations
- }
-}
-
-impl DFHeapSize for i256 {
- fn heap_size(&self, _: &mut DFHeapSizeCtx) -> usize {
- 0 // no heap allocations
- }
-}
-
-impl DFHeapSize for f16 {
- fn heap_size(&self, _: &mut DFHeapSizeCtx) -> usize {
- 0 // no heap allocations
- }
-}
-
-impl DFHeapSize for f32 {
- fn heap_size(&self, _: &mut DFHeapSizeCtx) -> usize {
- 0 // no heap allocations
- }
-}
-impl DFHeapSize for f64 {
- fn heap_size(&self, _: &mut DFHeapSizeCtx) -> usize {
- 0 // no heap allocations
- }
-}
-
-impl DFHeapSize for usize {
- fn heap_size(&self, _: &mut DFHeapSizeCtx) -> usize {
- 0 // no heap allocations
- }
-}
+/// Implement [`DFHeapSize`] for types that own no heap allocations.
+macro_rules! impl_zero_heap_size {
+ ($($t:ty),+ $(,)?) => {
+ $(
+ impl DFHeapSize for $t {
+ fn heap_size(&self, _: &mut DFHeapSizeCtx) -> usize {
+ 0 // no heap allocations
+ }
+ }
+ )+
+ };
+}
+
+impl_zero_heap_size!(
+ bool,
+ u8,
+ u16,
+ u32,
+ u64,
+ usize,
+ i8,
+ i16,
+ i32,
+ i64,
+ i128,
+ i256,
+ f16,
+ f32,
+ f64,
+ UnionMode,
+ TimeUnit,
+ IntervalUnit,
+ DateTime,
+);
#[cfg(test)]
mod tests {
@@ -621,6 +545,20 @@ mod tests {
assert_eq!(size(&f16::from_f32(0.0)), 0);
}
+ #[test]
+ fn test_heap_size_union_mode() {
+ assert_eq!(size(&UnionMode::Sparse), 0);
+ assert_eq!(size(&UnionMode::Dense), 0);
+ }
+
+ #[test]
+ fn test_heap_size_time_units() {
+ assert_eq!(size(&TimeUnit::Second), 0);
+ assert_eq!(size(&IntervalUnit::YearMonth), 0);
+ assert_eq!(size(&DateTime::::UNIX_EPOCH), 0);
+ assert_eq!(size(&Utc::now()), 0);
+ }
+
#[test]
fn test_string() {
let mut s = String::with_capacity(32);
From e20763ce773c31ab67ec448e9d64d773a8df8435 Mon Sep 17 00:00:00 2001
From: Yongting You <2010youy01@gmail.com>
Date: Mon, 15 Jun 2026 17:29:56 +0800
Subject: [PATCH 13/39] refactor(hash-aggr): Migrate the partial aggregation
skip optimization to the new hash aggregation impl (#22899)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
## Which issue does this PR close?
Part of https://github.com/apache/datafusion/issues/22710
## Rationale for this change
See issue for the background, this PR forward ports below optimization
to the rewritten hash aggregation
- https://github.com/apache/datafusion/pull/11627
After this migration, the performance is back, so this PR also changes
the temporary configuration
`datafusion.execution.enable_migration_aggregate` default to `true` --
the new path will be used by default.
Local Clickbench_partitioned result (see `benchmarks/` for details), on
M4 Pro MacBook
```
--------------------
Benchmark clickbench_partitioned.json
--------------------
┏━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┓
┃ Query ┃ main ┃ split-aggr-skip-partial ┃ Change ┃
┡━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━┩
│ QQuery 0 │ 0.68 ms │ 0.70 ms │ no change │
│ QQuery 1 │ 7.66 ms │ 7.48 ms │ no change │
│ QQuery 2 │ 25.79 ms │ 25.72 ms │ no change │
│ QQuery 3 │ 22.25 ms │ 22.10 ms │ no change │
│ QQuery 4 │ 182.82 ms │ 188.57 ms │ no change │
│ QQuery 5 │ 213.57 ms │ 212.69 ms │ no change │
│ QQuery 6 │ 0.66 ms │ 0.69 ms │ no change │
│ QQuery 7 │ 8.54 ms │ 8.49 ms │ no change │
│ QQuery 8 │ 245.27 ms │ 246.04 ms │ no change │
│ QQuery 9 │ 323.81 ms │ 323.68 ms │ no change │
│ QQuery 10 │ 48.95 ms │ 48.70 ms │ no change │
│ QQuery 11 │ 57.73 ms │ 57.05 ms │ no change │
│ QQuery 12 │ 211.82 ms │ 210.91 ms │ no change │
│ QQuery 13 │ 298.06 ms │ 302.46 ms │ no change │
│ QQuery 14 │ 219.03 ms │ 217.94 ms │ no change │
│ QQuery 15 │ 219.24 ms │ 216.60 ms │ no change │
│ QQuery 16 │ 485.78 ms │ 493.53 ms │ no change │
│ QQuery 17 │ 500.92 ms │ 487.31 ms │ no change │
│ QQuery 18 │ 1087.29 ms │ 1051.08 ms │ no change │
│ QQuery 19 │ 19.10 ms │ 19.45 ms │ no change │
│ QQuery 20 │ 453.62 ms │ 458.61 ms │ no change │
│ QQuery 21 │ 454.90 ms │ 459.08 ms │ no change │
│ QQuery 22 │ 829.91 ms │ 847.96 ms │ no change │
│ QQuery 23 │ 2561.67 ms │ 2619.03 ms │ no change │
│ QQuery 24 │ 31.76 ms │ 31.78 ms │ no change │
│ QQuery 25 │ 86.63 ms │ 89.67 ms │ no change │
│ QQuery 26 │ 31.37 ms │ 32.67 ms │ no change │
│ QQuery 27 │ 544.97 ms │ 553.90 ms │ no change │
│ QQuery 28 │ 1822.22 ms │ 1877.44 ms │ no change │
│ QQuery 29 │ 27.76 ms │ 29.00 ms │ no change │
│ QQuery 30 │ 211.00 ms │ 217.48 ms │ no change │
│ QQuery 31 │ 206.02 ms │ 211.34 ms │ no change │
│ QQuery 32 │ 676.20 ms │ 724.32 ms │ 1.07x slower │
│ QQuery 33 │ 1144.96 ms │ 1161.21 ms │ no change │
│ QQuery 34 │ 1141.98 ms │ 1147.83 ms │ no change │
│ QQuery 35 │ 209.49 ms │ 217.06 ms │ no change │
│ QQuery 36 │ 44.38 ms │ 44.10 ms │ no change │
│ QQuery 37 │ 24.15 ms │ 24.57 ms │ no change │
│ QQuery 38 │ 29.67 ms │ 30.00 ms │ no change │
│ QQuery 39 │ 87.68 ms │ 88.80 ms │ no change │
│ QQuery 40 │ 8.57 ms │ 8.95 ms │ no change │
│ QQuery 41 │ 8.62 ms │ 8.38 ms │ no change │
│ QQuery 42 │ 7.42 ms │ 7.20 ms │ no change │
└───────────┴────────────┴─────────────────────────┴──────────────┘
```
## What changes are included in this PR?
This PR is easier to read commit-by-commit.
1. Cleanup the state machine in hash aggregation with typestate pattern
2. Move common util for partial hash aggregation skip from
`aggregates/row_hash.rs` -> `aggregates/utils.rs`
3. Implement the same optimization to the migrated aggregation
4. Set configuration `enable_migration_aggregate` default to true
## Are these changes tested?
Existing tests + new UT
## Are there any user-facing changes?
No
---
datafusion/common/src/config.rs | 2 +-
.../src/aggregates/group_values/metrics.rs | 1 +
.../src/aggregates/hash_aggregate.rs | 874 +++++++++++++++---
.../src/aggregates/hash_table.rs | 109 ++-
.../physical-plan/src/aggregates/mod.rs | 184 ++++
.../physical-plan/src/aggregates/row_hash.rs | 263 +-----
.../src/aggregates/skip_partial.rs | 303 ++++++
.../test_files/information_schema.slt | 4 +-
docs/source/user-guide/configs.md | 2 +-
uv.lock | 28 +-
10 files changed, 1326 insertions(+), 444 deletions(-)
create mode 100644 datafusion/physical-plan/src/aggregates/skip_partial.rs
diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs
index 0c26bd0841883..07196d009c54c 100644
--- a/datafusion/common/src/config.rs
+++ b/datafusion/common/src/config.rs
@@ -722,7 +722,7 @@ config_namespace! {
/// will be removed after the migration is finished.
///
/// See for details.
- pub enable_migration_aggregate: bool, default = false
+ pub enable_migration_aggregate: bool, default = true
/// Sets the compression codec used when spilling data to disk.
///
diff --git a/datafusion/physical-plan/src/aggregates/group_values/metrics.rs b/datafusion/physical-plan/src/aggregates/group_values/metrics.rs
index a0934b976ea79..1c6285d793b88 100644
--- a/datafusion/physical-plan/src/aggregates/group_values/metrics.rs
+++ b/datafusion/physical-plan/src/aggregates/group_values/metrics.rs
@@ -19,6 +19,7 @@
use crate::metrics::{ExecutionPlanMetricsSet, MetricBuilder, Time};
+#[derive(Clone)]
pub(crate) struct GroupByMetrics {
/// Time spent calculating the group IDs from the evaluated grouping columns.
pub(crate) time_calculating_group_ids: Time,
diff --git a/datafusion/physical-plan/src/aggregates/hash_aggregate.rs b/datafusion/physical-plan/src/aggregates/hash_aggregate.rs
index 0c8593efd05bb..29d292b215d16 100644
--- a/datafusion/physical-plan/src/aggregates/hash_aggregate.rs
+++ b/datafusion/physical-plan/src/aggregates/hash_aggregate.rs
@@ -25,6 +25,7 @@
//!
//! See issue for details:
+use std::ops::ControlFlow;
use std::sync::Arc;
use std::task::{Context, Poll};
@@ -36,17 +37,20 @@ use datafusion_execution::memory_pool::{MemoryConsumer, MemoryReservation};
use futures::stream::{Stream, StreamExt};
use super::AggregateExec;
-use super::hash_table::{AggregateHashTable, Final, Partial};
-use crate::metrics::{BaselineMetrics, MetricBuilder, RecordOutput, SpillMetrics};
+use super::hash_table::{AggregateHashTable, Final, Partial, PartialSkip};
+use super::skip_partial::SkipAggregationProbe;
+use crate::metrics::{
+ BaselineMetrics, MetricBuilder, MetricCategory, RecordOutput, SpillMetrics,
+};
use crate::stream::EmptyRecordBatchStream;
use crate::{InputOrderMode, RecordBatchStream, SendableRecordBatchStream, metrics};
-/// Hash aggregation uses a 2-stage (partial and final) hash aggregation, this stream
-/// is for the partial stage.
+/// Hash aggregation is implemented in two stages: partial and final. This
+/// stream implements the partial stage.
///
/// # Example
///
-/// select k, avg(v) from t group by k;
+/// SELECT k, AVG(v) FROM t GROUP BY k;
///
/// ## Plan
/// AggregateExec(stage=final)
@@ -55,15 +59,18 @@ use crate::{InputOrderMode, RecordBatchStream, SendableRecordBatchStream, metric
///
/// ## Partial Stage Behavior
/// Input: raw rows
-/// Output: partial states for all groups (e.g. for avg(x), it's sum(x), count(x))
+/// Output: partial states for all groups (for example, `AVG(x)` emits `SUM(x)`
+/// and `COUNT(x)`)
///
/// ## Final Stage Behavior
/// Input: partial states
-/// Output: results for all groups (e.g. for avg(x), it's avg(x) calculated from the state)
+/// Output: results for all groups (for example, `AVG(x)` calculated from the
+/// state)
///
/// # Optimization: DISTINCT LIMIT Soft Limit
///
-/// This optimization applies to both [`PartialHashAggregateStream`] and [`FinalHashAggregateStream`]
+/// This optimization applies to both [`PartialHashAggregateStream`] and
+/// [`FinalHashAggregateStream`].
///
/// Unordered distinct queries such as:
///
@@ -87,6 +94,17 @@ use crate::{InputOrderMode, RecordBatchStream, SendableRecordBatchStream, metric
/// This operator does not guarantee an exact limit because a single batch can
/// cross the threshold. The downstream limit operator enforces the exact result
/// size.
+///
+/// # Optimization: Partial Aggregation Skip
+///
+/// Partial aggregation can be counterproductive for high-cardinality inputs,
+/// where most rows create distinct groups. The stream probes the ratio of
+/// accumulated groups to input rows while it is still aggregating. If the ratio
+/// crosses the configured threshold and all aggregate accumulators can convert
+/// raw inputs directly to partial state, the stream emits any already
+/// accumulated groups, then switches to a skip state. In that state, each
+/// remaining input batch is converted directly to partial aggregate state rows
+/// without inserting the rows into the grouped hash table.
pub(crate) struct PartialHashAggregateStream {
/// Output schema: group columns followed by partial aggregate state columns.
schema: SchemaRef,
@@ -94,9 +112,6 @@ pub(crate) struct PartialHashAggregateStream {
/// Input batches containing raw rows, not partial aggregate state.
input: SendableRecordBatchStream,
- /// Hash table state for this aggregate stream.
- hash_table: AggregateHashTable,
-
/// Memory reservation for group keys and accumulators.
reservation: MemoryReservation,
@@ -106,15 +121,69 @@ pub(crate) struct PartialHashAggregateStream {
/// Tracks partial aggregation row reduction, matching `GroupedHashAggregateStream`.
reduction_factor: metrics::RatioMetrics,
+ /// Tracks whether partial aggregation should switch to direct state conversion.
+ skip_aggregation_probe: Option,
+
/// Optional soft limit on the number of groups to accumulate before output.
///
/// Invariant: when this is `Some(..)`, the accumulators inside `hash_table` must
/// be empty. See struct comments for details.
group_values_soft_limit: Option,
+
+ /// Tracks the high-level stream lifecycle. The hash table owns the lower-level
+ /// state for materializing and slicing output batches.
+ state: Option,
+}
+
+/// States for partial hash aggregation processing.
+enum PartialHashAggregateState {
+ ReadingInput {
+ hash_table: AggregateHashTable,
+ },
+ ProducingOutput {
+ hash_table: AggregateHashTable,
+ /// If `None`, partial skip was never triggered and this state will
+ /// finish in `Done`. If `Some`, partial skip has triggered and the
+ /// stream will move to `SkippingAggregation` after these accumulated
+ /// groups are emitted.
+ skip_hash_table: Option>,
+ },
+ SkippingAggregation {
+ hash_table: AggregateHashTable,
+ },
+ Done,
+}
+
+type PartialHashAggregatePoll = Poll
>> {
let mut source = self.clone();
- source.projection = self.projection.try_merge(projection)?;
+
+ // If there's no reference to `FileRowIndexFunc` in the projection, we can just merge
+ // both projections as-is, there's no need to modify the projection first.
+ if !projection.iter().any(|projection_expr| {
+ expr_references_scalar_udf::(&projection_expr.expr)
+ }) {
+ source.projection = self.projection.try_merge(projection)?;
+ return Ok(Some(Arc::new(source)));
+ }
+
+ // If we can find a reference to `FileRowIndexFunc`, we add it as a virtual column
+ // or re-use an existing one in the table's schema.
+ let (table_schema, row_index_col) =
+ table_schema_with_row_index_col(self.table_schema());
+
+ source.table_schema = table_schema;
+ source.projection = rewrite_file_row_index_projection(
+ &self.projection,
+ projection,
+ &row_index_col,
+ )?;
+
Ok(Some(Arc::new(source)))
}
@@ -952,15 +982,12 @@ impl FileSource for ParquetSource {
reversed_eq_properties.ordering_satisfy(order.iter().cloned())?;
let sort_order = LexOrdering::new(order.iter().cloned());
let column_in_file_schema = sort_order.as_ref().is_some_and(|s| {
- s.first()
- .expr
- .downcast_ref::()
- .is_some_and(|col| {
- self.table_schema
- .file_schema()
- .field_with_name(col.name())
- .is_ok()
- })
+ s.first().expr.downcast_ref::().is_some_and(|col| {
+ self.table_schema
+ .file_schema()
+ .field_with_name(col.name())
+ .is_ok()
+ })
});
if !column_in_file_schema && !reversed_satisfies {
@@ -989,6 +1016,69 @@ impl FileSource for ParquetSource {
}
}
+/// Returns the a [`TableSchema`] containing a [`RowNumber`] virtual column and a [`Column`] expression referencing its row index column.
+/// The expression is then merged into a projection.
+///
+/// - If the schema already has a virtual column with the [`RowNumber`] type, it returns the schema unchanged.
+/// - If the schema doesn't have the appropriate virtual column, it returns a modified schema with the virtual column appended to it.
+fn table_schema_with_row_index_col(table_schema: &TableSchema) -> (TableSchema, Column) {
+ // If we can find a virtual column with the `RowNumber` type, we just return the schema
+ // and create the appropriate `column` we're going to use
+ if let Some((idx, field)) =
+ table_schema
+ .virtual_columns()
+ .iter()
+ .enumerate()
+ .find(|(_, field)| {
+ field
+ .extension_type_name()
+ .is_some_and(|name| name == RowNumber::NAME)
+ })
+ {
+ let virtual_offset = table_schema.file_schema().fields().len()
+ + table_schema.table_partition_cols().len();
+
+ return (
+ table_schema.clone(),
+ Column::new(field.name(), virtual_offset + idx),
+ );
+ }
+
+ // The hidden field is shared across all files in this scan, but it must
+ // have a unique table-schema name because later rewrites resolve it by
+ // column name and index.
+ let base_row_index_name = "__datafusion_file_row_index";
+ let mut row_index_name = base_row_index_name.to_string();
+ let mut suffix = 0;
+ while table_schema
+ .table_schema()
+ .field_with_name(&row_index_name)
+ .is_ok()
+ {
+ suffix += 1;
+ row_index_name = format!("{base_row_index_name}_{suffix}");
+ }
+
+ let row_index_table_idx = table_schema.table_schema().fields().len();
+ let row_index_field = Arc::new(
+ Field::new(&row_index_name, DataType::Int64, true).with_extension_type(RowNumber),
+ );
+ (
+ TableSchema::builder(Arc::clone(table_schema.file_schema()))
+ .with_table_partition_cols(table_schema.table_partition_cols().clone())
+ .with_virtual_columns(
+ table_schema
+ .virtual_columns()
+ .iter()
+ .cloned()
+ .chain([row_index_field])
+ .collect::(),
+ )
+ .build(),
+ Column::new(&row_index_name, row_index_table_idx),
+ )
+}
+
#[cfg(test)]
mod tests {
use super::*;
@@ -1622,7 +1712,9 @@ mod tests {
use datafusion_common::config::ConfigOptions;
use datafusion_datasource::TableSchema;
use datafusion_expr::{col, lit as logical_lit};
+ use datafusion_functions::core::expr_fn::file_row_index;
use datafusion_physical_expr::planner::logical2physical;
+ use datafusion_physical_expr_adapter::rewrite_file_row_index_expr;
use datafusion_physical_plan::filter_pushdown::PushedDown;
use parquet::arrow::RowNumber;
@@ -1652,13 +1744,20 @@ mod tests {
.or(col("value").eq(logical_lit(4i64))),
full_schema,
);
+ let (_, row_index_col) = table_schema_with_row_index_col(source.table_schema());
+ let row_index = rewrite_file_row_index_expr(
+ logical2physical(&file_row_index().gt(logical_lit(2i64)), full_schema),
+ row_index_col.name(),
+ row_index_col.index(),
+ )
+ .expect("file_row_index should rewrite to the row_number virtual column");
let config = ConfigOptions::default();
let prop = source
- .try_pushdown_filters(vec![pushable, virtual_only, mixed], &config)
+ .try_pushdown_filters(vec![pushable, virtual_only, mixed, row_index], &config)
.expect("try_pushdown_filters must not error");
- assert_eq!(prop.filters.len(), 3);
+ assert_eq!(prop.filters.len(), 4);
assert!(
matches!(prop.filters[0], PushedDown::Yes),
"file-column filter should be pushable"
@@ -1672,5 +1771,10 @@ mod tests {
"filter mixing a virtual column with a file column must not be \
pushed down (row filter would silently drop it)"
);
+ assert!(
+ matches!(prop.filters[3], PushedDown::No),
+ "file_row_index() rewrites to a virtual column and must not be \
+ pushed down"
+ );
}
}
diff --git a/datafusion/functions/src/core/file_row_index.rs b/datafusion/functions/src/core/file_row_index.rs
new file mode 100644
index 0000000000000..7b2667a8b8768
--- /dev/null
+++ b/datafusion/functions/src/core/file_row_index.rs
@@ -0,0 +1,96 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Implementation of the `file_row_index` scalar function.
+
+use arrow::datatypes::DataType;
+use datafusion_common::utils::take_function_args;
+use datafusion_common::{Result, exec_err};
+use datafusion_doc::Documentation;
+use datafusion_expr::{
+ ColumnarValue, ExpressionPlacement, ScalarFunctionArgs, ScalarUDFImpl, Signature,
+ Volatility,
+};
+use datafusion_macros::user_doc;
+
+/// Scalar UDF implementation for `file_row_index()`.
+///
+/// File sources that can expose per-file row indexes rewrite this placeholder
+/// function into a source-provided physical expression. Direct evaluation
+/// returns an error because there is no file context outside a scan.
+#[user_doc(
+ doc_section(label = "Other Functions"),
+ description = r#"Returns the zero-based row offset within the source file
+that produced the current row.
+
+The value is scoped to one file, so rows from different files in the same scan
+can have the same row index. This function is intended to be rewritten at
+file-scan time. If the input file is not known (for example, if this function
+is evaluated outside a file scan, or was not pushed down into one), direct
+evaluation returns an error.
+"#,
+ syntax_example = "file_row_index()",
+ sql_example = r#"```sql
+SELECT file_row_index() FROM t;
+```"#
+)]
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct FileRowIndexFunc {
+ signature: Signature,
+}
+
+impl Default for FileRowIndexFunc {
+ fn default() -> Self {
+ Self::new()
+ }
+}
+
+impl FileRowIndexFunc {
+ pub fn new() -> Self {
+ Self {
+ signature: Signature::nullary(Volatility::Volatile),
+ }
+ }
+}
+
+impl ScalarUDFImpl for FileRowIndexFunc {
+ fn name(&self) -> &str {
+ "file_row_index"
+ }
+
+ fn signature(&self) -> &Signature {
+ &self.signature
+ }
+
+ fn return_type(&self, args: &[DataType]) -> Result {
+ let [] = take_function_args(self.name(), args)?;
+ Ok(DataType::Int64)
+ }
+
+ fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result {
+ let [] = take_function_args(self.name(), args.args)?;
+ exec_err!("file_row_index() is source dependent and cannot be evaluated directly")
+ }
+
+ fn placement(&self, _args: &[ExpressionPlacement]) -> ExpressionPlacement {
+ ExpressionPlacement::MoveTowardsLeafNodes
+ }
+
+ fn documentation(&self) -> Option<&Documentation> {
+ self.doc()
+ }
+}
diff --git a/datafusion/functions/src/core/mod.rs b/datafusion/functions/src/core/mod.rs
index 5657f9d88810c..4665eca99ebef 100644
--- a/datafusion/functions/src/core/mod.rs
+++ b/datafusion/functions/src/core/mod.rs
@@ -28,6 +28,7 @@ pub mod arrowtypeof;
pub mod cast_to_type;
pub mod coalesce;
pub mod expr_ext;
+pub mod file_row_index;
pub mod getfield;
pub mod greatest;
mod greatest_least_utils;
@@ -67,6 +68,7 @@ make_udf_function!(version::VersionFunc, version);
make_udf_function!(arrow_metadata::ArrowMetadataFunc, arrow_metadata);
make_udf_function!(with_metadata::WithMetadataFunc, with_metadata);
make_udf_function!(arrow_field::ArrowFieldFunc, arrow_field);
+make_udf_function!(file_row_index::FileRowIndexFunc, file_row_index);
pub mod expr_fn {
use datafusion_expr::{Expr, Literal};
@@ -143,6 +145,9 @@ pub mod expr_fn {
union_tag,
"Returns the name of the currently selected field in the union",
arg1
+ ),(
+ file_row_index,
+ "Returns the offset of the row within its source file",
));
#[doc = "Returns the value of the field with the given name from the struct"]
@@ -196,5 +201,6 @@ pub fn functions() -> Vec> {
union_tag(),
version(),
r#struct(),
+ file_row_index(),
]
}
diff --git a/datafusion/physical-expr-adapter/src/lib.rs b/datafusion/physical-expr-adapter/src/lib.rs
index ea4db19ee110e..fa14bc8b4d150 100644
--- a/datafusion/physical-expr-adapter/src/lib.rs
+++ b/datafusion/physical-expr-adapter/src/lib.rs
@@ -29,5 +29,6 @@ pub mod schema_rewriter;
pub use schema_rewriter::{
BatchAdapter, BatchAdapterFactory, DefaultPhysicalExprAdapter,
DefaultPhysicalExprAdapterFactory, PhysicalExprAdapter, PhysicalExprAdapterFactory,
- replace_columns_with_literals,
+ expr_references_scalar_udf, replace_columns_with_literals,
+ rewrite_file_row_index_expr, rewrite_file_row_index_projection,
};
diff --git a/datafusion/physical-expr-adapter/src/schema_rewriter.rs b/datafusion/physical-expr-adapter/src/schema_rewriter.rs
index 56502ab8731a7..f287caf32ecda 100644
--- a/datafusion/physical-expr-adapter/src/schema_rewriter.rs
+++ b/datafusion/physical-expr-adapter/src/schema_rewriter.rs
@@ -25,16 +25,19 @@ use std::hash::Hash;
use std::sync::Arc;
use arrow::array::RecordBatch;
-use arrow::datatypes::{DataType, FieldRef, SchemaRef};
+use arrow::datatypes::{DataType, Field, FieldRef, SchemaRef};
use datafusion_common::{
DataFusionError, Result, ScalarValue, exec_err,
metadata::FieldMetadata,
nested_struct::validate_data_type_compatibility,
- tree_node::{Transformed, TransformedResult, TreeNode},
+ tree_node::{Transformed, TransformedResult, TreeNode, TreeNodeRecursion},
+};
+use datafusion_expr::ScalarUDFImpl;
+use datafusion_functions::core::{
+ file_row_index::FileRowIndexFunc, getfield::GetFieldFunc,
};
-use datafusion_functions::core::getfield::GetFieldFunc;
use datafusion_physical_expr::PhysicalExprSimplifier;
-use datafusion_physical_expr::projection::{ProjectionExprs, Projector};
+use datafusion_physical_expr::projection::{ProjectionExpr, ProjectionExprs, Projector};
use datafusion_physical_expr::{
ScalarFunctionExpr,
expressions::{self, CastExpr, Column},
@@ -81,6 +84,114 @@ where
.data()
}
+/// Return true if `expr` references scalar UDF `T`.
+///
+/// This matches the concrete [`ScalarUDFImpl`] type rather than the function
+/// name, so unrelated UDFs with the same name are not treated as matches.
+pub fn expr_references_scalar_udf(
+ expr: &Arc,
+) -> bool {
+ let mut found = false;
+
+ expr.apply(|node| {
+ if ScalarFunctionExpr::try_downcast_func::(node.as_ref()).is_some() {
+ found = true;
+ return Ok(TreeNodeRecursion::Stop);
+ }
+ Ok(TreeNodeRecursion::Continue)
+ })
+ .expect("Infallible traversal of PhysicalExpr tree failed");
+
+ found
+}
+
+/// Rewrite occurrences of scalar UDF `T` in `expr` using `replacement`.
+///
+/// The rewrite matches the concrete [`ScalarUDFImpl`] type rather than the
+/// function name. `replacement` is called with each matching
+/// [`ScalarFunctionExpr`] after its children have been rewritten.
+fn rewrite_scalar_udf(
+ expr: Arc,
+ mut replacement: F,
+) -> Result>
+where
+ T: ScalarUDFImpl,
+ F: FnMut(&ScalarFunctionExpr) -> Result>,
+{
+ expr.transform_up(|node| {
+ if let Some(scalar_fn) = ScalarFunctionExpr::try_downcast_func::(node.as_ref())
+ {
+ Ok(Transformed::yes(replacement(scalar_fn)?))
+ } else {
+ Ok(Transformed::no(node))
+ }
+ })
+ .map(|transformed| transformed.data)
+}
+
+/// Rewrite `file_row_index()` in `expr` to read from a source-provided
+/// row-index column.
+///
+/// `row_index_idx` is the index of `row_index_name` in the schema that the
+/// rewritten expression will be evaluated against. The rewrite uses ordinary
+/// physical expressions: a [`Column`] that reads the source row-index values
+/// wrapped in a [`CastExpr`] that exposes the public `file_row_index: Int64`
+/// return field without source-specific extension metadata.
+pub fn rewrite_file_row_index_expr(
+ expr: Arc,
+ row_index_name: &str,
+ row_index_idx: usize,
+) -> Result> {
+ rewrite_scalar_udf::(expr, |_| {
+ let source = Arc::new(Column::new(row_index_name, row_index_idx));
+ let target_field = Arc::new(Field::new("file_row_index", DataType::Int64, true));
+ Ok(Arc::new(CastExpr::new_with_target_field(
+ source,
+ target_field,
+ None,
+ )))
+ })
+}
+
+/// Rewrite `file_row_index()` in a pushed projection to read from a
+/// source-provided row-index column.
+///
+///
+/// For example if `row_index_column` is `__datafusion_row_idx` this function rewrites all
+/// instances of `file_row_index()` to `__datafusion_row_index` column references.
+///
+/// `base_projection` is the current projection already pushed into a source.
+/// The row-index source column is appended to that base projection if it is not
+/// already present. `projection` is rewritten to read from the projected
+/// row-index column and then merged on top of the extended base projection.
+pub fn rewrite_file_row_index_projection(
+ base_projection: &ProjectionExprs,
+ projection: &ProjectionExprs,
+ row_index_col: &Column,
+) -> Result {
+ let mut base_exprs = base_projection.as_ref().to_vec();
+ let row_index_projection_idx =
+ base_projection.projected_column_position(row_index_col);
+
+ // If the column doesn't exist in the projection yet
+ if row_index_projection_idx.is_none() {
+ base_exprs.push(ProjectionExpr {
+ expr: Arc::new(row_index_col.clone()),
+ alias: row_index_col.name().to_owned(),
+ });
+ }
+
+ let rewritten_projection = projection.clone().try_map_exprs(|expr| {
+ rewrite_file_row_index_expr(
+ expr,
+ row_index_col.name(),
+ row_index_projection_idx.unwrap_or(base_exprs.len() - 1),
+ )
+ })?;
+
+ ProjectionExprs::new(base_exprs).try_merge(&rewritten_projection)
+}
+
/// Trait for adapting [`PhysicalExpr`] expressions to match a target schema.
///
/// This is used in file scans to rewrite expressions so that they can be
@@ -631,8 +742,8 @@ mod tests {
RecordBatchOptions, StringArray, StringViewArray, StructArray,
};
use arrow::datatypes::{Field, Fields, Schema};
- use datafusion_common::{assert_contains, record_batch};
- use datafusion_expr::Operator;
+ use datafusion_common::{assert_contains, config::ConfigOptions, record_batch};
+ use datafusion_expr::{Operator, ScalarUDF};
use datafusion_physical_expr::expressions::{Column, Literal, col};
fn assert_cast_expr(expr: &Arc) -> &CastExpr {
@@ -648,6 +759,88 @@ mod tests {
assert_eq!(inner_col.index(), index);
}
+ fn file_row_index_expr() -> Arc {
+ Arc::new(ScalarFunctionExpr::new(
+ "file_row_index",
+ Arc::new(ScalarUDF::from(FileRowIndexFunc::new())),
+ vec![],
+ Arc::new(Field::new("file_row_index", DataType::Int64, true)),
+ Arc::new(ConfigOptions::default()),
+ ))
+ }
+
+ #[test]
+ fn test_rewrite_scalar_udf_replaces_nested_typed_udf() -> Result<()> {
+ let expr = Arc::new(expressions::BinaryExpr::new(
+ file_row_index_expr(),
+ Operator::Plus,
+ expressions::lit(ScalarValue::Int64(Some(1))),
+ )) as Arc;
+
+ let rewritten = rewrite_scalar_udf::(expr, |_| {
+ Ok(expressions::lit(ScalarValue::Int64(Some(7))))
+ })?;
+
+ let binary = rewritten
+ .downcast_ref::()
+ .expect("rewritten expression should remain binary");
+ assert_eq!(binary.op(), &Operator::Plus);
+
+ let left = binary
+ .left()
+ .downcast_ref::()
+ .expect("left side should be rewritten to a literal");
+ assert_eq!(left.value(), &ScalarValue::Int64(Some(7)));
+
+ let right = binary
+ .right()
+ .downcast_ref::()
+ .expect("right side should remain the original literal");
+ assert_eq!(right.value(), &ScalarValue::Int64(Some(1)));
+ Ok(())
+ }
+
+ #[test]
+ fn test_rewrite_file_row_index_expr_to_source_column() -> Result<()> {
+ let expr = rewrite_file_row_index_expr(
+ file_row_index_expr(),
+ "__datafusion_file_row_index",
+ 2,
+ )?;
+
+ let cast_expr = expr
+ .downcast_ref::()
+ .expect("file row index expression should be a cast");
+ assert_eq!(cast_expr.cast_type(), &DataType::Int64);
+ let target_field = cast_expr.target_field();
+ assert_eq!(target_field.name(), "file_row_index");
+ assert_eq!(target_field.data_type(), &DataType::Int64);
+ assert!(target_field.is_nullable());
+ assert!(target_field.metadata().is_empty());
+
+ let source = cast_expr
+ .expr()
+ .downcast_ref::()
+ .expect("source column");
+ assert_eq!(source.name(), "__datafusion_file_row_index");
+ assert_eq!(source.index(), 2);
+
+ let input_schema = Schema::new(vec![
+ Field::new("value", DataType::Int64, true),
+ Field::new("__datafusion_file_row_index", DataType::Int64, false)
+ .with_metadata(HashMap::from([(
+ "source".to_string(),
+ "virtual".to_string(),
+ )])),
+ ]);
+ let return_field = expr.return_field(&input_schema)?;
+ assert_eq!(return_field.name(), "file_row_index");
+ assert_eq!(return_field.data_type(), &DataType::Int64);
+ assert!(return_field.is_nullable());
+ assert!(return_field.metadata().is_empty());
+ Ok(())
+ }
+
fn stale_index_cast_schemas() -> (SchemaRef, SchemaRef) {
let physical_schema = Arc::new(Schema::new(vec![
Field::new("b", DataType::Binary, true),
diff --git a/datafusion/physical-expr/src/projection.rs b/datafusion/physical-expr/src/projection.rs
index cee95685e8440..1f6a6eb08fb78 100644
--- a/datafusion/physical-expr/src/projection.rs
+++ b/datafusion/physical-expr/src/projection.rs
@@ -661,7 +661,7 @@ impl ProjectionExprs {
for proj_expr in self.exprs.iter() {
let expr = &proj_expr.expr;
let col_stats = if let Some(col) = expr.downcast_ref::() {
- stats.column_statistics[col.index()].clone()
+ column_statistics_at(&stats.column_statistics, col.index())
} else if let Some(literal) = expr.downcast_ref::() {
// Handle literal expressions (constants) by calculating proper statistics
let data_type = expr.data_type(output_schema)?;
@@ -725,6 +725,60 @@ impl ProjectionExprs {
stats.column_statistics = column_statistics;
Ok(stats)
}
+
+ /// Returns the output position of `column` if this projection contains it.
+ ///
+ /// This only matches projection expressions that are exactly [`Column`] expressions.
+ /// Computed expressions, even if they reference `column`, do not match. The
+ /// comparison uses [`Column`] equality, so both the name and index must match.
+ /// If the same column appears more than once, this returns the first matching
+ /// position.
+ ///
+ /// # Example
+ ///
+ /// ```rust
+ /// use datafusion_common::ScalarValue;
+ /// use datafusion_physical_expr::expressions::{Column, Literal};
+ /// use datafusion_physical_expr::projection::{ProjectionExpr, ProjectionExprs};
+ /// use std::sync::Arc;
+ ///
+ /// let projection = ProjectionExprs::new([
+ /// ProjectionExpr::new(Arc::new(Column::new("b", 1)), "b"),
+ /// ProjectionExpr::new(
+ /// Arc::new(Literal::new(ScalarValue::Int32(Some(42)))),
+ /// "answer",
+ /// ),
+ /// ProjectionExpr::new(Arc::new(Column::new("a", 0)), "a"),
+ /// ]);
+ ///
+ /// assert_eq!(
+ /// projection.projected_column_position(&Column::new("b", 1)),
+ /// Some(0)
+ /// );
+ /// assert_eq!(
+ /// projection.projected_column_position(&Column::new("a", 0)),
+ /// Some(2)
+ /// );
+ ///
+ /// // The literal projection is not a Column expression.
+ /// assert_eq!(
+ /// projection.projected_column_position(&Column::new("answer", 1)),
+ /// None
+ /// );
+ ///
+ /// // Columns not present in the projection also return None.
+ /// assert_eq!(
+ /// projection.projected_column_position(&Column::new("c", 2)),
+ /// None
+ /// );
+ /// ```
+ pub fn projected_column_position(&self, column: &Column) -> Option {
+ self.iter().position(|expr| {
+ expr.expr
+ .downcast_ref::()
+ .is_some_and(|projected| projected == column)
+ })
+ }
}
/// Propagate column statistics through CAST projections. Other expressions
@@ -736,7 +790,7 @@ fn project_column_statistics_through_expr(
column_stats: &[ColumnStatistics],
) -> ColumnStatistics {
if let Some(col) = expr.downcast_ref::() {
- return column_stats[col.index()].clone();
+ return column_statistics_at(column_stats, col.index());
}
let Some(cast_expr) = expr.downcast_ref::() else {
return ColumnStatistics::new_unknown();
@@ -760,6 +814,16 @@ fn project_column_statistics_through_expr(
}
}
+fn column_statistics_at(
+ column_stats: &[ColumnStatistics],
+ index: usize,
+) -> ColumnStatistics {
+ column_stats
+ .get(index)
+ .cloned()
+ .unwrap_or_else(ColumnStatistics::new_unknown)
+}
+
impl<'a> IntoIterator for &'a ProjectionExprs {
type Item = &'a ProjectionExpr;
type IntoIter = std::slice::Iter<'a, ProjectionExpr>;
@@ -2202,6 +2266,43 @@ pub(crate) mod tests {
Schema::new(vec![field_0, field_1, field_2])
}
+ #[test]
+ fn test_projected_column_position_returns_output_position() {
+ let projection = ProjectionExprs::new([
+ ProjectionExpr::new(Arc::new(Column::new("col2", 2)), "col2"),
+ ProjectionExpr::new(Arc::new(Column::new("col0", 0)), "col0"),
+ ]);
+
+ assert_eq!(
+ projection.projected_column_position(&Column::new("col2", 2)),
+ Some(0)
+ );
+ assert_eq!(
+ projection.projected_column_position(&Column::new("col0", 0)),
+ Some(1)
+ );
+ }
+
+ #[test]
+ fn test_projected_column_position_returns_none_for_non_column_or_missing() {
+ let projection = ProjectionExprs::new([
+ ProjectionExpr::new(
+ Arc::new(Literal::new(ScalarValue::Int64(Some(42)))),
+ "col1",
+ ),
+ ProjectionExpr::new(Arc::new(Column::new("col0", 0)), "col0"),
+ ]);
+
+ assert_eq!(
+ projection.projected_column_position(&Column::new("col1", 1)),
+ None
+ );
+ assert_eq!(
+ projection.projected_column_position(&Column::new("col2", 2)),
+ None
+ );
+ }
+
#[test]
fn test_stats_projection_columns_only() {
let source = get_stats();
@@ -2913,6 +3014,57 @@ pub(crate) mod tests {
byte_size: Precision::Absent,
}
);
+
+ Ok(())
+ }
+
+ #[test]
+ fn test_project_statistics_missing_column_stats_are_unknown() -> Result<()> {
+ let mut input_stats = get_stats();
+ let input_schema = get_schema();
+ input_stats.column_statistics.truncate(2);
+
+ // The schema has col2, but the statistics do not. This can happen for
+ // source-provided virtual columns that are available at execution time
+ // but not represented in file-level statistics.
+ let projection = ProjectionExprs::new(vec![
+ ProjectionExpr {
+ expr: Arc::new(Column::new("col2", 2)),
+ alias: "virtual_col".to_string(),
+ },
+ ProjectionExpr {
+ expr: Arc::new(CastExpr::new(
+ Arc::new(Column::new("col2", 2)),
+ DataType::Float64,
+ None,
+ )),
+ alias: "casted_virtual_col".to_string(),
+ },
+ ProjectionExpr {
+ expr: Arc::new(Column::new("col0", 0)),
+ alias: "physical_col".to_string(),
+ },
+ ]);
+
+ let output_stats = projection.project_statistics(
+ input_stats,
+ &projection.project_schema(&input_schema)?,
+ )?;
+
+ assert_eq!(output_stats.column_statistics.len(), 3);
+ assert_eq!(
+ output_stats.column_statistics[0],
+ ColumnStatistics::new_unknown()
+ );
+ assert_eq!(
+ output_stats.column_statistics[1],
+ ColumnStatistics::new_unknown()
+ );
+ assert_eq!(
+ output_stats.column_statistics[2].max_value,
+ Precision::Exact(ScalarValue::Int64(Some(21)))
+ );
+
Ok(())
}
diff --git a/datafusion/sqllogictest/test_files/file_row_index.slt b/datafusion/sqllogictest/test_files/file_row_index.slt
new file mode 100644
index 0000000000000..38822bebfdfd3
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/file_row_index.slt
@@ -0,0 +1,171 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+
+# http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+statement ok
+COPY (VALUES (10), (20), (30), (40), (50))
+TO 'test_files/scratch/file_row_index/parquet_table/data.parquet'
+STORED AS PARQUET;
+
+statement ok
+CREATE EXTERNAL TABLE parquet_table(column1 int)
+STORED AS PARQUET
+LOCATION 'test_files/scratch/file_row_index/parquet_table/';
+
+query TT
+EXPLAIN SELECT file_row_index(), column1 FROM parquet_table
+----
+logical_plan
+01)Projection: file_row_index(), parquet_table.column1
+02)--TableScan: parquet_table projection=[column1]
+physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/file_row_index/parquet_table/data.parquet]]}, projection=[CAST(__datafusion_file_row_index@1 AS Int64) as file_row_index(), column1], file_type=parquet
+
+
+query II
+SELECT file_row_index(), column1 FROM parquet_table ORDER BY column1
+----
+0 10
+1 20
+2 30
+3 40
+4 50
+
+query III
+SELECT file_row_index(), file_row_index() + 1, column1
+FROM parquet_table
+ORDER BY column1
+----
+0 1 10
+1 2 20
+2 3 30
+3 4 40
+4 5 50
+
+
+query II
+SELECT file_row_index(), column1
+FROM parquet_table
+WHERE file_row_index() > 2
+ORDER BY column1
+----
+3 40
+4 50
+
+# Filter on file_row_index without having it in projection
+
+query TT
+EXPLAIN SELECT column1 FROM parquet_table WHERE file_row_index() > 2 ORDER BY column1
+----
+logical_plan
+01)Sort: parquet_table.column1 ASC NULLS LAST
+02)--Projection: parquet_table.column1
+03)----Filter: __datafusion_extracted_1 > Int64(2)
+04)------Projection: file_row_index() AS __datafusion_extracted_1, parquet_table.column1
+05)--------TableScan: parquet_table projection=[column1]
+physical_plan
+01)SortExec: expr=[column1@0 ASC NULLS LAST], preserve_partitioning=[false]
+02)--FilterExec: __datafusion_extracted_1@0 > 2, projection=[column1@1]
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/file_row_index/parquet_table/data.parquet]]}, projection=[CAST(__datafusion_file_row_index@1 AS Int64) as __datafusion_extracted_1, column1], file_type=parquet
+
+query I
+SELECT column1 FROM parquet_table WHERE file_row_index() > 2 ORDER BY column1
+----
+40
+50
+
+# Filter on file_row_index without projecting it, while enabling filter pushdown
+
+statement ok
+SET datafusion.execution.parquet.pushdown_filters = true;
+
+query TT
+EXPLAIN SELECT column1 FROM parquet_table WHERE file_row_index() > 2 ORDER BY column1
+----
+logical_plan
+01)Sort: parquet_table.column1 ASC NULLS LAST
+02)--Projection: parquet_table.column1
+03)----Filter: __datafusion_extracted_1 > Int64(2)
+04)------Projection: file_row_index() AS __datafusion_extracted_1, parquet_table.column1
+05)--------TableScan: parquet_table projection=[column1]
+physical_plan
+01)SortExec: expr=[column1@0 ASC NULLS LAST], preserve_partitioning=[false]
+02)--FilterExec: __datafusion_extracted_1@0 > 2, projection=[column1@1]
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/file_row_index/parquet_table/data.parquet]]}, projection=[CAST(__datafusion_file_row_index@1 AS Int64) as __datafusion_extracted_1, column1], file_type=parquet
+
+query I
+SELECT column1 FROM parquet_table WHERE file_row_index() > 2 ORDER BY column1
+----
+40
+50
+
+statement ok
+RESET datafusion.execution.parquet.pushdown_filters;
+
+# Without the rewrite in ParquetSource, `file_row_index()` errors because it
+# depends on file-source context
+
+query error file_row_index\(\) is source dependent and cannot be evaluated directly
+SELECT file_row_index()
+
+# Testing pushdown over a source that doesn't support `file_row_index()`.
+
+statement ok
+COPY (VALUES (10), (20), (30), (40), (50))
+TO 'test_files/scratch/file_row_index/csv_table/data.csv'
+STORED AS CSV;
+
+statement ok
+CREATE EXTERNAL TABLE csv_table(column1 int)
+STORED AS CSV
+LOCATION 'test_files/scratch/file_row_index/csv_table/data.csv';
+
+query error file_row_index\(\) is source dependent and cannot be evaluated directly
+SELECT *, file_row_index() FROM csv_table;
+
+# Testing a table with two files.
+
+statement ok
+COPY (VALUES (10), (20))
+TO 'test_files/scratch/file_row_index/parquet_two_files/part-1.parquet'
+STORED AS PARQUET;
+
+statement ok
+COPY (VALUES (30), (40))
+TO 'test_files/scratch/file_row_index/parquet_two_files/part-2.parquet'
+STORED AS PARQUET;
+
+statement ok
+CREATE EXTERNAL TABLE parquet_two_files(column1 int)
+STORED AS PARQUET
+LOCATION 'test_files/scratch/file_row_index/parquet_two_files/';
+
+query II
+SELECT file_row_index(), column1
+FROM parquet_two_files
+WHERE file_row_index() = 1
+ORDER BY column1
+----
+1 20
+1 40
+
+statement ok
+DROP TABLE parquet_two_files;
+
+statement ok
+DROP TABLE parquet_table;
+
+statement ok
+DROP TABLE csv_table;
diff --git a/docs/source/user-guide/sql/scalar_functions.md b/docs/source/user-guide/sql/scalar_functions.md
index e5cd6f3d99711..83df7b06fd224 100644
--- a/docs/source/user-guide/sql/scalar_functions.md
+++ b/docs/source/user-guide/sql/scalar_functions.md
@@ -5702,6 +5702,7 @@ union_tag(union_expression)
- [arrow_try_cast](#arrow_try_cast)
- [arrow_typeof](#arrow_typeof)
- [cast_to_type](#cast_to_type)
+- [file_row_index](#file_row_index)
- [get_field](#get_field)
- [try_cast_to_type](#try_cast_to_type)
- [version](#version)
@@ -5885,6 +5886,27 @@ cast_to_type(expression, reference)
+-----+
```
+### `file_row_index`
+
+Returns the zero-based row offset within the source file
+that produced the current row.
+
+The value is scoped to one file, so rows from different files in the same scan
+can have the same row index. This function is intended to be rewritten at
+file-scan time. If the input file is not known (for example, if this function
+is evaluated outside a file scan, or was not pushed down into one), direct
+evaluation returns an error.
+
+```sql
+file_row_index()
+```
+
+#### Example
+
+```sql
+SELECT file_row_index() FROM t;
+```
+
### `get_field`
Returns a field within a map or a struct with the given key.
From 98495131ff5966e1e2d8518ec1932824c2c266da Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 16 Jun 2026 18:09:45 +1000
Subject: [PATCH 21/39] chore(deps-dev): bump launch-editor from 2.10.0 to
2.14.1 in /datafusion/wasmtest/datafusion-wasm-app (#22970)
Bumps [launch-editor](https://github.com/vitejs/launch-editor) from
2.10.0 to 2.14.1.
Commits
This version was pushed to npm by GitHub Actions, a new
releaser for launch-editor since your current version.
[](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)
Dependabot will resolve any conflicts with this PR as long as you don't
alter it yourself. You can also trigger a rebase manually by commenting
`@dependabot rebase`.
[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)
---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR:
- `@dependabot rebase` will rebase this PR
- `@dependabot recreate` will recreate this PR, overwriting any edits
that have been made to it
- `@dependabot show ignore conditions` will show all
of the ignore conditions of the specified dependency
- `@dependabot ignore this major version` will close this PR and stop
Dependabot creating any more for this major version (unless you reopen
the PR or upgrade to it yourself)
- `@dependabot ignore this minor version` will close this PR and stop
Dependabot creating any more for this minor version (unless you reopen
the PR or upgrade to it yourself)
- `@dependabot ignore this dependency` will close this PR and stop
Dependabot creating any more for this dependency (unless you reopen the
PR or upgrade to it yourself)
You can disable automated security fix PRs for this repo from the
[Security Alerts
page](https://github.com/apache/datafusion/network/alerts).
Signed-off-by: dependabot[bot]
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
.../datafusion-wasm-app/package-lock.json | 21 +++++++++----------
1 file changed, 10 insertions(+), 11 deletions(-)
diff --git a/datafusion/wasmtest/datafusion-wasm-app/package-lock.json b/datafusion/wasmtest/datafusion-wasm-app/package-lock.json
index c476ea76347ab..526853d841421 100644
--- a/datafusion/wasmtest/datafusion-wasm-app/package-lock.json
+++ b/datafusion/wasmtest/datafusion-wasm-app/package-lock.json
@@ -2431,14 +2431,13 @@
}
},
"node_modules/launch-editor": {
- "version": "2.10.0",
- "resolved": "https://registry.npmjs.org/launch-editor/-/launch-editor-2.10.0.tgz",
- "integrity": "sha512-D7dBRJo/qcGX9xlvt/6wUYzQxjh5G1RvZPgPv8vi4KRU99DVQL/oW7tnVOCCTm2HGeo3C5HvGE5Yrh6UBoZ0vA==",
+ "version": "2.14.1",
+ "resolved": "https://registry.npmjs.org/launch-editor/-/launch-editor-2.14.1.tgz",
+ "integrity": "sha512-QWBrQsMpH7gPr965dsKD/3cKWiNoTjpATQf++Xq63N6sKRGMwlVXz41O1IZTMfZQgBctD/K5Zt06+/I6pP6+HA==",
"dev": true,
- "license": "MIT",
"dependencies": {
- "picocolors": "^1.0.0",
- "shell-quote": "^1.8.1"
+ "picocolors": "^1.1.1",
+ "shell-quote": "^1.8.4"
}
},
"node_modules/loader-runner": {
@@ -6028,13 +6027,13 @@
"dev": true
},
"launch-editor": {
- "version": "2.10.0",
- "resolved": "https://registry.npmjs.org/launch-editor/-/launch-editor-2.10.0.tgz",
- "integrity": "sha512-D7dBRJo/qcGX9xlvt/6wUYzQxjh5G1RvZPgPv8vi4KRU99DVQL/oW7tnVOCCTm2HGeo3C5HvGE5Yrh6UBoZ0vA==",
+ "version": "2.14.1",
+ "resolved": "https://registry.npmjs.org/launch-editor/-/launch-editor-2.14.1.tgz",
+ "integrity": "sha512-QWBrQsMpH7gPr965dsKD/3cKWiNoTjpATQf++Xq63N6sKRGMwlVXz41O1IZTMfZQgBctD/K5Zt06+/I6pP6+HA==",
"dev": true,
"requires": {
- "picocolors": "^1.0.0",
- "shell-quote": "^1.8.1"
+ "picocolors": "^1.1.1",
+ "shell-quote": "^1.8.4"
}
},
"loader-runner": {
From 15bc9333cb1a4e1fa0ce961f54433ec0b8fb9df5 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 16 Jun 2026 18:10:18 +1000
Subject: [PATCH 22/39] chore(deps): bump cryptography from 46.0.7 to 48.0.1
(#22968)
Bumps [cryptography](https://github.com/pyca/cryptography) from 46.0.7
to 48.0.1.
Changelog
* Updated Windows, macOS, and Linux wheels to be compiled with OpenSSL
4.0.1.
.. _v48-0-0:
48.0.0 - 2026-05-04
BACKWARDS INCOMPATIBLE: Support for Python 3.8 has
been removed.
cryptography now requires Python 3.9 or later.
BACKWARDS INCOMPATIBLE: Loading an X.509 CRL whose
inner
TBSCertList.signature algorithm does not match the outer
signatureAlgorithm now raises ValueError.
Previously, such CRLs
were parsed successfully and only rejected during signature
validation.
Added support for
:doc:/hazmat/primitives/asymmetric/mlkem and
:doc:/hazmat/primitives/asymmetric/mldsa when using OpenSSL
3.5.0 or
later, in addition to the existing AWS-LC and BoringSSL support. This
means
post-quantum algorithms are now available to users of our wheels.
Note: Going forward, we do not guarantee that all
functionality
in cryptography will be available when building against
OpenSSL. See :doc:/statements/state-of-openssl for more
information.
.. _v47-0-0:
47.0.0 - 2026-04-24
* Support for Python 3.8 is deprecated and will be removed in the next
``cryptography`` release.
* **BACKWARDS INCOMPATIBLE:** Support for binary elliptic curves
(``SECT*`` classes) has been removed. These curves are rarely used and
have additional security considerations that make them undesirable.
* **BACKWARDS INCOMPATIBLE:** Support for OpenSSL 1.1.x has been
removed.
OpenSSL 3.0.0 or later is now required. LibreSSL, BoringSSL, and AWS-LC
continue to be supported.
* **BACKWARDS INCOMPATIBLE:** Dropped support for LibreSSL < 4.1.
* **BACKWARDS INCOMPATIBLE:** Loading keys with unsupported algorithms
or
keys with unsupported explicit curve encodings now raises
:class:`~cryptography.exceptions.UnsupportedAlgorithm` instead of
``ValueError``. This change affects
:func:`~cryptography.hazmat.primitives.serialization.load_pem_private_key`,
:func:`~cryptography.hazmat.primitives.serialization.load_der_private_key`,
:func:`~cryptography.hazmat.primitives.serialization.load_pem_public_key`,
:func:`~cryptography.hazmat.primitives.serialization.load_der_public_key`,
and :meth:`~cryptography.x509.Certificate.public_key` when called on
certificates with unsupported public key algorithms.
</tr></table>
[](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)
Dependabot will resolve any conflicts with this PR as long as you don't
alter it yourself. You can also trigger a rebase manually by commenting
`@dependabot rebase`.
[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)
---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR:
- `@dependabot rebase` will rebase this PR
- `@dependabot recreate` will recreate this PR, overwriting any edits
that have been made to it
- `@dependabot show ignore conditions` will show all
of the ignore conditions of the specified dependency
- `@dependabot ignore this major version` will close this PR and stop
Dependabot creating any more for this major version (unless you reopen
the PR or upgrade to it yourself)
- `@dependabot ignore this minor version` will close this PR and stop
Dependabot creating any more for this minor version (unless you reopen
the PR or upgrade to it yourself)
- `@dependabot ignore this dependency` will close this PR and stop
Dependabot creating any more for this dependency (unless you reopen the
PR or upgrade to it yourself)
Signed-off-by: dependabot[bot]
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
Cargo.lock | 4 ++--
datafusion-cli/Cargo.toml | 2 +-
2 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/Cargo.lock b/Cargo.lock
index df6b263adb009..543750f8d8355 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3721,9 +3721,9 @@ dependencies = [
[[package]]
name = "insta-cmd"
-version = "0.6.0"
+version = "0.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ffeeefa927925cced49ccb01bf3e57c9d4cd132df21e576eb9415baeab2d3de6"
+checksum = "bffdf4af1db390cf0401535d7c1303cd079a074d28d8473b026fdb6559c41403"
dependencies = [
"insta",
"serde",
diff --git a/datafusion-cli/Cargo.toml b/datafusion-cli/Cargo.toml
index 441ae00c11db0..62eedafe798d4 100644
--- a/datafusion-cli/Cargo.toml
+++ b/datafusion-cli/Cargo.toml
@@ -75,7 +75,7 @@ workspace = true
[dev-dependencies]
ctor = { workspace = true }
insta = { workspace = true }
-insta-cmd = "0.6.0"
+insta-cmd = "0.7.0"
rstest = { workspace = true }
testcontainers-modules = { workspace = true, features = ["minio"] }
# Makes sure `test_display_pg_json` behaves in a consistent way regardless of
From 46d241d1cea07543d4341867b008560978d22bf8 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 16 Jun 2026 09:49:52 +0000
Subject: [PATCH 27/39] chore(deps): update maturin requirement from
<2,>=1.13.3 to >=1.14.0,<2 in /docs (#22974)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Updates the requirements on [maturin](https://github.com/pyo3/maturin)
to permit the latest version.
Release notes
Dependabot will resolve any conflicts with this PR as long as you don't
alter it yourself. You can also trigger a rebase manually by commenting
`@dependabot rebase`.
[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)
---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR:
- `@dependabot rebase` will rebase this PR
- `@dependabot recreate` will recreate this PR, overwriting any edits
that have been made to it
- `@dependabot show ignore conditions` will show all
of the ignore conditions of the specified dependency
- `@dependabot ignore this major version` will close this PR and stop
Dependabot creating any more for this major version (unless you reopen
the PR or upgrade to it yourself)
- `@dependabot ignore this minor version` will close this PR and stop
Dependabot creating any more for this minor version (unless you reopen
the PR or upgrade to it yourself)
- `@dependabot ignore this dependency` will close this PR and stop
Dependabot creating any more for this dependency (unless you reopen the
PR or upgrade to it yourself)
Signed-off-by: dependabot[bot]
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
docs/pyproject.toml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/docs/pyproject.toml b/docs/pyproject.toml
index d8fa4f9ec1775..1f4044d63674b 100644
--- a/docs/pyproject.toml
+++ b/docs/pyproject.toml
@@ -7,7 +7,7 @@ dependencies = [
"sphinx-reredirects>=1.1,<2",
"pydata-sphinx-theme>=0.18.0,<1",
"myst-parser>=5.1.0,<6",
- "maturin>=1.13.3,<2",
+ "maturin>=1.14.0,<2",
"jinja2>=3.1.6,<4",
"setuptools>=82.0.1,<83",
]
From 2282d23d4ff0af91463b63aa99cd793635ecef8e Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 16 Jun 2026 19:51:01 +1000
Subject: [PATCH 28/39] chore(deps): bump taiki-e/install-action from 2.81.8 to
2.81.11 (#22973)
Bumps
[taiki-e/install-action](https://github.com/taiki-e/install-action) from
2.81.8 to 2.81.11.
Release notes
[](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)
Dependabot will resolve any conflicts with this PR as long as you don't
alter it yourself. You can also trigger a rebase manually by commenting
`@dependabot rebase`.
[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)
---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR:
- `@dependabot rebase` will rebase this PR
- `@dependabot recreate` will recreate this PR, overwriting any edits
that have been made to it
- `@dependabot show ignore conditions` will show all
of the ignore conditions of the specified dependency
- `@dependabot ignore this major version` will close this PR and stop
Dependabot creating any more for this major version (unless you reopen
the PR or upgrade to it yourself)
- `@dependabot ignore this minor version` will close this PR and stop
Dependabot creating any more for this minor version (unless you reopen
the PR or upgrade to it yourself)
- `@dependabot ignore this dependency` will close this PR and stop
Dependabot creating any more for this dependency (unless you reopen the
PR or upgrade to it yourself)
Signed-off-by: dependabot[bot]
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
.github/workflows/audit.yml | 2 +-
.github/workflows/breaking_changes_detector.yml | 2 +-
.github/workflows/dev.yml | 2 +-
.github/workflows/rust.yml | 4 ++--
4 files changed, 5 insertions(+), 5 deletions(-)
diff --git a/.github/workflows/audit.yml b/.github/workflows/audit.yml
index a6eec722ed3e1..bca3390370175 100644
--- a/.github/workflows/audit.yml
+++ b/.github/workflows/audit.yml
@@ -45,7 +45,7 @@ jobs:
steps:
- uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3
- name: Install cargo-audit
- uses: taiki-e/install-action@0631aa6515c7d545823c67cfae7ef4fc7f490154 # v2.81.8
+ uses: taiki-e/install-action@15449e3094499af05d8d964a1c884208e4b8b595 # v2.81.11
with:
tool: cargo-audit
- name: Run audit check
diff --git a/.github/workflows/breaking_changes_detector.yml b/.github/workflows/breaking_changes_detector.yml
index 264ba16e3c8d0..a0d7bf6520ee5 100644
--- a/.github/workflows/breaking_changes_detector.yml
+++ b/.github/workflows/breaking_changes_detector.yml
@@ -89,7 +89,7 @@ jobs:
- name: Install cargo-semver-checks
if: steps.changed_crates.outputs.packages != ''
- uses: taiki-e/install-action@0631aa6515c7d545823c67cfae7ef4fc7f490154 # v2.81.8
+ uses: taiki-e/install-action@15449e3094499af05d8d964a1c884208e4b8b595 # v2.81.11
with:
tool: cargo-semver-checks
diff --git a/.github/workflows/dev.yml b/.github/workflows/dev.yml
index 48f5ea8939c8c..43da64c569162 100644
--- a/.github/workflows/dev.yml
+++ b/.github/workflows/dev.yml
@@ -64,7 +64,7 @@ jobs:
source ci/scripts/utils/tool_versions.sh
echo "LYCHEE_VERSION=${LYCHEE_VERSION}" >> "$GITHUB_ENV"
- name: Install lychee
- uses: taiki-e/install-action@0631aa6515c7d545823c67cfae7ef4fc7f490154 # v2.81.8
+ uses: taiki-e/install-action@15449e3094499af05d8d964a1c884208e4b8b595 # v2.81.11
with:
tool: lychee@${{ env.LYCHEE_VERSION }}
- name: Run markdown link check
diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
index 770e705dddc90..cc48a54f27ec4 100644
--- a/.github/workflows/rust.yml
+++ b/.github/workflows/rust.yml
@@ -429,7 +429,7 @@ jobs:
sudo apt-get update -qq
sudo apt-get install -y -qq clang
- name: Setup wasm-pack
- uses: taiki-e/install-action@0631aa6515c7d545823c67cfae7ef4fc7f490154 # v2.81.8
+ uses: taiki-e/install-action@15449e3094499af05d8d964a1c884208e4b8b595 # v2.81.11
with:
tool: wasm-pack
- name: Run tests with headless mode
@@ -773,7 +773,7 @@ jobs:
- name: Setup Rust toolchain
uses: ./.github/actions/setup-builder
- name: Install cargo-msrv
- uses: taiki-e/install-action@0631aa6515c7d545823c67cfae7ef4fc7f490154 # v2.81.8
+ uses: taiki-e/install-action@15449e3094499af05d8d964a1c884208e4b8b595 # v2.81.11
with:
tool: cargo-msrv
From fbd64b4471a688fa8a8be201db0283861244e7a1 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 16 Jun 2026 10:37:00 +0000
Subject: [PATCH 29/39] chore(deps): update pydata-sphinx-theme requirement
from <1,>=0.18.0 to >=0.19.0,<1 in /docs (#22972)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Updates the requirements on
[pydata-sphinx-theme](https://github.com/pydata/pydata-sphinx-theme) to
permit the latest version.
Release notes
Bump pydata/pydata-sphinx-theme/.github/workflows/CI.yml from
178df7a9d69695be4e49fa56822d76b048977387 to
047227ad41f26a993ea7b2182955d26aa837acea by @dependabot[bot]
in pydata/pydata-sphinx-theme#2398
Bump pydata/pydata-sphinx-theme from
178df7a9d69695be4e49fa56822d76b048977387 to
047227ad41f26a993ea7b2182955d26aa837acea by @dependabot[bot]
in pydata/pydata-sphinx-theme#2397
Bump pydata/pydata-sphinx-theme/.github/workflows/docs.yml from
178df7a9d69695be4e49fa56822d76b048977387 to
047227ad41f26a993ea7b2182955d26aa837acea by @dependabot[bot]
in pydata/pydata-sphinx-theme#2395