diff --git a/benchmarks/bench.sh b/benchmarks/bench.sh index 29957f25e370d..0ca40eb78d57c 100755 --- a/benchmarks/bench.sh +++ b/benchmarks/bench.sh @@ -102,6 +102,8 @@ topk_tpch: Benchmark of top-k (sorting with limit) queries on TPC-H external_aggr: External aggregation benchmark on TPC-H dataset (SF=1) wide_schema: Small-projection queries on a wide synthetic dataset (1024 cols × 256 files) — measures per-file metadata overhead (runs both 'wide' and 'narrow' subgroups: narrow is an internal baseline; the wide-vs-narrow ratio is the signal) +adversarial_filter: Conjunct-ordering stress test for adaptive filter reordering (synthetic data, generated inline) + (set DATAFUSION_EXECUTION_ADAPTIVE_FILTER_REORDERING=true to enable the optimization; ADV_ROWS sizes the data) # ClickBench Benchmarks clickbench_1: ClickBench queries against a single parquet file @@ -245,6 +247,10 @@ main() { wide_schema) data_wide_schema ;; + adversarial_filter) + # Data is generated inline by the suite's init SQL. + echo "adversarial_filter: no external data to generate" + ;; tpcds) data_tpcds ;; @@ -458,6 +464,9 @@ main() { wide_schema) run_wide_schema ;; + adversarial_filter) + run_adversarial_filter + ;; tpcds) run_tpcds ;; @@ -778,6 +787,22 @@ run_wide_schema() { bash -c "$SQL_CARGO_COMMAND" } +# Runs the adversarial_filter benchmark. Data is generated inline by the suite, +# so there is no data step. Toggle the optimization under test with +# DATAFUSION_EXECUTION_ADAPTIVE_FILTER_REORDERING (off by default — the same env +# var the dfbench suites read via SessionConfig::from_env) and size the data +# with ADV_ROWS; both are consumed by the suite's init SQL, so flag-on vs +# flag-off comparisons are driven entirely by the environment. +run_adversarial_filter() { + echo "Running adversarial_filter benchmark (adaptive_filter_reordering=${DATAFUSION_EXECUTION_ADAPTIVE_FILTER_REORDERING:-false})..." + debug_run env BENCH_NAME=adversarial_filter \ + DATAFUSION_EXECUTION_ADAPTIVE_FILTER_REORDERING="${DATAFUSION_EXECUTION_ADAPTIVE_FILTER_REORDERING:-false}" \ + ADV_ROWS="${ADV_ROWS:-10000000}" \ + SIMULATE_LATENCY="${SIMULATE_LATENCY}" \ + ${QUERY:+BENCH_QUERY="${QUERY}"} \ + bash -c "$SQL_CARGO_COMMAND" +} + # Runs the tpch in memory (needs tpch parquet data) run_tpch_mem() { SCALE_FACTOR=$1 diff --git a/benchmarks/sql_benchmarks/adversarial_filter/adversarial_filter.suite b/benchmarks/sql_benchmarks/adversarial_filter/adversarial_filter.suite new file mode 100644 index 0000000000000..42422eff880ec --- /dev/null +++ b/benchmarks/sql_benchmarks/adversarial_filter/adversarial_filter.suite @@ -0,0 +1,2 @@ +name = "adversarial_filter" +description = "Conjunct-ordering stress test for adaptive filter reordering: five equally-expensive regexp predicates whose selective member is written last. Enable the optimization with DATAFUSION_EXECUTION_ADAPTIVE_FILTER_REORDERING=true; size the synthetic data with ADV_ROWS (default 10M)." diff --git a/benchmarks/sql_benchmarks/adversarial_filter/benchmarks/q01.benchmark b/benchmarks/sql_benchmarks/adversarial_filter/benchmarks/q01.benchmark new file mode 100644 index 0000000000000..46d6d0c542225 --- /dev/null +++ b/benchmarks/sql_benchmarks/adversarial_filter/benchmarks/q01.benchmark @@ -0,0 +1,27 @@ +-- Five expensive regexp predicates with the VERY selective one ('rare', ~0.1%) +-- written LAST. Neither SQL order, the cheap/expensive heuristic (#22343), nor +-- BinaryExpr's leftmost-only pre-selection reorders it, so the baseline scans +-- every predicate over ~every row. Only runtime measurement promotes `rare` to +-- gate the rest. This is where adaptive reordering wins. + +name Q01 +group adversarial_filter + +init sql_benchmarks/adversarial_filter/init/set_config.sql + +load sql_benchmarks/adversarial_filter/init/load.sql + +assert I +SELECT COUNT(*) > 0 FROM adv WHERE regexp_like(s, 'rare'); +---- +true + +run +SELECT count(*) FROM adv +WHERE regexp_like(s, 'aaa') + AND regexp_like(s, 'bbb') + AND regexp_like(s, 'ccc') + AND regexp_like(s, 'ddd') + AND regexp_like(s, 'rare'); + +cleanup sql_benchmarks/adversarial_filter/init/cleanup.sql diff --git a/benchmarks/sql_benchmarks/adversarial_filter/benchmarks/q02.benchmark b/benchmarks/sql_benchmarks/adversarial_filter/benchmarks/q02.benchmark new file mode 100644 index 0000000000000..5a79230b36a3b --- /dev/null +++ b/benchmarks/sql_benchmarks/adversarial_filter/benchmarks/q02.benchmark @@ -0,0 +1,27 @@ +-- Control: the same five predicates, but the selective one ('rare') FIRST. +-- BinaryExpr's AND short-circuit already gates on a leftmost selective +-- conjunct, so the baseline is already near-optimal and the flag is ~neutral +-- here. This isolates the Q01 win as purely an ordering fix (and confirms the +-- adaptive path adds no measurable overhead once it can't help). + +name Q02 +group adversarial_filter + +init sql_benchmarks/adversarial_filter/init/set_config.sql + +load sql_benchmarks/adversarial_filter/init/load.sql + +assert I +SELECT COUNT(*) > 0 FROM adv WHERE regexp_like(s, 'rare'); +---- +true + +run +SELECT count(*) FROM adv +WHERE regexp_like(s, 'rare') + AND regexp_like(s, 'aaa') + AND regexp_like(s, 'bbb') + AND regexp_like(s, 'ccc') + AND regexp_like(s, 'ddd'); + +cleanup sql_benchmarks/adversarial_filter/init/cleanup.sql diff --git a/benchmarks/sql_benchmarks/adversarial_filter/init/cleanup.sql b/benchmarks/sql_benchmarks/adversarial_filter/init/cleanup.sql new file mode 100644 index 0000000000000..a0eae620eb718 --- /dev/null +++ b/benchmarks/sql_benchmarks/adversarial_filter/init/cleanup.sql @@ -0,0 +1 @@ +DROP TABLE IF EXISTS adv; diff --git a/benchmarks/sql_benchmarks/adversarial_filter/init/load.sql b/benchmarks/sql_benchmarks/adversarial_filter/init/load.sql new file mode 100644 index 0000000000000..319bc98a26ccb --- /dev/null +++ b/benchmarks/sql_benchmarks/adversarial_filter/init/load.sql @@ -0,0 +1,25 @@ +-- Synthetic dataset generated inline (no external data files). Each row's +-- string column `s` embeds five markers inside ~180 chars of filler so every +-- regexp_like must scan the whole value (i.e. each predicate is expensive), +-- with deliberately different selectivity. The moduli are coprime so the +-- markers are independent; `rare` is keyed on a prime so the AND is non-empty. +-- +-- 'aaa' present in 90% of rows (i % 10 <> 0) +-- 'bbb' present in ~86% of rows (i % 7 <> 0) +-- 'ccc' present in 80% of rows (i % 5 <> 0) +-- 'ddd' present in 75% of rows (i % 4 <> 0) +-- 'rare' present in ~0.1% of rows (i % 1009 = 5) <- the selective one +CREATE TABLE adv AS +SELECT + repeat('q', 30) + || CASE WHEN value % 10 <> 0 THEN 'aaa' ELSE 'zzz' END + || repeat('q', 30) + || CASE WHEN value % 7 <> 0 THEN 'bbb' ELSE 'zzz' END + || repeat('q', 30) + || CASE WHEN value % 5 <> 0 THEN 'ccc' ELSE 'zzz' END + || repeat('q', 30) + || CASE WHEN value % 4 <> 0 THEN 'ddd' ELSE 'zzz' END + || repeat('q', 30) + || CASE WHEN value % 1009 = 5 THEN 'rare' ELSE 'zzzz' END + || repeat('q', 30) AS s +FROM generate_series(1, ${ADV_ROWS:-10000000}); diff --git a/benchmarks/sql_benchmarks/adversarial_filter/init/set_config.sql b/benchmarks/sql_benchmarks/adversarial_filter/init/set_config.sql new file mode 100644 index 0000000000000..d19515a7a8b78 --- /dev/null +++ b/benchmarks/sql_benchmarks/adversarial_filter/init/set_config.sql @@ -0,0 +1,6 @@ +-- Toggle the optimization under test. Off by default (acts as the baseline); +-- set DATAFUSION_EXECUTION_ADAPTIVE_FILTER_REORDERING=true to enable it. This +-- is the same env var the dfbench suites pick up via SessionConfig::from_env; +-- the SQL bench harness uses SessionContext::new(), so we wire it in explicitly +-- here via env interpolation. +set datafusion.execution.adaptive_filter_reordering = ${DATAFUSION_EXECUTION_ADAPTIVE_FILTER_REORDERING:-false}; diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs index 9d960e3bf694c..27addabbf559c 100644 --- a/datafusion/common/src/config.rs +++ b/datafusion/common/src/config.rs @@ -662,6 +662,17 @@ config_namespace! { /// tables with a highly-selective join filter, but is also slightly slower. pub enforce_batch_size_in_joins: bool, default = false + /// (experimental) When enabled, `FilterExec` adaptively reorders the + /// conjuncts of a conjunctive predicate at runtime. It measures each + /// conjunct's selectivity and evaluation cost on the rows that reach it + /// and runs the conjuncts that discard the most rows per unit of CPU + /// time first, so cheap-and-selective predicates gate expensive ones. + /// Reordering never changes query results (only the evaluation order of + /// a conjunction) but can change observable side effects of fallible + /// predicates, so it is off by default. Predicates containing volatile + /// expressions are never reordered. + pub adaptive_filter_reordering: bool, default = false + /// Size (bytes) of data buffer DataFusion uses when writing output files. /// This affects the size of the data chunks that are uploaded to remote /// object stores (e.g. AWS S3). If very large (>= 100 GiB) output files are being diff --git a/datafusion/physical-expr-common/src/adaptive/mod.rs b/datafusion/physical-expr-common/src/adaptive/mod.rs new file mode 100644 index 0000000000000..42bbf4d6a7eb9 --- /dev/null +++ b/datafusion/physical-expr-common/src/adaptive/mod.rs @@ -0,0 +1,40 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Shared substrate for adaptive (measurement-driven) filtering. +//! +//! Adaptive filter policies observe how predicates behave at runtime and +//! re-decide accordingly — the parquet scan adapts filter *placement* +//! (row-level vs. post-scan vs. dropped), and an adaptive `FilterExec` could +//! adapt conjunct evaluation *order*. Both need the same ingredients: +//! +//! - per-predicate online **selectivity + cost** measurement with confidence +//! intervals — [`SelectivityStats`]; +//! - a concurrent **registry** keyed by a caller-local [`FilterId`], with +//! per-predicate skip flags so an optional predicate can be made a no-op +//! mid-stream — [`AdaptiveStatsRegistry`]. +//! +//! What stays with each consumer is *policy*: the per-batch effectiveness +//! metric it feeds in, and the ranking/decision function it computes over the +//! snapshots. This module intentionally contains no placement or ordering +//! logic. + +pub mod registry; +pub mod stats; + +pub use registry::AdaptiveStatsRegistry; +pub use stats::{FilterId, SelectivityStats}; diff --git a/datafusion/physical-expr-common/src/adaptive/registry.rs b/datafusion/physical-expr-common/src/adaptive/registry.rs new file mode 100644 index 0000000000000..5ac46e69d8c45 --- /dev/null +++ b/datafusion/physical-expr-common/src/adaptive/registry.rs @@ -0,0 +1,286 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Concurrent registry of per-predicate [`SelectivityStats`] plus the +//! "skip" flags that let an *optional* predicate be turned into a no-op +//! mid-stream. +//! +//! This is shared plumbing, free of any placement/ordering policy. A consumer +//! [`register`](AdaptiveStatsRegistry::register)s the predicates it tracks, +//! calls [`record`](AdaptiveStatsRegistry::record) on the per-batch hot path, +//! and reads back [`snapshot`](AdaptiveStatsRegistry::snapshot)s when it +//! periodically re-decides (placement, ordering, drop, …). + +use std::collections::HashMap; +use std::sync::Arc; +use std::sync::atomic::{AtomicBool, Ordering}; + +use parking_lot::{Mutex, RwLock}; + +use super::stats::{FilterId, SelectivityStats}; + +/// Thread-safe map of [`FilterId`] → online [`SelectivityStats`], with +/// per-predicate skip flags. +/// +/// # Locking +/// +/// The outer [`RwLock`] over the stats map is almost always *read*-locked: both +/// [`record`](Self::record) (hot, per-batch) and the snapshot readers only need +/// shared access to look up an existing entry. The write lock is taken only by +/// [`register`](Self::register) when a new [`FilterId`] is first seen — a brief, +/// infrequent operation. +/// +/// Each entry is an independent [`Mutex`], so concurrent +/// `record` calls on *different* predicates proceed in parallel with zero +/// contention. +#[derive(Debug, Default)] +pub struct AdaptiveStatsRegistry { + /// Per-predicate selectivity statistics, each individually `Mutex`-guarded. + stats: RwLock>>, + /// Per-predicate "skip" flags. When set, the consumer treats the predicate + /// as a no-op for subsequent batches. Only ever set for predicates whose + /// [`SelectivityStats::is_optional`] is `true` — mandatory predicates must + /// always execute or queries return wrong rows. + skip_flags: RwLock>>, +} + +impl AdaptiveStatsRegistry { + /// Create an empty registry. + pub fn new() -> Self { + Self::default() + } + + /// Register a predicate so future [`record`](Self::record) calls can find + /// it. Idempotent: an already-registered id keeps its accumulated stats and + /// its existing optional flag. + /// + /// `is_optional` records whether the predicate may be dropped without + /// affecting correctness (see [`SelectivityStats::is_optional`]). + pub fn register(&self, id: FilterId, is_optional: bool) { + if self.stats.read().contains_key(&id) { + return; + } + let mut stats = self.stats.write(); + stats + .entry(id) + .or_insert_with(|| Mutex::new(SelectivityStats::new(is_optional))); + self.skip_flags + .write() + .entry(id) + .or_insert_with(|| Arc::new(AtomicBool::new(false))); + } + + /// Register many predicates at once (see [`register`](Self::register)). + pub fn register_all(&self, entries: impl IntoIterator) { + let mut stats = self.stats.write(); + let mut flags = self.skip_flags.write(); + for (id, is_optional) in entries { + stats + .entry(id) + .or_insert_with(|| Mutex::new(SelectivityStats::new(is_optional))); + flags + .entry(id) + .or_insert_with(|| Arc::new(AtomicBool::new(false))); + } + } + + /// Record one batch of observations for `id` (per-batch hot path). + /// + /// Takes only a shared lock on the map plus the per-predicate mutex, so it + /// never contends with `record` calls on other predicates. A no-op if `id` + /// was never [`register`](Self::register)ed. + pub fn record( + &self, + id: FilterId, + matched: u64, + total: u64, + eval_nanos: u64, + effectiveness_sample: f64, + ) { + let map = self.stats.read(); + if let Some(entry) = map.get(&id) { + entry + .lock() + .record(matched, total, eval_nanos, effectiveness_sample); + } + } + + /// Copy out the current stats for `id`, or `None` if unregistered. + /// + /// [`SelectivityStats`] is `Copy`, so consumers read every derived metric + /// (pass rate, cost-per-row, effectiveness, confidence bounds) off the + /// returned value without holding any lock. + pub fn snapshot(&self, id: FilterId) -> Option { + self.stats.read().get(&id).map(|entry| *entry.lock()) + } + + /// Clear the accumulated stats for `id`, preserving its optional flag and + /// skip flag. Used when a dynamic predicate re-arms under a stable id (see + /// [`SelectivityStats::reset`]). A no-op if `id` is unregistered. + pub fn reset(&self, id: FilterId) { + if let Some(entry) = self.stats.read().get(&id) { + entry.lock().reset(); + } + } + + /// The shared skip flag for `id`, registering `id` as optional if it was + /// not already present. The returned `Arc` can be cached by an evaluator so + /// it can cheaply check the flag without touching the registry. + pub fn skip_flag(&self, id: FilterId) -> Arc { + if let Some(flag) = self.skip_flags.read().get(&id) { + return Arc::clone(flag); + } + // First sighting: register as optional and create the flag. + self.register(id, true); + Arc::clone( + self.skip_flags + .read() + .get(&id) + .expect("skip flag inserted by register"), + ) + } + + /// Whether `id`'s skip flag is currently set. `false` if unregistered. + pub fn is_skipped(&self, id: FilterId) -> bool { + self.skip_flags + .read() + .get(&id) + .is_some_and(|flag| flag.load(Ordering::Relaxed)) + } + + /// Set or clear `id`'s skip flag. A no-op if `id` is unregistered. + pub fn set_skipped(&self, id: FilterId, skipped: bool) { + if let Some(flag) = self.skip_flags.read().get(&id) { + flag.store(skipped, Ordering::Relaxed); + } + } + + /// Whether `id` has been registered. + pub fn contains(&self, id: FilterId) -> bool { + self.stats.read().contains_key(&id) + } + + /// Number of registered predicates. + pub fn len(&self) -> usize { + self.stats.read().len() + } + + /// Whether no predicates are registered. + pub fn is_empty(&self) -> bool { + self.stats.read().is_empty() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn record_requires_registration() { + let reg = AdaptiveStatsRegistry::new(); + // recording an unregistered id is a silent no-op + reg.record(0, 1, 10, 100, 5.0); + assert!(reg.snapshot(0).is_none()); + assert_eq!(reg.len(), 0); + + reg.register(0, false); + reg.record(0, 1, 10, 100, 5.0); + let s = reg.snapshot(0).unwrap(); + assert_eq!(s.pass_rate(), Some(0.1)); + assert_eq!(s.sample_count(), 1); + } + + #[test] + fn register_is_idempotent_and_keeps_stats() { + let reg = AdaptiveStatsRegistry::new(); + reg.register(7, false); + reg.record(7, 2, 10, 100, 9.0); + // re-register must not wipe accumulated stats + reg.register(7, true); + let s = reg.snapshot(7).unwrap(); + assert_eq!(s.sample_count(), 1); + // optional flag is not flipped by a redundant register + assert!(!s.is_optional()); + } + + #[test] + fn register_all_bulk() { + let reg = AdaptiveStatsRegistry::new(); + reg.register_all([(0, false), (1, true), (2, false)]); + assert_eq!(reg.len(), 3); + assert!(reg.snapshot(1).unwrap().is_optional()); + assert!(!reg.snapshot(0).unwrap().is_optional()); + } + + #[test] + fn skip_flag_round_trips_and_is_shared() { + let reg = AdaptiveStatsRegistry::new(); + reg.register(3, true); + assert!(!reg.is_skipped(3)); + let flag = reg.skip_flag(3); + reg.set_skipped(3, true); + // the cached Arc observes the change made through the registry + assert!(flag.load(Ordering::Relaxed)); + assert!(reg.is_skipped(3)); + } + + #[test] + fn skip_flag_autoregisters_as_optional() { + let reg = AdaptiveStatsRegistry::new(); + let _ = reg.skip_flag(42); + assert!(reg.contains(42)); + assert!(reg.snapshot(42).unwrap().is_optional()); + } + + #[test] + fn reset_clears_stats_keeps_flag() { + let reg = AdaptiveStatsRegistry::new(); + reg.register(1, true); + reg.record(1, 5, 10, 100, 3.0); + reg.set_skipped(1, true); + reg.reset(1); + let s = reg.snapshot(1).unwrap(); + assert_eq!(s.sample_count(), 0); + assert_eq!(s.pass_rate(), None); + assert!(s.is_optional()); + // skip flag is independent of stats reset + assert!(reg.is_skipped(1)); + } + + #[test] + fn concurrent_records_on_distinct_ids() { + use std::thread; + let reg = Arc::new(AdaptiveStatsRegistry::new()); + reg.register_all((0..8).map(|i| (i, false))); + thread::scope(|scope| { + for id in 0..8usize { + let reg = Arc::clone(®); + scope.spawn(move || { + for _ in 0..1000 { + reg.record(id, 1, 2, 10, id as f64); + } + }); + } + }); + for id in 0..8usize { + let s = reg.snapshot(id).unwrap(); + assert_eq!(s.sample_count(), 1000); + assert_eq!(s.pass_rate(), Some(0.5)); + assert!((s.effectiveness().unwrap() - id as f64).abs() < 1e-9); + } + } +} diff --git a/datafusion/physical-expr-common/src/adaptive/stats.rs b/datafusion/physical-expr-common/src/adaptive/stats.rs new file mode 100644 index 0000000000000..6d8ef8b353d7a --- /dev/null +++ b/datafusion/physical-expr-common/src/adaptive/stats.rs @@ -0,0 +1,316 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Online per-predicate selectivity and cost accumulator. +//! +//! This is the domain-agnostic measurement substrate shared by the adaptive +//! filter machinery. It records, for a single predicate (conjunct), the +//! quantities every adaptive policy needs: +//! +//! - **selectivity** — `rows_matched / rows_total` (the *pass rate*), +//! - **cost** — cumulative `eval_nanos`, from which a per-row cost is derived, +//! - **effectiveness** — a caller-supplied per-batch scalar, accumulated with +//! [Welford's online algorithm] so callers can put a confidence interval on +//! its mean. +//! +//! The accumulator deliberately does *not* define what "effectiveness" means or +//! what to do with these numbers — that is policy, and lives with the consumer: +//! +//! - the parquet scan ranks filters by *bytes-saved-per-second* to decide +//! row-level vs. post-scan **placement**; +//! - an adaptive `FilterExec` ranks conjuncts by `cost_per_row / (1 - pass_rate)` +//! to decide evaluation **order**. +//! +//! Both feed the same accumulator; only the per-batch sample and the ranking +//! function differ. +//! +//! [Welford's online algorithm]: https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Welford's_online_algorithm + +/// Identifier for a tracked predicate, **local to a single +/// [`AdaptiveStatsRegistry`](super::registry::AdaptiveStatsRegistry)**. +/// +/// There is no global expression id. Each consumer mints its own ids — in +/// practice the index of a conjunct in *that consumer's* predicate `Vec` — and +/// the same numeric value in two different registries refers to two unrelated +/// predicates. Ids are opaque here so the accumulator works regardless of how a +/// consumer enumerates its predicates. +pub type FilterId = usize; + +/// Online selectivity + cost statistics for a single predicate expression. +/// +/// Cheap to copy; a consumer typically owns one behind a per-predicate lock +/// (see [`AdaptiveStatsRegistry`](super::registry::AdaptiveStatsRegistry)) so +/// concurrent updates on *different* predicates never contend. +#[derive(Debug, Clone, Copy, PartialEq)] +pub struct SelectivityStats { + /// Number of rows that passed (matched) the predicate. + rows_matched: u64, + /// Total number of rows the predicate was evaluated on. + rows_total: u64, + /// Cumulative evaluation time in nanoseconds. + eval_nanos: u64, + /// Welford's online algorithm: number of per-batch effectiveness samples. + sample_count: u64, + /// Welford's online algorithm: running mean of the per-batch effectiveness + /// sample. + eff_mean: f64, + /// Welford's online algorithm: running sum of squared deviations (M2). + eff_m2: f64, + /// Whether the underlying predicate is *optional* — i.e. may be skipped + /// entirely without affecting query results (e.g. a dynamic join filter). + /// + /// Cached here so the per-batch hot path can decide whether the + /// skip/drop logic applies with a single field load, without re-inspecting + /// the expression. Mandatory predicates must always execute or queries + /// return wrong rows. + is_optional: bool, +} + +impl Default for SelectivityStats { + fn default() -> Self { + Self::new(false) + } +} + +impl SelectivityStats { + /// Create an empty accumulator. `is_optional` records whether the predicate + /// may be dropped without affecting correctness. + pub fn new(is_optional: bool) -> Self { + Self { + rows_matched: 0, + rows_total: 0, + eval_nanos: 0, + sample_count: 0, + eff_mean: 0.0, + eff_m2: 0.0, + is_optional, + } + } + + /// Whether the predicate may be dropped without affecting correctness. + pub fn is_optional(&self) -> bool { + self.is_optional + } + + /// Record one batch of observations. + /// + /// - `matched` / `total` — rows that passed / were evaluated this batch. + /// - `eval_nanos` — wall time spent evaluating the predicate this batch. + /// - `effectiveness_sample` — the caller's per-batch effectiveness metric + /// (e.g. bytes-saved-per-second for placement, or any scalar the + /// consumer wants confidence intervals on). Its unit is opaque here. + /// + /// The raw counters are always updated. The Welford accumulator only + /// ingests the sample when `total > 0 && eval_nanos > 0` and the sample is + /// finite — an empty or zero-time batch is not a meaningful sample. + pub fn record( + &mut self, + matched: u64, + total: u64, + eval_nanos: u64, + effectiveness_sample: f64, + ) { + self.rows_matched += matched; + self.rows_total += total; + self.eval_nanos += eval_nanos; + + if total > 0 && eval_nanos > 0 && effectiveness_sample.is_finite() { + self.sample_count += 1; + let delta = effectiveness_sample - self.eff_mean; + self.eff_mean += delta / self.sample_count as f64; + let delta2 = effectiveness_sample - self.eff_mean; + self.eff_m2 += delta * delta2; + } + } + + /// Cumulative pass rate `rows_matched / rows_total` in `[0, 1]`. + /// + /// `None` until at least one row has been evaluated. Lower means more + /// selective — `1 - pass_rate()` is the fraction of rows discarded. + pub fn pass_rate(&self) -> Option { + if self.rows_total == 0 { + return None; + } + Some(self.rows_matched as f64 / self.rows_total as f64) + } + + /// Average evaluation cost per row, in nanoseconds. + /// + /// Unlike [`pass_rate`](Self::pass_rate) this is roughly independent of the + /// predicate's position in a conjunction (it is a property of the predicate + /// and the data it sees), which makes it the stable term to rank on. + /// `None` until at least one row has been evaluated. + pub fn cost_per_row_nanos(&self) -> Option { + if self.rows_total == 0 { + return None; + } + Some(self.eval_nanos as f64 / self.rows_total as f64) + } + + /// Mean of the per-batch effectiveness samples (Welford `eff_mean`). + /// + /// `None` until at least one sample has been recorded. The unit is whatever + /// the consumer fed to [`record`](Self::record); callers should not assume + /// it. + pub fn effectiveness(&self) -> Option { + if self.sample_count == 0 { + return None; + } + Some(self.eff_mean) + } + + /// Number of per-batch effectiveness samples recorded so far. + pub fn sample_count(&self) -> u64 { + self.sample_count + } + + /// Sample variance of the per-batch effectiveness samples. + /// + /// Uses the unbiased (`n - 1`) estimator; `None` with fewer than 2 samples. + fn variance(&self) -> Option { + if self.sample_count < 2 { + return None; + } + Some(self.eff_m2 / (self.sample_count - 1) as f64) + } + + /// Lower bound of a one-sided confidence interval on mean effectiveness: + /// `mean - z * stderr`. + /// + /// `z` is the caller's confidence multiplier (e.g. `2.0` ≈ 97.5% one-sided). + /// `None` with fewer than 2 samples. + pub fn confidence_lower_bound(&self, z: f64) -> Option { + let stderr = (self.variance()? / self.sample_count as f64).sqrt(); + Some(self.eff_mean - z * stderr) + } + + /// Upper bound of a one-sided confidence interval on mean effectiveness: + /// `mean + z * stderr`. + /// + /// `None` with fewer than 2 samples. + pub fn confidence_upper_bound(&self, z: f64) -> Option { + let stderr = (self.variance()? / self.sample_count as f64).sqrt(); + Some(self.eff_mean + z * stderr) + } + + /// Clear all accumulated observations, preserving [`is_optional`]. + /// + /// Used when a predicate's identity changes underneath a stable + /// [`FilterId`] — e.g. a dynamic filter that re-arms with new values — so + /// stale measurements don't bias the new predicate. + /// + /// [`is_optional`]: Self::is_optional + pub fn reset(&mut self) { + *self = Self::new(self.is_optional); + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn empty_stats_report_none() { + let s = SelectivityStats::default(); + assert_eq!(s.pass_rate(), None); + assert_eq!(s.cost_per_row_nanos(), None); + assert_eq!(s.effectiveness(), None); + assert_eq!(s.confidence_lower_bound(2.0), None); + assert_eq!(s.sample_count(), 0); + assert!(!s.is_optional()); + } + + #[test] + fn pass_rate_and_cost_per_row() { + let mut s = SelectivityStats::default(); + // 2 of 10 rows pass, 1000ns spent. + s.record(2, 10, 1000, 0.0); + assert_eq!(s.pass_rate(), Some(0.2)); + assert_eq!(s.cost_per_row_nanos(), Some(100.0)); + // accumulates across batches + s.record(8, 10, 1000, 0.0); + assert_eq!(s.pass_rate(), Some(0.5)); // 10/20 + assert_eq!(s.cost_per_row_nanos(), Some(100.0)); // 2000/20 + } + + #[test] + fn welford_mean_matches_naive_average() { + let mut s = SelectivityStats::default(); + let samples = [10.0, 20.0, 30.0, 40.0]; + for &x in &samples { + s.record(1, 1, 1, x); + } + assert_eq!(s.sample_count(), 4); + let mean = s.effectiveness().unwrap(); + assert!((mean - 25.0).abs() < 1e-9, "mean was {mean}"); + } + + #[test] + fn welford_variance_matches_naive() { + let mut s = SelectivityStats::default(); + let samples = [2.0, 4.0, 4.0, 4.0, 5.0, 5.0, 7.0, 9.0]; + for &x in &samples { + s.record(1, 1, 1, x); + } + // Known sample variance (n-1) of this set is 4.571428... + let var = s.variance().unwrap(); + assert!((var - 32.0 / 7.0).abs() < 1e-9, "variance was {var}"); + } + + #[test] + fn confidence_interval_brackets_mean() { + let mut s = SelectivityStats::default(); + for &x in &[10.0, 12.0, 8.0, 11.0, 9.0] { + s.record(1, 1, 1, x); + } + let mean = s.effectiveness().unwrap(); + let lo = s.confidence_lower_bound(2.0).unwrap(); + let hi = s.confidence_upper_bound(2.0).unwrap(); + assert!(lo < mean && mean < hi, "lo={lo} mean={mean} hi={hi}"); + // symmetric around the mean + assert!(((hi - mean) - (mean - lo)).abs() < 1e-9); + } + + #[test] + fn ci_needs_two_samples() { + let mut s = SelectivityStats::default(); + s.record(1, 1, 1, 5.0); + assert_eq!(s.confidence_lower_bound(2.0), None); + assert_eq!(s.confidence_upper_bound(2.0), None); + } + + #[test] + fn empty_or_zero_time_batches_are_not_samples() { + let mut s = SelectivityStats::default(); + s.record(0, 0, 1000, 5.0); // no rows + s.record(5, 10, 0, 5.0); // no time + s.record(5, 10, 100, f64::NAN); // non-finite sample + assert_eq!(s.sample_count(), 0); + // but raw counters still moved where rows/time were present + assert_eq!(s.pass_rate(), Some(0.5)); // from the latter two row-bearing batches + } + + #[test] + fn reset_preserves_optional_flag() { + let mut s = SelectivityStats::new(true); + s.record(2, 10, 1000, 7.0); + s.reset(); + assert_eq!(s.sample_count(), 0); + assert_eq!(s.pass_rate(), None); + assert!(s.is_optional()); + } +} diff --git a/datafusion/physical-expr-common/src/lib.rs b/datafusion/physical-expr-common/src/lib.rs index b6eaacdca2505..bc53abc993a0c 100644 --- a/datafusion/physical-expr-common/src/lib.rs +++ b/datafusion/physical-expr-common/src/lib.rs @@ -30,6 +30,7 @@ //! //! [DataFusion]: +pub mod adaptive; pub mod binary_map; pub mod binary_view_map; pub mod datum; diff --git a/datafusion/physical-plan/src/adaptive_filter.rs b/datafusion/physical-plan/src/adaptive_filter.rs new file mode 100644 index 0000000000000..f92430b3eb99a --- /dev/null +++ b/datafusion/physical-plan/src/adaptive_filter.rs @@ -0,0 +1,671 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Runtime-adaptive evaluation of a conjunctive (`AND`) predicate for +//! [`FilterExec`](crate::filter::FilterExec). +//! +//! This is the *ordering* policy that sits on top of the shared measurement +//! substrate in +//! [`datafusion_physical_expr_common::adaptive`]. The substrate +//! ([`AdaptiveStatsRegistry`]) measures per-conjunct selectivity and cost; this +//! module decides, from those measurements, what order to evaluate the +//! conjuncts in. +//! +//! ## How it evaluates +//! +//! The conjuncts are evaluated sequentially, combining their boolean results +//! with `AND`. The working batch is physically compacted to the surviving rows +//! only once the accumulated mask becomes selective enough (the same +//! pre-selection gate [`BinaryExpr`]'s `AND` short-circuit uses) — so a run of +//! non-selective conjuncts costs only cheap bitwise `AND`s, while a selective +//! conjunct shrinks the batch the expensive conjuncts after it must decode. +//! Gating is what makes ordering matter: a conjunct that compacts early saves +//! every later conjunct work, so the cheap-and-selective ones should run first. +//! Each conjunct is timed and counted on exactly the rows it evaluated, giving +//! the *marginal* selectivity and cost on the current working population. +//! +//! ## How it reorders +//! +//! Every conjunct accrues a per-batch *effectiveness* sample of **rows +//! discarded per second** (`(total - matched) * 1e9 / eval_nanos`). Maximising +//! discards-per-second is exactly minimising `cost_per_row / (1 - pass_rate)`, +//! the classic optimal ordering key for independent conjuncts — so an expensive +//! but very selective predicate (e.g. a `LIKE` that keeps one row) correctly +//! sorts ahead of a cheap but unselective one. +//! +//! Conjuncts are ranked by mean effectiveness, and the order is committed only +//! once it is *statistically certain* (see below), so per-batch noise never +//! thrashes the order. +//! +//! The evaluator *freezes* as soon as the ranking is statistically certain — +//! every adjacent pair of conjuncts has non-overlapping effectiveness +//! confidence intervals — rather than after a fixed number of batches, so a +//! clear winner is locked in within a handful of batches. (If the conjuncts +//! turn out indistinguishable, a small sample cap freezes anyway, since their +//! order does not matter.) When frozen, the conjuncts are fused into a single +//! left-deep `AND` in the learned order and evaluated as an ordinary +//! predicate — no measurement, so the steady state costs what a normal +//! `FilterExec` predicate would and inherits `BinaryExpr`'s own pre-selection. +//! +//! To stay correct under distribution drift, a frozen evaluator periodically +//! *re-thaws*: it re-measures a short window and re-decides. Each re-thaw that +//! confirms the order backs the next one off exponentially, so a stable filter +//! is re-checked geometrically less often and steady-state overhead decays +//! toward zero; a changed order resets the interval so drift is caught quickly. +//! +//! ## Known limitation +//! +//! The measured selectivity of a conjunct is *conditional* on the conjuncts +//! ordered before it (it only sees their survivors). This greedy hill-climb +//! observes only the current arrangement, so with strongly correlated +//! predicates it can settle into a local optimum. A proper exploration phase +//! (measuring each conjunct's marginal selectivity on a common population) is +//! future work. +//! +//! [`BinaryExpr`]: datafusion_physical_expr::expressions::BinaryExpr + +use std::sync::Arc; + +use arrow::array::{Array, ArrayRef, BooleanArray, BooleanBufferBuilder, UInt32Array}; +use arrow::compute::kernels::boolean::and; +use arrow::compute::{filter, filter_record_batch, prep_null_mask_filter}; +use arrow::record_batch::RecordBatch; +use datafusion_common::Result; +use datafusion_common::cast::as_boolean_array; +use datafusion_common::instant::Instant; +use datafusion_expr::Operator; +use datafusion_physical_expr::PhysicalExpr; +use datafusion_physical_expr::expressions::BinaryExpr; +use datafusion_physical_expr::utils::split_conjunction; +use datafusion_physical_expr_common::adaptive::SelectivityStats; +use datafusion_physical_expr_common::physical_expr::is_volatile; + +/// Confidence multiplier for the one-sided interval on effectiveness +/// (~97.5% one-sided at 2.0). +const CONFIDENCE_Z: f64 = 2.0; +/// Minimum per-conjunct samples before the confidence intervals are trusted +/// enough to make a freeze decision. +const MIN_SAMPLES_FOR_CI: u64 = 4; +/// Freeze after this many learning samples even if the order's confidence +/// intervals still overlap: if the conjuncts cannot be told apart by now, their +/// relative order does not matter, so stop paying to measure it. +const MAX_LEARNING_SAMPLES: u64 = 64; +/// Batches the fast frozen path runs before the first re-thaw check. +const INITIAL_THAW_INTERVAL: u64 = 64; +/// Each re-thaw that confirms the order is unchanged multiplies the next +/// interval by this factor (exponential backoff), so a stable filter is +/// re-checked geometrically less often and steady-state overhead decays to ~0. +const THAW_BACKOFF: u64 = 4; +/// Upper bound on the re-thaw interval. +const MAX_THAW_INTERVAL: u64 = 16_384; +/// Batches measured during a re-thaw before re-deciding the order. +const REMEASURE_WINDOW: u64 = 16; +/// Physically compact the working batch to the surviving rows only when the +/// accumulated mask keeps at most this fraction of them. Above this, the cost +/// of materializing a barely-smaller batch is not repaid, so we keep evaluating +/// against the full working batch and just AND the boolean masks — mirroring +/// the pre-selection gate in `BinaryExpr`'s `AND` short-circuit. +const COMPACTION_SELECTIVITY_THRESHOLD: f64 = 0.2; + +/// Lifecycle of the adaptive evaluator. +#[derive(Debug)] +enum Phase { + /// Measuring every batch, building confidence in the conjunct order. + Learning, + /// Order settled and fused into a single predicate; evaluated as a plain + /// predicate (no measurement) until `thaw_at` batches have been processed. + Frozen { + expr: Arc, + /// Batch count at which to re-measure. + thaw_at: u64, + /// Interval that produced `thaw_at`; grows on each confirmation. + interval: u64, + }, + /// Briefly measuring again after a thaw, to detect distribution drift. + Remeasuring { + /// Batch count at which the re-measurement window ends. + until: u64, + /// Order in effect before this thaw, to detect whether it changed. + prev_order: Vec, + /// The frozen interval before this thaw (for backoff bookkeeping). + interval: u64, + }, +} + +/// Adaptive evaluator for a single conjunctive predicate. +/// +/// Owned per partition stream (single-threaded), so all state — order, stats, +/// and lifecycle [`Phase`] — is held directly and mutated through `&mut self` +/// with no locking. The stats are a plain `Vec` indexed by conjunct id (ids are +/// dense `0..n`), so the per-batch hot path is a direct index, not a locked map +/// lookup. The multi-threaded [`AdaptiveStatsRegistry`] in the substrate is for +/// a shared consumer (e.g. the parquet scan); a single-threaded stream does not +/// need it. +/// +/// [`AdaptiveStatsRegistry`]: datafusion_physical_expr_common::adaptive::AdaptiveStatsRegistry +#[derive(Debug)] +pub(crate) struct AdaptiveConjunction { + /// The conjuncts. Stats/order indices refer to positions in this `Vec`. + conjuncts: Vec>, + /// Per-conjunct online selectivity + cost, indexed by conjunct id. + stats: Vec, + /// Current evaluation order: indices into [`conjuncts`](Self::conjuncts). + order: Vec, + /// Total batches processed; drives the re-thaw schedule. + batches: u64, + /// Current lifecycle phase. + phase: Phase, +} + +impl AdaptiveConjunction { + /// Build an adaptive evaluator for `predicate`, or `None` if adaptive + /// reordering does not apply: + /// + /// - `enabled` is false (the config flag is off); + /// - the predicate has fewer than two `AND` conjuncts (nothing to reorder); + /// - any conjunct is volatile (reordering could change results). + pub(crate) fn try_new( + predicate: &Arc, + enabled: bool, + ) -> Option { + if !enabled { + return None; + } + let conjuncts: Vec> = split_conjunction(predicate) + .into_iter() + .map(Arc::clone) + .collect(); + if conjuncts.len() < 2 { + return None; + } + if conjuncts.iter().any(is_volatile) { + return None; + } + + let stats = vec![SelectivityStats::new(false); conjuncts.len()]; + let order = (0..conjuncts.len()).collect(); + + Some(Self { + conjuncts, + stats, + order, + batches: 0, + phase: Phase::Learning, + }) + } + + /// Evaluate the conjunction against `batch`, returning the boolean mask + /// (over the batch's original rows) of rows that passed every conjunct. + /// + /// While [`Learning`](Phase::Learning) or [`Remeasuring`](Phase::Remeasuring) + /// the conjuncts are evaluated and measured individually (see + /// [`evaluate_measured`](Self::evaluate_measured)); once + /// [`Frozen`](Phase::Frozen) a fused predicate is evaluated directly with no + /// measurement. The phase advances after each batch in + /// [`update_phase`](Self::update_phase). + pub(crate) fn evaluate(&mut self, batch: &RecordBatch) -> Result { + self.batches += 1; + + // Frozen fast path; when the interval elapses, drop into a fresh + // measurement window to check whether the data has drifted. + if let Phase::Frozen { + expr, + thaw_at, + interval, + } = &self.phase + { + if self.batches < *thaw_at { + return expr.evaluate(batch)?.into_array(batch.num_rows()); + } + let interval = *interval; + let prev_order = self.order.clone(); + self.stats.iter_mut().for_each(SelectivityStats::reset); + self.phase = Phase::Remeasuring { + until: self.batches + REMEASURE_WINDOW, + prev_order, + interval, + }; + } + + let result = self.evaluate_measured(batch)?; + self.update_phase(); + Ok(result) + } + + /// Evaluate the conjuncts in the current order, measuring each, and return + /// the boolean mask over the batch's original rows. + /// + /// The working batch is physically compacted to the surviving rows only once + /// the accumulated mask becomes selective enough (see + /// [`COMPACTION_SELECTIVITY_THRESHOLD`]); until then masks are combined with + /// a cheap bitwise `AND`, so a run of non-selective conjuncts pays no + /// materialization cost. Each conjunct is measured on the rows it actually + /// evaluated (its *marginal* selectivity and cost on the current working + /// population). + fn evaluate_measured(&mut self, batch: &RecordBatch) -> Result { + let num_rows = batch.num_rows(); + if num_rows == 0 { + return Ok(Arc::new(BooleanArray::from(Vec::::new()))); + } + + // `working` is the batch conjuncts are evaluated against; `live` maps + // each of its rows back to the original row index. `acc` is the + // accumulated (`AND`-combined) result over `working`'s rows since the + // last compaction — kept null-free so `acc.true_count()` is the live + // count and the final scatter is exact. + let mut working = batch.clone(); + let mut live: ArrayRef = + Arc::new(UInt32Array::from_iter_values(0..num_rows as u32)); + let mut acc = BooleanArray::from(vec![true; num_rows]); + + for k in 0..self.order.len() { + let id = self.order[k]; + let rows_in = working.num_rows(); + + let timer = Instant::now(); + let array = self.conjuncts[id].evaluate(&working)?.into_array(rows_in)?; + let eval_nanos = timer.elapsed().as_nanos() as u64; + let mask = as_boolean_array(&array)?; + + // `matched` counts non-null trues (SQL filter semantics). + let matched = mask.true_count() as u64; + let discarded = rows_in as u64 - matched; + let sample = if eval_nanos > 0 { + discarded as f64 * 1e9 / eval_nanos as f64 + } else { + 0.0 + }; + self.stats[id].record(matched, rows_in as u64, eval_nanos, sample); + + // Fold this conjunct into the accumulated mask (null -> false). + let normalized; + let mask = if mask.null_count() > 0 { + normalized = prep_null_mask_filter(mask); + &normalized + } else { + mask + }; + acc = and(&acc, mask)?; + + let alive = acc.true_count(); + if alive == 0 { + break; + } + // Compact only when the survivors are a small fraction of the + // working batch — otherwise the copy is not worth it. + if (alive as f64) <= COMPACTION_SELECTIVITY_THRESHOLD * rows_in as f64 { + working = filter_record_batch(&working, &acc)?; + live = filter(&live, &acc)?; + acc = BooleanArray::from(vec![true; alive]); + } + } + + // Scatter the surviving original indices (`live` masked by `acc`) into a + // full-length mask over the original batch. + let live = filter(&live, &acc)?; + let live = live + .as_any() + .downcast_ref::() + .expect("u32 live"); + let mut builder = BooleanBufferBuilder::new(num_rows); + builder.append_n(num_rows, false); + for &idx in live.values() { + builder.set_bit(idx as usize, true); + } + Ok(Arc::new(BooleanArray::new(builder.finish(), None))) + } + + /// Advance the lifecycle phase after measuring a batch: + /// - [`Learning`](Phase::Learning) freezes once the order is statistically + /// settled (see [`settled_order`](Self::settled_order)); + /// - [`Remeasuring`](Phase::Remeasuring) re-freezes when its window ends, + /// keeping the new order and either backing off the next interval (order + /// unchanged) or resetting it (drift detected). + fn update_phase(&mut self) { + match std::mem::replace(&mut self.phase, Phase::Learning) { + Phase::Learning => { + self.phase = match self.settled_order() { + Some(order) => { + self.order = order; + self.freeze(INITIAL_THAW_INTERVAL) + } + None => Phase::Learning, + }; + } + Phase::Remeasuring { + until, + prev_order, + interval, + } => { + self.phase = if self.batches >= until { + let new_order = self.rank_by_effectiveness(); + let next = if new_order == prev_order { + interval.saturating_mul(THAW_BACKOFF).min(MAX_THAW_INTERVAL) + } else { + INITIAL_THAW_INTERVAL + }; + self.order = new_order; + self.freeze(next) + } else { + Phase::Remeasuring { + until, + prev_order, + interval, + } + }; + } + // Not reachable: a frozen evaluator returns early or has already + // transitioned to `Remeasuring` before measuring. Restore it. + frozen => self.phase = frozen, + } + } + + /// Build a [`Frozen`](Phase::Frozen) phase for the current order, due to + /// re-measure after `interval` more batches. + fn freeze(&self, interval: u64) -> Phase { + Phase::Frozen { + expr: self.build_fused(), + thaw_at: self.batches + interval, + interval, + } + } + + /// During learning, decide whether the order has settled enough to freeze. + /// Returns the order to freeze, or `None` to keep learning. + /// + /// We freeze once we are statistically sure of the ranking (adjacent + /// confidence intervals do not overlap), or once enough samples have accrued + /// that more measurement would not resolve the order — meaning the conjuncts + /// are effectively indistinguishable and their order does not matter. + fn settled_order(&self) -> Option> { + let min_samples = self + .stats + .iter() + .map(SelectivityStats::sample_count) + .min() + .unwrap_or(0); + if min_samples < MIN_SAMPLES_FOR_CI { + return None; + } + let order = self.rank_by_effectiveness(); + if self.order_is_certain(&order) || min_samples >= MAX_LEARNING_SAMPLES { + Some(order) + } else { + None + } + } + + /// Rank conjunct ids by mean effectiveness (discards-per-second) descending; + /// ids without samples sort last. Stable, so equal ids keep ascending order. + fn rank_by_effectiveness(&self) -> Vec { + let mut ids: Vec = (0..self.conjuncts.len()).collect(); + ids.sort_by(|&a, &b| { + match (self.stats[a].effectiveness(), self.stats[b].effectiveness()) { + (Some(x), Some(y)) => { + y.partial_cmp(&x).unwrap_or(std::cmp::Ordering::Equal) + } + (Some(_), None) => std::cmp::Ordering::Less, + (None, Some(_)) => std::cmp::Ordering::Greater, + (None, None) => std::cmp::Ordering::Equal, + } + }); + ids + } + + /// Whether every adjacent pair in `order` has non-overlapping one-sided + /// effectiveness confidence intervals — i.e. the ranking is statistically + /// certain. + fn order_is_certain(&self, order: &[usize]) -> bool { + order.windows(2).all(|w| { + match ( + self.stats[w[0]].confidence_lower_bound(CONFIDENCE_Z), + self.stats[w[1]].confidence_upper_bound(CONFIDENCE_Z), + ) { + (Some(lo), Some(up)) => lo >= up, + _ => false, + } + }) + } + + /// Fuse the conjuncts into a single left-deep `AND` in the learned order, so + /// the frozen steady state evaluates as a normal predicate. + fn build_fused(&self) -> Arc { + let mut it = self.order.iter().map(|&i| Arc::clone(&self.conjuncts[i])); + let first = it.next().expect("at least two conjuncts"); + it.fold(first, |acc, e| { + Arc::new(BinaryExpr::new(acc, Operator::And, e)) as Arc + }) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + use arrow::array::Int32Array; + use arrow::datatypes::{DataType, Field, Schema}; + use datafusion_expr::Operator; + use datafusion_physical_expr::expressions::{binary, col, lit}; + + fn test_batch(schema: &Arc, a: Vec, b: Vec) -> RecordBatch { + RecordBatch::try_new( + Arc::clone(schema), + vec![Arc::new(Int32Array::from(a)), Arc::new(Int32Array::from(b))], + ) + .unwrap() + } + + fn schema() -> Arc { + Arc::new(Schema::new(vec![ + Field::new("a", DataType::Int32, false), + Field::new("b", DataType::Int32, false), + ])) + } + + /// `a > 2 AND b < 5` + fn predicate(schema: &Arc) -> Arc { + let left = + binary(col("a", schema).unwrap(), Operator::Gt, lit(2i32), schema).unwrap(); + let right = + binary(col("b", schema).unwrap(), Operator::Lt, lit(5i32), schema).unwrap(); + binary(left, Operator::And, right, schema).unwrap() + } + + fn passing_rows(mask: &ArrayRef) -> Vec { + let mask = as_boolean_array(mask).unwrap(); + (0..mask.len()).filter(|&i| mask.value(i)).collect() + } + + #[test] + fn single_conjunct_is_not_adaptive() { + let schema = schema(); + let p = + binary(col("a", &schema).unwrap(), Operator::Gt, lit(2i32), &schema).unwrap(); + assert!(AdaptiveConjunction::try_new(&p, true).is_none()); + } + + #[test] + fn disabled_is_none() { + let schema = schema(); + assert!(AdaptiveConjunction::try_new(&predicate(&schema), false).is_none()); + } + + #[test] + fn matches_plain_conjunction_evaluation() { + let schema = schema(); + let p = predicate(&schema); + let mut adaptive = AdaptiveConjunction::try_new(&p, true).unwrap(); + + let batch = test_batch( + &schema, + vec![1, 3, 5, 2, 4], // a + vec![9, 4, 6, 1, 0], // b + ); + // a > 2 AND b < 5: rows where a in {3,5,2,4} AND b in {<5} + // idx0 a=1 -> false + // idx1 a=3,b=4 -> true + // idx2 a=5,b=6 -> false (b) + // idx3 a=2 -> false (a) + // idx4 a=4,b=0 -> true + let mask = adaptive.evaluate(&batch).unwrap(); + assert_eq!(passing_rows(&mask), vec![1, 4]); + + // Result is independent of the internal order: force a reordering and + // re-check on the same data. + adaptive.order = vec![1, 0]; + let mask = adaptive.evaluate(&batch).unwrap(); + assert_eq!(passing_rows(&mask), vec![1, 4]); + } + + fn frozen_interval(adaptive: &AdaptiveConjunction) -> Option { + match &adaptive.phase { + Phase::Frozen { interval, .. } => Some(*interval), + _ => None, + } + } + + /// Run batches until frozen (panics if it never freezes within a bound). + fn run_until_frozen(adaptive: &mut AdaptiveConjunction, batch: &RecordBatch) { + for _ in 0..MAX_LEARNING_SAMPLES + 5 { + adaptive.evaluate(batch).unwrap(); + if matches!(adaptive.phase, Phase::Frozen { .. }) { + return; + } + } + panic!("did not freeze"); + } + + /// b<5 discards almost everything, a>2 discards nothing, so the + /// effectiveness CIs separate fast and we freeze on certainty. + fn selective_b_batch(schema: &Arc) -> RecordBatch { + let a: Vec = (0..1000).map(|_| 100).collect(); + let b: Vec = (0..1000).map(|i| if i == 0 { 1 } else { 100 }).collect(); + test_batch(schema, a, b) + } + + #[test] + fn freezes_on_certainty_and_stops_measuring() { + let schema = schema(); + let mut adaptive = + AdaptiveConjunction::try_new(&predicate(&schema), true).unwrap(); + let batch = selective_b_batch(&schema); + + run_until_frozen(&mut adaptive, &batch); + assert_eq!(adaptive.order.first().copied(), Some(1)); + + // Once frozen, further batches (within the thaw interval) do not record. + let before: u64 = adaptive.stats.iter().map(|s| s.sample_count()).sum(); + for _ in 0..10 { + let mask = adaptive.evaluate(&batch).unwrap(); + assert_eq!(passing_rows(&mask), vec![0]); // ...and stay correct + } + let after: u64 = adaptive.stats.iter().map(|s| s.sample_count()).sum(); + assert_eq!(before, after, "frozen evaluator must not keep measuring"); + } + + #[test] + fn rethaw_backs_off_when_order_is_stable() { + let schema = schema(); + let mut adaptive = + AdaptiveConjunction::try_new(&predicate(&schema), true).unwrap(); + let batch = selective_b_batch(&schema); + + run_until_frozen(&mut adaptive, &batch); + let interval1 = frozen_interval(&adaptive).unwrap(); + assert_eq!(interval1, INITIAL_THAW_INTERVAL); + + // Cross the thaw point and the re-measurement window; same data, so the + // order is reconfirmed and the next interval backs off. + for _ in 0..interval1 + REMEASURE_WINDOW + 2 { + adaptive.evaluate(&batch).unwrap(); + } + let interval2 = frozen_interval(&adaptive).unwrap(); + assert_eq!(interval2, interval1 * THAW_BACKOFF); + assert_eq!(adaptive.order.first().copied(), Some(1)); + } + + #[test] + fn rethaw_adapts_to_drift() { + let schema = schema(); + let mut adaptive = + AdaptiveConjunction::try_new(&predicate(&schema), true).unwrap(); + + // Freeze with b<5 as the selective conjunct (id 1 leads). + run_until_frozen(&mut adaptive, &selective_b_batch(&schema)); + assert_eq!(adaptive.order.first().copied(), Some(1)); + + // Drift: now a>2 is the selective one (only row 0), b<5 is always true. + let a: Vec = (0..1000).map(|i| if i == 0 { 100 } else { 0 }).collect(); + let b: Vec = (0..1000).map(|_| 0).collect(); + let drift = test_batch(&schema, a, b); + + for _ in 0..INITIAL_THAW_INTERVAL + REMEASURE_WINDOW + 2 { + let mask = adaptive.evaluate(&drift).unwrap(); + assert_eq!(passing_rows(&mask), vec![0]); // a>2 AND b<5 -> row 0 only + } + assert_eq!( + adaptive.order.first().copied(), + Some(0), + "re-thaw should adapt the order to the new distribution" + ); + } + + #[test] + fn non_selective_conjuncts_never_compact_but_are_correct() { + // Both conjuncts keep well over the compaction threshold, so the + // working batch is never compacted and the result is produced purely + // by AND-combining masks. Result must still be exact. + let schema = schema(); + let mut adaptive = + AdaptiveConjunction::try_new(&predicate(&schema), true).unwrap(); + // a > 2 keeps 8/10; b < 5 keeps 7/10 — neither is <= 20%. + let a = vec![5, 6, 7, 8, 9, 10, 11, 12, 1, 2]; // last two fail a>2 + let b = vec![0, 1, 2, 3, 4, 9, 9, 9, 0, 0]; // idx5..7 fail b<5 + let batch = test_batch(&schema, a, b); + // a>2 AND b<5: idx0..4 pass both; idx5..7 fail b; idx8..9 fail a. + let mask = adaptive.evaluate(&batch).unwrap(); + assert_eq!(passing_rows(&mask), vec![0, 1, 2, 3, 4]); + } + + #[test] + fn empty_batch() { + let schema = schema(); + let mut adaptive = + AdaptiveConjunction::try_new(&predicate(&schema), true).unwrap(); + let batch = test_batch(&schema, vec![], vec![]); + let mask = adaptive.evaluate(&batch).unwrap(); + assert_eq!(as_boolean_array(&mask).unwrap().len(), 0); + } + + #[test] + fn reorders_selective_conjunct_first() { + let schema = schema(); + let p = predicate(&schema); // [a>2, b<5] + let mut adaptive = AdaptiveConjunction::try_new(&p, true).unwrap(); + + // Conjunct 1 (b < 5) is far more selective than conjunct 0 (a > 2): + // a is always > 2 (never discards), b is almost always >= 5 (discards + // ~everything). Discards-per-second is highest for conjunct 1, so it + // should be promoted ahead of conjunct 0. + let batch = selective_b_batch(&schema); + run_until_frozen(&mut adaptive, &batch); + assert_eq!(adaptive.order.first().copied(), Some(1)); + // Result is correct regardless of order. + let mask = adaptive.evaluate(&batch).unwrap(); + assert_eq!(passing_rows(&mask), vec![0]); + } +} diff --git a/datafusion/physical-plan/src/filter.rs b/datafusion/physical-plan/src/filter.rs index b3b107dc580df..6d2223f3d4327 100644 --- a/datafusion/physical-plan/src/filter.rs +++ b/datafusion/physical-plan/src/filter.rs @@ -28,6 +28,7 @@ use super::{ ColumnStatistics, DisplayAs, ExecutionPlanProperties, PlanProperties, RecordBatchStream, SendableRecordBatchStream, Statistics, }; +use crate::adaptive_filter::AdaptiveConjunction; use crate::check_if_same_properties; use crate::coalesce::{LimitedBatchCoalescer, PushBatchStatus}; use crate::common::can_project; @@ -549,9 +550,18 @@ impl ExecutionPlan for FilterExec { context.task_id() ); let metrics = FilterExecMetrics::new(&self.metrics, partition); + let adaptive = AdaptiveConjunction::try_new( + &self.predicate, + context + .session_config() + .options() + .execution + .adaptive_filter_reordering, + ); Ok(Box::pin(FilterExecStream { schema: self.schema(), predicate: Arc::clone(&self.predicate), + adaptive, input: self.input.execute(partition, context)?, metrics, projection: self.projection.clone(), @@ -935,6 +945,9 @@ struct FilterExecStream { schema: SchemaRef, /// The expression to filter on. This expression must evaluate to a boolean value. predicate: Arc, + /// When set, the predicate is a reorderable conjunction and is evaluated + /// adaptively (per-conjunct, in measured order) instead of via `predicate`. + adaptive: Option, /// The input partition to filter. input: SendableRecordBatchStream, /// Runtime metrics recording @@ -1030,9 +1043,15 @@ impl Stream for FilterExecStream { } Some(Ok(batch)) => { let timer = elapsed_compute.timer(); - let status = self.predicate.as_ref() - .evaluate(&batch) - .and_then(|v| v.into_array(batch.num_rows())) + let array = match self.adaptive.as_mut() { + Some(adaptive) => adaptive.evaluate(&batch), + None => self + .predicate + .as_ref() + .evaluate(&batch) + .and_then(|v| v.into_array(batch.num_rows())), + }; + let status = array .and_then(|array| { Ok(match self.projection.as_ref() { Some(projection) => { diff --git a/datafusion/physical-plan/src/lib.rs b/datafusion/physical-plan/src/lib.rs index c7b1d4729e21d..27dcdc040bcbb 100644 --- a/datafusion/physical-plan/src/lib.rs +++ b/datafusion/physical-plan/src/lib.rs @@ -60,6 +60,7 @@ mod render_tree; mod topk; mod visitor; +mod adaptive_filter; pub mod aggregates; pub mod analyze; pub mod async_func; diff --git a/datafusion/sqllogictest/test_files/adaptive_filter.slt b/datafusion/sqllogictest/test_files/adaptive_filter.slt new file mode 100644 index 0000000000000..b0e2d958a3be9 --- /dev/null +++ b/datafusion/sqllogictest/test_files/adaptive_filter.slt @@ -0,0 +1,80 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Tests for execution.adaptive_filter_reordering. Runtime reordering of a +# conjunction must never change query results, only evaluation order. + +statement ok +CREATE TABLE t AS +SELECT + i AS a, + i % 7 AS b, + arrow_cast(i, 'Utf8') AS s +FROM generate_series(1, 1000) AS tbl(i); + +# Baseline (flag off): multi-conjunct filter mixing a cheap comparison with an +# expensive LIKE. +query I +SELECT count(*) FROM t WHERE b = 3 AND s LIKE '1%'; +---- +17 + +statement ok +SET datafusion.execution.adaptive_filter_reordering = true; + +# Same query with adaptive reordering on must return the same result. +query I +SELECT count(*) FROM t WHERE b = 3 AND s LIKE '1%'; +---- +17 + +# A three-conjunct predicate, including an expensive-but-selective LIKE. +query I +SELECT count(*) FROM t WHERE a > 100 AND s LIKE '5%' AND b <> 0; +---- +86 + +# Full row materialization (not just count) is unchanged by reordering. +query IIT +SELECT a, b, s FROM t WHERE b = 1 AND a > 990 AND s LIKE '99%' ORDER BY a; +---- +995 1 995 + +# EXPLAIN: runtime reordering is invisible to the plan — the FilterExec +# predicate is unchanged whether the flag is on or off. +query TT +EXPLAIN SELECT count(*) FROM t WHERE b = 3 AND s LIKE '1%'; +---- +logical_plan +01)Projection: count(Int64(1)) AS count(*) +02)--Aggregate: groupBy=[[]], aggr=[[count(Int64(1))]] +03)----Projection: +04)------Filter: t.b = Int64(3) AND t.s LIKE Utf8("1%") +05)--------TableScan: t projection=[b, s] +physical_plan +01)ProjectionExec: expr=[count(Int64(1))@0 as count(*)] +02)--AggregateExec: mode=Final, gby=[], aggr=[count(Int64(1))] +03)----CoalescePartitionsExec +04)------AggregateExec: mode=Partial, gby=[], aggr=[count(Int64(1))] +05)--------FilterExec: b@0 = 3 AND s@1 LIKE 1%, projection=[] +06)----------DataSourceExec: partitions=4, partition_sizes=[1, 0, 0, 0] + +statement ok +SET datafusion.execution.adaptive_filter_reordering = false; + +statement ok +DROP TABLE t; diff --git a/datafusion/sqllogictest/test_files/information_schema.slt b/datafusion/sqllogictest/test_files/information_schema.slt index 387ef2262e1cf..45fe2a746d29c 100644 --- a/datafusion/sqllogictest/test_files/information_schema.slt +++ b/datafusion/sqllogictest/test_files/information_schema.slt @@ -214,6 +214,7 @@ datafusion.catalog.has_header true datafusion.catalog.information_schema true datafusion.catalog.location NULL datafusion.catalog.newlines_in_values false +datafusion.execution.adaptive_filter_reordering false datafusion.execution.batch_size 8192 datafusion.execution.coalesce_batches true datafusion.execution.collect_statistics true @@ -365,6 +366,7 @@ datafusion.catalog.has_header true Default value for `format.has_header` for `CR datafusion.catalog.information_schema true Should DataFusion provide access to `information_schema` virtual tables for displaying schema information datafusion.catalog.location NULL Location scanned to load tables for `default` schema datafusion.catalog.newlines_in_values false Specifies whether newlines in (quoted) CSV values are supported. This is the default value for `format.newlines_in_values` for `CREATE EXTERNAL TABLE` if not specified explicitly in the statement. Parsing newlines in quoted values may be affected by execution behaviour such as parallel file scanning. Setting this to `true` ensures that newlines in values are parsed successfully, which may reduce performance. +datafusion.execution.adaptive_filter_reordering false (experimental) When enabled, `FilterExec` adaptively reorders the conjuncts of a conjunctive predicate at runtime. It measures each conjunct's selectivity and evaluation cost on the rows that reach it and runs the conjuncts that discard the most rows per unit of CPU time first, so cheap-and-selective predicates gate expensive ones. Reordering never changes query results (only the evaluation order of a conjunction) but can change observable side effects of fallible predicates, so it is off by default. Predicates containing volatile expressions are never reordered. datafusion.execution.batch_size 8192 Default batch size while creating new batches, it's especially useful for buffer-in-memory batches since creating tiny batches would result in too much metadata memory consumption datafusion.execution.coalesce_batches true When set to true, record batches will be examined between each operator and small batches will be coalesced into larger batches. This is helpful when there are highly selective filters or joins that could produce tiny output batches. The target batch size is determined by the configuration setting datafusion.execution.collect_statistics true Should DataFusion collect statistics when first creating a table. Has no effect after the table is created. Applies to the default `ListingTableProvider` in DataFusion. Defaults to true. diff --git a/docs/source/user-guide/configs.md b/docs/source/user-guide/configs.md index e0e2a5d21c8fd..b0bbaf91b2116 100644 --- a/docs/source/user-guide/configs.md +++ b/docs/source/user-guide/configs.md @@ -134,6 +134,7 @@ The following configuration settings are available: | datafusion.execution.skip_partial_aggregation_probe_rows_threshold | 100000 | Number of input rows partial aggregation partition should process, before aggregation ratio check and trying to switch to skipping aggregation mode | | datafusion.execution.use_row_number_estimates_to_optimize_partitioning | false | Should DataFusion use row number estimates at the input to decide whether increasing parallelism is beneficial or not. By default, only exact row numbers (not estimates) are used for this decision. Setting this flag to `true` will likely produce better plans. if the source of statistics is accurate. We plan to make this the default in the future. | | datafusion.execution.enforce_batch_size_in_joins | false | Should DataFusion enforce batch size in joins or not. By default, DataFusion will not enforce batch size in joins. Enforcing batch size in joins can reduce memory usage when joining large tables with a highly-selective join filter, but is also slightly slower. | +| datafusion.execution.adaptive_filter_reordering | false | (experimental) When enabled, `FilterExec` adaptively reorders the conjuncts of a conjunctive predicate at runtime. It measures each conjunct's selectivity and evaluation cost on the rows that reach it and runs the conjuncts that discard the most rows per unit of CPU time first, so cheap-and-selective predicates gate expensive ones. Reordering never changes query results (only the evaluation order of a conjunction) but can change observable side effects of fallible predicates, so it is off by default. Predicates containing volatile expressions are never reordered. | | datafusion.execution.objectstore_writer_buffer_size | 10485760 | Size (bytes) of data buffer DataFusion uses when writing output files. This affects the size of the data chunks that are uploaded to remote object stores (e.g. AWS S3). If very large (>= 100 GiB) output files are being written, it may be necessary to increase this size to avoid errors from the remote end point. | | datafusion.execution.enable_ansi_mode | false | Whether to enable ANSI SQL mode. The flag is experimental and relevant only for DataFusion Spark built-in functions When `enable_ansi_mode` is set to `true`, the query engine follows ANSI SQL semantics for expressions, casting, and error handling. This means: - **Strict type coercion rules:** implicit casts between incompatible types are disallowed. - **Standard SQL arithmetic behavior:** operations such as division by zero, numeric overflow, or invalid casts raise runtime errors rather than returning `NULL` or adjusted values. - **Consistent ANSI behavior** for string concatenation, comparisons, and `NULL` handling. When `enable_ansi_mode` is `false` (the default), the engine uses a more permissive, non-ANSI mode designed for user convenience and backward compatibility. In this mode: - Implicit casts between types are allowed (e.g., string to integer when possible). - Arithmetic operations are more lenient — for example, `abs()` on the minimum representable integer value returns the input value instead of raising overflow. - Division by zero or invalid casts may return `NULL` instead of failing. # Default `false` — ANSI SQL mode is disabled by default. | | datafusion.execution.hash_join_buffering_capacity | 0 | How many bytes to buffer in the probe side of hash joins while the build side is concurrently being built. Without this, hash joins will wait until the full materialization of the build side before polling the probe side. This is useful in scenarios where the query is not completely CPU bounded, allowing to do some early work concurrently and reducing the latency of the query. Note that when hash join buffering is enabled, the probe side will start eagerly polling data, not giving time for the producer side of dynamic filters to produce any meaningful predicate. Queries with dynamic filters might see performance degradation. Disabled by default, set to a number greater than 0 for enabling it. |