diff --git a/benchmarks/queries/push_down_topk/q1.sql b/benchmarks/queries/push_down_topk/q1.sql new file mode 100644 index 0000000000000..3a2b796f3d70a --- /dev/null +++ b/benchmarks/queries/push_down_topk/q1.sql @@ -0,0 +1,8 @@ +-- LEFT JOIN, ORDER BY column from preserved (left) side, small LIMIT. +-- Canonical case for push_down_topk_through_join: the Sort(fetch=10) is +-- duplicated below the join over the customer scan, so only the top 10 +-- rows (by c_acctbal) are joined against orders. +SELECT c_custkey, c_acctbal +FROM customer LEFT JOIN orders ON c_custkey = o_custkey +ORDER BY c_acctbal +LIMIT 10 \ No newline at end of file diff --git a/benchmarks/queries/push_down_topk/q2.sql b/benchmarks/queries/push_down_topk/q2.sql new file mode 100644 index 0000000000000..1675babcb93cb --- /dev/null +++ b/benchmarks/queries/push_down_topk/q2.sql @@ -0,0 +1,7 @@ +-- RIGHT JOIN, ORDER BY column from preserved (right) side. +-- Symmetric to q1: the Sort(fetch) is pushed below the join over the +-- orders scan (the right/preserved side). +SELECT o_orderkey, o_totalprice +FROM customer RIGHT JOIN orders ON c_custkey = o_custkey +ORDER BY o_totalprice +LIMIT 10 \ No newline at end of file diff --git a/benchmarks/queries/push_down_topk/q3.sql b/benchmarks/queries/push_down_topk/q3.sql new file mode 100644 index 0000000000000..4f53d8ca91ee8 --- /dev/null +++ b/benchmarks/queries/push_down_topk/q3.sql @@ -0,0 +1,7 @@ +-- LEFT JOIN, multi-column ORDER BY (both columns from preserved side). +-- All sort exprs must come from the preserved side for the rule to fire; +-- this query checks that multi-column sorts are still pushed. +SELECT c_custkey, c_acctbal, c_nationkey +FROM customer LEFT JOIN orders ON c_custkey = o_custkey +ORDER BY c_acctbal, c_nationkey +LIMIT 100 \ No newline at end of file diff --git a/benchmarks/queries/push_down_topk/q4.sql b/benchmarks/queries/push_down_topk/q4.sql new file mode 100644 index 0000000000000..c21604b678d50 --- /dev/null +++ b/benchmarks/queries/push_down_topk/q4.sql @@ -0,0 +1,7 @@ +-- CROSS JOIN, ORDER BY column from one side. +-- Cross joins preserve every row from both sides; the rule pushes the +-- Sort(fetch) below the join over the side referenced by ORDER BY. +SELECT c_custkey, c_acctbal +FROM customer CROSS JOIN nation +ORDER BY c_acctbal +LIMIT 10 \ No newline at end of file diff --git a/benchmarks/queries/push_down_topk/q5.sql b/benchmarks/queries/push_down_topk/q5.sql new file mode 100644 index 0000000000000..0db3a8b36ea50 --- /dev/null +++ b/benchmarks/queries/push_down_topk/q5.sql @@ -0,0 +1,9 @@ +-- Negative case: ORDER BY references the probe (non-preserved) side. +-- The rule MUST NOT fire here — orders is the right side of a LEFT JOIN +-- so it isn't preserved (rows can be NULL when there's no match), and +-- pushing a Sort with fetch onto orders would change semantics. +-- Included so the bench harness can verify the rule's selectivity. +SELECT c_custkey, o_totalprice +FROM customer LEFT JOIN orders ON c_custkey = o_custkey +ORDER BY o_totalprice +LIMIT 10 \ No newline at end of file diff --git a/benchmarks/src/bin/dfbench.rs b/benchmarks/src/bin/dfbench.rs index 50dd99368b7f0..e660fda268f45 100644 --- a/benchmarks/src/bin/dfbench.rs +++ b/benchmarks/src/bin/dfbench.rs @@ -32,7 +32,8 @@ static ALLOC: snmalloc_rs::SnMalloc = snmalloc_rs::SnMalloc; static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc; use datafusion_benchmarks::{ - cancellation, clickbench, dict, h2o, hj, imdb, nlj, smj, sort_tpch, tpcds, tpch, + cancellation, clickbench, dict, h2o, hj, imdb, nlj, push_down_topk, smj, sort_tpch, + tpcds, tpch, }; #[derive(Debug, Parser)] @@ -51,6 +52,7 @@ enum Options { HJ(hj::RunOpt), Imdb(imdb::RunOpt), Nlj(nlj::RunOpt), + PushDownTopk(push_down_topk::RunOpt), Smj(smj::RunOpt), SortPushdown(sort_pushdown::RunOpt), SortTpch(sort_tpch::RunOpt), @@ -72,6 +74,7 @@ pub async fn main() -> Result<()> { Options::HJ(opt) => opt.run().await, Options::Imdb(opt) => Box::pin(opt.run()).await, Options::Nlj(opt) => opt.run().await, + Options::PushDownTopk(opt) => opt.run().await, Options::Smj(opt) => opt.run().await, Options::SortPushdown(opt) => opt.run().await, Options::SortTpch(opt) => opt.run().await, diff --git a/benchmarks/src/lib.rs b/benchmarks/src/lib.rs index f41fd5ebed205..0148eab6a9b04 100644 --- a/benchmarks/src/lib.rs +++ b/benchmarks/src/lib.rs @@ -24,6 +24,7 @@ pub mod h2o; pub mod hj; pub mod imdb; pub mod nlj; +pub mod push_down_topk; pub mod smj; pub mod sort_pushdown; pub mod sort_tpch; diff --git a/benchmarks/src/push_down_topk.rs b/benchmarks/src/push_down_topk.rs new file mode 100644 index 0000000000000..568792e4f8550 --- /dev/null +++ b/benchmarks/src/push_down_topk.rs @@ -0,0 +1,264 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Benchmark for `push_down_topk_through_join`. +//! +//! Runs SQL files from `queries/push_down_topk/` against TPC-H +//! `customer`, `orders`, and `nation`. Intended to be run on a branch +//! with the `push_down_topk_through_join` rule registered and +//! against a baseline that does not register the rule, with results +//! compared via `compare.py`. +//! +//! # Usage +//! +//! ```text +//! # Generate TPC-H SF=1 (one-time) +//! ./bench.sh data tpch +//! +//! # Run with rule registered (this branch) and write results +//! ./bench.sh run push_down_topk -o pr.json +//! +//! # Run again on a baseline (e.g. main, or this branch with rule +//! # registration reverted) and write results +//! ./bench.sh run push_down_topk -o baseline.json +//! +//! ./compare.py baseline.json pr.json +//! ``` + +use clap::Args; +use futures::StreamExt; +use std::path::PathBuf; +use std::sync::Arc; + +use datafusion::datasource::TableProvider; +use datafusion::datasource::file_format::parquet::ParquetFormat; +use datafusion::datasource::listing::{ + ListingOptions, ListingTable, ListingTableConfig, ListingTableUrl, +}; +use datafusion::error::Result; +use datafusion::execution::SessionStateBuilder; +use datafusion::physical_plan::{ + display::DisplayableExecutionPlan, displayable, execute_stream, +}; +use datafusion::prelude::*; +use datafusion_common::DEFAULT_PARQUET_EXTENSION; +use datafusion_common::instant::Instant; + +use crate::util::{BenchmarkRun, CommonOpt, QueryResult, print_memory_stats}; + +const PUSH_DOWN_TOPK_QUERY_DIR: &str = "queries/push_down_topk"; + +#[derive(Debug, Args)] +pub struct RunOpt { + #[command(flatten)] + common: CommonOpt, + + /// Query number (1-N). If unset, runs every query in the directory. + #[arg(short, long)] + pub query: Option, + + /// Path to TPC-H parquet directory (must contain `customer`, `orders`, + /// `nation` subdirectories). + #[arg(required = true, short = 'p', long = "path")] + path: PathBuf, + + /// Path to JSON benchmark result, comparable via `compare.py`. + #[arg(short = 'o', long = "output")] + output_path: Option, + + /// Path to directory containing query SQL files. + /// Defaults to `queries/push_down_topk/` relative to current directory. + #[arg(long = "queries-path")] + queries_path: Option, +} + +impl RunOpt { + const TABLES: [&'static str; 3] = ["customer", "orders", "nation"]; + + fn queries_dir(&self) -> PathBuf { + self.queries_path + .clone() + .unwrap_or_else(|| PathBuf::from(PUSH_DOWN_TOPK_QUERY_DIR)) + } + + fn load_query(&self, query_id: usize) -> Result { + let path = self.queries_dir().join(format!("q{query_id}.sql")); + std::fs::read_to_string(&path).map_err(|e| { + datafusion_common::DataFusionError::Execution(format!( + "Failed to read query file {}: {e}", + path.display() + )) + }) + } + + fn available_queries(&self) -> Vec { + let dir = self.queries_dir(); + let mut ids = Vec::new(); + if let Ok(entries) = std::fs::read_dir(&dir) { + for entry in entries.flatten() { + let name = entry.file_name(); + let name = name.to_string_lossy(); + if let Some(rest) = name.strip_prefix('q') + && let Some(num_str) = rest.strip_suffix(".sql") + && let Ok(id) = num_str.parse::() + { + ids.push(id); + } + } + } + ids.sort(); + ids + } + + pub async fn run(&self) -> Result<()> { + let mut benchmark_run = BenchmarkRun::new(); + + let query_ids = match self.query { + Some(query_id) => vec![query_id], + None => self.available_queries(), + }; + + for query_id in query_ids { + benchmark_run.start_new_case(&format!("{query_id}")); + + match self.benchmark_query(query_id).await { + Ok(query_results) => { + for iter in query_results { + benchmark_run.write_iter(iter.elapsed, iter.row_count); + } + } + Err(e) => { + benchmark_run.mark_failed(); + eprintln!("Query {query_id} failed: {e}"); + } + } + } + + benchmark_run.maybe_write_json(self.output_path.as_ref())?; + benchmark_run.maybe_print_failures(); + Ok(()) + } + + async fn benchmark_query(&self, query_id: usize) -> Result> { + let sql = self.load_query(query_id)?; + + let config = self.common.config()?; + let rt = self.common.build_runtime()?; + let state = SessionStateBuilder::new() + .with_config(config) + .with_runtime_env(rt) + .with_default_features() + .build(); + let ctx = SessionContext::from(state); + + self.register_tables(&ctx).await?; + + let mut millis = vec![]; + let mut query_results = vec![]; + for i in 0..self.iterations() { + let start = Instant::now(); + let row_count = self.execute_query(&ctx, &sql).await?; + let elapsed = start.elapsed(); + let ms = elapsed.as_secs_f64() * 1000.0; + millis.push(ms); + + println!( + "Query {query_id} iteration {i} took {ms:.1} ms and returned {row_count} rows" + ); + query_results.push(QueryResult { elapsed, row_count }); + } + + let avg = millis.iter().sum::() / millis.len() as f64; + println!("Query {query_id} avg time: {avg:.2} ms"); + + print_memory_stats(); + Ok(query_results) + } + + async fn register_tables(&self, ctx: &SessionContext) -> Result<()> { + for table in Self::TABLES { + let provider = self.get_table(ctx, table).await?; + ctx.register_table(table, provider)?; + } + Ok(()) + } + + async fn execute_query(&self, ctx: &SessionContext, sql: &str) -> Result { + let debug = self.common.debug; + let plan = ctx.sql(sql).await?; + let (state, plan) = plan.into_parts(); + + if debug { + println!("=== Logical plan ===\n{plan}\n"); + } + + let plan = state.optimize(&plan)?; + if debug { + println!("=== Optimized logical plan ===\n{plan}\n"); + } + let physical_plan = state.create_physical_plan(&plan).await?; + if debug { + println!( + "=== Physical plan ===\n{}\n", + displayable(physical_plan.as_ref()).indent(true) + ); + } + + let mut row_count = 0; + let mut stream = execute_stream(physical_plan.clone(), state.task_ctx())?; + while let Some(batch) = stream.next().await { + row_count += batch?.num_rows(); + } + + if debug { + println!( + "=== Physical plan with metrics ===\n{}\n", + DisplayableExecutionPlan::with_metrics(physical_plan.as_ref()) + .indent(true) + ); + } + + Ok(row_count) + } + + async fn get_table( + &self, + ctx: &SessionContext, + table: &str, + ) -> Result> { + let path = self.path.to_str().unwrap(); + let state = ctx.state(); + let table_path = format!("{path}/{table}"); + let format = Arc::new( + ParquetFormat::default() + .with_options(ctx.state().table_options().parquet.clone()), + ); + let options = ListingOptions::new(format) + .with_file_extension(DEFAULT_PARQUET_EXTENSION) + .with_collect_stat(true); + let table_path = ListingTableUrl::parse(table_path)?; + let schema = options.infer_schema(&state, &table_path).await?; + let config = ListingTableConfig::new(table_path) + .with_listing_options(options) + .with_schema(schema); + Ok(Arc::new(ListingTable::try_new(config)?)) + } + + fn iterations(&self) -> usize { + self.common.iterations + } +} diff --git a/datafusion/core/src/optimizer_rule_reference.md b/datafusion/core/src/optimizer_rule_reference.md index 1f9f37f530557..94789a067358c 100644 --- a/datafusion/core/src/optimizer_rule_reference.md +++ b/datafusion/core/src/optimizer_rule_reference.md @@ -35,33 +35,34 @@ Rule order matters. The default pipeline may change between releases. ### Logical Optimizer Rules -| order | rule | summary | -| ----- | ----------------------------------------- | --------------------------------------------------------------------------------------------------------------------------- | -| 1 | `rewrite_set_comparison` | Rewrites `ANY` and `ALL` set-comparison subqueries into `EXISTS`-based boolean expressions with correct SQL NULL semantics. | -| 2 | `optimize_unions` | Flattens nested unions and removes unions with a single input. | -| 3 | `unions_to_filter` | Merges `UNION DISTINCT` branches that share the same source into a single filtered branch with a disjunctive predicate. | -| 4 | `simplify_expressions` | Constant-folds and simplifies expressions while preserving output names. | -| 5 | `replace_distinct_aggregate` | Rewrites `DISTINCT` and `DISTINCT ON` operators into aggregate-based plans that later rules can optimize further. | -| 6 | `eliminate_join` | Replaces keyless inner joins with a literal `false` filter by an empty relation. | -| 7 | `decorrelate_predicate_subquery` | Converts eligible `IN` and `EXISTS` predicate subqueries into semi or anti joins. | -| 8 | `scalar_subquery_to_join` | Rewrites eligible scalar subqueries into joins and adds schema-preserving projections. | -| 9 | `decorrelate_lateral_join` | Rewrites eligible lateral joins into regular joins. | -| 10 | `extract_equijoin_predicate` | Splits join filters into equijoin keys and residual predicates. | -| 11 | `eliminate_duplicated_expr` | Removes duplicate expressions from projections, aggregates, and similar operators. | -| 12 | `eliminate_filter` | Drops always-true filters and replaces always-false or NULL filters with empty relations. | -| 13 | `eliminate_cross_join` | Uses filter predicates to replace cross joins with inner joins when join keys can be found. | -| 14 | `eliminate_limit` | Removes no-op limits and simplifies trivial limit shapes. | -| 15 | `propagate_empty_relation` | Pushes empty-relation knowledge upward so operators fed by no rows collapse early. | -| 16 | `filter_null_join_keys` | Adds `IS NOT NULL` filters to nullable equijoin keys that can never match. | -| 17 | `eliminate_outer_join` | Rewrites outer joins to inner joins when later filters reject the NULL-extended rows. | -| 18 | `push_down_limit` | Moves literal limits closer to scans and unions and merges adjacent limits. | -| 19 | `push_down_filter` | Moves filters as early as possible through filter-commutative operators. | -| 20 | `single_distinct_aggregation_to_group_by` | Rewrites single-column `DISTINCT` aggregations into two-stage `GROUP BY` plans. | -| 21 | `eliminate_group_by_constant` | Removes constant or functionally redundant expressions from `GROUP BY`. | -| 22 | `common_sub_expression_eliminate` | Computes repeated subexpressions once and reuses the result. | -| 23 | `extract_leaf_expressions` | Pulls cheap leaf expressions closer to data sources so later pruning and filter rules can act earlier. | -| 24 | `push_down_leaf_projections` | Pushes the helper projections created by leaf extraction toward leaf inputs. | -| 25 | `optimize_projections` | Prunes unused columns and removes unnecessary logical projections. | +| order | rule | summary | +| ----------------------------------------------------------------------------------- | ----------------------------------------- | --------------------------------------------------------------------------------------------------------------------------- | +| 1 | `rewrite_set_comparison` | Rewrites `ANY` and `ALL` set-comparison subqueries into `EXISTS`-based boolean expressions with correct SQL NULL semantics. | +| 2 | `optimize_unions` | Flattens nested unions and removes unions with a single input. | +| 3 | `unions_to_filter` | Merges `UNION DISTINCT` branches that share the same source into a single filtered branch with a disjunctive predicate. | +| 4 | `simplify_expressions` | Constant-folds and simplifies expressions while preserving output names. | +| 5 | `replace_distinct_aggregate` | Rewrites `DISTINCT` and `DISTINCT ON` operators into aggregate-based plans that later rules can optimize further. | +| 6 | `eliminate_join` | Replaces keyless inner joins with a literal `false` filter by an empty relation. | +| 7 | `decorrelate_predicate_subquery` | Converts eligible `IN` and `EXISTS` predicate subqueries into semi or anti joins. | +| 8 | `scalar_subquery_to_join` | Rewrites eligible scalar subqueries into joins and adds schema-preserving projections. | +| 9 | `decorrelate_lateral_join` | Rewrites eligible lateral joins into regular joins. | +| 10 | `extract_equijoin_predicate` | Splits join filters into equijoin keys and residual predicates. | +| 11 | `eliminate_duplicated_expr` | Removes duplicate expressions from projections, aggregates, and similar operators. | +| 12 | `eliminate_filter` | Drops always-true filters and replaces always-false or NULL filters with empty relations. | +| 13 | `eliminate_cross_join` | Uses filter predicates to replace cross joins with inner joins when join keys can be found. | +| 14 | `eliminate_limit` | Removes no-op limits and simplifies trivial limit shapes. | +| 15 | `propagate_empty_relation` | Pushes empty-relation knowledge upward so operators fed by no rows collapse early. | +| 16 | `filter_null_join_keys` | Adds `IS NOT NULL` filters to nullable equijoin keys that can never match. | +| 17 | `eliminate_outer_join` | Rewrites outer joins to inner joins when later filters reject the NULL-extended rows. | +| 18 | `push_down_limit` | Moves literal limits closer to scans and unions and merges adjacent limits, and pushes | +| `Sort(fetch=N)` onto a join's preserved-side child for LEFT/RIGHT/CROSS/MARK joins. | +| 19 | `push_down_filter` | Moves filters as early as possible through filter-commutative operators. | +| 20 | `single_distinct_aggregation_to_group_by` | Rewrites single-column `DISTINCT` aggregations into two-stage `GROUP BY` plans. | +| 21 | `eliminate_group_by_constant` | Removes constant or functionally redundant expressions from `GROUP BY`. | +| 22 | `common_sub_expression_eliminate` | Computes repeated subexpressions once and reuses the result. | +| 23 | `extract_leaf_expressions` | Pulls cheap leaf expressions closer to data sources so later pruning and filter rules can act earlier. | +| 24 | `push_down_leaf_projections` | Pushes the helper projections created by leaf extraction toward leaf inputs. | +| 25 | `optimize_projections` | Prunes unused columns and removes unnecessary logical projections. | ### Physical Optimizer Rules diff --git a/datafusion/optimizer/src/push_down_limit.rs b/datafusion/optimizer/src/push_down_limit.rs index 4a26cd5884f6b..34db732ecdca9 100644 --- a/datafusion/optimizer/src/push_down_limit.rs +++ b/datafusion/optimizer/src/push_down_limit.rs @@ -29,6 +29,9 @@ use datafusion_common::utils::combine_limit; use datafusion_expr::logical_plan::{Join, JoinType, Limit, LogicalPlan}; use datafusion_expr::{FetchType, SkipType, lit}; +mod topk_through_join; +use topk_through_join::push_topk_through_join; + /// Optimization rule that tries to push down `LIMIT`. //. It will push down through projection, limits (taking the smaller limit) #[derive(Default, Debug)] @@ -47,146 +50,159 @@ impl OptimizerRule for PushDownLimit { true } - #[expect(clippy::only_used_in_recursion)] fn rewrite( &self, plan: LogicalPlan, config: &dyn OptimizerConfig, ) -> Result> { - let LogicalPlan::Limit(mut limit) = plan else { - return Ok(Transformed::no(plan)); - }; + match plan { + LogicalPlan::Limit(limit) => rewrite_limit(limit, config), + LogicalPlan::Sort(s) if s.fetch.is_some() => { + push_topk_through_join(LogicalPlan::Sort(s)) + } + other => Ok(Transformed::no(other)), + } + } - // Currently only rewrite if skip and fetch are both literals - let SkipType::Literal(skip) = limit.get_skip_type()? else { + fn name(&self) -> &str { + "push_down_limit" + } + + fn apply_order(&self) -> Option { + Some(ApplyOrder::TopDown) + } +} + +/// Limit-side dispatch (split out from `rewrite` so that the top-level +/// match in `OptimizerRule::rewrite` reads as a parallel branch alongside +/// the Sort-with-fetch handler). +#[expect(clippy::only_used_in_recursion)] +fn rewrite_limit( + mut limit: Limit, + config: &dyn OptimizerConfig, +) -> Result> { + // Currently only rewrite if skip and fetch are both literals + let SkipType::Literal(skip) = limit.get_skip_type()? else { + return Ok(Transformed::no(LogicalPlan::Limit(limit))); + }; + let FetchType::Literal(fetch) = limit.get_fetch_type()? else { + return Ok(Transformed::no(LogicalPlan::Limit(limit))); + }; + + // Merge the Parent Limit and the Child Limit. + if let LogicalPlan::Limit(child) = limit.input.as_ref() { + let SkipType::Literal(child_skip) = child.get_skip_type()? else { return Ok(Transformed::no(LogicalPlan::Limit(limit))); }; - let FetchType::Literal(fetch) = limit.get_fetch_type()? else { + let FetchType::Literal(child_fetch) = child.get_fetch_type()? else { return Ok(Transformed::no(LogicalPlan::Limit(limit))); }; - // Merge the Parent Limit and the Child Limit. - if let LogicalPlan::Limit(child) = limit.input.as_ref() { - let SkipType::Literal(child_skip) = child.get_skip_type()? else { - return Ok(Transformed::no(LogicalPlan::Limit(limit))); - }; - let FetchType::Literal(child_fetch) = child.get_fetch_type()? else { - return Ok(Transformed::no(LogicalPlan::Limit(limit))); - }; - - let (skip, fetch) = combine_limit(skip, fetch, child_skip, child_fetch); - let plan = LogicalPlan::Limit(Limit { - skip: Some(Box::new(lit(skip as i64))), - fetch: fetch.map(|f| Box::new(lit(f as i64))), - input: Arc::clone(&child.input), - }); + let (skip, fetch) = combine_limit(skip, fetch, child_skip, child_fetch); + let new_limit = Limit { + skip: Some(Box::new(lit(skip as i64))), + fetch: fetch.map(|f| Box::new(lit(f as i64))), + input: Arc::clone(&child.input), + }; - // recursively reapply the rule on the new plan - return self.rewrite(plan, config); - } + // recursively reapply the rule on the new limit + return rewrite_limit(new_limit, config); + } - // no fetch to push, so return the original plan - let Some(fetch) = fetch else { - return Ok(Transformed::no(LogicalPlan::Limit(limit))); - }; + // no fetch to push, so return the original plan + let Some(fetch) = fetch else { + return Ok(Transformed::no(LogicalPlan::Limit(limit))); + }; - match Arc::unwrap_or_clone(limit.input) { - LogicalPlan::TableScan(mut scan) => { - let rows_needed = if fetch != 0 { fetch + skip } else { 0 }; - let new_fetch = scan + match Arc::unwrap_or_clone(limit.input) { + LogicalPlan::TableScan(mut scan) => { + let rows_needed = if fetch != 0 { fetch + skip } else { 0 }; + let new_fetch = scan + .fetch + .map(|x| min(x, rows_needed)) + .or(Some(rows_needed)); + if new_fetch == scan.fetch { + original_limit(skip, fetch, LogicalPlan::TableScan(scan)) + } else { + // push limit into the table scan itself + scan.fetch = scan .fetch .map(|x| min(x, rows_needed)) .or(Some(rows_needed)); - if new_fetch == scan.fetch { - original_limit(skip, fetch, LogicalPlan::TableScan(scan)) - } else { - // push limit into the table scan itself - scan.fetch = scan - .fetch - .map(|x| min(x, rows_needed)) - .or(Some(rows_needed)); - transformed_limit(skip, fetch, LogicalPlan::TableScan(scan)) - } - } - LogicalPlan::Union(mut union) => { - // push limits to each input of the union - union.inputs = union - .inputs - .into_iter() - .map(|input| make_arc_limit(0, fetch + skip, input)) - .collect(); - transformed_limit(skip, fetch, LogicalPlan::Union(union)) + transformed_limit(skip, fetch, LogicalPlan::TableScan(scan)) } + } + LogicalPlan::Union(mut union) => { + // push limits to each input of the union + union.inputs = union + .inputs + .into_iter() + .map(|input| make_arc_limit(0, fetch + skip, input)) + .collect(); + transformed_limit(skip, fetch, LogicalPlan::Union(union)) + } + + LogicalPlan::Join(join) => { + Ok(push_down_join(join, fetch + skip).update_data(|join| { + make_limit(skip, fetch, Arc::new(LogicalPlan::Join(join))) + })) + } - LogicalPlan::Join(join) => Ok(push_down_join(join, fetch + skip) - .update_data(|join| { - make_limit(skip, fetch, Arc::new(LogicalPlan::Join(join))) - })), - - LogicalPlan::Sort(mut sort) => { - let new_fetch = { - let sort_fetch = skip + fetch; - Some(sort.fetch.map(|f| f.min(sort_fetch)).unwrap_or(sort_fetch)) - }; - if new_fetch == sort.fetch { - if skip > 0 { - original_limit(skip, fetch, LogicalPlan::Sort(sort)) - } else { - Ok(Transformed::yes(LogicalPlan::Sort(sort))) - } + LogicalPlan::Sort(mut sort) => { + let new_fetch = { + let sort_fetch = skip + fetch; + Some(sort.fetch.map(|f| f.min(sort_fetch)).unwrap_or(sort_fetch)) + }; + if new_fetch == sort.fetch { + if skip > 0 { + original_limit(skip, fetch, LogicalPlan::Sort(sort)) } else { - sort.fetch = new_fetch; - limit.input = Arc::new(LogicalPlan::Sort(sort)); - Ok(Transformed::yes(LogicalPlan::Limit(limit))) + Ok(Transformed::yes(LogicalPlan::Sort(sort))) } + } else { + sort.fetch = new_fetch; + limit.input = Arc::new(LogicalPlan::Sort(sort)); + Ok(Transformed::yes(LogicalPlan::Limit(limit))) } - LogicalPlan::Projection(mut proj) => { - // commute - limit.input = Arc::clone(&proj.input); - let new_limit = LogicalPlan::Limit(limit); - proj.input = Arc::new(new_limit); - Ok(Transformed::yes(LogicalPlan::Projection(proj))) - } - LogicalPlan::SubqueryAlias(mut subquery_alias) => { - // commute - limit.input = Arc::clone(&subquery_alias.input); - let new_limit = LogicalPlan::Limit(limit); - subquery_alias.input = Arc::new(new_limit); - Ok(Transformed::yes(LogicalPlan::SubqueryAlias(subquery_alias))) - } - LogicalPlan::Extension(extension_plan) - if extension_plan.node.supports_limit_pushdown() => - { - let new_children = extension_plan - .node - .inputs() - .into_iter() - .map(|child| { - LogicalPlan::Limit(Limit { - skip: None, - fetch: Some(Box::new(lit((fetch + skip) as i64))), - input: Arc::new(child.clone()), - }) + } + LogicalPlan::Projection(mut proj) => { + // commute + limit.input = Arc::clone(&proj.input); + let new_limit = LogicalPlan::Limit(limit); + proj.input = Arc::new(new_limit); + Ok(Transformed::yes(LogicalPlan::Projection(proj))) + } + LogicalPlan::SubqueryAlias(mut subquery_alias) => { + // commute + limit.input = Arc::clone(&subquery_alias.input); + let new_limit = LogicalPlan::Limit(limit); + subquery_alias.input = Arc::new(new_limit); + Ok(Transformed::yes(LogicalPlan::SubqueryAlias(subquery_alias))) + } + LogicalPlan::Extension(extension_plan) + if extension_plan.node.supports_limit_pushdown() => + { + let new_children = extension_plan + .node + .inputs() + .into_iter() + .map(|child| { + LogicalPlan::Limit(Limit { + skip: None, + fetch: Some(Box::new(lit((fetch + skip) as i64))), + input: Arc::new(child.clone()), }) - .collect::>(); + }) + .collect::>(); - // Create a new extension node with updated inputs - let child_plan = LogicalPlan::Extension(extension_plan); - let new_extension = - child_plan.with_new_exprs(child_plan.expressions(), new_children)?; + // Create a new extension node with updated inputs + let child_plan = LogicalPlan::Extension(extension_plan); + let new_extension = + child_plan.with_new_exprs(child_plan.expressions(), new_children)?; - transformed_limit(skip, fetch, new_extension) - } - input => original_limit(skip, fetch, input), + transformed_limit(skip, fetch, new_extension) } - } - - fn name(&self) -> &str { - "push_down_limit" - } - - fn apply_order(&self) -> Option { - Some(ApplyOrder::TopDown) + input => original_limit(skip, fetch, input), } } diff --git a/datafusion/optimizer/src/push_down_limit/topk_through_join.rs b/datafusion/optimizer/src/push_down_limit/topk_through_join.rs new file mode 100644 index 0000000000000..567ca5cab6f25 --- /dev/null +++ b/datafusion/optimizer/src/push_down_limit/topk_through_join.rs @@ -0,0 +1,1185 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Sort(fetch) → Join pushdown — a sub-module of `push_down_limit`. +//! +//! When a `Sort` with a fetch limit (TopK) sits above a join whose +//! preserved side is known (LEFT / RIGHT / LeftMark / RightMark / CROSS) +//! and all sort expressions come from the preserved side, we insert a +//! copy of the `Sort(fetch)` onto that input to reduce rows entering +//! the join. The outer `Sort` is kept because a 1-to-many join can +//! produce more than N output rows from N preserved-side rows. +//! +//! Dispatched from `PushDownLimit::rewrite` when the plan node is +//! `LogicalPlan::Sort` with `fetch.is_some()`. + +use std::collections::HashMap; +use std::sync::Arc; + +use crate::utils::{has_all_column_refs, schema_columns}; + +use datafusion_common::tree_node::{Transformed, TreeNode}; +use datafusion_common::{Column, Result, internal_err}; +use datafusion_expr::logical_plan::{ + JoinType, LogicalPlan, Projection, Sort as SortPlan, SubqueryAlias, +}; +use datafusion_expr::{Expr, SortExpr}; + +/// Which child of a join is being treated as the preserved side. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum Side { + Left, + Right, +} + +/// Top-level pushdown for `Sort(fetch) → ... → Join` patterns. The plan +/// passed in is guaranteed by the caller to be `LogicalPlan::Sort` with +/// `fetch.is_some()`; we re-bind to a borrow inside. +pub(super) fn push_topk_through_join( + plan: LogicalPlan, +) -> Result> { + let LogicalPlan::Sort(sort) = &plan else { + return Ok(Transformed::no(plan)); + }; + let Some(fetch) = sort.fetch else { + return Ok(Transformed::no(plan)); + }; + + // Don't push if any sort expression is non-deterministic (e.g. + // `random()`). Duplicating such expressions would produce different + // values at each evaluation point, potentially changing results. + if sort.expr.iter().any(|se| se.expr.is_volatile()) { + return Ok(Transformed::no(plan)); + } + + // Peel through transparent nodes (SubqueryAlias, Projection) to + // find the Join. Track intermediates so we can reconstruct the tree + // and resolve sort expressions through them. + let mut current = sort.input.as_ref(); + let mut intermediates: Vec<&LogicalPlan> = Vec::new(); + let join = loop { + match current { + LogicalPlan::Join(join) => break join, + LogicalPlan::Projection(proj) => { + intermediates.push(current); + current = proj.input.as_ref(); + } + LogicalPlan::SubqueryAlias(sq) => { + intermediates.push(current); + current = sq.input.as_ref(); + } + _ => return Ok(Transformed::no(plan)), + } + }; + + // Determine which side(s) of the join are preserved. + // + // - LEFT / LeftMark: only left preserved. + // - RIGHT / RightMark: symmetric. + // - CROSS JOIN (Inner with no `on` keys and no filter): + // every row from both sides appears in the output (Cartesian + // product), so we can push to whichever side has all the sort + // columns. + // + // For LEFT/RIGHT, non-equijoin filters in the ON clause are safe: + // outer joins guarantee all preserved-side rows appear in the + // output regardless of the filter. For Inner joins (cross-join + // detection), the filter check is strict (`filter.is_none()`) — + // any filter on Inner can drop rows from either side. + let preserved_candidates: &[Side] = match join.join_type { + JoinType::Left | JoinType::LeftMark => &[Side::Left], + JoinType::Right | JoinType::RightMark => &[Side::Right], + JoinType::Inner if join.on.is_empty() && join.filter.is_none() => { + &[Side::Left, Side::Right] + } + _ => return Ok(Transformed::no(plan)), + }; + + // Resolve sort expressions through all intermediate nodes + // (Projection, SubqueryAlias) so column references match the + // join's schema. + let mut resolved_sort_exprs = sort.expr.clone(); + for node in &intermediates { + match node { + LogicalPlan::Projection(proj) => { + resolved_sort_exprs = + resolve_sort_exprs_through_projection(&resolved_sort_exprs, proj)?; + } + LogicalPlan::SubqueryAlias(sq) => { + resolved_sort_exprs = + resolve_sort_exprs_through_subquery_alias(&resolved_sort_exprs, sq)?; + } + _ => { + return internal_err!( + "push_topk_through_join: unexpected intermediate node: {}", + node.display() + ); + } + } + } + + // After resolving through projections, sort expressions may now + // contain volatile functions (e.g. `random() AS col`). Duplicating + // them would change results. + if resolved_sort_exprs.iter().any(|se| se.expr.is_volatile()) { + return Ok(Transformed::no(plan)); + } + + // Pick the first preserved-side candidate whose schema contains all + // referenced sort columns. For LEFT/RIGHT this is the fixed side; + // for CROSS we try both. + let Some(preserved_side) = preserved_candidates.iter().copied().find(|&side| { + let schema = match side { + Side::Left => join.left.schema(), + Side::Right => join.right.schema(), + }; + let cols = schema_columns(schema); + resolved_sort_exprs + .iter() + .all(|se| has_all_column_refs(&se.expr, &cols)) + }) else { + return Ok(Transformed::no(plan)); + }; + + let preserved_child = match preserved_side { + Side::Left => &join.left, + Side::Right => &join.right, + }; + + // Scan deep inside the preserved child (through SubqueryAlias and + // Projection layers) to find an existing Sort. If found with same + // exprs, tighten its fetch in-place. Otherwise, insert a new Sort + // directly below the join as the preserved child's wrapper. + let mut inner_child = preserved_child.as_ref(); + let mut deep_resolved_exprs = resolved_sort_exprs.clone(); + loop { + match inner_child { + LogicalPlan::SubqueryAlias(sq) => { + deep_resolved_exprs = + resolve_sort_exprs_through_subquery_alias(&deep_resolved_exprs, sq)?; + inner_child = sq.input.as_ref(); + } + LogicalPlan::Projection(proj) => { + deep_resolved_exprs = + resolve_sort_exprs_through_projection(&deep_resolved_exprs, proj)?; + inner_child = proj.input.as_ref(); + } + _ => break, + } + } + + // If the inner child is a Limit (PushDownLimit's own Limit handling + // hasn't merged it with the Sort yet), skip this iteration. + if matches!(inner_child, LogicalPlan::Limit(_)) { + return Ok(Transformed::no(plan)); + } + + // Determine action based on existing inner Sort: + // - Same exprs, tighter fetch → skip (already optimal) + // - Same exprs, larger/no fetch → tighten in-place + // - Different exprs or no Sort → insert new Sort below the join + // + // If `deep_resolved_exprs` became volatile while resolving through + // projections inside the preserved child (e.g. `random() AS col`), + // structural equality with an existing inner Sort is unsound: two + // identical `random()` exprs evaluate to different values. Fall + // back to inserting a new Sort with `resolved_sort_exprs`. + let deep_exprs_volatile = deep_resolved_exprs.iter().any(|se| se.expr.is_volatile()); + let inner_sort = match inner_child { + LogicalPlan::Sort(s) if !deep_exprs_volatile => Some(s), + _ => None, + }; + let new_preserved_child = if let Some(child_sort) = inner_sort { + let same_exprs = sort_exprs_equal(&child_sort.expr, &deep_resolved_exprs); + let child_fetch_tighter = match child_sort.fetch { + Some(child_fetch) => child_fetch <= fetch, + None => false, + }; + if same_exprs && child_fetch_tighter { + return Ok(Transformed::no(plan)); + } + if same_exprs { + rebuild_with_tightened_sort( + preserved_child.as_ref(), + &deep_resolved_exprs, + fetch, + )? + } else { + // Different exprs — insert new Sort above the preserved + // child. If the inner Sort has no fetch, our pushed Sort + // is the only row reduction. If it has a fetch, re-sorting + // a small set is cheap and still reduces join input. + Arc::new(LogicalPlan::Sort(SortPlan { + expr: resolved_sort_exprs, + input: Arc::clone(preserved_child), + fetch: Some(fetch), + })) + } + } else { + Arc::new(LogicalPlan::Sort(SortPlan { + expr: resolved_sort_exprs, + input: Arc::clone(preserved_child), + fetch: Some(fetch), + })) + }; + + let mut new_join = join.clone(); + match preserved_side { + Side::Left => new_join.left = new_preserved_child, + Side::Right => new_join.right = new_preserved_child, + } + + // Rebuild the tree: join → intermediate nodes → top-level sort. + let mut new_sort_input = Arc::new(LogicalPlan::Join(new_join)); + for node in intermediates.into_iter().rev() { + new_sort_input = Arc::new(match node { + LogicalPlan::Projection(proj) => { + let mut new_proj = proj.clone(); + new_proj.input = new_sort_input; + LogicalPlan::Projection(new_proj) + } + LogicalPlan::SubqueryAlias(sq) => LogicalPlan::SubqueryAlias( + SubqueryAlias::try_new(new_sort_input, sq.alias.clone())?, + ), + _ => { + return internal_err!( + "push_topk_through_join: unexpected intermediate node: {}", + node.display() + ); + } + }); + } + + Ok(Transformed::yes(LogicalPlan::Sort(SortPlan { + expr: sort.expr.clone(), + input: new_sort_input, + fetch: sort.fetch, + }))) +} + +/// Replace column references in sort expressions using a name→expr map. +fn replace_columns_in_sort_exprs( + sort_exprs: &[SortExpr], + replace_map: &HashMap, +) -> Result> { + sort_exprs + .iter() + .map(|sort_expr| { + let new_expr = sort_expr.expr.clone().transform(|expr| { + let replacement = match &expr { + Expr::Column(col) => replace_map.get(&col.flat_name()).cloned(), + _ => None, + }; + Ok(replacement.map_or_else(|| Transformed::no(expr), Transformed::yes)) + })?; + Ok(SortExpr { + expr: new_expr.data, + ..*sort_expr + }) + }) + .collect() +} + +/// Resolve sort expressions through a projection by replacing column +/// references with the underlying projection expressions. +fn resolve_sort_exprs_through_projection( + sort_exprs: &[SortExpr], + projection: &Projection, +) -> Result> { + let replace_map: HashMap = projection + .schema + .iter() + .zip(projection.expr.iter()) + .map(|((qualifier, field), expr)| { + let key = Column::from((qualifier, field)).flat_name(); + (key, expr.clone().unalias()) + }) + .collect(); + + replace_columns_in_sort_exprs(sort_exprs, &replace_map) +} + +/// Compare two slices of `SortExpr` for structural equality. +fn sort_exprs_equal(a: &[SortExpr], b: &[SortExpr]) -> bool { + a.len() == b.len() + && a.iter().zip(b.iter()).all(|(left, right)| { + left.asc == right.asc + && left.nulls_first == right.nulls_first + && left.expr == right.expr + }) +} + +/// Resolve sort expressions through a `SubqueryAlias` by replacing the +/// alias qualifier with the input schema's qualifier. +fn resolve_sort_exprs_through_subquery_alias( + sort_exprs: &[SortExpr], + subquery_alias: &SubqueryAlias, +) -> Result> { + let replace_map: HashMap = subquery_alias + .schema + .iter() + .zip(subquery_alias.input.schema().iter()) + .map(|((alias_qual, alias_field), (input_qual, input_field))| { + let alias_col = Column::from((alias_qual, alias_field)); + let input_col = Column::from((input_qual, input_field)); + (alias_col.flat_name(), Expr::Column(input_col)) + }) + .collect(); + + replace_columns_in_sort_exprs(sort_exprs, &replace_map) +} + +/// Rebuild the tree from `root` down to an existing Sort whose expressions +/// match `target_exprs`, tightening its fetch to `new_fetch`. +fn rebuild_with_tightened_sort( + root: &LogicalPlan, + target_exprs: &[SortExpr], + new_fetch: usize, +) -> Result> { + match root { + LogicalPlan::Sort(s) if sort_exprs_equal(&s.expr, target_exprs) => { + Ok(Arc::new(LogicalPlan::Sort(SortPlan { + expr: s.expr.clone(), + input: Arc::clone(&s.input), + fetch: Some(new_fetch), + }))) + } + LogicalPlan::Projection(proj) => { + let new_input = rebuild_with_tightened_sort( + proj.input.as_ref(), + target_exprs, + new_fetch, + )?; + let mut new_proj = proj.clone(); + new_proj.input = new_input; + Ok(Arc::new(LogicalPlan::Projection(new_proj))) + } + LogicalPlan::SubqueryAlias(sq) => { + let new_input = + rebuild_with_tightened_sort(sq.input.as_ref(), target_exprs, new_fetch)?; + Ok(Arc::new(LogicalPlan::SubqueryAlias( + SubqueryAlias::try_new(new_input, sq.alias.clone())?, + ))) + } + _ => internal_err!( + "rebuild_with_tightened_sort: unexpected node: {}", + root.display() + ), + } +} + +#[cfg(test)] +mod test { + use super::*; + use crate::OptimizerContext; + use crate::assert_optimized_plan_eq_snapshot; + use crate::push_down_limit::PushDownLimit; + use crate::test::*; + + use datafusion_expr::col; + use datafusion_expr::logical_plan::builder::LogicalPlanBuilder; + + macro_rules! assert_optimized_plan_equal { + ( + $plan:expr, + @ $expected:literal $(,)? + ) => {{ + let optimizer_ctx = OptimizerContext::new().with_max_passes(1); + let rules: Vec> = + vec![Arc::new(PushDownLimit::new())]; + assert_optimized_plan_eq_snapshot!( + optimizer_ctx, + rules, + $plan, + @ $expected, + ) + }}; + } + + /// TopK on left-side columns above a LEFT JOIN → pushed to left child. + #[test] + fn topk_pushed_to_left_of_left_join() -> Result<()> { + let t1 = test_table_scan_with_name("t1")?; + let t2 = test_table_scan_with_name("t2")?; + + let plan = LogicalPlanBuilder::from(t1) + .join( + LogicalPlanBuilder::from(t2).build()?, + JoinType::Left, + (vec!["a"], vec!["a"]), + None, + )? + .sort_with_limit(vec![col("t1.b").sort(true, false)], Some(3))? + .build()?; + + assert_optimized_plan_equal!( + plan, + @r" + Sort: t1.b ASC NULLS LAST, fetch=3 + Left Join: t1.a = t2.a + Sort: t1.b ASC NULLS LAST, fetch=3 + TableScan: t1 + TableScan: t2 + " + ) + } + + /// TopK on right-side columns above a RIGHT JOIN → pushed to right child. + #[test] + fn topk_pushed_to_right_of_right_join() -> Result<()> { + let t1 = test_table_scan_with_name("t1")?; + let t2 = test_table_scan_with_name("t2")?; + + let plan = LogicalPlanBuilder::from(t1) + .join( + LogicalPlanBuilder::from(t2).build()?, + JoinType::Right, + (vec!["a"], vec!["a"]), + None, + )? + .sort_with_limit(vec![col("t2.b").sort(true, false)], Some(5))? + .build()?; + + assert_optimized_plan_equal!( + plan, + @r" + Sort: t2.b ASC NULLS LAST, fetch=5 + Right Join: t1.a = t2.a + TableScan: t1 + Sort: t2.b ASC NULLS LAST, fetch=5 + TableScan: t2 + " + ) + } + + /// TopK pushed through a Projection between Sort and Join. + #[test] + fn topk_pushed_through_projection() -> Result<()> { + let t1 = test_table_scan_with_name("t1")?; + let t2 = test_table_scan_with_name("t2")?; + + let plan = LogicalPlanBuilder::from(t1) + .join( + LogicalPlanBuilder::from(t2).build()?, + JoinType::Left, + (vec!["a"], vec!["a"]), + None, + )? + .project(vec![col("t1.a"), col("t1.b"), col("t2.c")])? + .sort_with_limit(vec![col("t1.b").sort(true, false)], Some(3))? + .build()?; + + assert_optimized_plan_equal!( + plan, + @r" + Sort: t1.b ASC NULLS LAST, fetch=3 + Projection: t1.a, t1.b, t2.c + Left Join: t1.a = t2.a + Sort: t1.b ASC NULLS LAST, fetch=3 + TableScan: t1 + TableScan: t2 + " + ) + } + + /// INNER JOIN → no pushdown. + #[test] + fn topk_not_pushed_for_inner_join() -> Result<()> { + let t1 = test_table_scan_with_name("t1")?; + let t2 = test_table_scan_with_name("t2")?; + + let plan = LogicalPlanBuilder::from(t1) + .join( + LogicalPlanBuilder::from(t2).build()?, + JoinType::Inner, + (vec!["a"], vec!["a"]), + None, + )? + .sort_with_limit(vec![col("t1.b").sort(true, false)], Some(3))? + .build()?; + + assert_optimized_plan_equal!( + plan, + @r" + Sort: t1.b ASC NULLS LAST, fetch=3 + Inner Join: t1.a = t2.a + TableScan: t1 + TableScan: t2 + " + ) + } + + /// CROSS JOIN sorted by left-side columns → pushed to left child. + #[test] + fn topk_pushed_to_left_of_cross_join() -> Result<()> { + let t1 = test_table_scan_with_name("t1")?; + let t2 = test_table_scan_with_name("t2")?; + + let plan = LogicalPlanBuilder::from(t1) + .cross_join(LogicalPlanBuilder::from(t2).build()?)? + .sort_with_limit(vec![col("t1.b").sort(true, false)], Some(3))? + .build()?; + + assert_optimized_plan_equal!( + plan, + @r" + Sort: t1.b ASC NULLS LAST, fetch=3 + Cross Join: + Sort: t1.b ASC NULLS LAST, fetch=3 + TableScan: t1 + TableScan: t2 + " + ) + } + + /// CROSS JOIN sorted by right-side columns → pushed to right child. + #[test] + fn topk_pushed_to_right_of_cross_join() -> Result<()> { + let t1 = test_table_scan_with_name("t1")?; + let t2 = test_table_scan_with_name("t2")?; + + let plan = LogicalPlanBuilder::from(t1) + .cross_join(LogicalPlanBuilder::from(t2).build()?)? + .sort_with_limit(vec![col("t2.b").sort(true, false)], Some(3))? + .build()?; + + assert_optimized_plan_equal!( + plan, + @r" + Sort: t2.b ASC NULLS LAST, fetch=3 + Cross Join: + TableScan: t1 + Sort: t2.b ASC NULLS LAST, fetch=3 + TableScan: t2 + " + ) + } + + /// CROSS JOIN sorted by columns from both sides → no pushdown. + #[test] + fn topk_not_pushed_for_cross_join_mixed_side_sort() -> Result<()> { + let t1 = test_table_scan_with_name("t1")?; + let t2 = test_table_scan_with_name("t2")?; + + let plan = LogicalPlanBuilder::from(t1) + .cross_join(LogicalPlanBuilder::from(t2).build()?)? + .sort_with_limit( + vec![(col("t1.b") + col("t2.b")).sort(true, false)], + Some(3), + )? + .build()?; + + assert_optimized_plan_equal!( + plan, + @r" + Sort: t1.b + t2.b ASC NULLS LAST, fetch=3 + Cross Join: + TableScan: t1 + TableScan: t2 + " + ) + } + + /// Inner join with no equi-keys but a non-empty filter: filter can drop + /// rows from either side, so pushing fetch=N is unsafe. + #[test] + fn topk_not_pushed_for_inner_with_filter_no_on() -> Result<()> { + let t1 = test_table_scan_with_name("t1")?; + let t2 = test_table_scan_with_name("t2")?; + + let plan = LogicalPlanBuilder::from(t1) + .join_on( + LogicalPlanBuilder::from(t2).build()?, + JoinType::Inner, + vec![col("t1.b").gt(col("t2.b"))], + )? + .sort_with_limit(vec![col("t1.b").sort(true, false)], Some(3))? + .build()?; + + assert_optimized_plan_equal!( + plan, + @r" + Sort: t1.b ASC NULLS LAST, fetch=3 + Inner Join: Filter: t1.b > t2.b + TableScan: t1 + TableScan: t2 + " + ) + } + + /// LEFT MARK join: one record per left row → pushdown to left. + #[test] + fn topk_pushed_to_left_of_left_mark_join() -> Result<()> { + let t1 = test_table_scan_with_name("t1")?; + let t2 = test_table_scan_with_name("t2")?; + + let plan = LogicalPlanBuilder::from(t1) + .join( + LogicalPlanBuilder::from(t2).build()?, + JoinType::LeftMark, + (vec!["a"], vec!["a"]), + None, + )? + .sort_with_limit(vec![col("t1.b").sort(true, false)], Some(3))? + .build()?; + + assert_optimized_plan_equal!( + plan, + @r" + Sort: t1.b ASC NULLS LAST, fetch=3 + LeftMark Join: t1.a = t2.a + Sort: t1.b ASC NULLS LAST, fetch=3 + TableScan: t1 + TableScan: t2 + " + ) + } + + /// RIGHT MARK join: symmetric to LeftMark. + #[test] + fn topk_pushed_to_right_of_right_mark_join() -> Result<()> { + let t1 = test_table_scan_with_name("t1")?; + let t2 = test_table_scan_with_name("t2")?; + + let plan = LogicalPlanBuilder::from(t1) + .join( + LogicalPlanBuilder::from(t2).build()?, + JoinType::RightMark, + (vec!["a"], vec!["a"]), + None, + )? + .sort_with_limit(vec![col("t2.b").sort(true, false)], Some(3))? + .build()?; + + assert_optimized_plan_equal!( + plan, + @r" + Sort: t2.b ASC NULLS LAST, fetch=3 + RightMark Join: t1.a = t2.a + TableScan: t1 + Sort: t2.b ASC NULLS LAST, fetch=3 + TableScan: t2 + " + ) + } + + /// LEFT JOIN but sort on right-side columns → no pushdown. + #[test] + fn topk_not_pushed_for_wrong_side() -> Result<()> { + let t1 = test_table_scan_with_name("t1")?; + let t2 = test_table_scan_with_name("t2")?; + + let plan = LogicalPlanBuilder::from(t1) + .join( + LogicalPlanBuilder::from(t2).build()?, + JoinType::Left, + (vec!["a"], vec!["a"]), + None, + )? + .sort_with_limit(vec![col("t2.b").sort(true, false)], Some(3))? + .build()?; + + assert_optimized_plan_equal!( + plan, + @r" + Sort: t2.b ASC NULLS LAST, fetch=3 + Left Join: t1.a = t2.a + TableScan: t1 + TableScan: t2 + " + ) + } + + /// Join with non-equijoin filter → pushdown still happens. + #[test] + fn topk_pushed_with_join_filter() -> Result<()> { + let t1 = test_table_scan_with_name("t1")?; + let t2 = test_table_scan_with_name("t2")?; + + let plan = LogicalPlanBuilder::from(t1) + .join_on( + LogicalPlanBuilder::from(t2).build()?, + JoinType::Left, + vec![col("t1.a").eq(col("t2.a"))], + )? + .sort_with_limit(vec![col("t1.b").sort(true, false)], Some(3))? + .build()?; + + assert_optimized_plan_equal!( + plan, + @r" + Sort: t1.b ASC NULLS LAST, fetch=3 + Left Join: Filter: t1.a = t2.a + Sort: t1.b ASC NULLS LAST, fetch=3 + TableScan: t1 + TableScan: t2 + " + ) + } + + /// Sort without fetch → no pushdown. + #[test] + fn topk_not_pushed_without_fetch() -> Result<()> { + let t1 = test_table_scan_with_name("t1")?; + let t2 = test_table_scan_with_name("t2")?; + + let plan = LogicalPlanBuilder::from(t1) + .join( + LogicalPlanBuilder::from(t2).build()?, + JoinType::Left, + (vec!["a"], vec!["a"]), + None, + )? + .sort(vec![col("t1.b").sort(true, false)])? + .build()?; + + assert_optimized_plan_equal!( + plan, + @r" + Sort: t1.b ASC NULLS LAST + Left Join: t1.a = t2.a + TableScan: t1 + TableScan: t2 + " + ) + } + + /// LEFT SEMI JOIN: not all left rows appear in output → no pushdown. + #[test] + fn topk_not_pushed_for_left_semi_join() -> Result<()> { + let t1 = test_table_scan_with_name("t1")?; + let t2 = test_table_scan_with_name("t2")?; + + let plan = LogicalPlanBuilder::from(t1) + .join( + LogicalPlanBuilder::from(t2).build()?, + JoinType::LeftSemi, + (vec!["a"], vec!["a"]), + None, + )? + .sort_with_limit(vec![col("t1.b").sort(true, false)], Some(3))? + .build()?; + + assert_optimized_plan_equal!( + plan, + @r" + Sort: t1.b ASC NULLS LAST, fetch=3 + LeftSemi Join: t1.a = t2.a + TableScan: t1 + TableScan: t2 + " + ) + } + + /// LEFT ANTI JOIN: not all left rows appear in output → no pushdown. + #[test] + fn topk_not_pushed_for_left_anti_join() -> Result<()> { + let t1 = test_table_scan_with_name("t1")?; + let t2 = test_table_scan_with_name("t2")?; + + let plan = LogicalPlanBuilder::from(t1) + .join( + LogicalPlanBuilder::from(t2).build()?, + JoinType::LeftAnti, + (vec!["a"], vec!["a"]), + None, + )? + .sort_with_limit(vec![col("t1.b").sort(true, false)], Some(3))? + .build()?; + + assert_optimized_plan_equal!( + plan, + @r" + Sort: t1.b ASC NULLS LAST, fetch=3 + LeftAnti Join: t1.a = t2.a + TableScan: t1 + TableScan: t2 + " + ) + } + + /// RIGHT SEMI JOIN: not all right rows appear in output → no pushdown. + #[test] + fn topk_not_pushed_for_right_semi_join() -> Result<()> { + let t1 = test_table_scan_with_name("t1")?; + let t2 = test_table_scan_with_name("t2")?; + + let plan = LogicalPlanBuilder::from(t1) + .join( + LogicalPlanBuilder::from(t2).build()?, + JoinType::RightSemi, + (vec!["a"], vec!["a"]), + None, + )? + .sort_with_limit(vec![col("t2.b").sort(true, false)], Some(3))? + .build()?; + + assert_optimized_plan_equal!( + plan, + @r" + Sort: t2.b ASC NULLS LAST, fetch=3 + RightSemi Join: t1.a = t2.a + TableScan: t1 + TableScan: t2 + " + ) + } + + /// RIGHT ANTI JOIN: not all right rows appear in output → no pushdown. + #[test] + fn topk_not_pushed_for_right_anti_join() -> Result<()> { + let t1 = test_table_scan_with_name("t1")?; + let t2 = test_table_scan_with_name("t2")?; + + let plan = LogicalPlanBuilder::from(t1) + .join( + LogicalPlanBuilder::from(t2).build()?, + JoinType::RightAnti, + (vec!["a"], vec!["a"]), + None, + )? + .sort_with_limit(vec![col("t2.b").sort(true, false)], Some(3))? + .build()?; + + assert_optimized_plan_equal!( + plan, + @r" + Sort: t2.b ASC NULLS LAST, fetch=3 + RightAnti Join: t1.a = t2.a + TableScan: t1 + TableScan: t2 + " + ) + } + + /// Multi-column sort with columns from both sides → no pushdown. + #[test] + fn topk_not_pushed_for_mixed_side_sort() -> Result<()> { + let t1 = test_table_scan_with_name("t1")?; + let t2 = test_table_scan_with_name("t2")?; + + let plan = LogicalPlanBuilder::from(t1) + .join( + LogicalPlanBuilder::from(t2).build()?, + JoinType::Left, + (vec!["a"], vec!["a"]), + None, + )? + .sort_with_limit( + vec![col("t1.b").sort(true, false), col("t2.b").sort(true, false)], + Some(3), + )? + .build()?; + + assert_optimized_plan_equal!( + plan, + @r" + Sort: t1.b ASC NULLS LAST, t2.b ASC NULLS LAST, fetch=3 + Left Join: t1.a = t2.a + TableScan: t1 + TableScan: t2 + " + ) + } + + /// Preserved child has a larger fetch → push our tighter limit. + #[test] + fn topk_pushed_when_child_has_larger_fetch() -> Result<()> { + let t1 = test_table_scan_with_name("t1")?; + let t2 = test_table_scan_with_name("t2")?; + + let t1_with_sort = LogicalPlanBuilder::from(t1) + .sort_with_limit(vec![col("t1.b").sort(true, false)], Some(10))? + .build()?; + + let plan = LogicalPlanBuilder::from(t1_with_sort) + .join( + LogicalPlanBuilder::from(t2).build()?, + JoinType::Left, + (vec!["a"], vec!["a"]), + None, + )? + .sort_with_limit(vec![col("t1.b").sort(true, false)], Some(3))? + .build()?; + + assert_optimized_plan_equal!( + plan, + @r" + Sort: t1.b ASC NULLS LAST, fetch=3 + Left Join: t1.a = t2.a + Sort: t1.b ASC NULLS LAST, fetch=3 + TableScan: t1 + TableScan: t2 + " + ) + } + + /// Preserved child already has a tighter fetch → skip pushdown. + #[test] + fn topk_not_pushed_when_child_has_smaller_fetch() -> Result<()> { + let t1 = test_table_scan_with_name("t1")?; + let t2 = test_table_scan_with_name("t2")?; + + let t1_with_sort = LogicalPlanBuilder::from(t1) + .sort_with_limit(vec![col("t1.b").sort(true, false)], Some(2))? + .build()?; + + let plan = LogicalPlanBuilder::from(t1_with_sort) + .join( + LogicalPlanBuilder::from(t2).build()?, + JoinType::Left, + (vec!["a"], vec!["a"]), + None, + )? + .sort_with_limit(vec![col("t1.b").sort(true, false)], Some(5))? + .build()?; + + assert_optimized_plan_equal!( + plan, + @r" + Sort: t1.b ASC NULLS LAST, fetch=5 + Left Join: t1.a = t2.a + Sort: t1.b ASC NULLS LAST, fetch=2 + TableScan: t1 + TableScan: t2 + " + ) + } + + /// Projection passthrough: sort expr matches a projected column directly. + #[test] + fn resolve_through_projection_passthrough() -> Result<()> { + let t1 = test_table_scan_with_name("t1")?; + let plan = LogicalPlanBuilder::from(t1) + .project(vec![col("t1.a"), col("t1.b")])? + .build()?; + let LogicalPlan::Projection(proj) = &plan else { + panic!("expected Projection"); + }; + let sort_exprs = vec![col("t1.b").sort(true, false)]; + let resolved = resolve_sort_exprs_through_projection(&sort_exprs, proj)?; + assert_eq!(resolved.len(), 1); + assert_eq!(resolved[0].expr.to_string(), "t1.b"); + assert!(resolved[0].asc); + Ok(()) + } + + /// Projection alias: sort expr references an alias mapping to a negation. + #[test] + fn resolve_through_projection_alias() -> Result<()> { + let t1 = test_table_scan_with_name("t1")?; + let plan = LogicalPlanBuilder::from(t1) + .project(vec![ + col("t1.a"), + (Expr::Negative(Box::new(col("t1.b")))).alias("neg_b"), + ])? + .build()?; + let LogicalPlan::Projection(proj) = &plan else { + panic!("expected Projection"); + }; + let sort_exprs = vec![col("neg_b").sort(true, false)]; + let resolved = resolve_sort_exprs_through_projection(&sort_exprs, proj)?; + assert_eq!(resolved.len(), 1); + assert_eq!(resolved[0].expr.to_string(), "(- t1.b)"); + Ok(()) + } + + /// Multi-column resolution preserves direction and nulls_first per column. + #[test] + fn resolve_through_projection_multi_column() -> Result<()> { + let t1 = test_table_scan_with_name("t1")?; + let plan = LogicalPlanBuilder::from(t1) + .project(vec![col("t1.a"), col("t1.b"), col("t1.c")])? + .build()?; + let LogicalPlan::Projection(proj) = &plan else { + panic!("expected Projection"); + }; + let sort_exprs = + vec![col("t1.a").sort(true, false), col("t1.b").sort(false, true)]; + let resolved = resolve_sort_exprs_through_projection(&sort_exprs, proj)?; + assert_eq!(resolved.len(), 2); + assert_eq!(resolved[0].expr.to_string(), "t1.a"); + assert!(resolved[0].asc); + assert_eq!(resolved[1].expr.to_string(), "t1.b"); + assert!(!resolved[1].asc); + assert!(resolved[1].nulls_first); + Ok(()) + } + + /// Stacked Projection + SubqueryAlias: resolve through both layers. + #[test] + fn resolve_through_projection_and_subquery_alias() -> Result<()> { + let t1 = test_table_scan_with_name("t1")?; + let plan = LogicalPlanBuilder::from(t1) + .alias("sub")? + .project(vec![ + col("sub.a"), + (Expr::Negative(Box::new(col("sub.b")))).alias("neg_b"), + ])? + .build()?; + let LogicalPlan::Projection(proj) = &plan else { + panic!("expected Projection"); + }; + let LogicalPlan::SubqueryAlias(sq) = proj.input.as_ref() else { + panic!("expected SubqueryAlias"); + }; + let sort_exprs = vec![col("neg_b").sort(true, false)]; + let after_proj = resolve_sort_exprs_through_projection(&sort_exprs, proj)?; + assert_eq!(after_proj[0].expr.to_string(), "(- sub.b)"); + let after_sq = resolve_sort_exprs_through_subquery_alias(&after_proj, sq)?; + assert_eq!(after_sq[0].expr.to_string(), "(- t1.b)"); + assert!(after_sq[0].asc); + assert!(!after_sq[0].nulls_first); + Ok(()) + } + + /// Simple SubqueryAlias resolution: sub.b → t1.b. + #[test] + fn resolve_through_subquery_alias_simple() -> Result<()> { + let t1 = test_table_scan_with_name("t1")?; + let plan = LogicalPlanBuilder::from(t1).alias("sub")?.build()?; + let LogicalPlan::SubqueryAlias(sq) = &plan else { + panic!("expected SubqueryAlias"); + }; + let sort_exprs = vec![col("sub.b").sort(true, false)]; + let resolved = resolve_sort_exprs_through_subquery_alias(&sort_exprs, sq)?; + assert_eq!(resolved.len(), 1); + assert_eq!(resolved[0].expr.to_string(), "t1.b"); + assert!(resolved[0].asc); + assert!(!resolved[0].nulls_first); + Ok(()) + } + + /// Multi-column SubqueryAlias resolution preserves direction per column. + #[test] + fn resolve_through_subquery_alias_multi_column() -> Result<()> { + let t1 = test_table_scan_with_name("t1")?; + let plan = LogicalPlanBuilder::from(t1).alias("sub")?.build()?; + let LogicalPlan::SubqueryAlias(sq) = &plan else { + panic!("expected SubqueryAlias"); + }; + let sort_exprs = vec![ + col("sub.a").sort(true, false), + col("sub.b").sort(false, true), + ]; + let resolved = resolve_sort_exprs_through_subquery_alias(&sort_exprs, sq)?; + assert_eq!(resolved.len(), 2); + assert_eq!(resolved[0].expr.to_string(), "t1.a"); + assert!(resolved[0].asc); + assert_eq!(resolved[1].expr.to_string(), "t1.b"); + assert!(!resolved[1].asc); + assert!(resolved[1].nulls_first); + Ok(()) + } + + /// SubqueryAlias with a different alias name (foo ≠ t1). + #[test] + fn resolve_through_subquery_alias_different_name() -> Result<()> { + let t1 = test_table_scan_with_name("t1")?; + let plan = LogicalPlanBuilder::from(t1).alias("foo")?.build()?; + let LogicalPlan::SubqueryAlias(sq) = &plan else { + panic!("expected SubqueryAlias"); + }; + let sort_exprs = vec![col("foo.b").sort(true, false)]; + let resolved = resolve_sort_exprs_through_subquery_alias(&sort_exprs, sq)?; + assert_eq!(resolved.len(), 1); + assert_eq!(resolved[0].expr.to_string(), "t1.b"); + Ok(()) + } + + /// SubqueryAlias with nested expression: (- sub.b) → (- t1.b). + #[test] + fn resolve_through_subquery_alias_nested_expr() -> Result<()> { + let t1 = test_table_scan_with_name("t1")?; + let plan = LogicalPlanBuilder::from(t1).alias("sub")?.build()?; + let LogicalPlan::SubqueryAlias(sq) = &plan else { + panic!("expected SubqueryAlias"); + }; + let sort_exprs = vec![Expr::Negative(Box::new(col("sub.b"))).sort(true, false)]; + let resolved = resolve_sort_exprs_through_subquery_alias(&sort_exprs, sq)?; + assert_eq!(resolved.len(), 1); + assert_eq!(resolved[0].expr.to_string(), "(- t1.b)"); + assert!(resolved[0].asc); + Ok(()) + } + + /// Inner Sort has different exprs WITH fetch → stacked sorts. + #[test] + fn topk_stacked_when_child_has_different_exprs_with_fetch() -> Result<()> { + let t1 = test_table_scan_with_name("t1")?; + let t2 = test_table_scan_with_name("t2")?; + + let t1_with_sort = LogicalPlanBuilder::from(t1) + .sort_with_limit(vec![col("t1.a").sort(true, false)], Some(5))? + .build()?; + + let plan = LogicalPlanBuilder::from(t1_with_sort) + .join( + LogicalPlanBuilder::from(t2).build()?, + JoinType::Left, + (vec!["a"], vec!["a"]), + None, + )? + .sort_with_limit(vec![col("t1.b").sort(true, false)], Some(2))? + .build()?; + + assert_optimized_plan_equal!( + plan, + @r" + Sort: t1.b ASC NULLS LAST, fetch=2 + Left Join: t1.a = t2.a + Sort: t1.b ASC NULLS LAST, fetch=2 + Sort: t1.a ASC NULLS LAST, fetch=5 + TableScan: t1 + TableScan: t2 + " + ) + } + + /// Inner Sort has different exprs WITHOUT fetch → stacked sorts. + #[test] + fn topk_stacked_when_child_has_different_exprs_no_fetch() -> Result<()> { + let t1 = test_table_scan_with_name("t1")?; + let t2 = test_table_scan_with_name("t2")?; + + let t1_with_sort = LogicalPlanBuilder::from(t1) + .sort(vec![col("t1.a").sort(true, false)])? + .build()?; + + let plan = LogicalPlanBuilder::from(t1_with_sort) + .join( + LogicalPlanBuilder::from(t2).build()?, + JoinType::Left, + (vec!["a"], vec!["a"]), + None, + )? + .sort_with_limit(vec![col("t1.b").sort(true, false)], Some(2))? + .build()?; + + assert_optimized_plan_equal!( + plan, + @r" + Sort: t1.b ASC NULLS LAST, fetch=2 + Left Join: t1.a = t2.a + Sort: t1.b ASC NULLS LAST, fetch=2 + Sort: t1.a ASC NULLS LAST + TableScan: t1 + TableScan: t2 + " + ) + } +} diff --git a/datafusion/sqllogictest/test_files/push_down_topk_through_join.slt b/datafusion/sqllogictest/test_files/push_down_topk_through_join.slt new file mode 100644 index 0000000000000..328d6f0b26f69 --- /dev/null +++ b/datafusion/sqllogictest/test_files/push_down_topk_through_join.slt @@ -0,0 +1,1224 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Tests for pushing TopK (Sort with fetch) through outer joins + +statement ok +set datafusion.execution.target_partitions = 1; + +statement ok +set datafusion.explain.logical_plan_only = true; + +# Create test tables +statement ok +CREATE TABLE t1 (a INT, b INT, c VARCHAR) AS VALUES + (1, 10, 'one'), + (2, 20, 'two'), + (3, 30, 'three'), + (4, 40, 'four'), + (5, 50, 'five'); + +statement ok +CREATE TABLE t2 (x INT, y INT, z VARCHAR) AS VALUES + (1, 100, 'alpha'), + (2, 200, 'beta'), + (3, 300, 'gamma'), + (6, 600, 'delta'), + (7, 700, 'epsilon'); + +### +### Positive cases — TopK should be pushed down +### + +# LEFT JOIN: TopK on left-side columns pushed to left child +query TT +EXPLAIN SELECT t1.a, t1.b, t2.x +FROM t1 LEFT JOIN t2 ON t1.a = t2.x +ORDER BY t1.b ASC LIMIT 3; +---- +logical_plan +01)Sort: t1.b ASC NULLS LAST, fetch=3 +02)--Left Join: t1.a = t2.x +03)----Sort: t1.b ASC NULLS LAST, fetch=3 +04)------TableScan: t1 projection=[a, b] +05)----TableScan: t2 projection=[x] + +# Verify correctness of the above query +query III +SELECT t1.a, t1.b, t2.x +FROM t1 LEFT JOIN t2 ON t1.a = t2.x +ORDER BY t1.b ASC LIMIT 3; +---- +1 10 1 +2 20 2 +3 30 3 + +# RIGHT JOIN: TopK on right-side columns pushed to right child +query TT +EXPLAIN SELECT t1.a, t2.x, t2.y +FROM t1 RIGHT JOIN t2 ON t1.a = t2.x +ORDER BY t2.y ASC LIMIT 3; +---- +logical_plan +01)Sort: t2.y ASC NULLS LAST, fetch=3 +02)--Right Join: t1.a = t2.x +03)----TableScan: t1 projection=[a] +04)----Sort: t2.y ASC NULLS LAST, fetch=3 +05)------TableScan: t2 projection=[x, y] + +# Verify correctness +query III +SELECT t1.a, t2.x, t2.y +FROM t1 RIGHT JOIN t2 ON t1.a = t2.x +ORDER BY t2.y ASC LIMIT 3; +---- +1 1 100 +2 2 200 +3 3 300 + +### +### Negative cases — TopK should NOT be pushed down +### + +# INNER JOIN: no pushdown +query TT +EXPLAIN SELECT t1.a, t2.x +FROM t1 INNER JOIN t2 ON t1.a = t2.x +ORDER BY t1.b ASC LIMIT 3; +---- +logical_plan +01)Projection: t1.a, t2.x +02)--Sort: t1.b ASC NULLS LAST, fetch=3 +03)----Projection: t1.a, t2.x, t1.b +04)------Inner Join: t1.a = t2.x +05)--------TableScan: t1 projection=[a, b] +06)--------TableScan: t2 projection=[x] + +# LEFT JOIN but sort on right-side columns: no pushdown +query TT +EXPLAIN SELECT t1.a, t2.x, t2.y +FROM t1 LEFT JOIN t2 ON t1.a = t2.x +ORDER BY t2.y ASC LIMIT 3; +---- +logical_plan +01)Sort: t2.y ASC NULLS LAST, fetch=3 +02)--Left Join: t1.a = t2.x +03)----TableScan: t1 projection=[a] +04)----TableScan: t2 projection=[x, y] + +# FULL OUTER JOIN: no pushdown +query TT +EXPLAIN SELECT t1.a, t2.x +FROM t1 FULL OUTER JOIN t2 ON t1.a = t2.x +ORDER BY t1.b ASC LIMIT 3; +---- +logical_plan +01)Projection: t1.a, t2.x +02)--Sort: t1.b ASC NULLS LAST, fetch=3 +03)----Projection: t1.a, t2.x, t1.b +04)------Full Join: t1.a = t2.x +05)--------TableScan: t1 projection=[a, b] +06)--------TableScan: t2 projection=[x] + +# LEFT JOIN with non-equijoin filter on BOTH sides: pushdown OK +# Filter t1.b > t2.y is in the ON clause — it only controls matching, not +# which preserved (left) rows appear. All left rows are preserved. +query TT +EXPLAIN SELECT t1.a, t1.b, t2.x +FROM t1 LEFT JOIN t2 ON t1.a = t2.x AND t1.b > t2.y +ORDER BY t1.b ASC LIMIT 3; +---- +logical_plan +01)Sort: t1.b ASC NULLS LAST, fetch=3 +02)--Projection: t1.a, t1.b, t2.x +03)----Left Join: t1.a = t2.x Filter: t1.b > t2.y +04)------Sort: t1.b ASC NULLS LAST, fetch=3 +05)--------TableScan: t1 projection=[a, b] +06)------TableScan: t2 projection=[x, y] + +# Verify correctness: all left rows appear, filter only affects matching +query III +SELECT t1.a, t1.b, t2.x +FROM t1 LEFT JOIN t2 ON t1.a = t2.x AND t1.b > t2.y +ORDER BY t1.b ASC LIMIT 3; +---- +1 10 NULL +2 20 NULL +3 30 NULL + +# LEFT JOIN with non-equijoin filter on non-preserved side only: pushdown OK +query TT +EXPLAIN SELECT t1.a, t1.b, t2.x +FROM t1 LEFT JOIN t2 ON t1.a = t2.x AND t2.y > 100 +ORDER BY t1.b ASC LIMIT 3; +---- +logical_plan +01)Sort: t1.b ASC NULLS LAST, fetch=3 +02)--Left Join: t1.a = t2.x +03)----Sort: t1.b ASC NULLS LAST, fetch=3 +04)------TableScan: t1 projection=[a, b] +05)----Projection: t2.x +06)------Filter: t2.y > Int32(100) +07)--------TableScan: t2 projection=[x, y] + +# LEFT JOIN with preserved-side-only filter: pushdown OK +# Filter t1.b > 20 prevents matching for left rows with b <= 20, +# but those rows still appear with NULL-filled right columns. +query TT +EXPLAIN SELECT t1.a, t1.b, t2.x +FROM t1 LEFT JOIN t2 ON t1.a = t2.x AND t1.b > 20 +ORDER BY t1.b ASC LIMIT 3; +---- +logical_plan +01)Sort: t1.b ASC NULLS LAST, fetch=3 +02)--Left Join: t1.a = t2.x Filter: t1.b > Int32(20) +03)----Sort: t1.b ASC NULLS LAST, fetch=3 +04)------TableScan: t1 projection=[a, b] +05)----TableScan: t2 projection=[x] + +# Verify correctness: rows with b <= 20 get NULL right columns +query III +SELECT t1.a, t1.b, t2.x +FROM t1 LEFT JOIN t2 ON t1.a = t2.x AND t1.b > 20 +ORDER BY t1.b ASC LIMIT 3; +---- +1 10 NULL +2 20 NULL +3 30 3 + +# Verify correctness: non-preserved side filter +query III +SELECT t1.a, t1.b, t2.x +FROM t1 LEFT JOIN t2 ON t1.a = t2.x AND t2.y > 100 +ORDER BY t1.b ASC LIMIT 3; +---- +1 10 NULL +2 20 2 +3 30 3 + +# Sort without LIMIT: no pushdown +query TT +EXPLAIN SELECT t1.a, t1.b, t2.x +FROM t1 LEFT JOIN t2 ON t1.a = t2.x +ORDER BY t1.b ASC; +---- +logical_plan +01)Sort: t1.b ASC NULLS LAST +02)--Left Join: t1.a = t2.x +03)----TableScan: t1 projection=[a, b] +04)----TableScan: t2 projection=[x] + +### +### Sort child cases — push vs skip based on existing child Sort +### + +# Child has larger fetch: push our tighter limit +# The inner Sort(fetch=5) is tightened to fetch=2 in-place. +query TT +EXPLAIN SELECT * FROM ( + SELECT t1.a, t1.b, t2.x + FROM (SELECT * FROM t1 ORDER BY b ASC LIMIT 5) t1 + LEFT JOIN t2 ON t1.a = t2.x +) sub +ORDER BY b ASC LIMIT 2; +---- +logical_plan +01)Sort: sub.b ASC NULLS LAST, fetch=2 +02)--SubqueryAlias: sub +03)----Left Join: t1.a = t2.x +04)------SubqueryAlias: t1 +05)--------Sort: t1.b ASC NULLS LAST, fetch=2 +06)----------TableScan: t1 projection=[a, b] +07)------TableScan: t2 projection=[x] + +# Verify correctness +query III +SELECT * FROM ( + SELECT t1.a, t1.b, t2.x + FROM (SELECT * FROM t1 ORDER BY b ASC LIMIT 5) t1 + LEFT JOIN t2 ON t1.a = t2.x +) sub +ORDER BY b ASC LIMIT 2; +---- +1 10 1 +2 20 2 + +# Child has smaller fetch with same sort: our rule skips (already tighter). +# PushDownLimit inserts a Sort(fetch=5) that gets collapsed with the inner +# Sort(fetch=2) to Sort(fetch=2) +query TT +EXPLAIN SELECT * FROM ( + SELECT t1.a, t1.b, t2.x + FROM (SELECT * FROM t1 ORDER BY b ASC LIMIT 2) t1 + LEFT JOIN t2 ON t1.a = t2.x +) sub +ORDER BY b ASC LIMIT 5; +---- +logical_plan +01)Sort: sub.b ASC NULLS LAST, fetch=5 +02)--SubqueryAlias: sub +03)----Left Join: t1.a = t2.x +04)------SubqueryAlias: t1 +05)--------Sort: t1.b ASC NULLS LAST, fetch=2 +06)----------TableScan: t1 projection=[a, b] +07)------TableScan: t2 projection=[x] + +# Verify correctness +query III +SELECT * FROM ( + SELECT t1.a, t1.b, t2.x + FROM (SELECT * FROM t1 ORDER BY b ASC LIMIT 2) t1 + LEFT JOIN t2 ON t1.a = t2.x +) sub +ORDER BY b ASC LIMIT 5; +---- +1 10 1 +2 20 2 + +### +### Semi/Anti join cases — pushdown NOT supported +### (not all preserved-side rows appear in output, so pushing fetch +### could drop rows that would have survived the semi/anti filter) +### + +# LEFT SEMI JOIN: no pushdown +query TT +EXPLAIN SELECT t1.a, t1.b +FROM t1 LEFT SEMI JOIN t2 ON t1.a = t2.x +ORDER BY t1.b ASC LIMIT 3; +---- +logical_plan +01)Sort: t1.b ASC NULLS LAST, fetch=3 +02)--LeftSemi Join: t1.a = t2.x +03)----TableScan: t1 projection=[a, b] +04)----TableScan: t2 projection=[x] + +# LEFT ANTI JOIN: no pushdown +query TT +EXPLAIN SELECT t1.a, t1.b +FROM t1 LEFT ANTI JOIN t2 ON t1.a = t2.x +ORDER BY t1.b ASC LIMIT 3; +---- +logical_plan +01)Sort: t1.b ASC NULLS LAST, fetch=3 +02)--LeftAnti Join: t1.a = t2.x +03)----TableScan: t1 projection=[a, b] +04)----TableScan: t2 projection=[x] + +# RIGHT SEMI JOIN: no pushdown +query TT +EXPLAIN SELECT t2.x, t2.y +FROM t1 RIGHT SEMI JOIN t2 ON t1.a = t2.x +ORDER BY t2.y ASC LIMIT 3; +---- +logical_plan +01)Sort: t2.y ASC NULLS LAST, fetch=3 +02)--RightSemi Join: t1.a = t2.x +03)----TableScan: t1 projection=[a] +04)----TableScan: t2 projection=[x, y] + +# RIGHT ANTI JOIN: no pushdown +query TT +EXPLAIN SELECT t2.x, t2.y +FROM t1 RIGHT ANTI JOIN t2 ON t1.a = t2.x +ORDER BY t2.y ASC LIMIT 3; +---- +logical_plan +01)Sort: t2.y ASC NULLS LAST, fetch=3 +02)--RightAnti Join: t1.a = t2.x +03)----TableScan: t1 projection=[a] +04)----TableScan: t2 projection=[x, y] + +### +### Multi-column sort and OFFSET cases +### + +# ORDER BY columns from both sides: no pushdown +# Sort uses t1.b (left) and t2.y (right) — not all from preserved side +query TT +EXPLAIN SELECT t1.a, t1.b, t2.x, t2.y +FROM t1 LEFT JOIN t2 ON t1.a = t2.x +ORDER BY t1.b ASC, t2.y ASC LIMIT 3; +---- +logical_plan +01)Sort: t1.b ASC NULLS LAST, t2.y ASC NULLS LAST, fetch=3 +02)--Left Join: t1.a = t2.x +03)----TableScan: t1 projection=[a, b] +04)----TableScan: t2 projection=[x, y] + +# Verify correctness +query IIII +SELECT t1.a, t1.b, t2.x, t2.y +FROM t1 LEFT JOIN t2 ON t1.a = t2.x +ORDER BY t1.b ASC, t2.y ASC LIMIT 3; +---- +1 10 1 100 +2 20 2 200 +3 30 3 300 + +# LIMIT with OFFSET: pushdown still applies (Sort fetch = limit + offset = 3) +query TT +EXPLAIN SELECT t1.a, t1.b, t2.x +FROM t1 LEFT JOIN t2 ON t1.a = t2.x +ORDER BY t1.b ASC LIMIT 2 OFFSET 1; +---- +logical_plan +01)Limit: skip=1, fetch=2 +02)--Sort: t1.b ASC NULLS LAST, fetch=3 +03)----Left Join: t1.a = t2.x +04)------Sort: t1.b ASC NULLS LAST, fetch=3 +05)--------TableScan: t1 projection=[a, b] +06)------TableScan: t2 projection=[x] + +# Verify correctness: skip 1, take 2 +query III +SELECT t1.a, t1.b, t2.x +FROM t1 LEFT JOIN t2 ON t1.a = t2.x +ORDER BY t1.b ASC LIMIT 2 OFFSET 1; +---- +2 20 2 +3 30 3 + +### +### Projection expression resolution cases +### + +# Sort on a projected expression: the pushed Sort should use the +# pre-projection expression, not the aliased column name. +# ORDER BY neg_b (which is -t1.b) should push Sort(-t1.b) below the join. +query TT +EXPLAIN SELECT -t1.b AS neg_b, t2.x +FROM t1 LEFT JOIN t2 ON t1.a = t2.x +ORDER BY neg_b ASC LIMIT 3; +---- +logical_plan +01)Sort: neg_b ASC NULLS LAST, fetch=3 +02)--Projection: (- t1.b) AS neg_b, t2.x +03)----Left Join: t1.a = t2.x +04)------Sort: (- t1.b) ASC NULLS LAST, fetch=3 +05)--------TableScan: t1 projection=[a, b] +06)------TableScan: t2 projection=[x] + +# Verify correctness: -b ascending means largest b first +query II +SELECT -t1.b AS neg_b, t2.x +FROM t1 LEFT JOIN t2 ON t1.a = t2.x +ORDER BY neg_b ASC LIMIT 3; +---- +-50 NULL +-40 NULL +-30 3 + +# Non-deterministic sort expression (random()): no pushdown +# Duplicating random() would produce different values at each evaluation point. +query TT +EXPLAIN SELECT t1.a, t1.b, t2.x +FROM t1 LEFT JOIN t2 ON t1.a = t2.x +ORDER BY t1.b + random() ASC LIMIT 3; +---- +logical_plan +01)Sort: CAST(t1.b AS Float64) + random() ASC NULLS LAST, fetch=3 +02)--Left Join: t1.a = t2.x +03)----TableScan: t1 projection=[a, b] +04)----TableScan: t2 projection=[x] + +# Non-deterministic projected expression (random() AS col): no pushdown +# Sort references a column that resolves to random() through the projection. +query TT +EXPLAIN SELECT rand_col, t2.x +FROM ( + SELECT random() AS rand_col, t1.a, t2.x + FROM t1 LEFT JOIN t2 ON t1.a = t2.x +) +ORDER BY rand_col ASC LIMIT 3; +---- +logical_plan +01)Sort: rand_col ASC NULLS LAST, fetch=3 +02)--Projection: random() AS rand_col, t2.x +03)----Left Join: t1.a = t2.x +04)------TableScan: t1 projection=[a] +05)------TableScan: t2 projection=[x] + +### +### SubqueryAlias edge cases +### + +# SubqueryAlias without inner Sort: push new Sort below the join. +# Preserved child is SubqueryAlias(t1, TableScan) — no existing Sort to tighten, +# so a new Sort(fetch=2) is inserted above the SubqueryAlias. +query TT +EXPLAIN SELECT * FROM ( + SELECT t1.a, t1.b, t2.x + FROM (SELECT * FROM t1) t1 + LEFT JOIN t2 ON t1.a = t2.x +) sub +ORDER BY b ASC LIMIT 2; +---- +logical_plan +01)Sort: sub.b ASC NULLS LAST, fetch=2 +02)--SubqueryAlias: sub +03)----Left Join: t1.a = t2.x +04)------Sort: t1.b ASC NULLS LAST, fetch=2 +05)--------SubqueryAlias: t1 +06)----------TableScan: t1 projection=[a, b] +07)------TableScan: t2 projection=[x] + +# Verify correctness +query III +SELECT * FROM ( + SELECT t1.a, t1.b, t2.x + FROM (SELECT * FROM t1) t1 + LEFT JOIN t2 ON t1.a = t2.x +) sub +ORDER BY b ASC LIMIT 2; +---- +1 10 1 +2 20 2 + +# RIGHT JOIN with SubqueryAlias on preserved (right) side +# Inner Sort(fetch=10) is tightened to fetch=3 +query TT +EXPLAIN SELECT * FROM ( + SELECT t1.a, t2.x, t2.y + FROM t1 + RIGHT JOIN (SELECT * FROM t2 ORDER BY y ASC LIMIT 10) t2 + ON t1.a = t2.x +) sub +ORDER BY y ASC LIMIT 3; +---- +logical_plan +01)Sort: sub.y ASC NULLS LAST, fetch=3 +02)--SubqueryAlias: sub +03)----Right Join: t1.a = t2.x +04)------TableScan: t1 projection=[a] +05)------SubqueryAlias: t2 +06)--------Sort: t2.y ASC NULLS LAST, fetch=3 +07)----------TableScan: t2 projection=[x, y] + +# Verify correctness +query III +SELECT * FROM ( + SELECT t1.a, t2.x, t2.y + FROM t1 + RIGHT JOIN (SELECT * FROM t2 ORDER BY y ASC LIMIT 10) t2 + ON t1.a = t2.x +) sub +ORDER BY y ASC LIMIT 3; +---- +1 1 100 +2 2 200 +3 3 300 + +# SubqueryAlias with different alias name (foo ≠ t1) +# Column resolution: foo.b → t1.b through SubqueryAlias renaming. +query TT +EXPLAIN SELECT * FROM ( + SELECT foo.a, foo.b, t2.x + FROM (SELECT * FROM t1) foo + LEFT JOIN t2 ON foo.a = t2.x +) sub +ORDER BY b ASC LIMIT 3; +---- +logical_plan +01)Sort: sub.b ASC NULLS LAST, fetch=3 +02)--SubqueryAlias: sub +03)----Left Join: foo.a = t2.x +04)------Sort: foo.b ASC NULLS LAST, fetch=3 +05)--------SubqueryAlias: foo +06)----------TableScan: t1 projection=[a, b] +07)------TableScan: t2 projection=[x] + +# Verify correctness +query III +SELECT * FROM ( + SELECT foo.a, foo.b, t2.x + FROM (SELECT * FROM t1) foo + LEFT JOIN t2 ON foo.a = t2.x +) sub +ORDER BY b ASC LIMIT 3; +---- +1 10 1 +2 20 2 +3 30 3 + +# Sort on non-preserved side column through SubqueryAlias: no pushdown +# ORDER BY t2.x is from the non-preserved (right) side of a LEFT JOIN. +query TT +EXPLAIN SELECT * FROM ( + SELECT t1.a, t1.b, t2.x + FROM (SELECT * FROM t1) t1 + LEFT JOIN t2 ON t1.a = t2.x +) sub +ORDER BY x ASC LIMIT 3; +---- +logical_plan +01)Sort: sub.x ASC NULLS LAST, fetch=3 +02)--SubqueryAlias: sub +03)----Left Join: t1.a = t2.x +04)------SubqueryAlias: t1 +05)--------TableScan: t1 projection=[a, b] +06)------TableScan: t2 projection=[x] + +# INNER JOIN through SubqueryAlias: no pushdown (only LEFT/RIGHT) +query TT +EXPLAIN SELECT * FROM ( + SELECT t1.a, t1.b, t2.x + FROM (SELECT * FROM t1) t1 + INNER JOIN t2 ON t1.a = t2.x +) sub +ORDER BY b ASC LIMIT 3; +---- +logical_plan +01)Sort: sub.b ASC NULLS LAST, fetch=3 +02)--SubqueryAlias: sub +03)----Inner Join: t1.a = t2.x +04)------SubqueryAlias: t1 +05)--------TableScan: t1 projection=[a, b] +06)------TableScan: t2 projection=[x] + +# Multiple sort columns from preserved side through SubqueryAlias +query TT +EXPLAIN SELECT * FROM ( + SELECT t1.a, t1.b, t2.x + FROM (SELECT * FROM t1) t1 + LEFT JOIN t2 ON t1.a = t2.x +) sub +ORDER BY a ASC, b ASC LIMIT 3; +---- +logical_plan +01)Sort: sub.a ASC NULLS LAST, sub.b ASC NULLS LAST, fetch=3 +02)--SubqueryAlias: sub +03)----Left Join: t1.a = t2.x +04)------Sort: t1.a ASC NULLS LAST, t1.b ASC NULLS LAST, fetch=3 +05)--------SubqueryAlias: t1 +06)----------TableScan: t1 projection=[a, b] +07)------TableScan: t2 projection=[x] + +# Verify correctness +query III +SELECT * FROM ( + SELECT t1.a, t1.b, t2.x + FROM (SELECT * FROM t1) t1 + LEFT JOIN t2 ON t1.a = t2.x +) sub +ORDER BY a ASC, b ASC LIMIT 3; +---- +1 10 1 +2 20 2 +3 30 3 + +# WHERE filter on preserved side: pushdown still happens +# PushDownFilter pushes the WHERE filter below the Join first, +# then our rule sees Sort → Join (no Filter in between) and pushes TopK. +query TT +EXPLAIN SELECT t1.a, t1.b, t2.x +FROM t1 LEFT JOIN t2 ON t1.a = t2.x +WHERE t1.b > 10 +ORDER BY t1.b ASC LIMIT 3; +---- +logical_plan +01)Sort: t1.b ASC NULLS LAST, fetch=3 +02)--Left Join: t1.a = t2.x +03)----Sort: t1.b ASC NULLS LAST, fetch=3 +04)------Filter: t1.b > Int32(10) +05)--------TableScan: t1 projection=[a, b] +06)----TableScan: t2 projection=[x] + +# Verify correctness +query III +SELECT t1.a, t1.b, t2.x +FROM t1 LEFT JOIN t2 ON t1.a = t2.x +WHERE t1.b > 10 +ORDER BY t1.b ASC LIMIT 3; +---- +2 20 2 +3 30 3 +4 40 NULL + +### +### Descending order and NULLS FIRST cases +### + +# LEFT JOIN: TopK with DESC sort pushed to left child +query TT +EXPLAIN SELECT t1.a, t1.b, t2.x +FROM t1 LEFT JOIN t2 ON t1.a = t2.x +ORDER BY t1.b DESC LIMIT 3; +---- +logical_plan +01)Sort: t1.b DESC NULLS FIRST, fetch=3 +02)--Left Join: t1.a = t2.x +03)----Sort: t1.b DESC NULLS FIRST, fetch=3 +04)------TableScan: t1 projection=[a, b] +05)----TableScan: t2 projection=[x] + +# Verify correctness +query III +SELECT t1.a, t1.b, t2.x +FROM t1 LEFT JOIN t2 ON t1.a = t2.x +ORDER BY t1.b DESC LIMIT 3; +---- +5 50 NULL +4 40 NULL +3 30 3 + +# LEFT JOIN: TopK with ASC NULLS FIRST pushed to left child +query TT +EXPLAIN SELECT t1.a, t1.b, t2.x +FROM t1 LEFT JOIN t2 ON t1.a = t2.x +ORDER BY t1.b ASC NULLS FIRST LIMIT 3; +---- +logical_plan +01)Sort: t1.b ASC NULLS FIRST, fetch=3 +02)--Left Join: t1.a = t2.x +03)----Sort: t1.b ASC NULLS FIRST, fetch=3 +04)------TableScan: t1 projection=[a, b] +05)----TableScan: t2 projection=[x] + +# Verify correctness +query III +SELECT t1.a, t1.b, t2.x +FROM t1 LEFT JOIN t2 ON t1.a = t2.x +ORDER BY t1.b ASC NULLS FIRST LIMIT 3; +---- +1 10 1 +2 20 2 +3 30 3 + +# RIGHT JOIN: TopK with DESC NULLS LAST pushed to right child +query TT +EXPLAIN SELECT t1.a, t2.x, t2.y +FROM t1 RIGHT JOIN t2 ON t1.a = t2.x +ORDER BY t2.y DESC NULLS LAST LIMIT 3; +---- +logical_plan +01)Sort: t2.y DESC NULLS LAST, fetch=3 +02)--Right Join: t1.a = t2.x +03)----TableScan: t1 projection=[a] +04)----Sort: t2.y DESC NULLS LAST, fetch=3 +05)------TableScan: t2 projection=[x, y] + +# Verify correctness +query III +SELECT t1.a, t2.x, t2.y +FROM t1 RIGHT JOIN t2 ON t1.a = t2.x +ORDER BY t2.y DESC NULLS LAST LIMIT 3; +---- +NULL 7 700 +NULL 6 600 +3 3 300 + +### +### CROSS JOIN — pushdown to whichever side has the sort columns +### + +# CROSS JOIN: TopK on left-side columns pushed to left child. +# Every left row appears |t2| times in the output, so the top-N by +# left columns must come from the top-N left rows. +query TT +EXPLAIN SELECT t1.a, t1.b, t2.x +FROM t1 CROSS JOIN t2 +ORDER BY t1.b ASC LIMIT 3; +---- +logical_plan +01)Sort: t1.b ASC NULLS LAST, fetch=3 +02)--Cross Join: +03)----Sort: t1.b ASC NULLS LAST, fetch=3 +04)------TableScan: t1 projection=[a, b] +05)----TableScan: t2 projection=[x] + +# Verify correctness +query III +SELECT t1.a, t1.b, t2.x +FROM t1 CROSS JOIN t2 +ORDER BY t1.b ASC, t2.x ASC LIMIT 3; +---- +1 10 1 +1 10 2 +1 10 3 + +# CROSS JOIN: TopK on right-side columns pushed to right child. +query TT +EXPLAIN SELECT t1.a, t2.x, t2.y +FROM t1 CROSS JOIN t2 +ORDER BY t2.y ASC LIMIT 3; +---- +logical_plan +01)Sort: t2.y ASC NULLS LAST, fetch=3 +02)--Cross Join: +03)----TableScan: t1 projection=[a] +04)----Sort: t2.y ASC NULLS LAST, fetch=3 +05)------TableScan: t2 projection=[x, y] + +# Verify correctness +query III +SELECT t1.a, t2.x, t2.y +FROM t1 CROSS JOIN t2 +ORDER BY t2.y ASC, t1.a ASC LIMIT 3; +---- +1 1 100 +2 1 100 +3 1 100 + +# CROSS JOIN: sort spans both sides → no pushdown +query TT +EXPLAIN SELECT t1.a, t1.b, t2.y +FROM t1 CROSS JOIN t2 +ORDER BY t1.b + t2.y ASC LIMIT 3; +---- +logical_plan +01)Sort: t1.b + t2.y ASC NULLS LAST, fetch=3 +02)--Cross Join: +03)----TableScan: t1 projection=[a, b] +04)----TableScan: t2 projection=[y] + +# Inner join with no equi-keys but a non-empty filter +# the filter can drop rows from either side, so pushing fetch=N may select +# rows that get filtered out while discarding rows that would have matched. +# Sort stays above the join, no pushdown to t1. +query TT +EXPLAIN SELECT t1.a, t1.b, t2.x +FROM t1 INNER JOIN t2 ON t1.b > t2.y +ORDER BY t1.b ASC LIMIT 3; +---- +logical_plan +01)Sort: t1.b ASC NULLS LAST, fetch=3 +02)--Projection: t1.a, t1.b, t2.x +03)----Inner Join: Filter: t1.b > t2.y +04)------TableScan: t1 projection=[a, b] +05)------TableScan: t2 projection=[x, y] + +### +### Multi-level outer joins +### + +# Chained LEFT JOINs: TopK pushed to leftmost preserved child +statement ok +CREATE TABLE t3 (p INT, q INT) AS VALUES + (1, 1000), + (2, 2000), + (3, 3000); + +query TT +EXPLAIN SELECT t1.a, t1.b, t2.x, t3.p +FROM t1 +LEFT JOIN t2 ON t1.a = t2.x +LEFT JOIN t3 ON t1.a = t3.p +ORDER BY t1.b ASC LIMIT 2; +---- +logical_plan +01)Sort: t1.b ASC NULLS LAST, fetch=2 +02)--Left Join: t1.a = t3.p +03)----Sort: t1.b ASC NULLS LAST, fetch=2 +04)------Left Join: t1.a = t2.x +05)--------Sort: t1.b ASC NULLS LAST, fetch=2 +06)----------TableScan: t1 projection=[a, b] +07)--------TableScan: t2 projection=[x] +08)----TableScan: t3 projection=[p] + +# Verify correctness +query IIII +SELECT t1.a, t1.b, t2.x, t3.p +FROM t1 +LEFT JOIN t2 ON t1.a = t2.x +LEFT JOIN t3 ON t1.a = t3.p +ORDER BY t1.b ASC LIMIT 2; +---- +1 10 1 1 +2 20 2 2 + +statement ok +DROP TABLE t3; + +### +### Tied sort key scenarios +### + +# Tied sort keys: pushdown is safe, all tied rows from preserved side appear +statement ok +CREATE TABLE t_tied (a INT, b INT) AS VALUES + (1, 10), + (2, 10), + (3, 10), + (4, 20), + (5, 30); + +statement ok +CREATE TABLE t_other (x INT) AS VALUES (1), (2), (3); + +query TT +EXPLAIN SELECT t_tied.a, t_tied.b, t_other.x +FROM t_tied LEFT JOIN t_other ON t_tied.a = t_other.x +ORDER BY t_tied.b ASC, t_tied.a ASC LIMIT 3; +---- +logical_plan +01)Sort: t_tied.b ASC NULLS LAST, t_tied.a ASC NULLS LAST, fetch=3 +02)--Left Join: t_tied.a = t_other.x +03)----Sort: t_tied.b ASC NULLS LAST, t_tied.a ASC NULLS LAST, fetch=3 +04)------TableScan: t_tied projection=[a, b] +05)----TableScan: t_other projection=[x] + +# Verify correctness: 3 rows with b=10, tied but only 3 emitted by LIMIT +query III +SELECT t_tied.a, t_tied.b, t_other.x +FROM t_tied LEFT JOIN t_other ON t_tied.a = t_other.x +ORDER BY t_tied.b ASC, t_tied.a ASC LIMIT 3; +---- +1 10 1 +2 10 2 +3 10 3 + +statement ok +DROP TABLE t_tied; + +statement ok +DROP TABLE t_other; + +### +### Nested SubqueryAlias cases +### + +# Nested SubqueryAlias: resolve through multiple alias layers +query TT +EXPLAIN SELECT * FROM ( + SELECT inner_sub.a, inner_sub.b, t2.x + FROM ( + SELECT * FROM (SELECT * FROM t1) inner_alias + ) inner_sub + LEFT JOIN t2 ON inner_sub.a = t2.x +) outer_sub +ORDER BY b ASC LIMIT 2; +---- +logical_plan +01)Sort: outer_sub.b ASC NULLS LAST, fetch=2 +02)--SubqueryAlias: outer_sub +03)----Left Join: inner_sub.a = t2.x +04)------Sort: inner_sub.b ASC NULLS LAST, fetch=2 +05)--------SubqueryAlias: inner_sub +06)----------SubqueryAlias: inner_alias +07)------------TableScan: t1 projection=[a, b] +08)------TableScan: t2 projection=[x] + +# Verify correctness +query III +SELECT * FROM ( + SELECT inner_sub.a, inner_sub.b, t2.x + FROM ( + SELECT * FROM (SELECT * FROM t1) inner_alias + ) inner_sub + LEFT JOIN t2 ON inner_sub.a = t2.x +) outer_sub +ORDER BY b ASC LIMIT 2; +---- +1 10 1 +2 20 2 + +# Nested SubqueryAlias with existing inner Sort: tighten fetch in-place. +# The inner Sort(fetch=5) is behind two SubqueryAlias layers. The deep +# resolution finds it, confirms same sort exprs, and tightens to fetch=2. +query TT +EXPLAIN SELECT * FROM ( + SELECT inner_sub.a, inner_sub.b, t2.x + FROM ( + SELECT * FROM (SELECT * FROM t1 ORDER BY b ASC LIMIT 5) inner_alias + ) inner_sub + LEFT JOIN t2 ON inner_sub.a = t2.x +) outer_sub +ORDER BY b ASC LIMIT 2; +---- +logical_plan +01)Sort: outer_sub.b ASC NULLS LAST, fetch=2 +02)--SubqueryAlias: outer_sub +03)----Left Join: inner_sub.a = t2.x +04)------SubqueryAlias: inner_sub +05)--------SubqueryAlias: inner_alias +06)----------Sort: t1.b ASC NULLS LAST, fetch=2 +07)------------TableScan: t1 projection=[a, b] +08)------TableScan: t2 projection=[x] + +# Verify correctness +query III +SELECT * FROM ( + SELECT inner_sub.a, inner_sub.b, t2.x + FROM ( + SELECT * FROM (SELECT * FROM t1 ORDER BY b ASC LIMIT 5) inner_alias + ) inner_sub + LEFT JOIN t2 ON inner_sub.a = t2.x +) outer_sub +ORDER BY b ASC LIMIT 2; +---- +1 10 1 +2 20 2 + +# Nested SubqueryAlias with tighter existing inner Sort: skip (already tighter). +# Inner Sort(fetch=2) is tighter than outer fetch=5, so rule skips. +query TT +EXPLAIN SELECT * FROM ( + SELECT inner_sub.a, inner_sub.b, t2.x + FROM ( + SELECT * FROM (SELECT * FROM t1 ORDER BY b ASC LIMIT 2) inner_alias + ) inner_sub + LEFT JOIN t2 ON inner_sub.a = t2.x +) outer_sub +ORDER BY b ASC LIMIT 5; +---- +logical_plan +01)Sort: outer_sub.b ASC NULLS LAST, fetch=5 +02)--SubqueryAlias: outer_sub +03)----Left Join: inner_sub.a = t2.x +04)------SubqueryAlias: inner_sub +05)--------SubqueryAlias: inner_alias +06)----------Sort: t1.b ASC NULLS LAST, fetch=2 +07)------------TableScan: t1 projection=[a, b] +08)------TableScan: t2 projection=[x] + +# Verify correctness +query III +SELECT * FROM ( + SELECT inner_sub.a, inner_sub.b, t2.x + FROM ( + SELECT * FROM (SELECT * FROM t1 ORDER BY b ASC LIMIT 2) inner_alias + ) inner_sub + LEFT JOIN t2 ON inner_sub.a = t2.x +) outer_sub +ORDER BY b ASC LIMIT 5; +---- +1 10 1 +2 20 2 + +# Inner Sort with DIFFERENT exprs and fetch: pushdown still happens. +# Inner Sort(a, fetch=5) already limits to 5 rows. Pushed Sort(b, fetch=2) +# re-sorts those 5 rows (cheap) and reduces to 2 rows entering the join. +query TT +EXPLAIN SELECT * FROM ( + SELECT inner_sub.a, inner_sub.b, t2.x + FROM ( + SELECT * FROM (SELECT * FROM t1 ORDER BY a ASC LIMIT 5) inner_alias + ) inner_sub + LEFT JOIN t2 ON inner_sub.a = t2.x +) outer_sub +ORDER BY b ASC LIMIT 2; +---- +logical_plan +01)Sort: outer_sub.b ASC NULLS LAST, fetch=2 +02)--SubqueryAlias: outer_sub +03)----Left Join: inner_sub.a = t2.x +04)------Sort: inner_sub.b ASC NULLS LAST, fetch=2 +05)--------SubqueryAlias: inner_sub +06)----------SubqueryAlias: inner_alias +07)------------Sort: t1.a ASC NULLS LAST, fetch=5 +08)--------------TableScan: t1 projection=[a, b] +09)------TableScan: t2 projection=[x] + +# Verify correctness +query III +SELECT * FROM ( + SELECT inner_sub.a, inner_sub.b, t2.x + FROM ( + SELECT * FROM (SELECT * FROM t1 ORDER BY a ASC LIMIT 5) inner_alias + ) inner_sub + LEFT JOIN t2 ON inner_sub.a = t2.x +) outer_sub +ORDER BY b ASC LIMIT 2; +---- +1 10 1 +2 20 2 + +# Inner Sort with different exprs but NO fetch (full sort): pushdown OK. +# Our rule inserts Sort(b, fetch=2) above Sort(a, no fetch). The inner full +# sort is then eliminated by other optimizer rules since it's redundant. +query TT +EXPLAIN SELECT * FROM ( + SELECT inner_sub.a, inner_sub.b, t2.x + FROM ( + SELECT * FROM (SELECT * FROM t1 ORDER BY a ASC) inner_alias + ) inner_sub + LEFT JOIN t2 ON inner_sub.a = t2.x +) outer_sub +ORDER BY b ASC LIMIT 2; +---- +logical_plan +01)Sort: outer_sub.b ASC NULLS LAST, fetch=2 +02)--SubqueryAlias: outer_sub +03)----Left Join: inner_sub.a = t2.x +04)------Sort: inner_sub.b ASC NULLS LAST, fetch=2 +05)--------SubqueryAlias: inner_sub +06)----------SubqueryAlias: inner_alias +07)------------TableScan: t1 projection=[a, b] +08)------TableScan: t2 projection=[x] + +# Verify correctness +query III +SELECT * FROM ( + SELECT inner_sub.a, inner_sub.b, t2.x + FROM ( + SELECT * FROM (SELECT * FROM t1 ORDER BY a ASC) inner_alias + ) inner_sub + LEFT JOIN t2 ON inner_sub.a = t2.x +) outer_sub +ORDER BY b ASC LIMIT 2; +---- +1 10 1 +2 20 2 + +# Tighten Sort through SubqueryAlias + Projection + SubqueryAlias. +# The inner Sort(fetch=5) is behind SubqueryAlias(inner_sub) → Projection(rename) → SubqueryAlias(inner_alias). +# The Projection renames b → renamed_b, so it survives as a plan node. +# Deep resolution looks through the Projection to find the Sort and tightens it to fetch=2. +query TT +EXPLAIN SELECT * FROM ( + SELECT inner_sub.a, inner_sub.renamed_b, t2.x + FROM ( + SELECT a, b AS renamed_b FROM (SELECT * FROM t1 ORDER BY b ASC LIMIT 5) inner_alias + ) inner_sub + LEFT JOIN t2 ON inner_sub.a = t2.x +) outer_sub +ORDER BY renamed_b ASC LIMIT 2; +---- +logical_plan +01)Sort: outer_sub.renamed_b ASC NULLS LAST, fetch=2 +02)--SubqueryAlias: outer_sub +03)----Left Join: inner_sub.a = t2.x +04)------SubqueryAlias: inner_sub +05)--------Projection: inner_alias.a, inner_alias.b AS renamed_b +06)----------SubqueryAlias: inner_alias +07)------------Sort: t1.b ASC NULLS LAST, fetch=2 +08)--------------TableScan: t1 projection=[a, b] +09)------TableScan: t2 projection=[x] + +# Verify correctness +query III +SELECT * FROM ( + SELECT inner_sub.a, inner_sub.renamed_b, t2.x + FROM ( + SELECT a, b AS renamed_b FROM (SELECT * FROM t1 ORDER BY b ASC LIMIT 5) inner_alias + ) inner_sub + LEFT JOIN t2 ON inner_sub.a = t2.x +) outer_sub +ORDER BY renamed_b ASC LIMIT 2; +---- +1 10 1 +2 20 2 + +# Sort above Projection only needs to sort the projected column subset, +# which is more efficient than sorting all pre-projection columns. +query TT +EXPLAIN SELECT * FROM ( + SELECT t1.a, t1.renamed_b, t2.x + FROM (SELECT a, b AS renamed_b FROM t1) t1 + LEFT JOIN t2 ON t1.a = t2.x +) sub +ORDER BY renamed_b ASC LIMIT 2; +---- +logical_plan +01)Sort: sub.renamed_b ASC NULLS LAST, fetch=2 +02)--SubqueryAlias: sub +03)----Left Join: t1.a = t2.x +04)------Sort: t1.renamed_b ASC NULLS LAST, fetch=2 +05)--------SubqueryAlias: t1 +06)----------Projection: t1.a, t1.b AS renamed_b +07)------------TableScan: t1 projection=[a, b] +08)------TableScan: t2 projection=[x] + +# Verify correctness +query III +SELECT * FROM ( + SELECT t1.a, t1.renamed_b, t2.x + FROM (SELECT a, b AS renamed_b FROM t1) t1 + LEFT JOIN t2 ON t1.a = t2.x +) sub +ORDER BY renamed_b ASC LIMIT 2; +---- +1 10 1 +2 20 2 + +# Volatile expression inside Projection of preserved child: pushdown IS safe. +# The Projection computes random() once and names it rand_col. The pushed +# Sort only reorders by the pre-computed rand_col column — it does NOT +# re-evaluate random(). This is different from having random() directly +# in the sort expression (which IS blocked by the volatility check). +query TT +EXPLAIN SELECT * FROM ( + SELECT t1.rand_col, t2.x + FROM (SELECT random() AS rand_col, a FROM t1) t1 + LEFT JOIN t2 ON t1.a = t2.x +) sub +ORDER BY rand_col ASC LIMIT 2; +---- +logical_plan +01)Sort: sub.rand_col ASC NULLS LAST, fetch=2 +02)--SubqueryAlias: sub +03)----Projection: t1.rand_col, t2.x +04)------Left Join: t1.a = t2.x +05)--------Sort: t1.rand_col ASC NULLS LAST, fetch=2 +06)----------SubqueryAlias: t1 +07)------------Projection: random() AS rand_col, t1.a +08)--------------TableScan: t1 projection=[a] +09)--------TableScan: t2 projection=[x] + +# Volatile expression inside the preserved child *and* an existing inner +# Sort whose expr is also `random()`: must NOT tighten the inner Sort. +# +# After resolving the outer ORDER BY (a non-volatile column reference) +# through the inner `random() AS rand_col` Projection, the deep-resolved +# sort expr becomes `random()`. The existing inner Sort below the +# Projection happens to also be on `random()` — but those are independent +# random() invocations producing different orderings. Treating them as +# "same expr" and tightening fetch=10 → fetch=2 would discard rows the +# outer ordering would have ranked high. +# +# Expected: fall back to inserting a new Sort(t1.rand_col, fetch=2) above +# the preserved-side SubqueryAlias; the inner Sort(random(), fetch=10) +# stays untouched. +query TT +EXPLAIN SELECT * FROM ( + SELECT t1.rand_col, t2.x + FROM ( + SELECT random() AS rand_col, a + FROM (SELECT a FROM t1 ORDER BY random() LIMIT 10) + ) t1 + LEFT JOIN t2 ON t1.a = t2.x +) sub +ORDER BY rand_col ASC LIMIT 2; +---- +logical_plan +01)Sort: sub.rand_col ASC NULLS LAST, fetch=2 +02)--SubqueryAlias: sub +03)----Projection: t1.rand_col, t2.x +04)------Left Join: t1.a = t2.x +05)--------Sort: t1.rand_col ASC NULLS LAST, fetch=2 +06)----------SubqueryAlias: t1 +07)------------Projection: random() AS rand_col, t1.a +08)--------------Sort: random() ASC NULLS LAST, fetch=10 +09)----------------TableScan: t1 projection=[a] +10)--------TableScan: t2 projection=[x] + +### +### Config reset +### + +statement ok +set datafusion.execution.target_partitions = 4; + +statement ok +reset datafusion.explain.logical_plan_only; + +statement ok +DROP TABLE t1; + +statement ok +DROP TABLE t2; \ No newline at end of file