diff --git a/rust/lance/src/dataset/write/merge_insert.rs b/rust/lance/src/dataset/write/merge_insert.rs
index b14421c963f..075c4551f73 100644
--- a/rust/lance/src/dataset/write/merge_insert.rs
+++ b/rust/lance/src/dataset/write/merge_insert.rs
@@ -58,7 +58,7 @@ use crate::{
},
index::DatasetIndexInternalExt,
io::exec::{
- AddRowAddrExec, Planner, TakeExec, project,
+ AddRowAddrExec, LateMaterializeJoin, LateTakePlanner, Planner, TakeExec, project,
scalar_index::{IndexLookup, MapIndexExec},
utils::ReplayExec,
},
@@ -72,6 +72,7 @@ use arrow_select::take::take_record_batch;
use datafusion::common::NullEquality;
use datafusion::common::tree_node::{Transformed, TreeNode};
use datafusion::error::DataFusionError;
+use datafusion::optimizer::{OptimizerContext, OptimizerRule};
use datafusion::{
execution::{
context::{SessionConfig, SessionContext},
@@ -88,7 +89,7 @@ use datafusion::{
stream::RecordBatchStreamAdapter,
union::UnionExec,
},
- physical_planner::{DefaultPhysicalPlanner, PhysicalPlanner},
+ physical_planner::{DefaultPhysicalPlanner, ExtensionPlanner, PhysicalPlanner},
prelude::DataFrame,
scalar::ScalarValue,
};
@@ -1538,11 +1539,30 @@ impl MergeInsertJob {
node: Arc::new(write_node),
});
+ // First pass: standard optimization. The non-source "fill" columns are
+ // still referenced (each copies `target.
`), so projection pushdown
+ // keeps them — and `target._rowaddr` — in the target scan.
let logical_plan = session_state.optimize(&logical_plan)?;
- let planner =
- DefaultPhysicalPlanner::with_extension_planners(vec![Arc::new(MergeInsertPlanner {})]);
- // This method already does the optimization for us.
+ // Defer reading non-source columns: a partial-schema upsert reads the
+ // missing columns from the target side of the join only to rewrite full
+ // rows. This rule inserts a `LateTake` above the join so those wide
+ // columns are fetched by `_rowaddr` for the matched rows only, instead
+ // of being scanned for every target row. It is applied only here (not in
+ // the session-wide optimizer) to bound its blast radius.
+ let logical_plan = LateMaterializeJoin::new()
+ .rewrite(logical_plan, &OptimizerContext::default())?
+ .data;
+
+ // Second pass: the deferred columns are now absent from the `LateTake`'s
+ // input, so projection pushdown narrows the target scan to drop them
+ // while keeping `_rowaddr` (which the take forces in).
+ let logical_plan = session_state.optimize(&logical_plan)?;
+
+ let planner = DefaultPhysicalPlanner::with_extension_planners(vec![
+ Arc::new(MergeInsertPlanner {}) as Arc,
+ Arc::new(LateTakePlanner) as Arc,
+ ]);
let physical_plan = planner
.create_physical_plan(&logical_plan, &session_state)
.await?;
@@ -4521,20 +4541,379 @@ mod tests {
"expected HashJoinExec in plan, got: {}",
plan
);
- // Evidence that the partial-schema fix is active: the target
- // side of the join reads the `other` column (which is missing
- // from the source) and an explicit projection carries it
- // through to the write exec alongside source columns.
+ // Late materialization is active: the `other` column (missing from
+ // the source) is *not* read by the target scan. Instead it is
+ // fetched by a `Take` inserted above the join, so a selective match
+ // does not scan `other` for every target row.
+ assert!(
+ plan.contains("LanceRead") && plan.contains("projection=[key]"),
+ "target-side scan should only read the join key, not `other`: {}",
+ plan
+ );
+ assert!(
+ !plan.contains("projection=[other"),
+ "deferred `other` column must not be in the target scan projection: {}",
+ plan
+ );
+ assert!(
+ plan.contains("Take") && plan.contains("(other)"),
+ "expected a Take above the join fetching the deferred `other` column: {}",
+ plan
+ );
+ }
+
+ /// Extract the `output_rows` metric of the first plan node whose
+ /// (trimmed) display line starts with `node_prefix`, from an
+ /// `analyze_plan` string.
+ fn output_rows_for_node(analysis: &str, node_prefix: &str) -> Option {
+ let line = analysis
+ .lines()
+ .find(|l| l.trim_start().starts_with(node_prefix))?;
+ let start = line.find("output_rows=")? + "output_rows=".len();
+ let rest = &line[start..];
+ let end = rest
+ .find(|c: char| !c.is_ascii_digit())
+ .unwrap_or(rest.len());
+ rest[..end].parse().ok()
+ }
+
+ /// The read-amplification payoff: on a selective partial-schema update,
+ /// the wide non-source column must be fetched only for the matched rows
+ /// (via the `Take`), not scanned for every target row.
+ #[tokio::test]
+ async fn test_merge_insert_subcols_defers_wide_column_reads() {
+ // 100 rows across 4 fragments; `other` is a wide (string) column,
+ // `key`/`value` are narrow. Keys are 0..100 so matches are precise.
+ let batch = lance_datagen::gen_batch()
+ .with_seed(Seed::from(1))
+ .col("other", array::rand_utf8(64.into(), false))
+ .col("value", array::step::())
+ .col("key", array::step_custom::(0, 1))
+ .into_batch_rows(RowCount::from(100))
+ .unwrap();
+ let schema = batch.schema();
+ let ds = Dataset::write(
+ RecordBatchIterator::new([Ok(batch)], schema.clone()),
+ "memory://",
+ Some(WriteParams {
+ max_rows_per_file: 25,
+ ..Default::default()
+ }),
+ )
+ .await
+ .unwrap();
+ let ds = Arc::new(ds);
+
+ // Partial-schema source (key, value) updating only 5 of 100 keys.
+ let update_schema = Arc::new(schema.project(&[2, 1]).unwrap());
+ let new_data = RecordBatch::try_new(
+ update_schema,
+ vec![
+ Arc::new(UInt32Array::from(vec![0u32, 1, 2, 3, 4])),
+ Arc::new(UInt32Array::from(vec![1000u32, 1001, 1002, 1003, 1004])),
+ ],
+ )
+ .unwrap();
+
+ let job = MergeInsertBuilder::try_new(ds.clone(), vec!["key".to_string()])
+ .unwrap()
+ .when_matched(WhenMatched::UpdateAll)
+ .when_not_matched(WhenNotMatched::DoNothing)
+ .try_build()
+ .unwrap();
+
+ let source = reader_to_stream(Box::new(RecordBatchIterator::new(
+ [Ok(new_data.clone())],
+ new_data.schema(),
+ )));
+ let analysis = job.analyze_plan(source).await.unwrap();
+
+ // The target scan reads every row but only the narrow key column...
+ let lance_read = analysis
+ .lines()
+ .find(|l| l.trim_start().starts_with("LanceRead"))
+ .unwrap_or_else(|| panic!("no LanceRead node:\n{}", analysis));
+ assert!(
+ lance_read.contains("projection=[key]"),
+ "target scan must defer the wide `other` column: {}",
+ lance_read
+ );
+ assert_eq!(
+ output_rows_for_node(&analysis, "LanceRead"),
+ Some(100),
+ "target scan should still visit all rows:\n{}",
+ analysis
+ );
+
+ // ...and `other` is materialized only for the 5 matched rows.
+ assert_eq!(
+ output_rows_for_node(&analysis, "Take"),
+ Some(5),
+ "wide column should be taken for only the matched rows:\n{}",
+ analysis
+ );
+ }
+
+ /// A partial-schema `UpdateIf` whose condition references the deferred
+ /// (non-source) target column must still evaluate correctly: the column
+ /// is fetched once by the `Take` and read from there by the action
+ /// expression, not double-fetched or lost.
+ #[tokio::test]
+ async fn test_merge_insert_subcols_update_if_on_deferred_column() {
+ let schema = Arc::new(Schema::new(vec![
+ Field::new("key", DataType::UInt32, false),
+ Field::new("value", DataType::UInt32, true),
+ Field::new("other", DataType::Utf8, true),
+ ]));
+ // `other` gates the update; keys 0,2,4 are "keep", 1,3,5 are "skip".
+ let batch = RecordBatch::try_new(
+ schema.clone(),
+ vec![
+ Arc::new(UInt32Array::from(vec![0u32, 1, 2, 3, 4, 5])),
+ Arc::new(UInt32Array::from(vec![0u32, 1, 2, 3, 4, 5])),
+ Arc::new(StringArray::from(vec![
+ "keep", "skip", "keep", "skip", "keep", "skip",
+ ])),
+ ],
+ )
+ .unwrap();
+ let ds = Dataset::write(
+ RecordBatchIterator::new([Ok(batch)], schema.clone()),
+ "memory://",
+ Some(WriteParams {
+ max_rows_per_file: 3, // two fragments
+ ..Default::default()
+ }),
+ )
+ .await
+ .unwrap();
+ let ds = Arc::new(ds);
+
+ // Partial-schema source (key, value) matching keys 0..=3.
+ let source_schema = Arc::new(Schema::new(vec![
+ Field::new("key", DataType::UInt32, false),
+ Field::new("value", DataType::UInt32, true),
+ ]));
+ let new_data = RecordBatch::try_new(
+ source_schema,
+ vec![
+ Arc::new(UInt32Array::from(vec![0u32, 1, 2, 3])),
+ Arc::new(UInt32Array::from(vec![100u32, 100, 100, 100])),
+ ],
+ )
+ .unwrap();
+ let reader = Box::new(RecordBatchIterator::new(
+ [Ok(new_data.clone())],
+ new_data.schema(),
+ ));
+
+ let job = MergeInsertBuilder::try_new(ds.clone(), vec!["key".to_string()])
+ .unwrap()
+ .when_matched(WhenMatched::update_if(&ds, "target.other = 'keep'").unwrap())
+ .when_not_matched(WhenNotMatched::DoNothing)
+ .try_build()
+ .unwrap();
+ let (updated, _stats) = job.execute_reader(reader).await.unwrap();
+
+ let batch = updated.scan().try_into_batch().await.unwrap();
+ let keys = batch["key"].as_any().downcast_ref::().unwrap();
+ let values = batch["value"]
+ .as_any()
+ .downcast_ref::()
+ .unwrap();
+ let others = batch["other"]
+ .as_any()
+ .downcast_ref::()
+ .unwrap();
+ let by_key = (0..batch.num_rows())
+ .map(|i| {
+ (
+ keys.value(i),
+ (values.value(i), others.value(i).to_string()),
+ )
+ })
+ .collect::>();
+
+ // Matched + condition true (other == "keep"): value updated to 100.
+ assert_eq!(by_key[&0], (100, "keep".to_string()));
+ assert_eq!(by_key[&2], (100, "keep".to_string()));
+ // Matched + condition false (other == "skip"): value unchanged.
+ assert_eq!(by_key[&1], (1, "skip".to_string()));
+ assert_eq!(by_key[&3], (3, "skip".to_string()));
+ // Unmatched rows untouched.
+ assert_eq!(by_key[&4], (4, "keep".to_string()));
+ assert_eq!(by_key[&5], (5, "skip".to_string()));
+ }
+
+ /// The width gate's negative branch: a *narrow* missing column must NOT
+ /// be deferred — a sequential scan of it is cheaper than a per-row take,
+ /// so it stays in the target scan and no `Take` is introduced.
+ #[tokio::test]
+ async fn test_merge_insert_subcols_narrow_column_not_deferred() {
+ let schema = Arc::new(Schema::new(vec![
+ Field::new("key", DataType::UInt32, false),
+ Field::new("value", DataType::UInt32, true),
+ Field::new("small", DataType::UInt32, true),
+ ]));
+ let batch = RecordBatch::try_new(
+ schema.clone(),
+ vec![
+ Arc::new(UInt32Array::from(vec![0u32, 1, 2, 3])),
+ Arc::new(UInt32Array::from(vec![0u32, 1, 2, 3])),
+ Arc::new(UInt32Array::from(vec![10u32, 11, 12, 13])),
+ ],
+ )
+ .unwrap();
+ let ds = Arc::new(
+ Dataset::write(
+ RecordBatchIterator::new([Ok(batch)], schema.clone()),
+ "memory://",
+ None,
+ )
+ .await
+ .unwrap(),
+ );
+
+ // Source omits the narrow `small` column.
+ let source_schema = Arc::new(Schema::new(vec![
+ Field::new("key", DataType::UInt32, false),
+ Field::new("value", DataType::UInt32, true),
+ ]));
+ let job = MergeInsertBuilder::try_new(ds.clone(), vec!["key".to_string()])
+ .unwrap()
+ .when_matched(WhenMatched::UpdateAll)
+ .when_not_matched(WhenNotMatched::DoNothing)
+ .try_build()
+ .unwrap();
+ let plan = job.explain_plan(Some(&source_schema), false).await.unwrap();
+
assert!(
- plan.contains("LanceRead") && plan.contains("projection=[other"),
- "target-side scan should include the filled `other` column: {}",
+ !plan.contains("Take"),
+ "a narrow column must not be deferred via a Take: {}",
plan
);
+ let lance_read = plan
+ .lines()
+ .find(|l| l.trim_start().starts_with("LanceRead"))
+ .unwrap_or_else(|| panic!("no LanceRead node: {}", plan));
+ assert!(
+ lance_read.contains("small"),
+ "narrow `small` should be read directly by the target scan: {}",
+ lance_read
+ );
+ }
+
+ /// Deferral must remain correct when more than one wide column is
+ /// dropped from the scan: this exercises the multi-column index remap of
+ /// the join output and the name-based re-index of the parent projection.
+ #[tokio::test]
+ async fn test_merge_insert_subcols_defers_multiple_wide_columns() {
+ let schema = Arc::new(Schema::new(vec![
+ Field::new("key", DataType::UInt32, false),
+ Field::new("value", DataType::UInt32, true),
+ Field::new("wide_a", DataType::Utf8, true),
+ Field::new("wide_b", DataType::Utf8, true),
+ ]));
+ let batch = RecordBatch::try_new(
+ schema.clone(),
+ vec![
+ Arc::new(UInt32Array::from(vec![0u32, 1, 2, 3])),
+ Arc::new(UInt32Array::from(vec![0u32, 1, 2, 3])),
+ Arc::new(StringArray::from(vec!["a0", "a1", "a2", "a3"])),
+ Arc::new(StringArray::from(vec!["b0", "b1", "b2", "b3"])),
+ ],
+ )
+ .unwrap();
+ let ds = Arc::new(
+ Dataset::write(
+ RecordBatchIterator::new([Ok(batch)], schema.clone()),
+ "memory://",
+ Some(WriteParams {
+ max_rows_per_file: 2, // two fragments
+ ..Default::default()
+ }),
+ )
+ .await
+ .unwrap(),
+ );
+
+ // Source omits both wide columns; updates keys 0 and 1.
+ let source_schema = Arc::new(Schema::new(vec![
+ Field::new("key", DataType::UInt32, false),
+ Field::new("value", DataType::UInt32, true),
+ ]));
+ let new_data = RecordBatch::try_new(
+ source_schema,
+ vec![
+ Arc::new(UInt32Array::from(vec![0u32, 1])),
+ Arc::new(UInt32Array::from(vec![100u32, 100])),
+ ],
+ )
+ .unwrap();
+
+ let job = MergeInsertBuilder::try_new(ds.clone(), vec!["key".to_string()])
+ .unwrap()
+ .when_matched(WhenMatched::UpdateAll)
+ .when_not_matched(WhenNotMatched::DoNothing)
+ .try_build()
+ .unwrap();
+
+ // Both wide columns are deferred to the take, not the scan.
+ let plan = job
+ .explain_plan(Some(&new_data.schema().as_ref().clone()), false)
+ .await
+ .unwrap();
+ let lance_read = plan
+ .lines()
+ .find(|l| l.trim_start().starts_with("LanceRead"))
+ .unwrap_or_else(|| panic!("no LanceRead node: {}", plan));
+ assert!(
+ !lance_read.contains("wide_a") && !lance_read.contains("wide_b"),
+ "both wide columns must be deferred out of the scan: {}",
+ lance_read
+ );
assert!(
- plan.contains("other@0 as other"),
- "expected post-join projection to carry `other` from the target side: {}",
+ plan.contains("(wide_a)") && plan.contains("(wide_b)"),
+ "the take must fetch both deferred columns: {}",
plan
);
+
+ // And the result is correct: matched rows updated, wide columns preserved.
+ let reader = Box::new(RecordBatchIterator::new(
+ [Ok(new_data.clone())],
+ new_data.schema(),
+ ));
+ let (updated, _stats) = job.execute_reader(reader).await.unwrap();
+ let batch = updated.scan().try_into_batch().await.unwrap();
+ let keys = batch["key"].as_any().downcast_ref::().unwrap();
+ let values = batch["value"]
+ .as_any()
+ .downcast_ref::()
+ .unwrap();
+ let wide_a = batch["wide_a"]
+ .as_any()
+ .downcast_ref::()
+ .unwrap();
+ let wide_b = batch["wide_b"]
+ .as_any()
+ .downcast_ref::()
+ .unwrap();
+ let by_key = (0..batch.num_rows())
+ .map(|i| {
+ (
+ keys.value(i),
+ (
+ values.value(i),
+ wide_a.value(i).to_string(),
+ wide_b.value(i).to_string(),
+ ),
+ )
+ })
+ .collect::>();
+ assert_eq!(by_key[&0], (100, "a0".to_string(), "b0".to_string()));
+ assert_eq!(by_key[&1], (100, "a1".to_string(), "b1".to_string()));
+ assert_eq!(by_key[&2], (2, "a2".to_string(), "b2".to_string()));
+ assert_eq!(by_key[&3], (3, "a3".to_string(), "b3".to_string()));
}
/// Partial-schema upserts with `insert_not_matched=InsertAll` must
diff --git a/rust/lance/src/io/exec.rs b/rust/lance/src/io/exec.rs
index a477d60d56d..428362d83d8 100644
--- a/rust/lance/src/io/exec.rs
+++ b/rust/lance/src/io/exec.rs
@@ -15,6 +15,7 @@ pub mod filtered_read;
pub mod filtered_read_proto;
pub mod fts;
pub(crate) mod knn;
+mod late_take;
mod optimizer;
mod projection;
mod pushdown_scan;
@@ -32,6 +33,7 @@ pub use filter::LanceFilterExec;
pub use knn::{ANNIvfPartitionExec, ANNIvfSubIndexExec, KNNVectorDistanceExec};
pub use lance_datafusion::planner::Planner;
pub use lance_index::scalar::expression::FilterPlan;
+pub use late_take::{LateMaterializeJoin, LateTakeNode, LateTakePlanner};
pub use optimizer::get_physical_optimizer;
pub use projection::project;
pub use pushdown_scan::{LancePushdownScanExec, ScanConfig};
diff --git a/rust/lance/src/io/exec/late_take.rs b/rust/lance/src/io/exec/late_take.rs
new file mode 100644
index 00000000000..f9295594532
--- /dev/null
+++ b/rust/lance/src/io/exec/late_take.rs
@@ -0,0 +1,1212 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright The Lance Authors
+
+//! Late-materialization *logical* optimizer rule.
+//!
+//! Defers reading wide data columns that a row-reducing join only carries
+//! through, fetching them by `_rowaddr` *after* the row count has shrunk.
+//!
+//! A [`LateTakeNode`] is inserted above the join — fed by a projection that
+//! keeps only the columns carried past the join — and advertises an output
+//! schema of "carried columns plus the deferred columns appended". Its
+//! [`UserDefinedLogicalNodeCore::necessary_children_exprs`] reports that it does
+//! *not* need the deferred columns from its child, only `_rowaddr`, so
+//! DataFusion's stock `OptimizeProjections` rule prunes them from the scan
+//! automatically and downstream references resolve the deferred columns from the
+//! take by name.
+//!
+//! The node lowers to the physical [`super::TakeExec`] via [`LateTakePlanner`].
+
+use std::collections::{BTreeSet, HashSet};
+use std::sync::Arc;
+
+use arrow_schema::{Field as ArrowField, Schema as ArrowSchema};
+use async_trait::async_trait;
+use datafusion::{
+ common::{
+ Column, DFSchema, DFSchemaRef, Result as DFResult, TableReference,
+ tree_node::{Transformed, TreeNode, TreeNodeRecursion},
+ },
+ datasource::DefaultTableSource,
+ execution::SessionState,
+ logical_expr::{Expr, Extension, Join, JoinType, LogicalPlan, Projection},
+ optimizer::{OptimizerConfig, OptimizerRule},
+ physical_plan::ExecutionPlan,
+ physical_planner::{ExtensionPlanner, PhysicalPlanner},
+};
+use datafusion_expr::{UserDefinedLogicalNode, UserDefinedLogicalNodeCore};
+use lance_arrow::DataTypeExt;
+use lance_core::datatypes::OnMissing;
+use lance_core::{ROW_ADDR, ROW_ID};
+
+use super::TakeExec;
+use crate::Dataset;
+use crate::datafusion::dataframe::LanceTableProvider;
+
+/// Width/storage gate: a column is worth deferring only if it is "wide" for the
+/// backing storage — a variable-width type (strings, lists, vectors) or a
+/// fixed-width type at or above the per-row byte threshold (1KB on cloud
+/// storage, 10 bytes on local). Narrow columns are cheaper to read in the
+/// sequential scan than to re-fetch by address.
+fn is_wide_column(field: &lance_core::datatypes::Field, is_cloud: bool) -> bool {
+ if field.is_blob() {
+ return false;
+ }
+ let byte_width = field.data_type().byte_width_opt();
+ if is_cloud {
+ byte_width.is_none_or(|bw| bw >= 1000)
+ } else {
+ byte_width.is_none_or(|bw| bw >= 10)
+ }
+}
+
+/// Logical plan node that re-fetches `deferred_columns` from `dataset` by
+/// `_rowaddr` after a row-reducing operator.
+///
+/// Output schema = the input's columns with `deferred_columns` removed,
+/// followed by the deferred columns appended in dataset-schema order (mirroring
+/// the physical [`TakeExec`], which appends taken columns). Constructing the
+/// schema this way makes it invariant under projection pushdown: whether or not
+/// the child still produces a deferred column, the node advertises the same
+/// output, so the rule can be inserted before pushdown prunes the scan.
+#[derive(Debug)]
+pub struct LateTakeNode {
+ input: LogicalPlan,
+ dataset: Arc,
+ /// Dataset field names to re-fetch by address, in dataset-schema order.
+ deferred_columns: Vec,
+ /// Qualifier for the appended deferred fields (e.g. `target`), matching the
+ /// relation the scanned columns came from.
+ qualifier: Option,
+ /// When true the deferred fields are nullable in the output even if the
+ /// dataset declares them non-null. Set above an outer join where the scan
+ /// side can be null-extended (its `_rowaddr` is null → NULL deferred value).
+ nullable_extra: bool,
+ schema: DFSchemaRef,
+}
+
+impl PartialEq for LateTakeNode {
+ fn eq(&self, other: &Self) -> bool {
+ self.dataset.base == other.dataset.base
+ && self.deferred_columns == other.deferred_columns
+ && self.qualifier == other.qualifier
+ && self.nullable_extra == other.nullable_extra
+ && self.input == other.input
+ }
+}
+
+impl Eq for LateTakeNode {}
+
+impl std::hash::Hash for LateTakeNode {
+ fn hash(&self, state: &mut H) {
+ self.dataset.base.hash(state);
+ self.deferred_columns.hash(state);
+ self.qualifier.hash(state);
+ self.nullable_extra.hash(state);
+ self.input.hash(state);
+ }
+}
+
+impl PartialOrd for LateTakeNode {
+ // Orders by the only fields that have a natural order (`deferred_columns`,
+ // then `input`); `dataset`/`qualifier`/`nullable_extra` are part of equality
+ // but not ordered here, matching the sibling `MergeInsertWriteNode` impl.
+ fn partial_cmp(&self, other: &Self) -> Option {
+ match self.deferred_columns.partial_cmp(&other.deferred_columns) {
+ Some(std::cmp::Ordering::Equal) => self.input.partial_cmp(&other.input),
+ cmp => cmp,
+ }
+ }
+}
+
+impl LateTakeNode {
+ /// Build a node that re-fetches `deferred_columns` (dataset field names, in
+ /// dataset-schema order) from `dataset` by row address.
+ ///
+ /// `qualifier` is the relation the deferred columns came from; the appended
+ /// output fields carry it so downstream references still resolve. Set
+ /// `nullable_extra` when the take sits above an outer join on whose optional
+ /// side the scan lives, so unmatched rows (null row address) yield NULL
+ /// deferred values. Errors if a deferred name is absent from the dataset.
+ pub fn try_new(
+ input: LogicalPlan,
+ dataset: Arc,
+ deferred_columns: Vec,
+ qualifier: Option,
+ nullable_extra: bool,
+ ) -> DFResult {
+ let schema = Self::build_output_schema(
+ &input,
+ &dataset,
+ &deferred_columns,
+ &qualifier,
+ nullable_extra,
+ )?;
+ Ok(Self {
+ input,
+ dataset,
+ deferred_columns,
+ qualifier,
+ nullable_extra,
+ schema,
+ })
+ }
+
+ /// Build `input columns (minus deferred) ++ deferred columns appended`.
+ fn build_output_schema(
+ input: &LogicalPlan,
+ dataset: &Dataset,
+ deferred_columns: &[String],
+ qualifier: &Option,
+ nullable_extra: bool,
+ ) -> DFResult {
+ let input_schema = input.schema();
+ let deferred_set: HashSet<&str> = deferred_columns.iter().map(|s| s.as_str()).collect();
+
+ // A field is deferred only when both its name and its qualifier match the
+ // deferred columns' relation; a same-named field from another relation is
+ // a distinct column and must stay in place.
+ let mut qualified_fields: Vec<(Option, Arc)> = input_schema
+ .iter()
+ .filter(|(q, f)| {
+ !(*q == qualifier.as_ref() && deferred_set.contains(f.name().as_str()))
+ })
+ .map(|(q, f)| (q.cloned(), f.clone()))
+ .collect();
+
+ let dataset_arrow = ArrowSchema::from(dataset.schema());
+ for name in deferred_columns {
+ let field = dataset_arrow.field_with_name(name).map_err(|e| {
+ datafusion::error::DataFusionError::Plan(format!(
+ "late-materialization: deferred column '{name}' not found in dataset schema: {e}"
+ ))
+ })?;
+ let field = if nullable_extra && !field.is_nullable() {
+ field.clone().with_nullable(true)
+ } else {
+ field.clone()
+ };
+ qualified_fields.push((qualifier.clone(), Arc::new(field)));
+ }
+
+ Ok(Arc::new(DFSchema::new_with_metadata(
+ qualified_fields,
+ input_schema.metadata().clone(),
+ )?))
+ }
+
+ /// Index of the row-address (or, failing that, row-id) column in the child.
+ fn row_locator_index(&self) -> Option {
+ let input_schema = self.input.schema();
+ input_schema
+ .index_of_column_by_name(self.qualifier.as_ref(), ROW_ADDR)
+ .or_else(|| input_schema.index_of_column_by_name(self.qualifier.as_ref(), ROW_ID))
+ }
+}
+
+impl UserDefinedLogicalNodeCore for LateTakeNode {
+ fn name(&self) -> &str {
+ "LateTake"
+ }
+
+ fn inputs(&self) -> Vec<&LogicalPlan> {
+ vec![&self.input]
+ }
+
+ fn schema(&self) -> &DFSchemaRef {
+ &self.schema
+ }
+
+ fn expressions(&self) -> Vec {
+ vec![]
+ }
+
+ fn fmt_for_explain(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+ write!(
+ f,
+ "LateTake: deferred=[{}], nullable_extra={}",
+ self.deferred_columns.join(", "),
+ self.nullable_extra
+ )
+ }
+
+ fn with_exprs_and_inputs(
+ &self,
+ exprs: Vec,
+ mut inputs: Vec,
+ ) -> DFResult {
+ if !exprs.is_empty() {
+ return Err(datafusion::error::DataFusionError::Internal(
+ "LateTakeNode does not accept expressions".to_string(),
+ ));
+ }
+ if inputs.len() != 1 {
+ return Err(datafusion::error::DataFusionError::Internal(
+ "LateTakeNode requires exactly one input".to_string(),
+ ));
+ }
+ Self::try_new(
+ inputs.remove(0),
+ self.dataset.clone(),
+ self.deferred_columns.clone(),
+ self.qualifier.clone(),
+ self.nullable_extra,
+ )
+ }
+
+ /// Drive projection pushdown: the deferred columns are produced by this
+ /// node (fetched by address), so they are never requested from the child;
+ /// `_rowaddr` is always required so the fetch remains possible.
+ fn necessary_children_exprs(&self, output_columns: &[usize]) -> Option>> {
+ let input_schema = self.input.schema();
+ let deferred_set: HashSet<&str> =
+ self.deferred_columns.iter().map(|s| s.as_str()).collect();
+
+ // Output positions [0..passthrough_len) map back to these child indices,
+ // in order; positions beyond are the appended (fetched) deferred columns.
+ // The deferred match is qualified: a same-named field from another
+ // relation is a distinct passthrough column, not a deferred one.
+ let passthrough: Vec = input_schema
+ .iter()
+ .enumerate()
+ .filter(|(_, (q, f))| {
+ !(*q == self.qualifier.as_ref() && deferred_set.contains(f.name().as_str()))
+ })
+ .map(|(i, _)| i)
+ .collect();
+
+ let row_locator = self.row_locator_index()?;
+
+ let mut needed = BTreeSet::new();
+ for &oc in output_columns {
+ if let Some(child_idx) = passthrough.get(oc) {
+ needed.insert(*child_idx);
+ }
+ }
+ needed.insert(row_locator);
+ Some(vec![needed.into_iter().collect()])
+ }
+}
+
+/// Logical optimizer rule that inserts a [`LateTakeNode`] above a join when a
+/// wide column from a Lance table relation is only carried through the join.
+///
+/// Detection is qualifier-driven, not side-specific: it inspects both join
+/// inputs for a [`LanceTableProvider`] scan that emits `_rowaddr`, so it keeps
+/// working if build/probe sides are swapped. The actual scan narrowing is left
+/// to `OptimizeProjections`, which must run after this rule.
+#[derive(Debug, Default)]
+pub struct LateMaterializeJoin;
+
+impl LateMaterializeJoin {
+ pub fn new() -> Self {
+ Self
+ }
+
+ /// Recover the Lance dataset backing a join input, descending only through
+ /// single-input nodes (alias/projection/filter) so a nested join's scan is
+ /// never picked up by mistake.
+ fn find_lance_dataset(plan: &LogicalPlan) -> Option> {
+ if let LogicalPlan::TableScan(scan) = plan {
+ let source = scan.source.as_any().downcast_ref::()?;
+ let provider = source
+ .table_provider
+ .as_any()
+ .downcast_ref::()?;
+ return Some(provider.dataset());
+ }
+ let inputs = plan.inputs();
+ if inputs.len() == 1 {
+ Self::find_lance_dataset(inputs[0])
+ } else {
+ None
+ }
+ }
+
+ /// Names of the join's equi-keys on `side` (the columns that must stay in
+ /// the scan because the join reads them).
+ fn join_key_names(join: &Join, side: JoinSide) -> HashSet {
+ join.on
+ .iter()
+ .filter_map(|(left, right)| {
+ let expr = match side {
+ JoinSide::Left => left,
+ JoinSide::Right => right,
+ };
+ match expr {
+ Expr::Column(col) => Some(col.name.clone()),
+ _ => None,
+ }
+ })
+ .collect()
+ }
+
+ /// Collect every `(qualifier, name)` column reference that appears in an
+ /// expression anywhere in `plan`. Used to tell which scan-side columns are
+ /// actually consumed *above* a join (and so worth re-fetching) rather than
+ /// merely produced by the scan.
+ fn collect_referenced_columns(
+ plan: &LogicalPlan,
+ ) -> DFResult, String)>> {
+ let mut referenced = HashSet::new();
+ plan.apply(|node| {
+ // A join's own on-clause / filter columns are consumed by the join
+ // itself, not by an operator above it, so they must not count as
+ // "used above the join". (Equi-keys are the common case: both sides'
+ // keys share a name, which would otherwise look like a duplicate
+ // column flowing into the take.) Children are still visited.
+ if matches!(node, LogicalPlan::Join(_)) {
+ return Ok(TreeNodeRecursion::Continue);
+ }
+ for expr in node.expressions() {
+ expr.apply(|e| {
+ if let Expr::Column(col) = e {
+ referenced.insert((col.relation.clone(), col.name.clone()));
+ }
+ Ok(TreeNodeRecursion::Continue)
+ })?;
+ }
+ Ok(TreeNodeRecursion::Continue)
+ })?;
+ Ok(referenced)
+ }
+
+ fn try_defer_join(
+ join: &Join,
+ referenced: &HashSet<(Option, String)>,
+ ) -> DFResult