From d697df2605b4602eac953e5b336c3942d160efac Mon Sep 17 00:00:00 2001 From: Vova Kolmakov Date: Thu, 18 Jun 2026 10:53:55 +0700 Subject: [PATCH 1/2] feat(index): support Utf8View prefixes in SargableQueryParser Follow-up to #7310, which added Utf8View handling to the ngram TextQueryParser and explicitly left the identical gap in SargableQueryParser out of scope. The BTree/ZoneMap parser only matched Utf8 / LargeUtf8 for starts_with and infix-free LIKE prefixes, so a Utf8View predicate literal was dropped and the query silently fell back to a full scan instead of using the scalar index. Unlike the ngram path (where the pattern is only ever used as a regex string), here the parser emits a SargableQuery::LikePrefix whose ScalarValue flows downstream into the BTree, which compares the query bound against Utf8 page statistics with Arrow's type-dispatched comparator. A Utf8View bound cannot be compared against Utf8 stats arrays. Because Lance already normalizes Utf8View columns to Utf8 at write time (the stored index data is always Utf8), the fix normalizes a Utf8View prefix to Utf8 in the parser rather than threading a new type through the shared comparison code. Adds test_sargable_query_parser_utf8view, which exercises visit_scalar_function (starts_with) and visit_like directly with Utf8View literals and asserts the resulting LikePrefix(Utf8) query, with a Utf8 parity control. The test fails on the pre-change parser (the Utf8View literal is dropped) and passes after. Co-Authored-By: Claude Opus 4.8 (1M context) --- rust/lance-index/src/scalar/expression.rs | 112 +++++++++++++++++++++- 1 file changed, 108 insertions(+), 4 deletions(-) diff --git a/rust/lance-index/src/scalar/expression.rs b/rust/lance-index/src/scalar/expression.rs index ea7fbabc813..6a81eaef194 100644 --- a/rust/lance-index/src/scalar/expression.rs +++ b/rust/lance-index/src/scalar/expression.rs @@ -386,6 +386,13 @@ impl ScalarQueryParser for SargableQueryParser { Expr::Literal(ScalarValue::LargeUtf8(Some(s)), _) => { ScalarValue::LargeUtf8(Some(s.clone())) } + // Lance stores `Utf8View` columns as `Utf8` (normalized at write time), so a + // `Utf8View` literal is normalized to `Utf8` to match the indexed data: the + // BTree compares the query bound against `Utf8` page statistics at the Arrow + // level, which rejects a `Utf8View` bound. + Expr::Literal(ScalarValue::Utf8View(Some(s)), _) => { + ScalarValue::Utf8(Some(s.clone())) + } _ => return None, }; @@ -415,17 +422,21 @@ impl ScalarQueryParser for SargableQueryParser { // Extract the pattern string let pattern_str = match pattern { - ScalarValue::Utf8(Some(s)) => s.as_str(), - ScalarValue::LargeUtf8(Some(s)) => s.as_str(), + ScalarValue::Utf8(Some(s)) + | ScalarValue::LargeUtf8(Some(s)) + | ScalarValue::Utf8View(Some(s)) => s.as_str(), _ => return None, }; // Try to extract a prefix from the LIKE pattern let (prefix, needs_refine) = extract_like_leading_prefix(pattern_str, like.escape_char)?; - // Create the prefix ScalarValue with the same type as the pattern + // Create the prefix ScalarValue with the same type as the pattern. `Utf8View` is + // normalized to `Utf8` because Lance stores `Utf8View` columns as `Utf8`, and the + // downstream BTree compares the query bound against `Utf8` page statistics at the + // Arrow level (a `Utf8View` bound would fail that comparison). let prefix_value = match pattern { - ScalarValue::Utf8(_) => ScalarValue::Utf8(Some(prefix)), + ScalarValue::Utf8(_) | ScalarValue::Utf8View(_) => ScalarValue::Utf8(Some(prefix)), ScalarValue::LargeUtf8(_) => ScalarValue::LargeUtf8(Some(prefix)), _ => return None, }; @@ -3204,6 +3215,99 @@ mod tests { } } + #[test] + fn test_sargable_query_parser_utf8view() { + // Follow-up to PR #7310 / #7139: the BTree `SargableQueryParser` must accept + // `Utf8View` prefixes for `starts_with` and infix-free LIKE, not only `Utf8` / + // `LargeUtf8`. DataFusion can coerce the predicate literal to `ScalarValue::Utf8View`; + // dropping that variant silently skips the index. The `Utf8View` prefix is normalized + // to `Utf8` (Lance stores `Utf8View` columns as `Utf8`), so the emitted query is a + // `LikePrefix(Utf8(..))`. `visit_scalar_function` / `visit_like` are exercised directly + // so the test does not depend on the planner's coercion choices, and the `Utf8` case + // is a parity control: the pre-existing path must keep behaving identically. + let parser = SargableQueryParser::new("color_idx".to_string(), "BTree".to_string(), false); + + let assert_like_prefix = + |indexed: &IndexedExpression, expected: &ScalarValue, needs_refine: bool| { + assert_eq!( + indexed.refine_expr.is_some(), + needs_refine, + "unexpected refine_expr presence" + ); + let Some(ScalarIndexExpr::Query(search)) = &indexed.scalar_query else { + panic!("expected a scalar index query"); + }; + match search + .query + .as_any() + .downcast_ref::() + .expect("query should be a SargableQuery") + { + SargableQuery::LikePrefix(prefix) => assert_eq!(prefix, expected), + _ => panic!("expected a LikePrefix query"), + } + }; + + // starts_with(col, ) -> LikePrefix(Utf8). Reuse a real + // `starts_with` UDF parsed from SQL, then swap in a `Utf8View` literal argument. + let schema = Schema::new(vec![Field::new("color", DataType::Utf8View, false)]); + let df_schema: DFSchema = schema.try_into().unwrap(); + let ctx = get_session_context(&LanceExecutionOptions::default()); + let state = ctx.state(); + let Expr::ScalarFunction(starts_with) = state + .create_logical_expr("starts_with(color, 'foo')", &df_schema) + .unwrap() + else { + panic!("expected starts_with to parse as a scalar function"); + }; + let args = vec![ + starts_with.args[0].clone(), + Expr::Literal(ScalarValue::Utf8View(Some("foo".to_string())), None), + ]; + let indexed = parser + .visit_scalar_function( + "color", + &DataType::Utf8View, + starts_with.func.as_ref(), + &args, + ) + .expect("starts_with should use the BTree index"); + assert_like_prefix(&indexed, &ScalarValue::Utf8(Some("foo".to_string())), false); + + // col LIKE . `visit_like` is called directly so the test does not + // depend on DataFusion's LIKE type coercion choosing `Utf8View` for the pattern. + let like = |pattern: ScalarValue| { + Like::new( + false, + Box::new(Expr::Column(Column::new_unqualified("color"))), + Box::new(Expr::Literal(pattern, None)), + None, + false, + ) + }; + + // Pure prefix: routed to the index with no recheck needed. + let pattern = ScalarValue::Utf8View(Some("foo%".to_string())); + let indexed = parser + .visit_like("color", &like(pattern.clone()), &pattern) + .expect("LIKE prefix should use the BTree index"); + assert_like_prefix(&indexed, &ScalarValue::Utf8(Some("foo".to_string())), false); + + // Wildcards beyond the leading prefix keep the original LIKE as a recheck. + let pattern = ScalarValue::Utf8View(Some("foo%bar%".to_string())); + let indexed = parser + .visit_like("color", &like(pattern.clone()), &pattern) + .expect("LIKE prefix should use the BTree index"); + assert_like_prefix(&indexed, &ScalarValue::Utf8(Some("foo".to_string())), true); + + // Parity control: the pre-existing `Utf8` path is unchanged. + let pattern = ScalarValue::Utf8(Some("foo%".to_string())); + let indexed = parser + .visit_like("color", &like(pattern.clone()), &pattern) + .expect("LIKE prefix should use the BTree index"); + assert_like_prefix(&indexed, &ScalarValue::Utf8(Some("foo".to_string())), false); + } + #[test] fn test_serialize_index_expr_result_round_trip() { use lance_select::{RowAddrMask, RowAddrTreeMap}; From 528b9f08aa1e5658b953d52259a187cdd5286287 Mon Sep 17 00:00:00 2001 From: Vova Kolmakov Date: Sat, 20 Jun 2026 10:03:41 +0700 Subject: [PATCH 2/2] fix(index): escape LikePrefix recheck pattern and skip prefix queries for bitmap indexes Escape LIKE metacharacters (_, %, \) when rebuilding the LikePrefix recheck predicate so a literal prefix no longer over-matches on the inexact (zone map) path. Configure the bitmap index parser with without_like_prefix so LIKE/starts_with fall back to ordinary filtering instead of failing at search time. --- rust/lance-index/src/scalar.rs | 91 +++++++++-- rust/lance-index/src/scalar/bitmap.rs | 11 +- rust/lance-index/src/scalar/expression.rs | 88 ++++++++++ rust/lance/src/dataset/scanner.rs | 185 ++++++++++++++++++++++ 4 files changed, 360 insertions(+), 15 deletions(-) diff --git a/rust/lance-index/src/scalar.rs b/rust/lance-index/src/scalar.rs index a287d277a81..44b8a8b14f9 100644 --- a/rust/lance-index/src/scalar.rs +++ b/rust/lance-index/src/scalar.rs @@ -18,7 +18,10 @@ use std::fmt::Debug; use std::pin::Pin; use std::{any::Any, ops::Bound, sync::Arc}; -use datafusion_expr::{Expr, expr::ScalarFunction}; +use datafusion_expr::{ + Expr, + expr::{Like, ScalarFunction}, +}; use inverted::query::{FtsQuery, FtsQueryNode, FtsSearchParams, MatchQuery, fill_fts_query_column}; use lance_core::deepsize::DeepSizeOf; use lance_core::{Error, Result}; @@ -452,6 +455,19 @@ pub enum SargableQuery { LikePrefix(ScalarValue), } +/// Escape the LIKE metacharacters (`\`, `%`, `_`) in a literal string so it can be +/// embedded in a LIKE pattern and matched literally (paired with `ESCAPE '\'`). +fn escape_like_pattern(s: &str) -> String { + let mut out = String::with_capacity(s.len() + 2); + for c in s.chars() { + if matches!(c, '\\' | '%' | '_') { + out.push('\\'); + } + out.push(c); + } + out +} + impl AnyQuery for SargableQuery { fn as_any(&self) -> &dyn Any { self @@ -553,16 +569,36 @@ impl AnyQuery for SargableQuery { )), Self::IsNull() => col_expr.is_null(), Self::Equals(value) => col_expr.eq(Expr::Literal(value.clone(), None)), - Self::LikePrefix(prefix) => { - let pattern = match prefix { - ScalarValue::Utf8(Some(s)) => ScalarValue::Utf8(Some(format!("{}%", s))), - ScalarValue::LargeUtf8(Some(s)) => { - ScalarValue::LargeUtf8(Some(format!("{}%", s))) + Self::LikePrefix(prefix) => match prefix { + ScalarValue::Utf8(Some(s)) | ScalarValue::LargeUtf8(Some(s)) => { + // The prefix is a literal string. If it contains LIKE metacharacters + // (`_`, `%`, `\`) they must be escaped before appending the `%` wildcard; + // otherwise an inexact recheck (e.g. zone maps) would treat them as + // wildcards and over-match rows that do not start with the literal prefix. + // When the prefix has no metacharacters we keep the plain + // `col LIKE 'prefix%'` form (no `ESCAPE`), identical to the prior behavior, + // so DataFusion's optimized prefix matcher still applies. + let escaped = escape_like_pattern(s); + let needs_escape = escaped.as_str() != s.as_str(); + let pattern = format!("{}%", escaped); + let pattern_value = match prefix { + ScalarValue::LargeUtf8(_) => ScalarValue::LargeUtf8(Some(pattern)), + _ => ScalarValue::Utf8(Some(pattern)), + }; + if needs_escape { + Expr::Like(Like { + negated: false, + expr: Box::new(col_expr), + pattern: Box::new(Expr::Literal(pattern_value, None)), + escape_char: Some('\\'), + case_insensitive: false, + }) + } else { + col_expr.like(Expr::Literal(pattern_value, None)) } - other => other.clone(), - }; - col_expr.like(Expr::Literal(pattern, None)) - } + } + other => col_expr.like(Expr::Literal(other.clone(), None)), + }, } } @@ -1076,3 +1112,38 @@ pub trait ScalarIndex: Send + Sync + std::fmt::Debug + Index + DeepSizeOf { /// with the same configuration on another dataset. fn derive_index_params(&self) -> Result; } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_like_prefix_to_expr_escapes_metacharacters() { + // The stored prefix is a literal string, so LIKE metacharacters in it must be + // escaped when the recheck predicate is rebuilt; otherwise `_`/`%` would act as + // wildcards and over-match. The reconstructed expression uses `ESCAPE '\'`. + let query = SargableQuery::LikePrefix(ScalarValue::Utf8(Some("a_b%x".to_string()))); + let Expr::Like(like) = query.to_expr("name".to_string()) else { + panic!("expected a LIKE expression"); + }; + assert_eq!(like.escape_char, Some('\\')); + assert!(!like.negated); + assert!(!like.case_insensitive); + let Expr::Literal(ScalarValue::Utf8(Some(pattern)), _) = like.pattern.as_ref() else { + panic!("expected a Utf8 literal pattern"); + }; + assert_eq!(pattern.as_str(), "a\\_b\\%x%"); + + // A prefix without metacharacters only gains the trailing wildcard and keeps the + // plain `LIKE 'app%'` form (no `ESCAPE`) so the optimized prefix matcher still applies. + let query = SargableQuery::LikePrefix(ScalarValue::Utf8(Some("app".to_string()))); + let Expr::Like(like) = query.to_expr("name".to_string()) else { + panic!("expected a LIKE expression"); + }; + assert_eq!(like.escape_char, None); + let Expr::Literal(ScalarValue::Utf8(Some(pattern)), _) = like.pattern.as_ref() else { + panic!("expected a Utf8 literal pattern"); + }; + assert_eq!(pattern.as_str(), "app%"); + } +} diff --git a/rust/lance-index/src/scalar/bitmap.rs b/rust/lance-index/src/scalar/bitmap.rs index c2a6e80e82b..a5b34f0ab96 100644 --- a/rust/lance-index/src/scalar/bitmap.rs +++ b/rust/lance-index/src/scalar/bitmap.rs @@ -1712,11 +1712,12 @@ impl ScalarIndexPlugin for BitmapIndexPlugin { index_name: String, _index_details: &prost_types::Any, ) -> Option> { - Some(Box::new(SargableQueryParser::new( - index_name, - self.name().to_string(), - false, - ))) + // Bitmap indexes cannot answer `LikePrefix` queries (see `search`), so the parser + // is configured to skip them and let such predicates fall back to ordinary filtering. + Some(Box::new( + SargableQueryParser::new(index_name, self.name().to_string(), false) + .without_like_prefix(), + )) } async fn train_index( diff --git a/rust/lance-index/src/scalar/expression.rs b/rust/lance-index/src/scalar/expression.rs index 6a81eaef194..500c7f061cf 100644 --- a/rust/lance-index/src/scalar/expression.rs +++ b/rust/lance-index/src/scalar/expression.rs @@ -257,6 +257,7 @@ pub struct SargableQueryParser { index_name: String, index_type: String, needs_recheck: bool, + supports_like_prefix: bool, } impl SargableQueryParser { @@ -265,8 +266,17 @@ impl SargableQueryParser { index_name, index_type, needs_recheck, + supports_like_prefix: true, } } + + /// Bitmap (and similar) indexes cannot answer prefix queries; disabling + /// `LikePrefix` emission makes `LIKE`/`starts_with` predicates fall back to + /// ordinary filtering instead of failing at search time. + pub fn without_like_prefix(mut self) -> Self { + self.supports_like_prefix = false; + self + } } impl ScalarQueryParser for SargableQueryParser { @@ -380,6 +390,11 @@ impl ScalarQueryParser for SargableQueryParser { ) -> Option { // Handle starts_with(col, 'prefix') -> convert to LikePrefix query if func.name() == "starts_with" && args.len() == 2 { + // Indexes that cannot answer prefix queries (e.g. bitmap) fall back to + // ordinary filtering rather than emitting a query they would reject. + if !self.supports_like_prefix { + return None; + } // Extract the prefix from the second argument let prefix = match &args[1] { Expr::Literal(ScalarValue::Utf8(Some(s)), _) => ScalarValue::Utf8(Some(s.clone())), @@ -420,6 +435,12 @@ impl ScalarQueryParser for SargableQueryParser { return None; } + // Indexes that cannot answer prefix queries (e.g. bitmap) fall back to + // ordinary filtering rather than emitting a query they would reject. + if !self.supports_like_prefix { + return None; + } + // Extract the pattern string let pattern_str = match pattern { ScalarValue::Utf8(Some(s)) @@ -3308,6 +3329,73 @@ mod tests { assert_like_prefix(&indexed, &ScalarValue::Utf8(Some("foo".to_string())), false); } + #[test] + fn test_sargable_query_parser_without_like_prefix() { + // Bitmap indexes configure the parser with `without_like_prefix`: a bitmap index + // cannot answer `LikePrefix` queries (its `search` rejects them), so `starts_with` / + // `LIKE 'prefix%'` must not be turned into an index query. Returning `None` lets the + // predicate fall back to ordinary filtering instead of failing at search time. + let bitmap_parser = + SargableQueryParser::new("color_idx".to_string(), "BITMAP".to_string(), false) + .without_like_prefix(); + let btree_parser = + SargableQueryParser::new("color_idx".to_string(), "BTree".to_string(), false); + + let schema = Schema::new(vec![Field::new("color", DataType::Utf8, false)]); + let df_schema: DFSchema = schema.try_into().unwrap(); + let ctx = get_session_context(&LanceExecutionOptions::default()); + let state = ctx.state(); + let Expr::ScalarFunction(starts_with) = state + .create_logical_expr("starts_with(color, 'foo')", &df_schema) + .unwrap() + else { + panic!("expected starts_with to parse as a scalar function"); + }; + + let pattern = ScalarValue::Utf8(Some("foo%".to_string())); + let like = Like::new( + false, + Box::new(Expr::Column(Column::new_unqualified("color"))), + Box::new(Expr::Literal(pattern.clone(), None)), + None, + false, + ); + + // Bitmap parser: both prefix paths fall back (return `None`). + assert!( + bitmap_parser + .visit_scalar_function( + "color", + &DataType::Utf8, + starts_with.func.as_ref(), + &starts_with.args, + ) + .is_none(), + "bitmap parser must not emit a LikePrefix for starts_with" + ); + assert!( + bitmap_parser.visit_like("color", &like, &pattern).is_none(), + "bitmap parser must not emit a LikePrefix for LIKE" + ); + + // A prefix-capable parser (e.g. BTree) still emits the index query. + assert!( + btree_parser + .visit_scalar_function( + "color", + &DataType::Utf8, + starts_with.func.as_ref(), + &starts_with.args, + ) + .is_some(), + "BTree parser should still emit a LikePrefix for starts_with" + ); + assert!( + btree_parser.visit_like("color", &like, &pattern).is_some(), + "BTree parser should still emit a LikePrefix for LIKE" + ); + } + #[test] fn test_serialize_index_expr_result_round_trip() { use lance_select::{RowAddrMask, RowAddrTreeMap}; diff --git a/rust/lance/src/dataset/scanner.rs b/rust/lance/src/dataset/scanner.rs index 09cd7023e74..6d71fbe4c0a 100644 --- a/rust/lance/src/dataset/scanner.rs +++ b/rust/lance/src/dataset/scanner.rs @@ -9037,6 +9037,191 @@ full_filter=name LIKE Utf8(\"test%2\"), refine_filter=name LIKE Utf8(\"test%2\") ); } + /// Regression for over-matching on the zone-map recheck path: a literal prefix + /// containing LIKE metacharacters (`_`, `%`) must be matched literally, not as + /// wildcards. The indexed (recheck) result must equal the unindexed ground truth. + #[tokio::test] + async fn test_like_prefix_zone_map_escapes_metacharacters() { + use lance_index::scalar::BuiltinIndexType; + + let names: Vec<&str> = vec!["a_b", "a_c", "axb", "a1c", "b%c", "bxc", "bcc", "zoo"]; + let schema = Arc::new(ArrowSchema::new(vec![ + ArrowField::new("name", DataType::Utf8, false), + ArrowField::new("id", DataType::Int32, false), + ])); + let data = RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(StringArray::from(names.clone())), + Arc::new(Int32Array::from_iter_values(0..names.len() as i32)), + ], + ) + .unwrap(); + let reader = RecordBatchIterator::new(vec![Ok(data)], schema.clone()); + let mut dataset = Dataset::write(reader, "memory://test_like_zonemap_escape", None) + .await + .unwrap(); + + let params = ScalarIndexParams::for_builtin(BuiltinIndexType::ZoneMap); + dataset + .create_index( + &["name"], + IndexType::Scalar, + Some("name_zonemap".to_string()), + ¶ms, + true, + ) + .await + .unwrap(); + + let collect_names = |batch: &RecordBatch| -> BTreeSet { + batch + .column_by_name("name") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap() + .iter() + .map(|s| s.unwrap().to_string()) + .collect() + }; + + // Ensure the predicate actually exercises the zone-map LikePrefix recheck path. + let mut scanner = dataset.scan(); + scanner.filter("starts_with(name, 'a_')").unwrap(); + let plan_str = format!("{:?}", scanner.create_plan().await.unwrap()); + assert!( + plan_str.contains("LikePrefix"), + "expected a zone-map LikePrefix plan, got: {plan_str}" + ); + + // `_` and `%` in the prefix are literal characters; the indexed result must match + // the unindexed evaluation for each predicate (no wildcard over-match). + for predicate in ["starts_with(name, 'a_')", "starts_with(name, 'b%')"] { + let with_index = dataset + .scan() + .filter(predicate) + .unwrap() + .try_into_batch() + .await + .unwrap(); + let without_index = dataset + .scan() + .use_scalar_index(false) + .filter(predicate) + .unwrap() + .try_into_batch() + .await + .unwrap(); + assert_eq!( + collect_names(&with_index), + collect_names(&without_index), + "indexed result over-matched for predicate `{predicate}`" + ); + } + + // Explicit expectation so the intended (non-over-matching) result is obvious. + let result = dataset + .scan() + .filter("starts_with(name, 'a_')") + .unwrap() + .try_into_batch() + .await + .unwrap(); + assert_eq!( + collect_names(&result), + BTreeSet::from(["a_b".to_string(), "a_c".to_string()]) + ); + } + + /// A bitmap index cannot answer prefix queries, so `LIKE 'prefix%'` / `starts_with` + /// must fall back to ordinary filtering (returning correct results) instead of being + /// planned as a `LikePrefix` index scan that bitmap search would reject. + #[tokio::test] + async fn test_like_prefix_bitmap_falls_back_to_filter() { + use lance_index::scalar::BuiltinIndexType; + + let names: Vec<&str> = vec!["apple", "app", "application", "banana", "band", "zoo"]; + let schema = Arc::new(ArrowSchema::new(vec![ + ArrowField::new("name", DataType::Utf8, false), + ArrowField::new("id", DataType::Int32, false), + ])); + let data = RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(StringArray::from(names.clone())), + Arc::new(Int32Array::from_iter_values(0..names.len() as i32)), + ], + ) + .unwrap(); + let reader = RecordBatchIterator::new(vec![Ok(data)], schema.clone()); + let mut dataset = Dataset::write(reader, "memory://test_like_bitmap_fallback", None) + .await + .unwrap(); + + let params = ScalarIndexParams::for_builtin(BuiltinIndexType::Bitmap); + dataset + .create_index( + &["name"], + IndexType::Scalar, + Some("name_bitmap".to_string()), + ¶ms, + true, + ) + .await + .unwrap(); + + // The predicate must not be planned as a LikePrefix index scan (bitmap rejects it). + let mut scanner = dataset.scan(); + scanner.filter("name LIKE 'app%'").unwrap(); + let plan_str = format!("{:?}", scanner.create_plan().await.unwrap()); + assert!( + !plan_str.contains("LikePrefix"), + "bitmap LIKE must not use a LikePrefix index scan, got: {plan_str}" + ); + + let collect_names = |batch: &RecordBatch| -> BTreeSet { + batch + .column_by_name("name") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap() + .iter() + .map(|s| s.unwrap().to_string()) + .collect() + }; + + // And it must execute successfully with correct results (previously errored). + let result = dataset + .scan() + .filter("name LIKE 'app%'") + .unwrap() + .try_into_batch() + .await + .unwrap(); + assert_eq!( + collect_names(&result), + BTreeSet::from([ + "apple".to_string(), + "app".to_string(), + "application".to_string(), + ]) + ); + + let result = dataset + .scan() + .filter("starts_with(name, 'ban')") + .unwrap() + .try_into_batch() + .await + .unwrap(); + assert_eq!( + collect_names(&result), + BTreeSet::from(["banana".to_string(), "band".to_string()]) + ); + } + /// Build an in-memory dataset with a single `Dictionary(Int16, Utf8)` column. /// The dictionary cycles through "a", "b", "c" so each value appears in a /// predictable, repeated pattern.