Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
91 changes: 81 additions & 10 deletions rust/lance-index/src/scalar.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,10 @@ use std::fmt::Debug;
use std::pin::Pin;
use std::{any::Any, ops::Bound, sync::Arc};

use datafusion_expr::{Expr, expr::ScalarFunction};
use datafusion_expr::{
Expr,
expr::{Like, ScalarFunction},
};
use inverted::query::{FtsQuery, FtsQueryNode, FtsSearchParams, MatchQuery, fill_fts_query_column};
use lance_core::deepsize::DeepSizeOf;
use lance_core::{Error, Result};
Expand Down Expand Up @@ -452,6 +455,19 @@ pub enum SargableQuery {
LikePrefix(ScalarValue),
}

/// Escape the LIKE metacharacters (`\`, `%`, `_`) in a literal string so it can be
/// embedded in a LIKE pattern and matched literally (paired with `ESCAPE '\'`).
fn escape_like_pattern(s: &str) -> String {
let mut out = String::with_capacity(s.len() + 2);
for c in s.chars() {
if matches!(c, '\\' | '%' | '_') {
out.push('\\');
}
out.push(c);
}
out
}

impl AnyQuery for SargableQuery {
fn as_any(&self) -> &dyn Any {
self
Expand Down Expand Up @@ -553,16 +569,36 @@ impl AnyQuery for SargableQuery {
)),
Self::IsNull() => col_expr.is_null(),
Self::Equals(value) => col_expr.eq(Expr::Literal(value.clone(), None)),
Self::LikePrefix(prefix) => {
let pattern = match prefix {
ScalarValue::Utf8(Some(s)) => ScalarValue::Utf8(Some(format!("{}%", s))),
ScalarValue::LargeUtf8(Some(s)) => {
ScalarValue::LargeUtf8(Some(format!("{}%", s)))
Self::LikePrefix(prefix) => match prefix {
ScalarValue::Utf8(Some(s)) | ScalarValue::LargeUtf8(Some(s)) => {
// The prefix is a literal string. If it contains LIKE metacharacters
// (`_`, `%`, `\`) they must be escaped before appending the `%` wildcard;
// otherwise an inexact recheck (e.g. zone maps) would treat them as
// wildcards and over-match rows that do not start with the literal prefix.
// When the prefix has no metacharacters we keep the plain
// `col LIKE 'prefix%'` form (no `ESCAPE`), identical to the prior behavior,
// so DataFusion's optimized prefix matcher still applies.
let escaped = escape_like_pattern(s);
let needs_escape = escaped.as_str() != s.as_str();
let pattern = format!("{}%", escaped);
let pattern_value = match prefix {
ScalarValue::LargeUtf8(_) => ScalarValue::LargeUtf8(Some(pattern)),
_ => ScalarValue::Utf8(Some(pattern)),
};
if needs_escape {
Expr::Like(Like {
negated: false,
expr: Box::new(col_expr),
pattern: Box::new(Expr::Literal(pattern_value, None)),
escape_char: Some('\\'),
case_insensitive: false,
})
} else {
col_expr.like(Expr::Literal(pattern_value, None))
}
other => other.clone(),
};
col_expr.like(Expr::Literal(pattern, None))
}
}
other => col_expr.like(Expr::Literal(other.clone(), None)),
},
}
}

Expand Down Expand Up @@ -1076,3 +1112,38 @@ pub trait ScalarIndex: Send + Sync + std::fmt::Debug + Index + DeepSizeOf {
/// with the same configuration on another dataset.
fn derive_index_params(&self) -> Result<ScalarIndexParams>;
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn test_like_prefix_to_expr_escapes_metacharacters() {
// The stored prefix is a literal string, so LIKE metacharacters in it must be
// escaped when the recheck predicate is rebuilt; otherwise `_`/`%` would act as
// wildcards and over-match. The reconstructed expression uses `ESCAPE '\'`.
let query = SargableQuery::LikePrefix(ScalarValue::Utf8(Some("a_b%x".to_string())));
let Expr::Like(like) = query.to_expr("name".to_string()) else {
panic!("expected a LIKE expression");
};
assert_eq!(like.escape_char, Some('\\'));
assert!(!like.negated);
assert!(!like.case_insensitive);
let Expr::Literal(ScalarValue::Utf8(Some(pattern)), _) = like.pattern.as_ref() else {
panic!("expected a Utf8 literal pattern");
};
assert_eq!(pattern.as_str(), "a\\_b\\%x%");

// A prefix without metacharacters only gains the trailing wildcard and keeps the
// plain `LIKE 'app%'` form (no `ESCAPE`) so the optimized prefix matcher still applies.
let query = SargableQuery::LikePrefix(ScalarValue::Utf8(Some("app".to_string())));
let Expr::Like(like) = query.to_expr("name".to_string()) else {
panic!("expected a LIKE expression");
};
assert_eq!(like.escape_char, None);
let Expr::Literal(ScalarValue::Utf8(Some(pattern)), _) = like.pattern.as_ref() else {
panic!("expected a Utf8 literal pattern");
};
assert_eq!(pattern.as_str(), "app%");
}
}
11 changes: 6 additions & 5 deletions rust/lance-index/src/scalar/bitmap.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1712,11 +1712,12 @@ impl ScalarIndexPlugin for BitmapIndexPlugin {
index_name: String,
_index_details: &prost_types::Any,
) -> Option<Box<dyn ScalarQueryParser>> {
Some(Box::new(SargableQueryParser::new(
index_name,
self.name().to_string(),
false,
)))
// Bitmap indexes cannot answer `LikePrefix` queries (see `search`), so the parser
// is configured to skip them and let such predicates fall back to ordinary filtering.
Some(Box::new(
SargableQueryParser::new(index_name, self.name().to_string(), false)
.without_like_prefix(),
))
}

async fn train_index(
Expand Down
200 changes: 196 additions & 4 deletions rust/lance-index/src/scalar/expression.rs
Original file line number Diff line number Diff line change
Expand Up @@ -257,6 +257,7 @@ pub struct SargableQueryParser {
index_name: String,
index_type: String,
needs_recheck: bool,
supports_like_prefix: bool,
}

impl SargableQueryParser {
Expand All @@ -265,8 +266,17 @@ impl SargableQueryParser {
index_name,
index_type,
needs_recheck,
supports_like_prefix: true,
}
}

/// Bitmap (and similar) indexes cannot answer prefix queries; disabling
/// `LikePrefix` emission makes `LIKE`/`starts_with` predicates fall back to
/// ordinary filtering instead of failing at search time.
pub fn without_like_prefix(mut self) -> Self {
self.supports_like_prefix = false;
self
}
}

impl ScalarQueryParser for SargableQueryParser {
Expand Down Expand Up @@ -380,12 +390,24 @@ impl ScalarQueryParser for SargableQueryParser {
) -> Option<IndexedExpression> {
// Handle starts_with(col, 'prefix') -> convert to LikePrefix query
if func.name() == "starts_with" && args.len() == 2 {
// Indexes that cannot answer prefix queries (e.g. bitmap) fall back to
// ordinary filtering rather than emitting a query they would reject.
if !self.supports_like_prefix {
return None;
}
// Extract the prefix from the second argument
let prefix = match &args[1] {
Expr::Literal(ScalarValue::Utf8(Some(s)), _) => ScalarValue::Utf8(Some(s.clone())),
Expr::Literal(ScalarValue::LargeUtf8(Some(s)), _) => {
ScalarValue::LargeUtf8(Some(s.clone()))
}
// Lance stores `Utf8View` columns as `Utf8` (normalized at write time), so a
// `Utf8View` literal is normalized to `Utf8` to match the indexed data: the
// BTree compares the query bound against `Utf8` page statistics at the Arrow
// level, which rejects a `Utf8View` bound.
Expr::Literal(ScalarValue::Utf8View(Some(s)), _) => {

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The new Utf8View prefix path can feed ZoneMap an inexact LikePrefix query, but the later recheck reconstructs the predicate as an unescaped LIKE pattern. Prefixes containing _ or % can therefore match rows that do not satisfy the original starts_with predicate.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done 528b9f0 - LikePrefix::to_expr now escapes _/%/\ and reconstructs the recheck as LIKE 'prefix%' ESCAPE '\' (only when the prefix actually contains metacharacters), so the inexact zone-map recheck no longer over-matches.

ScalarValue::Utf8(Some(s.clone()))
}
_ => return None,
};

Expand Down Expand Up @@ -413,19 +435,29 @@ impl ScalarQueryParser for SargableQueryParser {
return None;
}

// Indexes that cannot answer prefix queries (e.g. bitmap) fall back to
// ordinary filtering rather than emitting a query they would reject.
if !self.supports_like_prefix {
return None;
}

// Extract the pattern string
let pattern_str = match pattern {
ScalarValue::Utf8(Some(s)) => s.as_str(),
ScalarValue::LargeUtf8(Some(s)) => s.as_str(),
ScalarValue::Utf8(Some(s))
| ScalarValue::LargeUtf8(Some(s))
| ScalarValue::Utf8View(Some(s)) => s.as_str(),
_ => return None,
};

// Try to extract a prefix from the LIKE pattern
let (prefix, needs_refine) = extract_like_leading_prefix(pattern_str, like.escape_char)?;

// Create the prefix ScalarValue with the same type as the pattern
// Create the prefix ScalarValue with the same type as the pattern. `Utf8View` is
// normalized to `Utf8` because Lance stores `Utf8View` columns as `Utf8`, and the
// downstream BTree compares the query bound against `Utf8` page statistics at the
// Arrow level (a `Utf8View` bound would fail that comparison).
let prefix_value = match pattern {
ScalarValue::Utf8(_) => ScalarValue::Utf8(Some(prefix)),
ScalarValue::Utf8(_) | ScalarValue::Utf8View(_) => ScalarValue::Utf8(Some(prefix)),

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The shared sargable parser now emits LikePrefix for Utf8View prefixes even when the selected index is Bitmap, which rejects that query during search. A query that previously fell back to ordinary filtering can now fail once planned as an index scan.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done 528b9f0 - the bitmap index now configures the parser with without_like_prefix, so LIKE/starts_with fall back to ordinary filtering instead of emitting a LikePrefix that bitmap search would reject.

ScalarValue::LargeUtf8(_) => ScalarValue::LargeUtf8(Some(prefix)),
_ => return None,
};
Expand Down Expand Up @@ -3204,6 +3236,166 @@ mod tests {
}
}

#[test]
fn test_sargable_query_parser_utf8view() {
// Follow-up to PR #7310 / #7139: the BTree `SargableQueryParser` must accept
// `Utf8View` prefixes for `starts_with` and infix-free LIKE, not only `Utf8` /
// `LargeUtf8`. DataFusion can coerce the predicate literal to `ScalarValue::Utf8View`;
// dropping that variant silently skips the index. The `Utf8View` prefix is normalized
// to `Utf8` (Lance stores `Utf8View` columns as `Utf8`), so the emitted query is a
// `LikePrefix(Utf8(..))`. `visit_scalar_function` / `visit_like` are exercised directly
// so the test does not depend on the planner's coercion choices, and the `Utf8` case
// is a parity control: the pre-existing path must keep behaving identically.
let parser = SargableQueryParser::new("color_idx".to_string(), "BTree".to_string(), false);

let assert_like_prefix =
|indexed: &IndexedExpression, expected: &ScalarValue, needs_refine: bool| {
assert_eq!(
indexed.refine_expr.is_some(),
needs_refine,
"unexpected refine_expr presence"
);
let Some(ScalarIndexExpr::Query(search)) = &indexed.scalar_query else {
panic!("expected a scalar index query");
};
match search
.query
.as_any()
.downcast_ref::<SargableQuery>()
.expect("query should be a SargableQuery")
{
SargableQuery::LikePrefix(prefix) => assert_eq!(prefix, expected),
_ => panic!("expected a LikePrefix query"),
}
};

// starts_with(col, <Utf8View prefix>) -> LikePrefix(Utf8). Reuse a real
// `starts_with` UDF parsed from SQL, then swap in a `Utf8View` literal argument.
let schema = Schema::new(vec![Field::new("color", DataType::Utf8View, false)]);
let df_schema: DFSchema = schema.try_into().unwrap();
let ctx = get_session_context(&LanceExecutionOptions::default());
let state = ctx.state();
let Expr::ScalarFunction(starts_with) = state
.create_logical_expr("starts_with(color, 'foo')", &df_schema)
.unwrap()
else {
panic!("expected starts_with to parse as a scalar function");
};
let args = vec![
starts_with.args[0].clone(),
Expr::Literal(ScalarValue::Utf8View(Some("foo".to_string())), None),
];
let indexed = parser
.visit_scalar_function(
"color",
&DataType::Utf8View,
starts_with.func.as_ref(),
&args,
)
.expect("starts_with should use the BTree index");
assert_like_prefix(&indexed, &ScalarValue::Utf8(Some("foo".to_string())), false);

// col LIKE <Utf8View pattern>. `visit_like` is called directly so the test does not
// depend on DataFusion's LIKE type coercion choosing `Utf8View` for the pattern.
let like = |pattern: ScalarValue| {
Like::new(
false,
Box::new(Expr::Column(Column::new_unqualified("color"))),
Box::new(Expr::Literal(pattern, None)),
None,
false,
)
};

// Pure prefix: routed to the index with no recheck needed.
let pattern = ScalarValue::Utf8View(Some("foo%".to_string()));
let indexed = parser
.visit_like("color", &like(pattern.clone()), &pattern)
.expect("LIKE prefix should use the BTree index");
assert_like_prefix(&indexed, &ScalarValue::Utf8(Some("foo".to_string())), false);

// Wildcards beyond the leading prefix keep the original LIKE as a recheck.
let pattern = ScalarValue::Utf8View(Some("foo%bar%".to_string()));
let indexed = parser
.visit_like("color", &like(pattern.clone()), &pattern)
.expect("LIKE prefix should use the BTree index");
assert_like_prefix(&indexed, &ScalarValue::Utf8(Some("foo".to_string())), true);

// Parity control: the pre-existing `Utf8` path is unchanged.
let pattern = ScalarValue::Utf8(Some("foo%".to_string()));
let indexed = parser
.visit_like("color", &like(pattern.clone()), &pattern)
.expect("LIKE prefix should use the BTree index");
assert_like_prefix(&indexed, &ScalarValue::Utf8(Some("foo".to_string())), false);
}

#[test]
fn test_sargable_query_parser_without_like_prefix() {
// Bitmap indexes configure the parser with `without_like_prefix`: a bitmap index
// cannot answer `LikePrefix` queries (its `search` rejects them), so `starts_with` /
// `LIKE 'prefix%'` must not be turned into an index query. Returning `None` lets the
// predicate fall back to ordinary filtering instead of failing at search time.
let bitmap_parser =
SargableQueryParser::new("color_idx".to_string(), "BITMAP".to_string(), false)
.without_like_prefix();
let btree_parser =
SargableQueryParser::new("color_idx".to_string(), "BTree".to_string(), false);

let schema = Schema::new(vec![Field::new("color", DataType::Utf8, false)]);
let df_schema: DFSchema = schema.try_into().unwrap();
let ctx = get_session_context(&LanceExecutionOptions::default());
let state = ctx.state();
let Expr::ScalarFunction(starts_with) = state
.create_logical_expr("starts_with(color, 'foo')", &df_schema)
.unwrap()
else {
panic!("expected starts_with to parse as a scalar function");
};

let pattern = ScalarValue::Utf8(Some("foo%".to_string()));
let like = Like::new(
false,
Box::new(Expr::Column(Column::new_unqualified("color"))),
Box::new(Expr::Literal(pattern.clone(), None)),
None,
false,
);

// Bitmap parser: both prefix paths fall back (return `None`).
assert!(
bitmap_parser
.visit_scalar_function(
"color",
&DataType::Utf8,
starts_with.func.as_ref(),
&starts_with.args,
)
.is_none(),
"bitmap parser must not emit a LikePrefix for starts_with"
);
assert!(
bitmap_parser.visit_like("color", &like, &pattern).is_none(),
"bitmap parser must not emit a LikePrefix for LIKE"
);

// A prefix-capable parser (e.g. BTree) still emits the index query.
assert!(
btree_parser
.visit_scalar_function(
"color",
&DataType::Utf8,
starts_with.func.as_ref(),
&starts_with.args,
)
.is_some(),
"BTree parser should still emit a LikePrefix for starts_with"
);
assert!(
btree_parser.visit_like("color", &like, &pattern).is_some(),
"BTree parser should still emit a LikePrefix for LIKE"
);
}

#[test]
fn test_serialize_index_expr_result_round_trip() {
use lance_select::{RowAddrMask, RowAddrTreeMap};
Expand Down
Loading
Loading