Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
323 changes: 260 additions & 63 deletions rust/lance-index/src/scalar/inverted/tokenizer/document_tokenizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,13 @@ use lance_arrow::json::JSON_EXT_NAME;
use lance_tokenizer::{BoxTokenStream, TextAnalyzer, Token, TokenStream};
use serde_json::Value;

const JSON_ARRAY_WILDCARD: &str = "*";
const JSON_ARRAY_INDEX_FIELD: &str = "$index";
const JSON_NUMBER_TYPE: &str = "number";
const JSON_BOOL_TYPE: &str = "bool";
const JSON_NULL_TYPE: &str = "null";
const JSON_STRING_TYPE: &str = "str";

/// Document type for full text search.
#[derive(Debug, Clone)]
pub enum DocType {
Expand Down Expand Up @@ -184,32 +191,19 @@ fn flatten_triplet(text: &str, tokenizer: &mut TextAnalyzer) -> lance_core::Resu
let field = parts[0];
let v_type = parts[1];
let value = parts[2];
let normalized_field = normalize_json_query_path(field)?;

match v_type {
"number" | "bool" | "null" => {
let token = Token {
offset_from: 0,
offset_to: 0,
position: idx,
text: format!("{},{},{}", field, v_type, value),
position_length: 1,
};
token_vec.push(token);
idx += 1;
}
"str" => {
let mut tokens = tokenizer.token_stream(value);
while let Some(token) = tokens.next() {
token_vec.push(Token {
offset_from: 0,
offset_to: 0,
position: idx,
text: format!("{},{},{}", field, v_type, token.text),
position_length: 1,
});
idx += 1;
}
JSON_NUMBER_TYPE | JSON_BOOL_TYPE | JSON_NULL_TYPE => {
emit_scalar_token(&normalized_field, v_type, value, &mut token_vec, &mut idx)
}
JSON_STRING_TYPE => emit_string_tokens(
&[normalized_field],
value,
&mut token_vec,
&mut idx,
tokenizer,
),
_ => {
return Err(lance_core::Error::invalid_input_source(
format!("Invalid triple type: {}", v_type).into(),
Expand All @@ -226,53 +220,188 @@ fn flatten_json(
out: &mut Vec<Token>,
position: &mut usize,
tokenizer: &mut TextAnalyzer,
) {
flatten_json_with_paths(
value,
prefix,
&[prefix.to_string()],
out,
position,
tokenizer,
);
}

fn flatten_json_with_paths(
value: &Value,
legacy_prefix: &str,
array_prefixes: &[String],
out: &mut Vec<Token>,
position: &mut usize,
tokenizer: &mut TextAnalyzer,
) {
match value {
Value::Object(map) => {
for (k, v) in map {
let next_prefix = if prefix.is_empty() {
let next_legacy_prefix = if legacy_prefix.is_empty() {
k.clone()
} else {
format!("{}.{}", prefix, k)
format!("{}.{}", legacy_prefix, k)
};
flatten_json(v, &next_prefix, out, position, tokenizer);
let next_array_prefixes = array_prefixes
.iter()
.map(|prefix| join_json_path(prefix, k))
.collect::<Vec<_>>();
flatten_json_with_paths(
v,
&next_legacy_prefix,
&next_array_prefixes,
out,
position,
tokenizer,
);
}
}
Value::Array(arr) => {
for v in arr.iter() {
flatten_json(v, prefix, out, position, tokenizer);
for (array_index, v) in arr.iter().enumerate() {
for index_path in json_array_index_paths(array_prefixes) {
emit_scalar_token(
&index_path,
JSON_NUMBER_TYPE,
&array_index.to_string(),
out,
position,
);
}

let next_array_prefixes = json_array_child_paths(array_prefixes, array_index);
flatten_json_with_paths(
v,
legacy_prefix,
&next_array_prefixes,
out,
position,
tokenizer,
);
}
}
Value::String(text) => {
let mut tokens = tokenizer.token_stream(text);
while let Some(token) = tokens.next() {
let token = Token {
offset_from: 0,
offset_to: 0,
position: *position,
text: format!("{},{},{}", prefix, "str", token.text),
position_length: 1,
};
*position += 1;
out.push(token);
}
let paths = json_value_paths(legacy_prefix, array_prefixes);
emit_string_tokens(&paths, text, out, position, tokenizer);
}
_ => {
let value_type = match value {
Value::Null => "null",
Value::Bool(_) => "bool",
Value::Number(_) => "number",
Value::Null => JSON_NULL_TYPE,
Value::Bool(_) => JSON_BOOL_TYPE,
Value::Number(_) => JSON_NUMBER_TYPE,
_ => unreachable!(),
};
let token = Token {
offset_from: 0,
offset_to: 0,
position: *position,
text: format!("{},{},{}", prefix, value_type, value),
position_length: 1,
};
*position += 1;
out.push(token);
for path in json_value_paths(legacy_prefix, array_prefixes) {
emit_scalar_token(&path, value_type, &value.to_string(), out, position);
}
}
}
}

fn normalize_json_query_path(path: &str) -> lance_core::Result<String> {
let mut normalized = path
.strip_prefix("$.")
.or_else(|| path.strip_prefix('$'))
.unwrap_or(path)
.to_string();

let mut search_start = 0;
while let Some(relative_left_bracket_idx) = normalized[search_start..].find('[') {
let left_bracket_idx = search_start + relative_left_bracket_idx;
let Some(relative_right_bracket_idx) = normalized[left_bracket_idx + 1..].find(']') else {
return Err(lance_core::Error::invalid_input_source(
format!("Missing right bracket in JSON path: {}", path).into(),
));
};
let right_bracket_idx = left_bracket_idx + 1 + relative_right_bracket_idx;
let left_part = &normalized[..left_bracket_idx];
let array_index = &normalized[left_bracket_idx + 1..right_bracket_idx];
let right_part = &normalized[right_bracket_idx + 1..];

if array_index == JSON_ARRAY_WILDCARD {
normalized = format!("{}.{}", left_part, right_part);
search_start = left_bracket_idx + 1;
} else {
search_start = right_bracket_idx + 1;
}
}

Ok(normalized)
}

fn json_value_paths(legacy_prefix: &str, array_prefixes: &[String]) -> Vec<String> {
let mut paths = Vec::with_capacity(array_prefixes.len() + 1);
push_unique_path(&mut paths, legacy_prefix.to_string());
for path in array_prefixes {
push_unique_path(&mut paths, path.clone());
}
paths
}

fn json_array_child_paths(array_prefixes: &[String], array_index: usize) -> Vec<String> {
let mut paths = Vec::with_capacity(array_prefixes.len() * 2);
for prefix in array_prefixes {
push_unique_path(&mut paths, format!("{}[{}]", prefix, array_index));
push_unique_path(&mut paths, format!("{}.", prefix));
}
paths
}

fn json_array_index_paths(array_prefixes: &[String]) -> Vec<String> {
let mut paths = Vec::with_capacity(array_prefixes.len());
for prefix in array_prefixes {
push_unique_path(&mut paths, format!("{}.{}", prefix, JSON_ARRAY_INDEX_FIELD));
}
paths
}

fn join_json_path(prefix: &str, field: &str) -> String {
if prefix.is_empty() {
field.to_string()
} else {
format!("{}.{}", prefix, field)
}
}

fn push_unique_path(paths: &mut Vec<String>, path: String) {
if !paths.iter().any(|existing| existing == &path) {
paths.push(path);
}
}

fn emit_scalar_token(
path: &str,
value_type: &str,
value: &str,
out: &mut Vec<Token>,
position: &mut usize,
) {
let token = Token {
offset_from: 0,
offset_to: 0,
position: *position,
text: format!("{},{},{}", path, value_type, value),
position_length: 1,
};
*position += 1;
out.push(token);
}

fn emit_string_tokens(
paths: &[String],
text: &str,
out: &mut Vec<Token>,
position: &mut usize,
tokenizer: &mut TextAnalyzer,
) {
for path in paths {
let mut tokens = tokenizer.token_stream(text);
while let Some(token) = tokens.next() {
emit_scalar_token(path, JSON_STRING_TYPE, &token.text, out, position);
}
}
}
Expand Down Expand Up @@ -304,9 +433,10 @@ impl TokenStream for TTStream {
#[cfg(test)]
mod tests {
use crate::scalar::inverted::tokenizer::document_tokenizer::{
JsonTokenizer, LanceTokenizer, flatten_json, flatten_triplet,
JsonTokenizer, LanceTokenizer, flatten_json, flatten_triplet, normalize_json_query_path,
};
use lance_tokenizer::{SimpleTokenizer, TextAnalyzer, Token};
use rstest::rstest;
use serde_json::Value;

#[test]
Expand All @@ -327,10 +457,16 @@ mod tests {
tokens.push(token.clone());
}

assert_eq!(tokens.len(), 3);
assert_eq!(tokens.len(), 9);
assert_token(&tokens[0], 0, "a,number,1");
assert_token(&tokens[1], 1, "b.c,str,d");
assert_token(&tokens[2], 2, "b.c,str,e");
assert_token(&tokens[1], 1, "b.$index,number,0");
assert_token(&tokens[2], 2, "b.c,str,d");
assert_token(&tokens[3], 3, "b[0].c,str,d");
assert_token(&tokens[4], 4, "b..c,str,d");
assert_token(&tokens[5], 5, "b.$index,number,1");
assert_token(&tokens[6], 6, "b.c,str,e");
assert_token(&tokens[7], 7, "b[1].c,str,e");
assert_token(&tokens[8], 8, "b..c,str,e");
}

#[test]
Expand All @@ -354,14 +490,22 @@ mod tests {
let mut position = 0;
flatten_json(&value, "", &mut tokens, &mut position, &mut tokenizer);

assert_eq!(7, tokens.len());
assert_eq!(15, tokens.len());
assert_token(&tokens[0], 0, "a,number,1");
assert_token(&tokens[1], 1, "b.c,str,hello");
assert_token(&tokens[2], 2, "b.c,str,world");
assert_token(&tokens[3], 3, "b.c,str,e");
assert_token(&tokens[4], 4, "c,bool,true");
assert_token(&tokens[5], 5, "d,null,null");
assert_token(&tokens[6], 6, "e.f,number,1.0");
assert_token(&tokens[1], 1, "b.$index,number,0");
assert_token(&tokens[2], 2, "b.c,str,hello");
assert_token(&tokens[3], 3, "b.c,str,world");
assert_token(&tokens[4], 4, "b[0].c,str,hello");
assert_token(&tokens[5], 5, "b[0].c,str,world");
assert_token(&tokens[6], 6, "b..c,str,hello");
assert_token(&tokens[7], 7, "b..c,str,world");
assert_token(&tokens[8], 8, "b.$index,number,1");
assert_token(&tokens[9], 9, "b.c,str,e");
assert_token(&tokens[10], 10, "b[1].c,str,e");
assert_token(&tokens[11], 11, "b..c,str,e");
assert_token(&tokens[12], 12, "c,bool,true");
assert_token(&tokens[13], 13, "d,null,null");
assert_token(&tokens[14], 14, "e.f,number,1.0");
}

#[test]
Expand All @@ -379,6 +523,59 @@ mod tests {
assert_token(&tokens[5], 5, "e,number,1.0");
}

#[rstest]
#[case::wildcard_object_array("addresses[*].country", "addresses..country")]
#[case::rooted_wildcard_object_array("$.addresses[*].country", "addresses..country")]
#[case::indexed_object_array("addresses[1].country", "addresses[1].country")]
#[case::wildcard_scalar_array("skills[*]", "skills.")]
#[case::indexed_scalar_array("skills[0]", "skills[0]")]
#[case::mixed_wildcard_then_index("addresses[*].types[1]", "addresses..types[1]")]
#[case::mixed_index_then_wildcard("addresses[0].types[*]", "addresses[0].types.")]
fn test_normalize_json_query_path(#[case] path: &str, #[case] expected: &str) {
assert_eq!(normalize_json_query_path(path).unwrap(), expected);
}

#[test]
fn test_flatten_triplet_with_json_array_paths() {
let text = r#"addresses[*].country,str,United States;addresses[1].number,number,123;skills[*],str,hello world"#;
let mut tokenizer = TextAnalyzer::builder(SimpleTokenizer::default()).build();
let tokens = flatten_triplet(text, &mut tokenizer).unwrap();

assert_eq!(tokens.len(), 5);
assert_token(&tokens[0], 0, "addresses..country,str,United");
assert_token(&tokens[1], 1, "addresses..country,str,States");
assert_token(&tokens[2], 2, "addresses[1].number,number,123");
assert_token(&tokens[3], 3, "skills.,str,hello");
assert_token(&tokens[4], 4, "skills.,str,world");
}

#[test]
fn test_flatten_json_nested_array_path_combinations() {
let json = r#"{
"addresses": [
{"types": ["home", "office"]}
]
}"#;
let value: Value = serde_json::from_str(json).unwrap();

let mut tokens = vec![];
let mut tokenizer = TextAnalyzer::builder(SimpleTokenizer::default()).build();
let mut position = 0;
flatten_json(&value, "", &mut tokens, &mut position, &mut tokenizer);
let token_texts = tokens
.iter()
.map(|token| token.text.as_str())
.collect::<Vec<_>>();

assert!(token_texts.contains(&"addresses.types,str,office"));
assert!(token_texts.contains(&"addresses[0].types[1],str,office"));
assert!(token_texts.contains(&"addresses[0].types.,str,office"));
assert!(token_texts.contains(&"addresses..types[1],str,office"));
assert!(token_texts.contains(&"addresses..types.,str,office"));
assert!(token_texts.contains(&"addresses[0].types.$index,number,1"));
assert!(token_texts.contains(&"addresses..types.$index,number,1"));
}

fn assert_token(token: &Token, position: usize, text: &str) {
assert_eq!(
token.position, position,
Expand Down
Loading
Loading