From 172547788462a6aa69534de4c76d7ddbe4c57932 Mon Sep 17 00:00:00 2001 From: xuanyili Date: Fri, 19 Jun 2026 18:19:38 +0000 Subject: [PATCH] feat(index): support array object paths in JSON FTS Summary: - Emit legacy, wildcard, and exact indexed path tokens for JSON arrays. - Normalize JSON query triplets with JSONPath-style $ roots, wildcards, and indexes. - Add tokenizer and dataset regressions for array-of-object JSON FTS. Test Plan: - cargo fmt --all --check - cargo test -p lance-index --features "protoc lance-encoding/protoc lance-file/protoc lance-table/protoc lance-datafusion/protoc" scalar::inverted::tokenizer::document_tokenizer::tests -- --nocapture - PROTOC=/home/user/lance-json/target/debug/build/protobuf-src-0bbb645409253599/out/bin/protoc-27.2.0 cargo test -p lance --features "protoc lance-encoding/protoc lance-file/protoc lance-index/protoc lance-table/protoc lance-datafusion/protoc" test_json_inverted_ -- --nocapture - PROTOC=/home/user/lance-json/target/debug/build/protobuf-src-0bbb645409253599/out/bin/protoc-27.2.0 cargo clippy --all --tests --benches -- -D warnings --- .../inverted/tokenizer/document_tokenizer.rs | 323 ++++++++++++++---- rust/lance/src/dataset/tests/dataset_index.rs | 90 +++++ 2 files changed, 350 insertions(+), 63 deletions(-) diff --git a/rust/lance-index/src/scalar/inverted/tokenizer/document_tokenizer.rs b/rust/lance-index/src/scalar/inverted/tokenizer/document_tokenizer.rs index 62dd7b1aa3b..09425513c84 100644 --- a/rust/lance-index/src/scalar/inverted/tokenizer/document_tokenizer.rs +++ b/rust/lance-index/src/scalar/inverted/tokenizer/document_tokenizer.rs @@ -7,6 +7,13 @@ use lance_arrow::json::JSON_EXT_NAME; use lance_tokenizer::{BoxTokenStream, TextAnalyzer, Token, TokenStream}; use serde_json::Value; +const JSON_ARRAY_WILDCARD: &str = "*"; +const JSON_ARRAY_INDEX_FIELD: &str = "$index"; +const JSON_NUMBER_TYPE: &str = "number"; +const JSON_BOOL_TYPE: &str = "bool"; +const JSON_NULL_TYPE: &str = "null"; +const JSON_STRING_TYPE: &str = "str"; + /// Document type for full text search. #[derive(Debug, Clone)] pub enum DocType { @@ -184,32 +191,19 @@ fn flatten_triplet(text: &str, tokenizer: &mut TextAnalyzer) -> lance_core::Resu let field = parts[0]; let v_type = parts[1]; let value = parts[2]; + let normalized_field = normalize_json_query_path(field)?; match v_type { - "number" | "bool" | "null" => { - let token = Token { - offset_from: 0, - offset_to: 0, - position: idx, - text: format!("{},{},{}", field, v_type, value), - position_length: 1, - }; - token_vec.push(token); - idx += 1; - } - "str" => { - let mut tokens = tokenizer.token_stream(value); - while let Some(token) = tokens.next() { - token_vec.push(Token { - offset_from: 0, - offset_to: 0, - position: idx, - text: format!("{},{},{}", field, v_type, token.text), - position_length: 1, - }); - idx += 1; - } + JSON_NUMBER_TYPE | JSON_BOOL_TYPE | JSON_NULL_TYPE => { + emit_scalar_token(&normalized_field, v_type, value, &mut token_vec, &mut idx) } + JSON_STRING_TYPE => emit_string_tokens( + &[normalized_field], + value, + &mut token_vec, + &mut idx, + tokenizer, + ), _ => { return Err(lance_core::Error::invalid_input_source( format!("Invalid triple type: {}", v_type).into(), @@ -226,53 +220,188 @@ fn flatten_json( out: &mut Vec, position: &mut usize, tokenizer: &mut TextAnalyzer, +) { + flatten_json_with_paths( + value, + prefix, + &[prefix.to_string()], + out, + position, + tokenizer, + ); +} + +fn flatten_json_with_paths( + value: &Value, + legacy_prefix: &str, + array_prefixes: &[String], + out: &mut Vec, + position: &mut usize, + tokenizer: &mut TextAnalyzer, ) { match value { Value::Object(map) => { for (k, v) in map { - let next_prefix = if prefix.is_empty() { + let next_legacy_prefix = if legacy_prefix.is_empty() { k.clone() } else { - format!("{}.{}", prefix, k) + format!("{}.{}", legacy_prefix, k) }; - flatten_json(v, &next_prefix, out, position, tokenizer); + let next_array_prefixes = array_prefixes + .iter() + .map(|prefix| join_json_path(prefix, k)) + .collect::>(); + flatten_json_with_paths( + v, + &next_legacy_prefix, + &next_array_prefixes, + out, + position, + tokenizer, + ); } } Value::Array(arr) => { - for v in arr.iter() { - flatten_json(v, prefix, out, position, tokenizer); + for (array_index, v) in arr.iter().enumerate() { + for index_path in json_array_index_paths(array_prefixes) { + emit_scalar_token( + &index_path, + JSON_NUMBER_TYPE, + &array_index.to_string(), + out, + position, + ); + } + + let next_array_prefixes = json_array_child_paths(array_prefixes, array_index); + flatten_json_with_paths( + v, + legacy_prefix, + &next_array_prefixes, + out, + position, + tokenizer, + ); } } Value::String(text) => { - let mut tokens = tokenizer.token_stream(text); - while let Some(token) = tokens.next() { - let token = Token { - offset_from: 0, - offset_to: 0, - position: *position, - text: format!("{},{},{}", prefix, "str", token.text), - position_length: 1, - }; - *position += 1; - out.push(token); - } + let paths = json_value_paths(legacy_prefix, array_prefixes); + emit_string_tokens(&paths, text, out, position, tokenizer); } _ => { let value_type = match value { - Value::Null => "null", - Value::Bool(_) => "bool", - Value::Number(_) => "number", + Value::Null => JSON_NULL_TYPE, + Value::Bool(_) => JSON_BOOL_TYPE, + Value::Number(_) => JSON_NUMBER_TYPE, _ => unreachable!(), }; - let token = Token { - offset_from: 0, - offset_to: 0, - position: *position, - text: format!("{},{},{}", prefix, value_type, value), - position_length: 1, - }; - *position += 1; - out.push(token); + for path in json_value_paths(legacy_prefix, array_prefixes) { + emit_scalar_token(&path, value_type, &value.to_string(), out, position); + } + } + } +} + +fn normalize_json_query_path(path: &str) -> lance_core::Result { + let mut normalized = path + .strip_prefix("$.") + .or_else(|| path.strip_prefix('$')) + .unwrap_or(path) + .to_string(); + + let mut search_start = 0; + while let Some(relative_left_bracket_idx) = normalized[search_start..].find('[') { + let left_bracket_idx = search_start + relative_left_bracket_idx; + let Some(relative_right_bracket_idx) = normalized[left_bracket_idx + 1..].find(']') else { + return Err(lance_core::Error::invalid_input_source( + format!("Missing right bracket in JSON path: {}", path).into(), + )); + }; + let right_bracket_idx = left_bracket_idx + 1 + relative_right_bracket_idx; + let left_part = &normalized[..left_bracket_idx]; + let array_index = &normalized[left_bracket_idx + 1..right_bracket_idx]; + let right_part = &normalized[right_bracket_idx + 1..]; + + if array_index == JSON_ARRAY_WILDCARD { + normalized = format!("{}.{}", left_part, right_part); + search_start = left_bracket_idx + 1; + } else { + search_start = right_bracket_idx + 1; + } + } + + Ok(normalized) +} + +fn json_value_paths(legacy_prefix: &str, array_prefixes: &[String]) -> Vec { + let mut paths = Vec::with_capacity(array_prefixes.len() + 1); + push_unique_path(&mut paths, legacy_prefix.to_string()); + for path in array_prefixes { + push_unique_path(&mut paths, path.clone()); + } + paths +} + +fn json_array_child_paths(array_prefixes: &[String], array_index: usize) -> Vec { + let mut paths = Vec::with_capacity(array_prefixes.len() * 2); + for prefix in array_prefixes { + push_unique_path(&mut paths, format!("{}[{}]", prefix, array_index)); + push_unique_path(&mut paths, format!("{}.", prefix)); + } + paths +} + +fn json_array_index_paths(array_prefixes: &[String]) -> Vec { + let mut paths = Vec::with_capacity(array_prefixes.len()); + for prefix in array_prefixes { + push_unique_path(&mut paths, format!("{}.{}", prefix, JSON_ARRAY_INDEX_FIELD)); + } + paths +} + +fn join_json_path(prefix: &str, field: &str) -> String { + if prefix.is_empty() { + field.to_string() + } else { + format!("{}.{}", prefix, field) + } +} + +fn push_unique_path(paths: &mut Vec, path: String) { + if !paths.iter().any(|existing| existing == &path) { + paths.push(path); + } +} + +fn emit_scalar_token( + path: &str, + value_type: &str, + value: &str, + out: &mut Vec, + position: &mut usize, +) { + let token = Token { + offset_from: 0, + offset_to: 0, + position: *position, + text: format!("{},{},{}", path, value_type, value), + position_length: 1, + }; + *position += 1; + out.push(token); +} + +fn emit_string_tokens( + paths: &[String], + text: &str, + out: &mut Vec, + position: &mut usize, + tokenizer: &mut TextAnalyzer, +) { + for path in paths { + let mut tokens = tokenizer.token_stream(text); + while let Some(token) = tokens.next() { + emit_scalar_token(path, JSON_STRING_TYPE, &token.text, out, position); } } } @@ -304,9 +433,10 @@ impl TokenStream for TTStream { #[cfg(test)] mod tests { use crate::scalar::inverted::tokenizer::document_tokenizer::{ - JsonTokenizer, LanceTokenizer, flatten_json, flatten_triplet, + JsonTokenizer, LanceTokenizer, flatten_json, flatten_triplet, normalize_json_query_path, }; use lance_tokenizer::{SimpleTokenizer, TextAnalyzer, Token}; + use rstest::rstest; use serde_json::Value; #[test] @@ -327,10 +457,16 @@ mod tests { tokens.push(token.clone()); } - assert_eq!(tokens.len(), 3); + assert_eq!(tokens.len(), 9); assert_token(&tokens[0], 0, "a,number,1"); - assert_token(&tokens[1], 1, "b.c,str,d"); - assert_token(&tokens[2], 2, "b.c,str,e"); + assert_token(&tokens[1], 1, "b.$index,number,0"); + assert_token(&tokens[2], 2, "b.c,str,d"); + assert_token(&tokens[3], 3, "b[0].c,str,d"); + assert_token(&tokens[4], 4, "b..c,str,d"); + assert_token(&tokens[5], 5, "b.$index,number,1"); + assert_token(&tokens[6], 6, "b.c,str,e"); + assert_token(&tokens[7], 7, "b[1].c,str,e"); + assert_token(&tokens[8], 8, "b..c,str,e"); } #[test] @@ -354,14 +490,22 @@ mod tests { let mut position = 0; flatten_json(&value, "", &mut tokens, &mut position, &mut tokenizer); - assert_eq!(7, tokens.len()); + assert_eq!(15, tokens.len()); assert_token(&tokens[0], 0, "a,number,1"); - assert_token(&tokens[1], 1, "b.c,str,hello"); - assert_token(&tokens[2], 2, "b.c,str,world"); - assert_token(&tokens[3], 3, "b.c,str,e"); - assert_token(&tokens[4], 4, "c,bool,true"); - assert_token(&tokens[5], 5, "d,null,null"); - assert_token(&tokens[6], 6, "e.f,number,1.0"); + assert_token(&tokens[1], 1, "b.$index,number,0"); + assert_token(&tokens[2], 2, "b.c,str,hello"); + assert_token(&tokens[3], 3, "b.c,str,world"); + assert_token(&tokens[4], 4, "b[0].c,str,hello"); + assert_token(&tokens[5], 5, "b[0].c,str,world"); + assert_token(&tokens[6], 6, "b..c,str,hello"); + assert_token(&tokens[7], 7, "b..c,str,world"); + assert_token(&tokens[8], 8, "b.$index,number,1"); + assert_token(&tokens[9], 9, "b.c,str,e"); + assert_token(&tokens[10], 10, "b[1].c,str,e"); + assert_token(&tokens[11], 11, "b..c,str,e"); + assert_token(&tokens[12], 12, "c,bool,true"); + assert_token(&tokens[13], 13, "d,null,null"); + assert_token(&tokens[14], 14, "e.f,number,1.0"); } #[test] @@ -379,6 +523,59 @@ mod tests { assert_token(&tokens[5], 5, "e,number,1.0"); } + #[rstest] + #[case::wildcard_object_array("addresses[*].country", "addresses..country")] + #[case::rooted_wildcard_object_array("$.addresses[*].country", "addresses..country")] + #[case::indexed_object_array("addresses[1].country", "addresses[1].country")] + #[case::wildcard_scalar_array("skills[*]", "skills.")] + #[case::indexed_scalar_array("skills[0]", "skills[0]")] + #[case::mixed_wildcard_then_index("addresses[*].types[1]", "addresses..types[1]")] + #[case::mixed_index_then_wildcard("addresses[0].types[*]", "addresses[0].types.")] + fn test_normalize_json_query_path(#[case] path: &str, #[case] expected: &str) { + assert_eq!(normalize_json_query_path(path).unwrap(), expected); + } + + #[test] + fn test_flatten_triplet_with_json_array_paths() { + let text = r#"addresses[*].country,str,United States;addresses[1].number,number,123;skills[*],str,hello world"#; + let mut tokenizer = TextAnalyzer::builder(SimpleTokenizer::default()).build(); + let tokens = flatten_triplet(text, &mut tokenizer).unwrap(); + + assert_eq!(tokens.len(), 5); + assert_token(&tokens[0], 0, "addresses..country,str,United"); + assert_token(&tokens[1], 1, "addresses..country,str,States"); + assert_token(&tokens[2], 2, "addresses[1].number,number,123"); + assert_token(&tokens[3], 3, "skills.,str,hello"); + assert_token(&tokens[4], 4, "skills.,str,world"); + } + + #[test] + fn test_flatten_json_nested_array_path_combinations() { + let json = r#"{ + "addresses": [ + {"types": ["home", "office"]} + ] + }"#; + let value: Value = serde_json::from_str(json).unwrap(); + + let mut tokens = vec![]; + let mut tokenizer = TextAnalyzer::builder(SimpleTokenizer::default()).build(); + let mut position = 0; + flatten_json(&value, "", &mut tokens, &mut position, &mut tokenizer); + let token_texts = tokens + .iter() + .map(|token| token.text.as_str()) + .collect::>(); + + assert!(token_texts.contains(&"addresses.types,str,office")); + assert!(token_texts.contains(&"addresses[0].types[1],str,office")); + assert!(token_texts.contains(&"addresses[0].types.,str,office")); + assert!(token_texts.contains(&"addresses..types[1],str,office")); + assert!(token_texts.contains(&"addresses..types.,str,office")); + assert!(token_texts.contains(&"addresses[0].types.$index,number,1")); + assert!(token_texts.contains(&"addresses..types.$index,number,1")); + } + fn assert_token(token: &Token, position: usize, text: &str) { assert_eq!( token.position, position, diff --git a/rust/lance/src/dataset/tests/dataset_index.rs b/rust/lance/src/dataset/tests/dataset_index.rs index beb6e2b99fd..dbaa8f736db 100644 --- a/rust/lance/src/dataset/tests/dataset_index.rs +++ b/rust/lance/src/dataset/tests/dataset_index.rs @@ -2728,6 +2728,28 @@ async fn prepare_json_dataset() -> (Dataset, String) { (dataset, json_col) } +async fn json_fts_row_ids(dataset: &Dataset, json_col: &str, terms: &str) -> Vec { + let query = FullTextSearchQuery { + query: FtsQuery::Match( + MatchQuery::new(terms.to_string()).with_column(Some(json_col.to_string())), + ), + limit: None, + wand_factor: None, + }; + let batch = dataset + .scan() + .project(&[ROW_ID]) + .unwrap() + .full_text_search(query) + .unwrap() + .try_into_batch() + .await + .unwrap(); + let mut row_ids = batch[ROW_ID].as_primitive::().values().to_vec(); + row_ids.sort_unstable(); + row_ids +} + #[tokio::test] async fn test_json_inverted_fuzziness_query() { let (mut dataset, json_col) = prepare_json_dataset().await; @@ -3007,6 +3029,74 @@ async fn test_json_inverted_flat_match_query() { assert_eq!(2, batch.num_rows()); } +#[tokio::test] +async fn test_json_inverted_array_object_paths() { + let json_col = "json_field".to_string(); + let text_col = Arc::new(StringArray::from(vec![ + r#"{ + "addresses": [ + {"country": "us", "street": "main"}, + {"country": "ca", "street": "second"} + ] + }"#, + r#"{ + "addresses": [ + {"country": "ca", "street": "main"}, + {"country": "us", "street": "second"} + ] + }"#, + ])); + + let mut metadata = HashMap::new(); + metadata.insert( + ARROW_EXT_NAME_KEY.to_string(), + ARROW_JSON_EXT_NAME.to_string(), + ); + let batch = RecordBatch::try_new( + arrow_schema::Schema::new(vec![ + Field::new(&json_col, DataType::Utf8, false).with_metadata(metadata), + ]) + .into(), + vec![text_col.clone()], + ) + .unwrap(); + let schema = batch.schema(); + let stream = RecordBatchIterator::new(vec![batch].into_iter().map(Ok), schema); + let mut dataset = Dataset::write(stream, "memory://test/json_array_objects", None) + .await + .unwrap(); + + dataset + .create_index( + &[&json_col], + IndexType::Inverted, + None, + &InvertedIndexParams::default() + .lance_tokenizer("json".to_string()) + .stem(false), + true, + ) + .await + .unwrap(); + + assert_eq!( + json_fts_row_ids(&dataset, &json_col, "addresses.country,str,us").await, + vec![0, 1] + ); + assert_eq!( + json_fts_row_ids(&dataset, &json_col, "addresses[*].country,str,us").await, + vec![0, 1] + ); + assert_eq!( + json_fts_row_ids(&dataset, &json_col, "addresses[0].country,str,us").await, + vec![0] + ); + assert_eq!( + json_fts_row_ids(&dataset, &json_col, "addresses[1].country,str,us").await, + vec![1] + ); +} + #[tokio::test] async fn test_json_inverted_phrase_query() { // Prepare json dataset