From 3f41a34f96c399e00866cd6256d8351001acece3 Mon Sep 17 00:00:00 2001 From: Siri Appalaneni Date: Wed, 17 Jun 2026 13:20:23 -0500 Subject: [PATCH 1/2] test: reproduce FTS posting_lists index panic for issue #7313 Add a unit test that mirrors the production failure (posting_lists.len=1731, token_id=4456) when with_position indexing encounters a stale next_id. Co-authored-by: Cursor --- .../src/scalar/inverted/builder.rs | 46 +++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/rust/lance-index/src/scalar/inverted/builder.rs b/rust/lance-index/src/scalar/inverted/builder.rs index 93932f35332..c43d640fed4 100644 --- a/rust/lance-index/src/scalar/inverted/builder.rs +++ b/rust/lance-index/src/scalar/inverted/builder.rs @@ -3750,6 +3750,52 @@ mod tests { Ok(()) } + /// Reproduces #7313: `with_position` indexing panics when `tokens.next_id` exceeds + /// `posting_lists.len()`, as observed during `optimize_indices` on legacy FTS partitions. + #[tokio::test] + #[should_panic(expected = "index out of bounds")] + async fn test_process_batch_with_position_panics_when_token_id_exceeds_posting_lists_len() { + const VOCAB_SIZE: usize = 1731; + const STALE_NEXT_ID: u32 = 4456; + + let tokenizer = InvertedIndexParams::default().with_position(true).build().unwrap(); + let store = Arc::new(CountingStore::new()); + let id_alloc = Arc::new(AtomicU64::new(0)); + let mut worker = IndexWorker::new( + tokenizer, + store, + id_alloc, + IndexWorkerConfig { + with_position: true, + format_version: InvertedListFormatVersion::V1, + fragment_mask: None, + token_set_format: TokenSetFormat::default(), + worker_memory_limit_bytes: u64::MAX, + }, + ) + .await + .unwrap(); + + let mut tokens = TokenSet::default(); + for i in 0..VOCAB_SIZE { + tokens.add(format!("tok_{i}")); + } + tokens.next_id = STALE_NEXT_ID; + worker.builder.set_tokens(tokens); + let posting_tail_codec = worker.builder.posting_tail_codec; + worker + .builder + .posting_lists + .resize_with(VOCAB_SIZE, || { + PostingListBuilder::new_with_posting_tail_codec(true, posting_tail_codec) + }); + + worker + .process_batch(make_doc_batch("unseen_token_xyz", 0)) + .await + .unwrap(); + } + #[test] fn test_resolve_worker_memory_limit_uses_default_when_unset() { let params = InvertedIndexParams::default(); From 66b981b4d47a01710daebe92777bc232402a04a2 Mon Sep 17 00:00:00 2001 From: Siri Appalaneni Date: Wed, 17 Jun 2026 13:57:19 -0500 Subject: [PATCH 2/2] fix: grow posting_lists before indexed access in FTS with_position builder When token_id exceeds posting_lists.len() during with_position indexing (e.g. stale next_id from legacy FTS partitions), resize posting_lists to token_id + 1 before access instead of growing only on exact equality. Fixes lance-format/lance#7313 Co-authored-by: Cursor --- .../src/scalar/inverted/builder.rs | 43 ++++++++++++------- 1 file changed, 27 insertions(+), 16 deletions(-) diff --git a/rust/lance-index/src/scalar/inverted/builder.rs b/rust/lance-index/src/scalar/inverted/builder.rs index c43d640fed4..4a0407eea83 100644 --- a/rust/lance-index/src/scalar/inverted/builder.rs +++ b/rust/lance-index/src/scalar/inverted/builder.rs @@ -1322,16 +1322,17 @@ impl IndexWorker { let token = token_stream.token_mut(); let token_text = std::mem::take(&mut token.text); let token_id = builder.tokens.add(token_text); - if token_id as usize == builder.posting_lists.len() { + let token_idx = token_id as usize; + if token_idx >= builder.posting_lists.len() { let old_posting_lists_overhead_size = (builder.posting_lists.capacity() * std::mem::size_of::()) as u64; - builder.posting_lists.push( + builder.posting_lists.resize_with(token_idx + 1, || { PostingListBuilder::new_with_posting_tail_codec( true, posting_tail_codec, - ), - ); + ) + }); let new_posting_lists_overhead_size = (builder.posting_lists.capacity() * std::mem::size_of::()) as u64; @@ -3750,15 +3751,20 @@ mod tests { Ok(()) } - /// Reproduces #7313: `with_position` indexing panics when `tokens.next_id` exceeds - /// `posting_lists.len()`, as observed during `optimize_indices` on legacy FTS partitions. + /// Regression test for #7313: `with_position` indexing must grow `posting_lists` when + /// `tokens.next_id` exceeds `posting_lists.len()`, as on legacy FTS partitions. #[tokio::test] - #[should_panic(expected = "index out of bounds")] - async fn test_process_batch_with_position_panics_when_token_id_exceeds_posting_lists_len() { + async fn test_process_batch_with_position_handles_token_id_gaps() { const VOCAB_SIZE: usize = 1731; const STALE_NEXT_ID: u32 = 4456; + const NEW_TOKEN: &str = "xyzzunique7313"; - let tokenizer = InvertedIndexParams::default().with_position(true).build().unwrap(); + let tokenizer = InvertedIndexParams::default() + .with_position(true) + .stem(false) + .lower_case(false) + .build() + .unwrap(); let store = Arc::new(CountingStore::new()); let id_alloc = Arc::new(AtomicU64::new(0)); let mut worker = IndexWorker::new( @@ -3783,17 +3789,22 @@ mod tests { tokens.next_id = STALE_NEXT_ID; worker.builder.set_tokens(tokens); let posting_tail_codec = worker.builder.posting_tail_codec; - worker - .builder - .posting_lists - .resize_with(VOCAB_SIZE, || { - PostingListBuilder::new_with_posting_tail_codec(true, posting_tail_codec) - }); + worker.builder.posting_lists.resize_with(VOCAB_SIZE, || { + PostingListBuilder::new_with_posting_tail_codec(true, posting_tail_codec) + }); worker - .process_batch(make_doc_batch("unseen_token_xyz", 0)) + .process_batch(make_doc_batch(NEW_TOKEN, 0)) .await .unwrap(); + + let new_token_id = worker + .builder + .tokens + .get(NEW_TOKEN) + .expect("new token indexed"); + assert!((new_token_id as usize) < worker.builder.posting_lists.len()); + assert_eq!(worker.builder.posting_lists.len(), 4457); } #[test]