Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 61 additions & 4 deletions rust/lance-index/src/scalar/inverted/builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1322,16 +1322,17 @@ impl IndexWorker {
let token = token_stream.token_mut();
let token_text = std::mem::take(&mut token.text);
let token_id = builder.tokens.add(token_text);
if token_id as usize == builder.posting_lists.len() {
let token_idx = token_id as usize;
if token_idx >= builder.posting_lists.len() {
let old_posting_lists_overhead_size = (builder.posting_lists.capacity()
* std::mem::size_of::<PostingListBuilder>())
as u64;
builder.posting_lists.push(
builder.posting_lists.resize_with(token_idx + 1, || {

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This turns a stale next_id into a persisted sparse token id while reload and tail-merge paths still assume dense token ids, so a later write/reload or multi-worker tail merge can reuse ids or hit another out-of-bounds access.

PostingListBuilder::new_with_posting_tail_codec(
true,
posting_tail_codec,
),
);
)
});
let new_posting_lists_overhead_size = (builder.posting_lists.capacity()
* std::mem::size_of::<PostingListBuilder>())
as u64;
Expand Down Expand Up @@ -3750,6 +3751,62 @@ mod tests {
Ok(())
}

/// Regression test for #7313: `with_position` indexing must grow `posting_lists` when
/// `tokens.next_id` exceeds `posting_lists.len()`, as on legacy FTS partitions.
#[tokio::test]
async fn test_process_batch_with_position_handles_token_id_gaps() {
const VOCAB_SIZE: usize = 1731;
const STALE_NEXT_ID: u32 = 4456;
const NEW_TOKEN: &str = "xyzzunique7313";

let tokenizer = InvertedIndexParams::default()
.with_position(true)
.stem(false)
.lower_case(false)
.build()
.unwrap();
let store = Arc::new(CountingStore::new());
let id_alloc = Arc::new(AtomicU64::new(0));
let mut worker = IndexWorker::new(
tokenizer,
store,
id_alloc,
IndexWorkerConfig {
with_position: true,
format_version: InvertedListFormatVersion::V1,
fragment_mask: None,
token_set_format: TokenSetFormat::default(),
worker_memory_limit_bytes: u64::MAX,
},
)
.await
.unwrap();

let mut tokens = TokenSet::default();
for i in 0..VOCAB_SIZE {
tokens.add(format!("tok_{i}"));
}
tokens.next_id = STALE_NEXT_ID;
worker.builder.set_tokens(tokens);
let posting_tail_codec = worker.builder.posting_tail_codec;
worker.builder.posting_lists.resize_with(VOCAB_SIZE, || {
PostingListBuilder::new_with_posting_tail_codec(true, posting_tail_codec)
});

worker
.process_batch(make_doc_batch(NEW_TOKEN, 0))
.await
.unwrap();

let new_token_id = worker
.builder
.tokens
.get(NEW_TOKEN)
.expect("new token indexed");
assert!((new_token_id as usize) < worker.builder.posting_lists.len());
assert_eq!(worker.builder.posting_lists.len(), 4457);
}

#[test]
fn test_resolve_worker_memory_limit_uses_default_when_unset() {
let params = InvertedIndexParams::default();
Expand Down
Loading