Skip to content

Commit b265c57

Browse files
phernandezclaude
andcommitted
fix: address review feedback on stale embedding stats
1. Remove unused valid_entity_ids parameter from _purge_stale_search_rows 2. Add entity_exists filter to total_embeddings and orphaned_chunks queries for consistency (previously only applied to 3 of 5 queries) 3. Strengthen test assertion to exact match instead of <= Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> Signed-off-by: phernandez <paul@basicmachines.co>
1 parent 6bf5343 commit b265c57

3 files changed

Lines changed: 21 additions & 11 deletions

File tree

src/basic_memory/services/project_service.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1000,6 +1000,10 @@ async def get_embedding_status(self, project_id: int) -> EmbeddingStatus:
10001000
# Filter by entity existence to exclude stale rows from deleted entities
10011001
# that remain in derived search tables (search_index, search_vector_chunks)
10021002
entity_exists = "AND entity_id IN (SELECT id FROM entity WHERE project_id = :project_id)"
1003+
# Same filter for aliased chunks table (used in JOIN queries below)
1004+
chunk_entity_exists = (
1005+
"AND c.entity_id IN (SELECT id FROM entity WHERE project_id = :project_id)"
1006+
)
10031007

10041008
si_result = await self.repository.execute_query(
10051009
text(
@@ -1034,13 +1038,13 @@ async def get_embedding_status(self, project_id: int) -> EmbeddingStatus:
10341038
embeddings_sql = text(
10351039
"SELECT COUNT(*) FROM search_vector_chunks c "
10361040
"JOIN search_vector_embeddings e ON e.chunk_id = c.id "
1037-
"WHERE c.project_id = :project_id"
1041+
f"WHERE c.project_id = :project_id {chunk_entity_exists}"
10381042
)
10391043
else:
10401044
embeddings_sql = text(
10411045
"SELECT COUNT(*) FROM search_vector_chunks c "
10421046
"JOIN search_vector_embeddings e ON e.rowid = c.id "
1043-
"WHERE c.project_id = :project_id"
1047+
f"WHERE c.project_id = :project_id {chunk_entity_exists}"
10441048
)
10451049

10461050
embeddings_result = await self.repository.execute_query(
@@ -1053,13 +1057,13 @@ async def get_embedding_status(self, project_id: int) -> EmbeddingStatus:
10531057
orphan_sql = text(
10541058
"SELECT COUNT(*) FROM search_vector_chunks c "
10551059
"LEFT JOIN search_vector_embeddings e ON e.chunk_id = c.id "
1056-
"WHERE c.project_id = :project_id AND e.chunk_id IS NULL"
1060+
f"WHERE c.project_id = :project_id AND e.chunk_id IS NULL {chunk_entity_exists}"
10571061
)
10581062
else:
10591063
orphan_sql = text(
10601064
"SELECT COUNT(*) FROM search_vector_chunks c "
10611065
"LEFT JOIN search_vector_embeddings e ON e.rowid = c.id "
1062-
"WHERE c.project_id = :project_id AND e.rowid IS NULL"
1066+
f"WHERE c.project_id = :project_id AND e.rowid IS NULL {chunk_entity_exists}"
10631067
)
10641068

10651069
orphan_result = await self.repository.execute_query(

src/basic_memory/services/search_service.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -406,7 +406,7 @@ async def reindex_vectors(self, progress_callback=None) -> dict:
406406

407407
# Clean up stale rows in search_index and search_vector_chunks
408408
# that reference entity_ids no longer in the entity table
409-
await self._purge_stale_search_rows(set(entity_ids))
409+
await self._purge_stale_search_rows()
410410

411411
batch_result = await self.repository.sync_entity_vectors_batch(
412412
entity_ids,
@@ -424,7 +424,7 @@ async def reindex_vectors(self, progress_callback=None) -> dict:
424424

425425
return stats
426426

427-
async def _purge_stale_search_rows(self, valid_entity_ids: set[int]) -> None:
427+
async def _purge_stale_search_rows(self) -> None:
428428
"""Remove rows from search_index and search_vector_chunks for deleted entities.
429429
430430
Trigger: entities are deleted but their derived search rows remain

tests/services/test_project_service_embedding_status.py

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -282,14 +282,20 @@ async def test_embedding_status_excludes_stale_entity_ids(
282282
):
283283
status = await project_service.get_embedding_status(test_project.id)
284284

285-
# The stale entity_id should NOT be counted in total_indexed_entities
286-
real_entity_result = await project_service.repository.execute_query(
287-
text("SELECT COUNT(*) FROM entity WHERE project_id = :pid"),
285+
# The stale entity_id should NOT be counted in total_indexed_entities.
286+
# Count real entities that have search_index rows (the stale one should be excluded).
287+
real_indexed_result = await project_service.repository.execute_query(
288+
text(
289+
"SELECT COUNT(DISTINCT si.entity_id) FROM search_index si "
290+
"JOIN entity e ON e.id = si.entity_id "
291+
"WHERE si.project_id = :pid"
292+
),
288293
{"pid": test_project.id},
289294
)
290-
real_entity_count = real_entity_result.scalar() or 0
295+
real_indexed_count = real_indexed_result.scalar() or 0
291296

292-
assert status.total_indexed_entities <= real_entity_count
297+
# Exact match — stale entity_id must not inflate the count
298+
assert status.total_indexed_entities == real_indexed_count
293299

294300

295301
@pytest.mark.asyncio

0 commit comments

Comments
 (0)