Skip to content

Commit 0f3889f

Browse files
phernandezclaude
andauthored
fix: Return matched chunk text in search results (#601)
Signed-off-by: phernandez <paul@basicmachines.co> Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
1 parent c442918 commit 0f3889f

10 files changed

Lines changed: 154 additions & 6 deletions

File tree

src/basic_memory/api/v2/utils.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -177,6 +177,7 @@ async def to_search_results(entity_service: EntityService, results: List[SearchI
177177
score=r.score, # pyright: ignore
178178
entity=entities[0].permalink if entities else None,
179179
content=r.content,
180+
matched_chunk=r.matched_chunk_text,
180181
file_path=r.file_path,
181182
metadata=r.metadata,
182183
entity_id=entity_id,

src/basic_memory/repository/postgres_search_repository.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -424,7 +424,7 @@ async def _run_vector_query(
424424
ORDER BY e.embedding <=> CAST(:query_embedding AS vector)
425425
LIMIT :vector_k
426426
)
427-
SELECT c.entity_id, c.chunk_key, vector_matches.distance AS best_distance
427+
SELECT c.entity_id, c.chunk_key, c.chunk_text, vector_matches.distance AS best_distance
428428
FROM vector_matches
429429
JOIN search_vector_chunks c ON c.id = vector_matches.chunk_id
430430
WHERE c.project_id = :project_id

src/basic_memory/repository/search_index_row.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,9 @@ class SearchIndexRow:
3838
to_id: Optional[int] = None # relations
3939
relation_type: Optional[str] = None # relations
4040

41+
# Matched chunk text from vector search (the actual content that matched the query)
42+
matched_chunk_text: Optional[str] = None
43+
4144
CONTENT_DISPLAY_LIMIT = 250
4245

4346
@property

src/basic_memory/repository/search_repository_base.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -871,12 +871,14 @@ async def _search_vector_only(
871871

872872
# Build per-search_index_row similarity scores from chunk-level results.
873873
# Each chunk_key encodes the search_index row type and id.
874-
# Keep the best similarity per search_index row id.
874+
# Keep the best similarity (and its chunk text) per search_index row id.
875875
similarity_by_si_id: dict[int, float] = {}
876+
best_chunk_by_si_id: dict[int, str] = {}
876877
for row in vector_rows:
877878
chunk_key = row.get("chunk_key", "")
878879
distance = float(row["best_distance"])
879880
similarity = self._distance_to_similarity(distance)
881+
chunk_text = row.get("chunk_text", "")
880882
try:
881883
_, si_id = self._parse_chunk_key(chunk_key)
882884
except (ValueError, IndexError):
@@ -885,6 +887,7 @@ async def _search_vector_only(
885887
current = similarity_by_si_id.get(si_id)
886888
if current is None or similarity > current:
887889
similarity_by_si_id[si_id] = similarity
890+
best_chunk_by_si_id[si_id] = chunk_text
888891

889892
if not similarity_by_si_id:
890893
return []
@@ -944,7 +947,13 @@ async def _search_vector_only(
944947
row = search_index_rows.get(si_id)
945948
if row is None:
946949
continue
947-
ranked_rows.append(replace(row, score=similarity))
950+
ranked_rows.append(
951+
replace(
952+
row,
953+
score=similarity,
954+
matched_chunk_text=best_chunk_by_si_id.get(si_id),
955+
)
956+
)
948957

949958
ranked_rows.sort(key=lambda item: item.score or 0.0, reverse=True)
950959
return ranked_rows[offset : offset + limit]

src/basic_memory/repository/sqlite_search_repository.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -453,7 +453,7 @@ async def _run_vector_query(
453453
" WHERE embedding MATCH :query_embedding "
454454
" AND k = :vector_k"
455455
") "
456-
"SELECT c.entity_id, c.chunk_key, vector_matches.distance AS best_distance "
456+
"SELECT c.entity_id, c.chunk_key, c.chunk_text, vector_matches.distance AS best_distance "
457457
"FROM vector_matches "
458458
"JOIN search_vector_chunks c ON c.id = vector_matches.rowid "
459459
"WHERE c.project_id = :project_id "

src/basic_memory/schemas/search.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,7 @@ class SearchResult(BaseModel):
119119
entity: Optional[Permalink] = None
120120
permalink: Optional[str]
121121
content: Optional[str] = None
122+
matched_chunk: Optional[str] = None
122123
file_path: str
123124

124125
metadata: Optional[dict] = None

tests/api/v2/test_search_router.py

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,14 @@
11
"""Tests for v2 search router endpoints."""
22

3+
from datetime import datetime, timezone
4+
35
import pytest
46
from httpx import AsyncClient
57
from pathlib import Path
68

79
from basic_memory.deps.services import get_search_service_v2_external
810
from basic_memory.models import Project
11+
from basic_memory.repository.search_index_row import SearchIndexRow
912
from basic_memory.repository.semantic_errors import (
1013
SemanticDependenciesMissingError,
1114
SemanticSearchDisabledError,
@@ -428,3 +431,86 @@ async def test_search_has_more_false_on_last_page(
428431
assert response.status_code == 200
429432
data = response.json()
430433
assert data["has_more"] is False
434+
435+
436+
@pytest.mark.asyncio
437+
async def test_search_result_includes_matched_chunk(
438+
client: AsyncClient,
439+
app,
440+
v2_project_url: str,
441+
):
442+
"""matched_chunk field appears in search API JSON when set on SearchIndexRow."""
443+
now = datetime.now(timezone.utc)
444+
fake_row = SearchIndexRow(
445+
project_id=1,
446+
id=42,
447+
type="entity",
448+
file_path="notes/pricing.md",
449+
created_at=now,
450+
updated_at=now,
451+
title="Pricing Notes",
452+
permalink="notes/pricing",
453+
content_snippet="# Pricing Notes\n\n- [pricing] Team plan is $9/mo per seat",
454+
score=0.85,
455+
matched_chunk_text="- [pricing] Team plan is $9/mo per seat",
456+
)
457+
458+
class FakeSearchService:
459+
async def search(self, *args, **kwargs):
460+
return [fake_row]
461+
462+
app.dependency_overrides[get_search_service_v2_external] = lambda: FakeSearchService()
463+
try:
464+
response = await client.post(
465+
f"{v2_project_url}/search/",
466+
json={"search_text": "pricing"},
467+
)
468+
finally:
469+
app.dependency_overrides.pop(get_search_service_v2_external, None)
470+
471+
assert response.status_code == 200
472+
data = response.json()
473+
assert len(data["results"]) == 1
474+
result = data["results"][0]
475+
assert result["matched_chunk"] == "- [pricing] Team plan is $9/mo per seat"
476+
477+
478+
@pytest.mark.asyncio
479+
async def test_search_result_omits_matched_chunk_when_none(
480+
client: AsyncClient,
481+
app,
482+
v2_project_url: str,
483+
):
484+
"""matched_chunk field is null when not set (FTS-only results)."""
485+
now = datetime.now(timezone.utc)
486+
fake_row = SearchIndexRow(
487+
project_id=1,
488+
id=43,
489+
type="entity",
490+
file_path="notes/general.md",
491+
created_at=now,
492+
updated_at=now,
493+
title="General Notes",
494+
permalink="notes/general",
495+
content_snippet="# General Notes\n\nSome content here",
496+
score=0.7,
497+
)
498+
499+
class FakeSearchService:
500+
async def search(self, *args, **kwargs):
501+
return [fake_row]
502+
503+
app.dependency_overrides[get_search_service_v2_external] = lambda: FakeSearchService()
504+
try:
505+
response = await client.post(
506+
f"{v2_project_url}/search/",
507+
json={"search_text": "general"},
508+
)
509+
finally:
510+
app.dependency_overrides.pop(get_search_service_v2_external, None)
511+
512+
assert response.status_code == 200
513+
data = response.json()
514+
assert len(data["results"]) == 1
515+
result = data["results"][0]
516+
assert result["matched_chunk"] is None

tests/repository/test_hybrid_rrf.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ class FakeRow:
3434
created_at: str | None = None
3535
updated_at: str | None = None
3636
project_id: int = 1
37+
matched_chunk_text: str | None = None
3738

3839

3940
class ConcreteSearchRepo(SearchRepositoryBase):

tests/repository/test_vector_pagination.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ class FakeRow:
2020
id: int
2121
type: str = "entity"
2222
score: float = 0.0
23+
matched_chunk_text: str | None = None
2324

2425

2526
class ConcreteSearchRepo(SearchRepositoryBase):
@@ -78,7 +79,13 @@ def _make_descending_vector_rows(count: int) -> list[dict]:
7879
# Similarity decreases linearly: 0.95, 0.94, 0.93, ...
7980
similarity = 0.95 - (i * 0.01)
8081
distance = (1.0 / similarity) - 1.0
81-
rows.append({"chunk_key": f"entity:{i}:0", "best_distance": distance})
82+
rows.append(
83+
{
84+
"chunk_key": f"entity:{i}:0",
85+
"best_distance": distance,
86+
"chunk_text": f"chunk text {i}",
87+
}
88+
)
8289
return rows
8390

8491

tests/repository/test_vector_threshold.py

Lines changed: 41 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ class FakeRow:
1616
id: int
1717
type: str = "entity"
1818
score: float = 0.0
19+
matched_chunk_text: str | None = None
1920

2021

2122
class ConcreteSearchRepo(SearchRepositoryBase):
@@ -74,7 +75,13 @@ def _make_vector_rows(scores: list[float]) -> list[dict]:
7475
rows = []
7576
for i, score in enumerate(scores):
7677
distance = (1.0 / score) - 1.0
77-
rows.append({"chunk_key": f"entity:{i}:0", "best_distance": distance})
78+
rows.append(
79+
{
80+
"chunk_key": f"entity:{i}:0",
81+
"best_distance": distance,
82+
"chunk_text": f"chunk text for entity:{i}:0",
83+
}
84+
)
7885
return rows
7986

8087

@@ -257,3 +264,36 @@ async def test_per_query_min_similarity_tightens_threshold():
257264

258265
assert len(results) == 1
259266
assert results[0].id == 0
267+
268+
269+
@pytest.mark.asyncio
270+
async def test_matched_chunk_text_populated_on_vector_results():
271+
"""Vector search results carry the matched chunk text from the best-matching chunk."""
272+
repo = ConcreteSearchRepo()
273+
repo._semantic_min_similarity = 0.0
274+
275+
fake_rows = _make_vector_rows([0.9, 0.7])
276+
277+
mock_embed = AsyncMock(return_value=[0.0] * 384)
278+
repo._embedding_provider = type("EP", (), {"embed_query": mock_embed, "dimensions": 384})()
279+
280+
with (
281+
patch(
282+
"basic_memory.repository.search_repository_base.db.scoped_session", fake_scoped_session
283+
),
284+
patch.object(repo, "_ensure_vector_tables", new_callable=AsyncMock),
285+
patch.object(repo, "_prepare_vector_session", new_callable=AsyncMock),
286+
patch.object(repo, "_run_vector_query", new_callable=AsyncMock, return_value=fake_rows),
287+
patch.object(
288+
repo,
289+
"_fetch_search_index_rows_by_ids",
290+
new_callable=AsyncMock,
291+
return_value={i: FakeRow(id=i) for i in range(2)},
292+
),
293+
):
294+
results = await repo._search_vector_only(**COMMON_SEARCH_KWARGS)
295+
296+
assert len(results) == 2
297+
# Results are sorted by score descending, so id=0 (0.9) first, id=1 (0.7) second
298+
assert results[0].matched_chunk_text == "chunk text for entity:0:0"
299+
assert results[1].matched_chunk_text == "chunk text for entity:1:0"

0 commit comments

Comments
 (0)