fix: Return matched chunk text in search results (#601)

phernandez · claude · web-flow · commit 0f3889fdd087 · 2026-02-22T21:01:20.000-06:00
Signed-off-by: phernandez &lt;paul@basicmachines.co&gt;
Co-authored-by: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/src/basic_memory/api/v2/utils.py b/src/basic_memory/api/v2/utils.py
@@ -177,6 +177,7 @@ async def to_search_results(entity_service: EntityService, results: List[SearchI
                 score=r.score,  # pyright: ignore
                 entity=entities[0].permalink if entities else None,
                 content=r.content,
+                matched_chunk=r.matched_chunk_text,
                 file_path=r.file_path,
                 metadata=r.metadata,
                 entity_id=entity_id,
diff --git a/src/basic_memory/repository/postgres_search_repository.py b/src/basic_memory/repository/postgres_search_repository.py
@@ -424,7 +424,7 @@ async def _run_vector_query(
                     ORDER BY e.embedding <=> CAST(:query_embedding AS vector)
                     LIMIT :vector_k
                 )
-                SELECT c.entity_id, c.chunk_key, vector_matches.distance AS best_distance
+                SELECT c.entity_id, c.chunk_key, c.chunk_text, vector_matches.distance AS best_distance
                 FROM vector_matches
                 JOIN search_vector_chunks c ON c.id = vector_matches.chunk_id
                 WHERE c.project_id = :project_id
diff --git a/src/basic_memory/repository/search_index_row.py b/src/basic_memory/repository/search_index_row.py
@@ -38,6 +38,9 @@ class SearchIndexRow:
     to_id: Optional[int] = None  # relations
     relation_type: Optional[str] = None  # relations
 
+    # Matched chunk text from vector search (the actual content that matched the query)
+    matched_chunk_text: Optional[str] = None
+
     CONTENT_DISPLAY_LIMIT = 250
 
     @property
diff --git a/src/basic_memory/repository/search_repository_base.py b/src/basic_memory/repository/search_repository_base.py
@@ -871,12 +871,14 @@ async def _search_vector_only(
 
         # Build per-search_index_row similarity scores from chunk-level results.
         # Each chunk_key encodes the search_index row type and id.
-        # Keep the best similarity per search_index row id.
+        # Keep the best similarity (and its chunk text) per search_index row id.
         similarity_by_si_id: dict[int, float] = {}
+        best_chunk_by_si_id: dict[int, str] = {}
         for row in vector_rows:
             chunk_key = row.get("chunk_key", "")
             distance = float(row["best_distance"])
             similarity = self._distance_to_similarity(distance)
+            chunk_text = row.get("chunk_text", "")
             try:
                 _, si_id = self._parse_chunk_key(chunk_key)
             except (ValueError, IndexError):
@@ -885,6 +887,7 @@ async def _search_vector_only(
             current = similarity_by_si_id.get(si_id)
             if current is None or similarity > current:
                 similarity_by_si_id[si_id] = similarity
+                best_chunk_by_si_id[si_id] = chunk_text
 
         if not similarity_by_si_id:
             return []
@@ -944,7 +947,13 @@ async def _search_vector_only(
             row = search_index_rows.get(si_id)
             if row is None:
                 continue
-            ranked_rows.append(replace(row, score=similarity))
+            ranked_rows.append(
+                replace(
+                    row,
+                    score=similarity,
+                    matched_chunk_text=best_chunk_by_si_id.get(si_id),
+                )
+            )
 
         ranked_rows.sort(key=lambda item: item.score or 0.0, reverse=True)
         return ranked_rows[offset : offset + limit]
diff --git a/src/basic_memory/repository/sqlite_search_repository.py b/src/basic_memory/repository/sqlite_search_repository.py
@@ -453,7 +453,7 @@ async def _run_vector_query(
                 "  WHERE embedding MATCH :query_embedding "
                 "    AND k = :vector_k"
                 ") "
-                "SELECT c.entity_id, c.chunk_key, vector_matches.distance AS best_distance "
+                "SELECT c.entity_id, c.chunk_key, c.chunk_text, vector_matches.distance AS best_distance "
                 "FROM vector_matches "
                 "JOIN search_vector_chunks c ON c.id = vector_matches.rowid "
                 "WHERE c.project_id = :project_id "
diff --git a/src/basic_memory/schemas/search.py b/src/basic_memory/schemas/search.py
@@ -119,6 +119,7 @@ class SearchResult(BaseModel):
     entity: Optional[Permalink] = None
     permalink: Optional[str]
     content: Optional[str] = None
+    matched_chunk: Optional[str] = None
     file_path: str
 
     metadata: Optional[dict] = None
diff --git a/tests/api/v2/test_search_router.py b/tests/api/v2/test_search_router.py
@@ -1,11 +1,14 @@
 """Tests for v2 search router endpoints."""
 
+from datetime import datetime, timezone
+
 import pytest
 from httpx import AsyncClient
 from pathlib import Path
 
 from basic_memory.deps.services import get_search_service_v2_external
 from basic_memory.models import Project
+from basic_memory.repository.search_index_row import SearchIndexRow
 from basic_memory.repository.semantic_errors import (
     SemanticDependenciesMissingError,
     SemanticSearchDisabledError,
@@ -428,3 +431,86 @@ async def test_search_has_more_false_on_last_page(
     assert response.status_code == 200
     data = response.json()
     assert data["has_more"] is False
+
+
+@pytest.mark.asyncio
+async def test_search_result_includes_matched_chunk(
+    client: AsyncClient,
+    app,
+    v2_project_url: str,
+):
+    """matched_chunk field appears in search API JSON when set on SearchIndexRow."""
+    now = datetime.now(timezone.utc)
+    fake_row = SearchIndexRow(
+        project_id=1,
+        id=42,
+        type="entity",
+        file_path="notes/pricing.md",
+        created_at=now,
+        updated_at=now,
+        title="Pricing Notes",
+        permalink="notes/pricing",
+        content_snippet="# Pricing Notes\n\n- [pricing] Team plan is $9/mo per seat",
+        score=0.85,
+        matched_chunk_text="- [pricing] Team plan is $9/mo per seat",
+    )
+
+    class FakeSearchService:
+        async def search(self, *args, **kwargs):
+            return [fake_row]
+
+    app.dependency_overrides[get_search_service_v2_external] = lambda: FakeSearchService()
+    try:
+        response = await client.post(
+            f"{v2_project_url}/search/",
+            json={"search_text": "pricing"},
+        )
+    finally:
+        app.dependency_overrides.pop(get_search_service_v2_external, None)
+
+    assert response.status_code == 200
+    data = response.json()
+    assert len(data["results"]) == 1
+    result = data["results"][0]
+    assert result["matched_chunk"] == "- [pricing] Team plan is $9/mo per seat"
+
+
+@pytest.mark.asyncio
+async def test_search_result_omits_matched_chunk_when_none(
+    client: AsyncClient,
+    app,
+    v2_project_url: str,
+):
+    """matched_chunk field is null when not set (FTS-only results)."""
+    now = datetime.now(timezone.utc)
+    fake_row = SearchIndexRow(
+        project_id=1,
+        id=43,
+        type="entity",
+        file_path="notes/general.md",
+        created_at=now,
+        updated_at=now,
+        title="General Notes",
+        permalink="notes/general",
+        content_snippet="# General Notes\n\nSome content here",
+        score=0.7,
+    )
+
+    class FakeSearchService:
+        async def search(self, *args, **kwargs):
+            return [fake_row]
+
+    app.dependency_overrides[get_search_service_v2_external] = lambda: FakeSearchService()
+    try:
+        response = await client.post(
+            f"{v2_project_url}/search/",
+            json={"search_text": "general"},
+        )
+    finally:
+        app.dependency_overrides.pop(get_search_service_v2_external, None)
+
+    assert response.status_code == 200
+    data = response.json()
+    assert len(data["results"]) == 1
+    result = data["results"][0]
+    assert result["matched_chunk"] is None
diff --git a/tests/repository/test_hybrid_rrf.py b/tests/repository/test_hybrid_rrf.py
@@ -34,6 +34,7 @@ class FakeRow:
     created_at: str | None = None
     updated_at: str | None = None
     project_id: int = 1
+    matched_chunk_text: str | None = None
 
 
 class ConcreteSearchRepo(SearchRepositoryBase):
diff --git a/tests/repository/test_vector_pagination.py b/tests/repository/test_vector_pagination.py
@@ -20,6 +20,7 @@ class FakeRow:
     id: int
     type: str = "entity"
     score: float = 0.0
+    matched_chunk_text: str | None = None
 
 
 class ConcreteSearchRepo(SearchRepositoryBase):
@@ -78,7 +79,13 @@ def _make_descending_vector_rows(count: int) -> list[dict]:
         # Similarity decreases linearly: 0.95, 0.94, 0.93, ...
         similarity = 0.95 - (i * 0.01)
         distance = (1.0 / similarity) - 1.0
-        rows.append({"chunk_key": f"entity:{i}:0", "best_distance": distance})
+        rows.append(
+            {
+                "chunk_key": f"entity:{i}:0",
+                "best_distance": distance,
+                "chunk_text": f"chunk text {i}",
+            }
+        )
     return rows
 
 
diff --git a/tests/repository/test_vector_threshold.py b/tests/repository/test_vector_threshold.py
@@ -16,6 +16,7 @@ class FakeRow:
     id: int
     type: str = "entity"
     score: float = 0.0
+    matched_chunk_text: str | None = None
 
 
 class ConcreteSearchRepo(SearchRepositoryBase):
@@ -74,7 +75,13 @@ def _make_vector_rows(scores: list[float]) -> list[dict]:
     rows = []
     for i, score in enumerate(scores):
         distance = (1.0 / score) - 1.0
-        rows.append({"chunk_key": f"entity:{i}:0", "best_distance": distance})
+        rows.append(
+            {
+                "chunk_key": f"entity:{i}:0",
+                "best_distance": distance,
+                "chunk_text": f"chunk text for entity:{i}:0",
+            }
+        )
     return rows
 
 
@@ -257,3 +264,36 @@ async def test_per_query_min_similarity_tightens_threshold():
 
     assert len(results) == 1
     assert results[0].id == 0
+
+
+@pytest.mark.asyncio
+async def test_matched_chunk_text_populated_on_vector_results():
+    """Vector search results carry the matched chunk text from the best-matching chunk."""
+    repo = ConcreteSearchRepo()
+    repo._semantic_min_similarity = 0.0
+
+    fake_rows = _make_vector_rows([0.9, 0.7])
+
+    mock_embed = AsyncMock(return_value=[0.0] * 384)
+    repo._embedding_provider = type("EP", (), {"embed_query": mock_embed, "dimensions": 384})()
+
+    with (
+        patch(
+            "basic_memory.repository.search_repository_base.db.scoped_session", fake_scoped_session
+        ),
+        patch.object(repo, "_ensure_vector_tables", new_callable=AsyncMock),
+        patch.object(repo, "_prepare_vector_session", new_callable=AsyncMock),
+        patch.object(repo, "_run_vector_query", new_callable=AsyncMock, return_value=fake_rows),
+        patch.object(
+            repo,
+            "_fetch_search_index_rows_by_ids",
+            new_callable=AsyncMock,
+            return_value={i: FakeRow(id=i) for i in range(2)},
+        ),
+    ):
+        results = await repo._search_vector_only(**COMMON_SEARCH_KWARGS)
+
+    assert len(results) == 2
+    # Results are sorted by score descending, so id=0 (0.9) first, id=1 (0.7) second
+    assert results[0].matched_chunk_text == "chunk text for entity:0:0"
+    assert results[1].matched_chunk_text == "chunk text for entity:1:0"

Original file line number	Diff line number	Diff line change
`@@ -424,7 +424,7 @@ async def _run_vector_query(`
`424`	`424`	`ORDER BY e.embedding <=> CAST(:query_embedding AS vector)`
`425`	`425`	`LIMIT :vector_k`
`426`	`426`	`)`
`427`		`- SELECT c.entity_id, c.chunk_key, vector_matches.distance AS best_distance`
	`427`	`+ SELECT c.entity_id, c.chunk_key, c.chunk_text, vector_matches.distance AS best_distance`
`428`	`428`	`FROM vector_matches`
`429`	`429`	`JOIN search_vector_chunks c ON c.id = vector_matches.chunk_id`
`430`	`430`	`WHERE c.project_id = :project_id`