Skip to content

Commit 9c9ff29

Browse files
groksrcclaudephernandez
authored
fix: backend-specific distance-to-similarity conversion (#593)
Signed-off-by: Drew Cain <groksrc@gmail.com> Signed-off-by: phernandez <paul@basicmachines.co> Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com> Co-authored-by: Paul Hernandez <60959+phernandez@users.noreply.github.com> Co-authored-by: phernandez <paul@basicmachines.co>
1 parent bbe6c1e commit 9c9ff29

9 files changed

Lines changed: 532 additions & 1 deletion

src/basic_memory/repository/postgres_search_repository.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -509,6 +509,14 @@ async def _delete_stale_chunks(
509509
async def _update_timestamp_sql(self) -> str:
510510
return "NOW()" # pragma: no cover
511511

512+
def _distance_to_similarity(self, distance: float) -> float:
513+
"""Convert pgvector cosine distance to cosine similarity.
514+
515+
pgvector's <=> operator returns cosine distance in [0, 2],
516+
where cos_distance = 1 - cos_similarity.
517+
"""
518+
return max(0.0, 1.0 - distance)
519+
512520
def _timestamp_now_expr(self) -> str:
513521
return "NOW()"
514522

src/basic_memory/repository/search_repository_base.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -204,6 +204,16 @@ async def _update_timestamp_sql(self) -> str:
204204
"""Return the SQL expression for current timestamp in the backend."""
205205
pass # pragma: no cover
206206

207+
@abstractmethod
208+
def _distance_to_similarity(self, distance: float) -> float:
209+
"""Convert a backend-specific vector distance to cosine similarity in [0, 1].
210+
211+
Backend-specific implementations:
212+
- SQLite (vec0): L2/Euclidean distance → cosine similarity via 1 - d²/2
213+
- Postgres (pgvector <=>): Cosine distance → cosine similarity via 1 - d
214+
"""
215+
pass # pragma: no cover
216+
207217
# ------------------------------------------------------------------
208218
# Shared index / delete operations
209219
# ------------------------------------------------------------------
@@ -866,7 +876,7 @@ async def _search_vector_only(
866876
for row in vector_rows:
867877
chunk_key = row.get("chunk_key", "")
868878
distance = float(row["best_distance"])
869-
similarity = 1.0 / (1.0 + max(distance, 0.0))
879+
similarity = self._distance_to_similarity(distance)
870880
try:
871881
_, si_id = self._parse_chunk_key(chunk_key)
872882
except (ValueError, IndexError):

src/basic_memory/repository/sqlite_search_repository.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,9 @@ def __init__(
5858
self._vector_dimensions = 384
5959

6060
if self._semantic_enabled and self._embedding_provider is None:
61+
# Constraint: SQLite maps L2 distance to cosine similarity via 1 - L2²/2.
62+
# This conversion is correct only for unit-normalized embeddings.
63+
# Provider implementations must return normalized vectors.
6164
self._embedding_provider = create_embedding_provider(self._app_config)
6265
if self._embedding_provider is not None:
6366
self._vector_dimensions = self._embedding_provider.dimensions
@@ -543,6 +546,14 @@ async def _delete_stale_chunks(
543546
async def _update_timestamp_sql(self) -> str:
544547
return "CURRENT_TIMESTAMP" # pragma: no cover
545548

549+
def _distance_to_similarity(self, distance: float) -> float:
550+
"""Convert L2 distance to cosine similarity for normalized embeddings.
551+
552+
sqlite-vec vec0 returns Euclidean (L2) distance by default.
553+
For unit-normalized vectors: L2² = 2·(1 - cos_sim), so cos_sim = 1 - L2²/2.
554+
"""
555+
return max(0.0, 1.0 - (distance * distance) / 2.0)
556+
546557
def _orphan_detection_sql(self) -> str:
547558
"""SQLite sqlite-vec uses rowid-based embedding table."""
548559
return (

0 commit comments

Comments
 (0)