From e65f127c470a513327138ee953e68347aeedfcd0 Mon Sep 17 00:00:00 2001
From: Matteo Merola <mattmezza@gmail.com>
Date: Sun, 7 Jun 2026 22:13:07 +0200
Subject: [PATCH 1/7] feat(memory): unified ADD/UPDATE/DELETE/NOOP long-term
 write pipeline

Replace exact-subject + substring + longer-wins dedup with a single
unified update path: retrieve top-k lexically similar long-term memories,
then one LLM call decides ADD/UPDATE/DELETE/NOOP. Handles semantic
duplicates, refinements, and contradictions, and includes timestamps so
the model can prefer recent facts on conflict.

Both extraction's LONG_TERM writes and consolidation's promotions now
route through update_memory (single source of truth). Lexical retrieval
is dependency-free Python token overlap (no FTS5/embeddings), portable
across local and container SQLite builds. Subjects are normalised in
code. Cooldown-skipped turns are buffered and replayed into the next
extraction instead of being dropped.
---
 core/memory.py | 337 ++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 305 insertions(+), 32 deletions(-)
diff --git a/core/memory.py b/core/memory.py
index 951e623..9243648 100644
--- a/core/memory.py
+++ b/core/memory.py
@@ -59,12 +59,43 @@
 
 Respond with ONLY the JSON array, no other text."""
 
+_UPDATE_PROMPT = """\
+You maintain the long-term memory of a personal AI assistant. Decide what to do
+with a new candidate fact relative to the existing memories it most resembles.
+
+Today's date: {today}
+
+## Candidate fact
+[{category}] {subject}: {content}
+
+## Existing related memories
+{existing}
+
+Choose exactly ONE operation:
+- ADD — the candidate is genuinely new information not already covered above.
+- UPDATE — the candidate refines, corrects, or re-words ONE existing memory.
+  Give its id and the final merged content to keep (prefer the newer fact on
+  conflict; keep it short and dense).
+- DELETE — the candidate states that an existing memory is no longer true, and
+  there is nothing worth keeping in its place. Give the id to remove.
+- NOOP — the candidate duplicates an existing memory, or is not worth keeping.
+
+Keep long-term memories short and dense: strip dates, times, and situational
+framing. Use lowercase for subject.
+
+Respond with ONLY a JSON object, no other text. One of:
+  {{"operation": "ADD"}}
+  {{"operation": "UPDATE", "id": <id>, "category": "<cat>", \
+"subject": "<subj>", "content": "<merged fact>"}}
+  {{"operation": "DELETE", "id": <id>}}
+  {{"operation": "NOOP"}}"""
+
 _EXTRACTION_PROMPT = """\
 Given this conversation exchange, identify any facts worth remembering.
 
 User: {user_msg}
 Assistant: {agent_msg}
-
+{recent_turns_block}
 {existing_memories_block}\
 For each fact, classify it into ONE of these tiers:
 
@@ -189,6 +220,127 @@ def _extract_json_array(raw: str) -> list | None:
     return None
 
 
+def _extract_json_object(raw: str) -> dict | None:
+    """Best-effort extraction of a single JSON object from an LLM response.
+
+    Mirrors :func:`_extract_json_array` but for ``{ ... }`` payloads. Returns
+    the parsed dict on success, or ``None`` if none could be extracted.
+    """
+    raw = raw.strip()
+    if not raw:
+        return None
+
+    try:
+        result = json.loads(raw)
+        if isinstance(result, dict):
+            return result
+    except json.JSONDecodeError:
+        pass
+
+    fence_match = _FENCE_RE.search(raw)
+    if fence_match:
+        try:
+            result = json.loads(fence_match.group(1).strip())
+            if isinstance(result, dict):
+                return result
+        except json.JSONDecodeError:
+            pass
+
+    start = raw.find("{")
+    if start != -1:
+        depth = 0
+        in_string = False
+        escape = False
+        end = -1
+        for i in range(start, len(raw)):
+            ch = raw[i]
+            if escape:
+                escape = False
+                continue
+            if ch == "\\":
+                escape = True
+                continue
+            if ch == '"':
+                in_string = not in_string
+                continue
+            if in_string:
+                continue
+            if ch == "{":
+                depth += 1
+            elif ch == "}":
+                depth -= 1
+                if depth == 0:
+                    end = i
+                    break
+        if end != -1:
+            try:
+                result = json.loads(raw[start : end + 1])
+                if isinstance(result, dict):
+                    return result
+            except json.JSONDecodeError:
+                pass
+
+    return None
+
+
+# Tokeniser for cheap lexical similarity (no embeddings, no new deps).
+_TOKEN_RE = re.compile(r"[a-z0-9]+")
+_STOPWORDS = frozenset(
+    {
+        "the",
+        "a",
+        "an",
+        "and",
+        "or",
+        "is",
+        "are",
+        "was",
+        "were",
+        "be",
+        "to",
+        "of",
+        "in",
+        "on",
+        "at",
+        "for",
+        "with",
+        "his",
+        "her",
+        "their",
+        "has",
+        "have",
+        "had",
+        "uses",
+        "use",
+        "that",
+        "this",
+        "it",
+        "as",
+        "by",
+    }
+)
+
+
+def _normalize_subject(subject: str) -> str:
+    """Canonicalise a memory subject (lowercase, trimmed)."""
+    return (subject or "").strip().lower()
+
+
+def _tokens(text: str) -> set[str]:
+    """Lowercase content words, dropping stopwords and single characters."""
+    return {t for t in _TOKEN_RE.findall(text.lower()) if len(t) > 1 and t not in _STOPWORDS}
+
+
+def _similarity(a: set[str], b: set[str]) -> float:
+    """Jaccard overlap between two token sets (0.0 when either is empty)."""
+    if not a or not b:
+        return 0.0
+    inter = len(a & b)
+    if not inter:
+        return 0.0
+    return inter / len(a | b)
+
+
 class MemoryStore:
     """Two-tier memory system backed by SQLite.
 
@@ -204,6 +356,9 @@ def __init__(self, db_path: str = "data/memory.db", long_term_limit: int = 50):
         self.long_term_limit = long_term_limit
         self._ready = False
         self._last_extraction: float | None = None  # monotonic timestamp of last extraction
+        # Turns skipped by the cooldown, replayed into the next extraction so
+        # back-to-back salient turns aren't dropped (issue #7).
+        self._pending_turns: list[tuple[str, str]] = []
 
     async def _ensure_schema(self) -> None:
         if self._ready:
@@ -263,6 +418,22 @@ async def format_for_prompt(self) -> str:
     # Maximum number of memories to store per extraction call.
     _MAX_PER_TURN = 3
 
+    # Maximum number of cooldown-skipped turns to buffer for the next extraction.
+    _MAX_PENDING_TURNS = 6
+
+    # Number of similar long-term memories retrieved as ADD/UPDATE/DELETE candidates.
+    _UPDATE_TOP_K = 8
+
+    def _format_pending_turns(self) -> str:
+        """Render buffered cooldown turns as a prompt section (empty if none)."""
+        if not self._pending_turns:
+            return ""
+        lines = ["", "Earlier turns since the last review (also consider these):"]
+        for user_msg, agent_msg in self._pending_turns:
+            lines.append(f"User: {user_msg}")
+            lines.append(f"Assistant: {agent_msg}")
+        return "\n".join(lines) + "\n"
+
     async def extract_memories(
         self,
         llm: LLMClient,
@@ -287,19 +458,29 @@ async def extract_memories(
             and self._last_extraction is not None
             and now - self._last_extraction < cooldown_seconds
         ):
+            # Buffer the skipped turn instead of dropping it; it is replayed
+            # into the next extraction once the cooldown elapses (issue #7).
+            self._pending_turns.append((user_msg, agent_msg))
+            del self._pending_turns[: -self._MAX_PENDING_TURNS]
             log.debug(
-                "Skipping memory extraction (cooldown: %.0fs remaining)",
+                "Buffering memory extraction (cooldown: %.0fs remaining, %d pending)",
                 cooldown_seconds - (now - self._last_extraction),
+                len(self._pending_turns),
             )
             return 0
         self._last_extraction = now
 
+        # Replay any turns buffered during the cooldown, then clear the buffer.
+        recent_turns_block = self._format_pending_turns()
+        self._pending_turns = []
+
         # Build existing-memories block so the LLM can avoid duplicates.
         existing_block = await self._existing_memories_block()
 
         prompt = _EXTRACTION_PROMPT.format(
             user_msg=user_msg,
             agent_msg=agent_msg,
+            recent_turns_block=recent_turns_block,
             existing_memories_block=existing_block,
         )
 
@@ -319,7 +500,9 @@ async def extract_memories(
             try:
                 tier = mem.get("tier", "").upper()
                 if tier == "LONG_TERM":
-                    stored += await self._store_long_term(mem)
+                    op = await self.update_memory(llm, model, mem)
+                    if op in ("ADD", "UPDATE"):
+                        stored += 1
                 elif tier == "SHORT_TERM":
                     stored += await self._store_short_term(mem)
                 else:
@@ -352,44 +535,131 @@ async def _existing_memories_block(self) -> str:
         parts.append("")  # trailing newline
         return "\n".join(parts) + "\n"
 
-    async def _store_long_term(self, mem: dict) -> int:
-        """Store a long-term memory, skipping if a similar one exists."""
-        category = mem.get("category", "fact")
-        subject = mem.get("subject", "")
-        content = mem.get("content", "")
+    async def update_memory(self, llm: LLMClient, model: str, candidate: dict) -> str:
+        """Apply a candidate fact to long-term memory via a unified pipeline.
+
+        Retrieves the most lexically similar existing long-term memories, then
+        a single LLM call decides ADD / UPDATE / DELETE / NOOP — handling
+        semantic duplicates, refinements, and contradictions (issues #1–#4, #8).
+        When nothing similar exists the candidate is added directly without an
+        LLM call. Malformed model output is a safe no-op.
+
+        Returns the operation applied: ``"ADD"``, ``"UPDATE"``, ``"DELETE"``,
+        or ``"NOOP"``.
+        """
+        category = candidate.get("category") or "fact"
+        subject = _normalize_subject(candidate.get("subject", ""))
+        content = (candidate.get("content") or "").strip()
         if not content:
-            return 0
+            return "NOOP"
+
+        similar = await self._retrieve_similar_long_term(subject, content)
+        if not similar:
+            await self._insert_long_term(category, subject, content)
+            log.debug("ADD long-term (no similar): [%s] %s: %s", category, subject, content[:80])
+            return "ADD"
+
+        existing_lines = []
+        for row in similar:
+            existing_lines.append(
+                f"- id={row['id']} [{row['category']}] {row['subject']}: {row['content']} "
+                f"(created {row['created_at']}, updated {row['updated_at']})"
+            )
+        prompt = _UPDATE_PROMPT.format(
+            today=datetime.now(tz=UTC).date().isoformat(),
+            category=category,
+            subject=subject or "(unknown)",
+            content=content,
+            existing="\n".join(existing_lines),
+        )
 
+        try:
+            raw = await llm.generate_text(model=model, prompt=prompt, max_tokens=1024)
+        except Exception:
+            log.exception("update_memory LLM call failed; skipping candidate")
+            return "NOOP"
+
+        decision = _extract_json_object(raw)
+        if not isinstance(decision, dict):
+            log.warning("update_memory returned non-JSON: %s", raw[:200])
+            return "NOOP"
+
+        operation = str(decision.get("operation", "")).upper()
+        valid_ids = {row["id"] for row in similar}
+
+        if operation == "ADD":
+            await self._insert_long_term(category, subject, content)
+            log.debug("ADD long-term: [%s] %s: %s", category, subject, content[:80])
+            return "ADD"
+
+        if operation == "UPDATE":
+            target_id = decision.get("id")
+            if target_id not in valid_ids:
+                log.warning("update_memory UPDATE with invalid id %r; no-op", target_id)
+                return "NOOP"
+            new_content = (decision.get("content") or content).strip()
+            new_category = decision.get("category") or category
+            new_subject = _normalize_subject(decision.get("subject") or subject)
+            await self._ensure_schema()
+            async with aiosqlite.connect(self.db_path) as db:
+                await db.execute(
+                    "UPDATE long_term SET category = ?, subject = ?, content = ?, "
+                    "updated_at = datetime('now') WHERE id = ?",
+                    (new_category, new_subject, new_content, target_id),
+                )
+                await db.commit()
+            log.debug("UPDATE long-term %s: %s", target_id, new_content[:80])
+            return "UPDATE"
+
+        if operation == "DELETE":
+            target_id = decision.get("id")
+            if target_id not in valid_ids:
+                log.warning("update_memory DELETE with invalid id %r; no-op", target_id)
+                return "NOOP"
+            await self._ensure_schema()
+            async with aiosqlite.connect(self.db_path) as db:
+                await db.execute("DELETE FROM long_term WHERE id = ?", (target_id,))
+                await db.commit()
+            log.debug("DELETE long-term %s (contradicted)", target_id)
+            return "DELETE"
+
+        return "NOOP"
+
+    async def _retrieve_similar_long_term(self, subject: str, content: str) -> list[dict]:
+        """Return the top-k existing long-term memories lexically similar to a
+        candidate (subject + content), ranked by token overlap with a boost for
+        a matching subject. Cheap and dependency-free; fine at <1k rows."""
         await self._ensure_schema()
         async with aiosqlite.connect(self.db_path) as db:
-            # Check for duplicates: same subject + overlapping content
+            db.row_factory = aiosqlite.Row
             cursor = await db.execute(
-                "SELECT id, content FROM long_term WHERE subject = ?",
-                (subject,),
+                "SELECT id, category, subject, content, created_at, updated_at FROM long_term"
             )
-            existing = await cursor.fetchall()
-            content_lower = content.lower()
-            for row in existing:
-                if content_lower in row[1].lower() or row[1].lower() in content_lower:
-                    # Update the existing memory if the new content is more detailed
-                    if len(content) > len(row[1]):
-                        await db.execute(
-                            "UPDATE long_term SET content = ?, updated_at = datetime('now') "
-                            "WHERE id = ?",
-                            (content, row[0]),
-                        )
-                        await db.commit()
-                        log.debug("Updated long-term memory %d: %s", row[0], content[:80])
-                    return 0
-
+            rows = [dict(r) for r in await cursor.fetchall()]
+
+        subject_norm = _normalize_subject(subject)
+        cand_tokens = _tokens(f"{subject} {content}")
+        scored: list[tuple[float, dict]] = []
+        for row in rows:
+            score = _similarity(cand_tokens, _tokens(f"{row['subject']} {row['content']}"))
+            if subject_norm and _normalize_subject(row["subject"]) == subject_norm:
+                score += 0.5
+            if score > 0:
+                scored.append((score, row))
+
+        scored.sort(key=lambda pair: pair[0], reverse=True)
+        return [row for _, row in scored[: self._UPDATE_TOP_K]]
+
+    async def _insert_long_term(self, category: str, subject: str, content: str) -> None:
+        """Insert a new long-term memory row."""
+        await self._ensure_schema()
+        async with aiosqlite.connect(self.db_path) as db:
             await db.execute(
                 "INSERT INTO long_term (category, subject, content, source, confidence) "
                 "VALUES (?, ?, ?, 'conversation', 'stated')",
                 (category, subject, content),
             )
             await db.commit()
-            log.debug("Stored long-term memory: [%s] %s: %s", category, subject, content[:80])
-            return 1
 
     async def _store_short_term(self, mem: dict) -> int:
         """Store a short-term memory with a LLM-determined TTL.
@@ -515,14 +785,17 @@ async def _run_consolidation_llm(
         stored = 0
         for mem in promotions:
             try:
-                count = await self._store_long_term(
+                op = await self.update_memory(
+                    llm,
+                    model,
                     {
                         "category": mem.get("category", "fact"),
                         "subject": mem.get("subject", ""),
                         "content": mem.get("content", ""),
-                    }
+                    },
                 )
-                stored += count
+                if op in ("ADD", "UPDATE"):
+                    stored += 1
             except Exception:
                 log.exception("Failed to store promoted memory: %s", mem)
 

From 30e350746bbc2b79767e1192d864f5fb96ed132e Mon Sep 17 00:00:00 2001
From: Matteo Merola <mattmezza@gmail.com>
Date: Sun, 7 Jun 2026 22:16:35 +0200
Subject: [PATCH 2/7] test(memory): cover unified update pipeline + cooldown
 buffering

Add test_memory_update_pipeline.py: ADD/UPDATE/DELETE/NOOP decisions,
malformed/exception safe no-op, invalid-id rejection, subject
normalisation, timestamp injection, and lexical retrieval ranking.
Update extraction tests for the new write path and add cooldown
turn-buffering tests. Document the unified write pipeline in pa.md.
---
 pa.md                                |  11 ++
 tests/test_memory_extraction.py      |  72 ++++++-
 tests/test_memory_update_pipeline.py | 282 +++++++++++++++++++++++++++
 3 files changed, 361 insertions(+), 4 deletions(-)
 create mode 100644 tests/test_memory_update_pipeline.py

diff --git a/pa.md b/pa.md
index de03625..678739b 100644
--- a/pa.md
+++ b/pa.md
@@ -1065,6 +1065,17 @@ A scheduled job of type `memory_consolidation` runs on a configurable cron sched
 
 2. **Cleanup** — deletes all expired short-term memories regardless of whether the LLM call succeeded.
 
+#### Unified long-term write pipeline
+
+Every long-term write — both the automatic per-turn extraction and consolidation's promotions — flows through a single path, `MemoryStore.update_memory`. For each candidate fact it retrieves the most lexically similar existing long-term memories (cheap token-overlap ranking, no embeddings or extra dependencies), then makes one LLM call that decides exactly one operation:
+
+- **ADD** — genuinely new information.
+- **UPDATE** — refines, corrects, or re-words one existing memory (the LLM returns the merged content to keep).
+- **DELETE** — the candidate says an existing memory is no longer true, with nothing worth keeping in its place.
+- **NOOP** — duplicate or not worth keeping.
+
+The decision prompt includes each candidate's `created_at`/`updated_at`, so the model can prefer recent facts when resolving contradictions. When nothing similar exists the fact is added directly without an LLM call. This replaces the earlier exact-subject + substring + "longer content wins" heuristic, which missed semantic duplicates and never resolved contradictions. Malformed model output is a safe no-op.
+
 This is configured as a regular scheduled job in `config.yml`:
 
 ```yaml
diff --git a/tests/test_memory_extraction.py b/tests/test_memory_extraction.py
index 63a4c9b..9e5682d 100644
--- a/tests/test_memory_extraction.py
+++ b/tests/test_memory_extraction.py
@@ -19,15 +19,22 @@ async def store(tmp_path):
 
 
 class _LLMStub:
-    def __init__(self, response_json):
+    """Routes by prompt type: extraction calls return the canned array;
+    update_memory calls (ADD/UPDATE/DELETE/NOOP) return ``update_response``
+    (defaults to ADD so every extracted long-term fact is stored)."""
+
+    def __init__(self, response_json, update_response=None):
         self._response = json.dumps(response_json)
+        self._update_response = update_response or {"operation": "ADD"}
 
     async def generate_text(self, *, model: str, prompt: str, max_tokens: int = 1024) -> str:
+        if "Choose exactly ONE operation" in prompt:
+            return json.dumps(self._update_response)
         return self._response
 
 
-def _make_mock_llm(response_json):
-    return _LLMStub(response_json)
+def _make_mock_llm(response_json, update_response=None):
+    return _LLMStub(response_json, update_response)
 
 
 async def _count_rows(db_path: str, table: str) -> int:
@@ -249,9 +256,65 @@ async def test_cooldown_skips_rapid_extractions(store) -> None:
     assert await _count_rows(store.db_path, "long_term") == 1
 
 
+@pytest.mark.asyncio
+async def test_cooldown_buffers_skipped_turn_for_next_extraction(store) -> None:
+    """A turn skipped by the cooldown is replayed into the next extraction."""
+
+    class _RecordingStub:
+        def __init__(self):
+            self.prompts: list[str] = []
+
+        async def generate_text(self, *, model, prompt, max_tokens=1024):
+            self.prompts.append(prompt)
+            return "[]"
+
+    llm = _RecordingStub()
+
+    # First turn runs and arms the cooldown.
+    await store.extract_memories(
+        llm, model="m", user_msg="turn one", agent_msg="ok", cooldown_seconds=300
+    )
+    # Second turn is inside the cooldown → buffered, not dropped.
+    await store.extract_memories(
+        llm, model="m", user_msg="buffered fact", agent_msg="reply", cooldown_seconds=300
+    )
+    assert store._pending_turns == [("buffered fact", "reply")]
+    assert len(llm.prompts) == 1  # the buffered turn made no LLM call
+
+    # Third turn (cooldown disabled) replays the buffered turn and clears it.
+    await store.extract_memories(
+        llm, model="m", user_msg="turn three", agent_msg="ok", cooldown_seconds=0
+    )
+    assert store._pending_turns == []
+    assert "buffered fact" in llm.prompts[-1]
+    assert "turn three" in llm.prompts[-1]
+
+
+@pytest.mark.asyncio
+async def test_pending_turns_buffer_is_capped(store) -> None:
+    """The cooldown buffer never grows past _MAX_PENDING_TURNS."""
+
+    class _Stub:
+        async def generate_text(self, *, model, prompt, max_tokens=1024):
+            return "[]"
+
+    llm = _Stub()
+    await store.extract_memories(
+        llm, model="m", user_msg="arm", agent_msg="ok", cooldown_seconds=300
+    )
+    for i in range(store._MAX_PENDING_TURNS + 5):
+        await store.extract_memories(
+            llm, model="m", user_msg=f"turn {i}", agent_msg="ok", cooldown_seconds=300
+        )
+    assert len(store._pending_turns) == store._MAX_PENDING_TURNS
+    # Oldest dropped, newest kept.
+    assert store._pending_turns[-1][0] == f"turn {store._MAX_PENDING_TURNS + 4}"
+
+
 @pytest.mark.asyncio
 async def test_cooldown_zero_allows_all(store) -> None:
     """cooldown_seconds=0 should allow every call."""
+    # The update pipeline rules the second (duplicate) candidate a NOOP.
     llm = _make_mock_llm(
         [
             {
@@ -260,7 +323,8 @@ async def test_cooldown_zero_allows_all(store) -> None:
                 "subject": "matteo",
                 "content": "Lives in Zurich",
             }
-        ]
+        ],
+        update_response={"operation": "NOOP"},
     )
 
     stored1 = await store.extract_memories(
diff --git a/tests/test_memory_update_pipeline.py b/tests/test_memory_update_pipeline.py
new file mode 100644
index 0000000..9b46940
--- /dev/null
+++ b/tests/test_memory_update_pipeline.py
@@ -0,0 +1,282 @@
+"""Tests for the unified ADD/UPDATE/DELETE/NOOP long-term update pipeline.
+
+Covers MemoryStore.update_memory and the lexical candidate retrieval that
+feeds it (issue #5, Tier 1).
+"""
+
+from __future__ import annotations
+
+import json
+from unittest.mock import AsyncMock
+
+import aiosqlite
+import pytest
+
+from core.memory import (
+    MemoryStore,
+    _normalize_subject,
+    _similarity,
+    _tokens,
+)
+
+
+@pytest.fixture
+async def store(tmp_path):
+    db_path = str(tmp_path / "memory.db")
+    s = MemoryStore(db_path=db_path, long_term_limit=50)
+    await s._ensure_schema()
+    return s
+
+
+async def _insert_long_term(store: MemoryStore, category: str, subject: str, content: str) -> int:
+    async with aiosqlite.connect(store.db_path) as db:
+        cursor = await db.execute(
+            "INSERT INTO long_term (category, subject, content, source, confidence) "
+            "VALUES (?, ?, ?, 'test', 'stated')",
+            (category, subject, content),
+        )
+        await db.commit()
+        return cursor.lastrowid
+
+
+async def _rows(store: MemoryStore) -> list[dict]:
+    async with aiosqlite.connect(store.db_path) as db:
+        db.row_factory = aiosqlite.Row
+        cursor = await db.execute(
+            "SELECT id, category, subject, content FROM long_term ORDER BY id"
+        )
+        return [dict(r) for r in await cursor.fetchall()]
+
+
+class _DecisionLLM:
+    """LLM stub that returns a fixed update decision (dict) and records calls."""
+
+    def __init__(self, decision: dict | str):
+        self._decision = decision if isinstance(decision, str) else json.dumps(decision)
+        self.calls = 0
+        self.last_prompt: str | None = None
+
+    async def generate_text(self, *, model: str, prompt: str, max_tokens: int = 1024) -> str:
+        self.calls += 1
+        self.last_prompt = prompt
+        return self._decision
+
+
+# -- update_memory --
+
+
+class TestUpdateMemory:
+    async def test_new_fact_into_empty_db_adds_without_llm(self, store):
+        """No existing memories → ADD directly, no LLM call."""
+        llm = _DecisionLLM({"operation": "NOOP"})  # should never be consulted
+
+        op = await store.update_memory(
+            llm, "m", {"category": "fact", "subject": "matteo", "content": "Lives in Zurich"}
+        )
+
+        assert op == "ADD"
+        assert llm.calls == 0
+        rows = await _rows(store)
+        assert len(rows) == 1
+        assert rows[0]["content"] == "Lives in Zurich"
+
+    async def test_unrelated_existing_still_adds_without_llm(self, store):
+        """Existing memory shares no tokens → no candidate → ADD without LLM."""
+        await _insert_long_term(store, "food", "simge", "Allergic to peanuts")
+        llm = _DecisionLLM({"operation": "NOOP"})
+
+        op = await store.update_memory(
+            llm, "m", {"category": "work", "subject": "matteo", "content": "Software engineer"}
+        )
+
+        assert op == "ADD"
+        assert llm.calls == 0
+        assert len(await _rows(store)) == 2
+
+    async def test_semantic_duplicate_noop(self, store):
+        """A near-duplicate of an existing memory → LLM rules NOOP, nothing added."""
+        await _insert_long_term(store, "health", "matteo", "Allergic to shellfish")
+        llm = _DecisionLLM({"operation": "NOOP"})
+
+        op = await store.update_memory(
+            llm, "m", {"category": "health", "subject": "matteo", "content": "Cannot eat shellfish"}
+        )
+
+        assert op == "NOOP"
+        assert llm.calls == 1
+        assert len(await _rows(store)) == 1
+
+    async def test_refinement_updates_in_place(self, store):
+        """LLM returns UPDATE with a merged content → existing row is rewritten."""
+        rid = await _insert_long_term(store, "work", "matteo", "Uses a desk at work")
+        llm = _DecisionLLM(
+            {
+                "operation": "UPDATE",
+                "id": rid,
+                "category": "work",
+                "subject": "matteo",
+                "content": "Uses a standing desk at work",
+            }
+        )
+
+        op = await store.update_memory(
+            llm, "m", {"category": "work", "subject": "matteo", "content": "Standing desk now"}
+        )
+
+        assert op == "UPDATE"
+        rows = await _rows(store)
+        assert len(rows) == 1
+        assert rows[0]["content"] == "Uses a standing desk at work"
+
+    async def test_contradiction_deletes(self, store):
+        """LLM returns DELETE → the contradicted memory is removed, none added."""
+        rid = await _insert_long_term(store, "work", "matteo", "Uses a standing desk at work")
+        llm = _DecisionLLM({"operation": "DELETE", "id": rid})
+
+        op = await store.update_memory(
+            llm,
+            "m",
+            {"category": "work", "subject": "matteo", "content": "Switched back to sitting desk"},
+        )
+
+        assert op == "DELETE"
+        assert len(await _rows(store)) == 0
+
+    async def test_add_alongside_existing(self, store):
+        """LLM returns ADD even though a lexically similar memory exists."""
+        await _insert_long_term(store, "routine", "matteo", "Runs on Mondays")
+        llm = _DecisionLLM({"operation": "ADD"})
+
+        op = await store.update_memory(
+            llm,
+            "m",
+            {"category": "routine", "subject": "matteo", "content": "Runs on Thursdays too"},
+        )
+
+        assert op == "ADD"
+        assert len(await _rows(store)) == 2
+
+    async def test_malformed_output_is_safe_noop(self, store):
+        """Non-JSON LLM output → no-op, no mutation."""
+        await _insert_long_term(store, "health", "matteo", "Allergic to shellfish")
+        llm = _DecisionLLM("totally not json")
+
+        op = await store.update_memory(
+            llm, "m", {"category": "health", "subject": "matteo", "content": "Cannot eat shellfish"}
+        )
+
+        assert op == "NOOP"
+        assert len(await _rows(store)) == 1
+
+    async def test_llm_exception_is_safe_noop(self, store):
+        """An LLM error mid-decision must not crash or mutate state."""
+        await _insert_long_term(store, "health", "matteo", "Allergic to shellfish")
+        llm = AsyncMock()
+        llm.generate_text.side_effect = RuntimeError("API down")
+
+        op = await store.update_memory(
+            llm, "m", {"category": "health", "subject": "matteo", "content": "Cannot eat shellfish"}
+        )
+
+        assert op == "NOOP"
+        assert len(await _rows(store)) == 1
+
+    async def test_update_with_invalid_id_is_noop(self, store):
+        """UPDATE targeting an id not in the candidate set is rejected."""
+        await _insert_long_term(store, "health", "matteo", "Allergic to shellfish")
+        llm = _DecisionLLM({"operation": "UPDATE", "id": 999, "content": "x"})
+
+        op = await store.update_memory(
+            llm, "m", {"category": "health", "subject": "matteo", "content": "Cannot eat shellfish"}
+        )
+
+        assert op == "NOOP"
+        rows = await _rows(store)
+        assert rows[0]["content"] == "Allergic to shellfish"
+
+    async def test_delete_with_invalid_id_is_noop(self, store):
+        await _insert_long_term(store, "health", "matteo", "Allergic to shellfish")
+        llm = _DecisionLLM({"operation": "DELETE", "id": 999})
+
+        op = await store.update_memory(
+            llm, "m", {"category": "health", "subject": "matteo", "content": "Cannot eat shellfish"}
+        )
+
+        assert op == "NOOP"
+        assert len(await _rows(store)) == 1
+
+    async def test_empty_content_is_noop_without_llm(self, store):
+        llm = _DecisionLLM({"operation": "ADD"})
+        op = await store.update_memory(
+            llm, "m", {"category": "fact", "subject": "matteo", "content": "  "}
+        )
+        assert op == "NOOP"
+        assert llm.calls == 0
+        assert len(await _rows(store)) == 0
+
+    async def test_subject_normalised_on_add(self, store):
+        """Subjects are lowercased in code, not just via prompt instruction."""
+        llm = _DecisionLLM({"operation": "NOOP"})
+        await store.update_memory(
+            llm, "m", {"category": "fact", "subject": "Matteo", "content": "Lives in Zurich"}
+        )
+        rows = await _rows(store)
+        assert rows[0]["subject"] == "matteo"
+
+    async def test_update_prompt_includes_timestamps(self, store):
+        """The decision prompt carries created/updated timestamps (issue #8)."""
+        await _insert_long_term(store, "health", "matteo", "Allergic to shellfish")
+        llm = _DecisionLLM({"operation": "NOOP"})
+
+        await store.update_memory(
+            llm, "m", {"category": "health", "subject": "matteo", "content": "Cannot eat shellfish"}
+        )
+
+        assert "created" in (llm.last_prompt or "")
+        assert "updated" in (llm.last_prompt or "")
+
+
+# -- lexical retrieval --
+
+
+class TestRetrieveSimilar:
+    async def test_subject_boost_ranks_same_subject_first(self, store):
+        await _insert_long_term(store, "fact", "simge", "enjoys shellfish dishes")
+        await _insert_long_term(store, "fact", "matteo", "dislikes loud music")
+
+        similar = await store._retrieve_similar_long_term("matteo", "allergic to shellfish")
+
+        # Both share a token with the candidate; the same-subject row wins.
+        assert similar[0]["subject"] == "matteo"
+
+    async def test_caps_at_top_k(self, store):
+        for i in range(20):
+            await _insert_long_term(store, "fact", "matteo", f"likes hiking trip {i}")
+
+        similar = await store._retrieve_similar_long_term("matteo", "likes hiking")
+
+        assert len(similar) <= store._UPDATE_TOP_K
+
+    async def test_no_overlap_returns_empty(self, store):
+        await _insert_long_term(store, "fact", "simge", "speaks turkish")
+        similar = await store._retrieve_similar_long_term("matteo", "owns a bicycle")
+        assert similar == []
+
+
+# -- pure helpers --
+
+
+class TestHelpers:
+    def test_normalize_subject(self):
+        assert _normalize_subject("  Matteo ") == "matteo"
+        assert _normalize_subject("") == ""
+        assert _normalize_subject(None) == ""
+
+    def test_tokens_drops_stopwords_and_single_chars(self):
+        assert _tokens("The user is a developer") == {"user", "developer"}
+
+    def test_similarity_jaccard(self):
+        assert _similarity({"a", "b"}, {"a", "b"}) == 1.0
+        assert _similarity({"a", "b"}, {"c", "d"}) == 0.0
+        assert _similarity(set(), {"a"}) == 0.0
+        assert _similarity({"a", "b", "c"}, {"a"}) == pytest.approx(1 / 3)

From d4c27b3c550a0127e19eac919bd49807c0abca07 Mon Sep 17 00:00:00 2001
From: Matteo Merola <mattmezza@gmail.com>
Date: Sun, 7 Jun 2026 22:37:28 +0200
Subject: [PATCH 3/7] feat(memory): embeddings, forgetting/importance, and
 hygiene pass (#5 Tiers 2-4)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Tier 2 — optional semantic retrieval. New core/embeddings.py wraps an
OpenAI-compatible /embeddings endpoint; vectors are stored as packed
float32 blobs and compared with brute-force cosine in Python (no native
extension, identical local/container). When enabled, update_memory
retrieves candidates by cosine (lexical fallback per row), and prompt
injection becomes relevance-ranked (relevance + importance + recency)
over only injection_top_k memories instead of dumping the recent N. The
inbound message is threaded through prompt building as the query. Off by
default — the pipeline still runs on Tier-1 lexical retrieval.

Tier 3 — forgetting/importance/reinforcement. long_term gains importance,
last_accessed, access_count, archived. Recalled memories are reinforced;
re-mentions raise importance; consolidation archives cold low-importance
idle memories via a soft-delete flag so long-term stops growing unbounded.

Tier 4 — hygiene pass. Consolidation clusters near-duplicate long-term
rows and resolves each cluster with one LLM call that merges duplicates
and drops contradictions, keeping the most recent fact.

New columns are added to existing DBs via in-place ALTER TABLE migration
in _ensure_schema; no manual upgrade step. All knobs configurable under
the memory config section.
---
 api/admin.py       |   5 +-
 core/agent.py      |  61 ++++++-
 core/config.py     |  31 ++++
 core/embeddings.py |  93 ++++++++++
 core/memory.py     | 425 ++++++++++++++++++++++++++++++++++++++++++---
 schema/memory.sql  |  11 +-
 6 files changed, 593 insertions(+), 33 deletions(-)
 create mode 100644 core/embeddings.py

diff --git a/api/admin.py b/api/admin.py
index 5cf7346..b9774fa 100644
--- a/api/admin.py
+++ b/api/admin.py
@@ -1277,15 +1277,16 @@ async def system_prompt_preview(body: PromptPreviewIn) -> dict:
 
         memories = ""
         if body.include_memories:
+            query = message or None
             if agent_state.agent:
-                memories = await agent_state.agent.memory.format_for_prompt()
+                memories = await agent_state.agent.memory.format_for_prompt(query=query)
             else:
                 from core.memory import MemoryStore
 
                 memories = await MemoryStore(
                     db_path=config.memory.db_path,
                     long_term_limit=config.memory.long_term_limit,
-                ).format_for_prompt()
+                ).format_for_prompt(query=query)
 
         reflections = ""
         if body.include_reflections and config.task_reflection.enabled:
diff --git a/core/agent.py b/core/agent.py
index 9175d72..89028a5 100644
--- a/core/agent.py
+++ b/core/agent.py
@@ -17,6 +17,7 @@
 
 from core.compaction import compact_messages, should_compact
 from core.config import Config
+from core.embeddings import EmbeddingClient
 from core.executor import ToolExecutor
 from core.goal_decomposition import DecomposedGoal, classify_complexity, decompose_goal
 from core.history import ConversationHistory
@@ -254,9 +255,18 @@ def __init__(self, config: Config):
             max_turns=config.history.max_turns,
         )
         self.history_mode = config.history.mode  # "injection" or "session"
+        mem_cfg = config.memory
         self.memory = MemoryStore(
-            db_path=config.memory.db_path,
-            long_term_limit=config.memory.long_term_limit,
+            db_path=mem_cfg.db_path,
+            long_term_limit=mem_cfg.long_term_limit,
+            embedder=self._build_embedder(),
+            injection_top_k=mem_cfg.embedding.injection_top_k,
+            default_importance=mem_cfg.default_importance,
+            archive_after_days=mem_cfg.archive_after_days,
+            archive_max_importance=mem_cfg.archive_max_importance,
+            archive_min_idle_days=mem_cfg.archive_min_idle_days,
+            hygiene_enabled=mem_cfg.hygiene_enabled,
+            hygiene_similarity_threshold=mem_cfg.hygiene_similarity_threshold,
         )
         self.reflections = ReflectionStore(
             db_path=config.task_reflection.db_path,
@@ -325,9 +335,9 @@ async def process(
         # is only built once, not rebuilt and re-sent each turn). In injection
         # mode the prompt is windowed/stateless, so it is rebuilt per call.
         if self.history_mode == "session":
-            system = await self._session_system_prompt(channel, user_id, chat_id)
+            system = await self._session_system_prompt(channel, user_id, chat_id, query=message)
         else:
-            system = await self._build_system_prompt()
+            system = await self._build_system_prompt(query=message)
 
         if self.config.admin.capture_prompts:
             self._record_system_prompt(
@@ -365,16 +375,20 @@ def _turn_preamble(self, decomposed_goal: DecomposedGoal | None) -> str:
             )
         return preamble
 
-    async def _session_system_prompt(self, channel: str, user_id: str, chat_id: str) -> str:
+    async def _session_system_prompt(
+        self, channel: str, user_id: str, chat_id: str, query: str | None = None
+    ) -> str:
         """Return the session's static system prompt, building it once if needed.
 
         Built fresh after a ``/new`` (when no snapshot exists), then reused for
         the lifetime of the session so the static content is sent only once.
+        Relevance-ranked memory injection therefore uses the first message of
+        the session as its query.
         """
         cached = await self.history.get_session_system(channel, user_id, chat_id)
         if cached is not None:
             return cached
-        system = await self._build_system_prompt()
+        system = await self._build_system_prompt(query=query)
         await self.history.set_session_system(channel, user_id, system, chat_id)
         return system
 
@@ -1130,6 +1144,35 @@ def _background_llm(self, provider: str) -> LLMClient:
             base_url=getattr(cfg, f"{provider}_base_url", None),
         )
 
+    def _build_embedder(self) -> EmbeddingClient | None:
+        """Construct the embedding client for semantic memory, if enabled.
+
+        Credentials fall back to the matching agent provider key / base URL when
+        not set explicitly on the embedding config. Returns None when disabled
+        or when no usable API key is available (the store then runs on Tier-1
+        lexical retrieval).
+        """
+        emb = self.config.memory.embedding
+        if not emb.enabled:
+            return None
+        cfg = self.config.agent
+        api_key = emb.api_key or getattr(cfg, f"{emb.provider}_api_key", "")
+        base_url = emb.base_url or getattr(cfg, f"{emb.provider}_base_url", "") or None
+        if not api_key:
+            log.warning("Memory embeddings enabled but no API key for provider %s", emb.provider)
+            return None
+        try:
+            return EmbeddingClient(
+                provider=emb.provider,
+                api_key=api_key,
+                model=emb.model,
+                base_url=base_url,
+                dimensions=emb.dimensions,
+            )
+        except Exception:
+            log.exception("Failed to build embedding client; disabling semantic memory")
+            return None
+
     async def _maybe_decompose(self, message: str) -> DecomposedGoal | None:
         """Classify and optionally decompose a user message into sub-goals.
 
@@ -1177,9 +1220,11 @@ async def _reflect_on_task(self, user_msg: str, agent_msg: str, tool_log: list[d
         except Exception:
             log.exception("Background task reflection failed")
 
-    async def _build_system_prompt(self, decomposed_goal: DecomposedGoal | None = None) -> str:
+    async def _build_system_prompt(
+        self, decomposed_goal: DecomposedGoal | None = None, query: str | None = None
+    ) -> str:
         skills_index = await self.skills.get_index_block()
-        memories = await self.memory.format_for_prompt()
+        memories = await self.memory.format_for_prompt(query=query)
 
         # Task reflections — lessons learned from past tasks
         reflections = ""
diff --git a/core/config.py b/core/config.py
index 870813c..cc11e1e 100644
--- a/core/config.py
+++ b/core/config.py
@@ -131,6 +131,25 @@ class HistoryConfig(BaseModel):
     mode: str = "injection"  # "injection" (windowed history) or "session" (sticky per channel)
 
 
+class EmbeddingConfig(BaseModel):
+    """Tier 2 — semantic similarity + relevance-ranked injection.
+
+    Disabled by default so the system runs on Tier-1 lexical retrieval with no
+    extra dependency or network call. When enabled, vectors are fetched from an
+    OpenAI-compatible ``/embeddings`` endpoint and stored as a blob alongside
+    each long-term memory (brute-force cosine in Python — fine at <1k rows, no
+    native extension required, identical on local and container SQLite).
+    """
+
+    enabled: bool = False
+    provider: str = "openai"  # OpenAI-compatible embeddings endpoint
+    model: str = "text-embedding-3-small"
+    api_key: str = ""  # falls back to the matching agent provider key when empty
+    base_url: str = ""  # falls back to the agent provider base URL when empty
+    dimensions: int = 0  # 0 = provider default
+    injection_top_k: int = 12  # relevance-ranked memories injected per turn
+
+
 class MemoryConfig(BaseModel):
     db_path: str = "data/memory.db"
     long_term_limit: int = 50
@@ -140,6 +159,18 @@ class MemoryConfig(BaseModel):
     consolidation_model: str = "claude-haiku-4-5"
     extraction_cooldown_seconds: int = 120  # minimum seconds between extractions
 
+    embedding: EmbeddingConfig = EmbeddingConfig()
+
+    # Tier 3 — forgetting / importance / reinforcement
+    default_importance: float = 5.0  # 1-10 scale assigned to new long-term memories
+    archive_after_days: int = 90  # min age before a cold memory may be archived
+    archive_max_importance: float = 4.0  # only archive memories at/below this importance
+    archive_min_idle_days: int = 45  # require this long since last access/creation
+
+    # Tier 4 — long-term hygiene pass (cluster + merge near-duplicates)
+    hygiene_enabled: bool = True
+    hygiene_similarity_threshold: float = 0.45  # min similarity to cluster two memories
+
 
 class GoalDecompositionConfig(BaseModel):
     enabled: bool = True
diff --git a/core/embeddings.py b/core/embeddings.py
new file mode 100644
index 0000000..f4b5f41
--- /dev/null
+++ b/core/embeddings.py
@@ -0,0 +1,93 @@
+"""Embedding client + vector helpers for semantic memory retrieval (Tier 2).
+
+Vectors are fetched from an OpenAI-compatible ``/embeddings`` endpoint and
+stored as packed float32 blobs alongside each long-term memory. Similarity is
+brute-force cosine in pure Python — no native SQLite extension, so it behaves
+identically on a local machine and inside the container.
+"""
+
+from __future__ import annotations
+
+import array
+import importlib
+import logging
+import math
+from typing import Any, cast
+
+log = logging.getLogger(__name__)
+
+# OpenAI-compatible base URLs for providers that expose an /embeddings endpoint.
+_DEFAULT_BASE_URLS = {
+    "google": "https://generativelanguage.googleapis.com/v1beta/openai",
+    "deepseek": "https://api.deepseek.com",
+}
+
+
+def pack_vector(vector: list[float]) -> bytes:
+    """Pack a float vector into a compact float32 blob for storage."""
+    return array.array("f", vector).tobytes()
+
+
+def unpack_vector(blob: bytes | None) -> list[float] | None:
+    """Unpack a float32 blob back into a list of floats (None if empty)."""
+    if not blob:
+        return None
+    arr = array.array("f")
+    arr.frombytes(blob)
+    return list(arr)
+
+
+def cosine_similarity(a: list[float], b: list[float]) -> float:
+    """Cosine similarity between two equal-length vectors (0.0 on degenerate input)."""
+    if not a or not b or len(a) != len(b):
+        return 0.0
+    dot = 0.0
+    na = 0.0
+    nb = 0.0
+    for x, y in zip(a, b, strict=False):
+        dot += x * y
+        na += x * x
+        nb += y * y
+    if na == 0.0 or nb == 0.0:
+        return 0.0
+    return dot / (math.sqrt(na) * math.sqrt(nb))
+
+
+class EmbeddingClient:
+    """Thin wrapper over an OpenAI-compatible embeddings endpoint."""
+
+    def __init__(
+        self,
+        provider: str,
+        api_key: str,
+        model: str,
+        base_url: str | None = None,
+        dimensions: int = 0,
+    ):
+        self.provider = (provider or "openai").strip().lower()
+        self.model = model
+        self.dimensions = dimensions or 0
+        resolved_base = base_url or _DEFAULT_BASE_URLS.get(self.provider)
+        try:
+            module = importlib.import_module("openai")
+            client_class = cast(Any, getattr(module, "AsyncOpenAI"))
+        except Exception as exc:  # pragma: no cover - import guard
+            raise RuntimeError("openai package is required for embeddings") from exc
+        self._client = cast(Any, client_class)(api_key=api_key, base_url=resolved_base or None)
+
+    async def embed(self, texts: list[str]) -> list[list[float]]:
+        """Return one embedding vector per input text."""
+        if not texts:
+            return []
+        kwargs: dict[str, Any] = {"model": self.model, "input": texts}
+        if self.dimensions:
+            kwargs["dimensions"] = self.dimensions
+        response = await self._client.embeddings.create(**kwargs)
+        # Preserve request order (OpenAI returns data sorted by index, but be safe).
+        items = sorted(response.data, key=lambda d: getattr(d, "index", 0))
+        return [list(item.embedding) for item in items]
+
+    async def embed_one(self, text: str) -> list[float]:
+        """Return a single embedding vector (empty list on failure)."""
+        vectors = await self.embed([text])
+        return vectors[0] if vectors else []
diff --git a/core/memory.py b/core/memory.py
index 9243648..2d271c8 100644
--- a/core/memory.py
+++ b/core/memory.py
@@ -11,6 +11,12 @@
 
 import aiosqlite
 
+from core.embeddings import (
+    EmbeddingClient,
+    cosine_similarity,
+    pack_vector,
+    unpack_vector,
+)
 from core.llm import LLMClient
 
 log = logging.getLogger(__name__)
@@ -90,6 +96,32 @@
   {{"operation": "DELETE", "id": <id>}}
   {{"operation": "NOOP"}}"""
 
+_HYGIENE_PROMPT = """\
+You are tidying a cluster of near-duplicate or possibly conflicting long-term
+memories for a personal AI assistant.
+
+Today's date: {today}
+
+## Memories in this cluster
+{cluster}
+
+Resolve the cluster into the minimal set of correct, non-redundant memories:
+- Merge duplicates and overlapping facts into one, keeping the clearest wording.
+- On contradictions, keep the most recent fact and drop the stale one.
+- Keep each memory short and dense (strip dates, times, situational framing).
+
+Return ONLY a JSON object describing the changes to apply:
+  {{"updates": [{{"id": <id>, "category": "<cat>", "subject": "<subj>", \
+"content": "<merged fact>"}}],
+   "deletes": [<id>, <id>]}}
+
+- Put the surviving memory in "updates" (reuse one of the cluster ids), with the
+  final merged content.
+- Put every other id in the cluster that should be removed in "deletes".
+- If the cluster is already clean, return {{"updates": [], "deletes": []}}.
+
+Respond with ONLY the JSON object, no other text."""
+
 _EXTRACTION_PROMPT = """\
 Given this conversation exchange, identify any facts worth remembering.
 
@@ -341,6 +373,42 @@ def _similarity(a: set[str], b: set[str]) -> float:
     return inter / len(a | b)
 
 
+# Half-life (in days) for the recency component of the retrieval score.
+_RECENCY_HALF_LIFE_DAYS = 30.0
+
+
+def _parse_sqlite_ts(ts: str | None) -> datetime | None:
+    """Parse a SQLite ``datetime('now')`` string (UTC, no tz suffix)."""
+    if not ts:
+        return None
+    try:
+        return datetime.strptime(ts, "%Y-%m-%d %H:%M:%S").replace(tzinfo=UTC)
+    except ValueError, TypeError:
+        return None
+
+
+def _recency_score(ts: str | None) -> float:
+    """Exponential-decay recency in [0, 1]; newer timestamps score higher."""
+    parsed = _parse_sqlite_ts(ts)
+    if parsed is None:
+        return 0.0
+    age_days = max(0.0, (datetime.now(tz=UTC) - parsed).total_seconds() / 86400.0)
+    return 0.5 ** (age_days / _RECENCY_HALF_LIFE_DAYS)
+
+
+def _pair_similarity(a: dict, b: dict) -> float:
+    """Similarity between two long-term rows: embedding cosine when both have a
+    stored vector, otherwise token overlap on subject + content."""
+    va = unpack_vector(a.get("embedding"))
+    vb = unpack_vector(b.get("embedding"))
+    if va and vb:
+        return cosine_similarity(va, vb)
+    return _similarity(
+        _tokens(f"{a['subject']} {a['content']}"),
+        _tokens(f"{b['subject']} {b['content']}"),
+    )
+
+
 class MemoryStore:
     """Two-tier memory system backed by SQLite.
 
@@ -351,9 +419,30 @@ class MemoryStore:
     conversation turn.
     """
 
-    def __init__(self, db_path: str = "data/memory.db", long_term_limit: int = 50):
+    def __init__(
+        self,
+        db_path: str = "data/memory.db",
+        long_term_limit: int = 50,
+        *,
+        embedder: EmbeddingClient | None = None,
+        injection_top_k: int = 12,
+        default_importance: float = 5.0,
+        archive_after_days: int = 90,
+        archive_max_importance: float = 4.0,
+        archive_min_idle_days: int = 45,
+        hygiene_enabled: bool = True,
+        hygiene_similarity_threshold: float = 0.45,
+    ):
         self.db_path = db_path
         self.long_term_limit = long_term_limit
+        self.embedder = embedder
+        self.injection_top_k = injection_top_k
+        self.default_importance = default_importance
+        self.archive_after_days = archive_after_days
+        self.archive_max_importance = archive_max_importance
+        self.archive_min_idle_days = archive_min_idle_days
+        self.hygiene_enabled = hygiene_enabled
+        self.hygiene_similarity_threshold = hygiene_similarity_threshold
         self._ready = False
         self._last_extraction: float | None = None  # monotonic timestamp of last extraction
         # Turns skipped by the cooldown, replayed into the next extraction so
@@ -367,19 +456,102 @@ async def _ensure_schema(self) -> None:
         schema = _SCHEMA_FILE.read_text()
         async with aiosqlite.connect(self.db_path) as db:
             await db.executescript(schema)
+            await self._migrate_long_term(db)
         self._ready = True
 
+    # Columns added after the original two-tier schema shipped. Each is applied
+    # via ALTER TABLE on databases created before the column existed, so an
+    # existing data/memory.db upgrades in place (defaults are constant, as
+    # required by SQLite's ALTER TABLE ADD COLUMN).
+    _LONG_TERM_MIGRATIONS = (
+        ("embedding", "embedding BLOB"),
+        ("importance", "importance REAL NOT NULL DEFAULT 5.0"),
+        ("last_accessed", "last_accessed DATETIME"),
+        ("access_count", "access_count INTEGER NOT NULL DEFAULT 0"),
+        ("archived", "archived INTEGER NOT NULL DEFAULT 0"),
+    )
+
+    async def _migrate_long_term(self, db: aiosqlite.Connection) -> None:
+        cursor = await db.execute("PRAGMA table_info(long_term)")
+        existing = {row[1] for row in await cursor.fetchall()}
+        for name, ddl in self._LONG_TERM_MIGRATIONS:
+            if name not in existing:
+                await db.execute(f"ALTER TABLE long_term ADD COLUMN {ddl}")  # noqa: S608
+        # Safe to create now: the archived column is guaranteed to exist (fresh
+        # DBs declare it; legacy DBs just had it added above).
+        await db.execute("CREATE INDEX IF NOT EXISTS idx_lt_archived ON long_term(archived)")
+        await db.commit()
+
     async def get_long_term(self) -> list[dict]:
-        """Retrieve long-term memories for system prompt injection."""
+        """Retrieve recent (non-archived) long-term memories for injection."""
         await self._ensure_schema()
         async with aiosqlite.connect(self.db_path) as db:
             db.row_factory = aiosqlite.Row
             cursor = await db.execute(
-                "SELECT category, subject, content FROM long_term ORDER BY updated_at DESC LIMIT ?",
+                "SELECT category, subject, content FROM long_term "
+                "WHERE archived = 0 ORDER BY updated_at DESC LIMIT ?",
                 (self.long_term_limit,),
             )
             return [dict(row) for row in await cursor.fetchall()]
 
+    async def get_relevant_long_term(self, query: str) -> list[dict]:
+        """Return long-term memories most relevant to *query*, relevance-ranked.
+
+        Uses a Generative-Agents-style score (recency + importance + relevance)
+        over embedding cosine similarity, and reinforces the chosen memories
+        (bumps ``access_count`` / ``last_accessed``). Falls back to recency
+        order when embeddings are unavailable or the query can't be embedded.
+        """
+        if not self.embedder or not query.strip():
+            return await self.get_long_term()
+
+        try:
+            query_vec = await self.embedder.embed_one(query)
+        except Exception:
+            log.exception("Query embedding failed; falling back to recency order")
+            return await self.get_long_term()
+        if not query_vec:
+            return await self.get_long_term()
+
+        await self._ensure_schema()
+        async with aiosqlite.connect(self.db_path) as db:
+            db.row_factory = aiosqlite.Row
+            cursor = await db.execute(
+                "SELECT id, category, subject, content, importance, embedding, "
+                "updated_at, last_accessed FROM long_term WHERE archived = 0"
+            )
+            rows = [dict(r) for r in await cursor.fetchall()]
+
+        scored: list[tuple[float, dict]] = []
+        for row in rows:
+            vec = unpack_vector(row.get("embedding"))
+            relevance = cosine_similarity(query_vec, vec) if vec else 0.0
+            importance = (row.get("importance") or self.default_importance) / 10.0
+            recency = _recency_score(row.get("last_accessed") or row.get("updated_at"))
+            score = relevance + 0.5 * importance + 0.3 * recency
+            scored.append((score, row))
+
+        scored.sort(key=lambda pair: pair[0], reverse=True)
+        top = [row for _, row in scored[: self.injection_top_k]]
+        await self._reinforce([row["id"] for row in top])
+        return [
+            {"category": r["category"], "subject": r["subject"], "content": r["content"]}
+            for r in top
+        ]
+
+    async def _reinforce(self, ids: list[int]) -> None:
+        """Strengthen recalled memories: bump access_count and last_accessed."""
+        if not ids:
+            return
+        await self._ensure_schema()
+        async with aiosqlite.connect(self.db_path) as db:
+            await db.executemany(
+                "UPDATE long_term SET access_count = access_count + 1, "
+                "last_accessed = datetime('now') WHERE id = ?",
+                [(i,) for i in ids],
+            )
+            await db.commit()
+
     async def get_short_term(self) -> list[dict]:
         """Retrieve active (non-expired) short-term memories."""
         await self._ensure_schema()
@@ -392,11 +564,19 @@ async def get_short_term(self) -> list[dict]:
             )
             return [dict(row) for row in await cursor.fetchall()]
 
-    async def format_for_prompt(self) -> str:
-        """Format both tiers into a block for the system prompt."""
+    async def format_for_prompt(self, query: str | None = None) -> str:
+        """Format both tiers into a block for the system prompt.
+
+        When *query* is given and embeddings are enabled, only the long-term
+        memories most relevant to the query are injected (relevance-ranked),
+        instead of dumping the most recent ``long_term_limit`` rows (issue #5).
+        """
         sections: list[str] = []
 
-        long_term = await self.get_long_term()
+        if query:
+            long_term = await self.get_relevant_long_term(query)
+        else:
+            long_term = await self.get_long_term()
         if long_term:
             lines = [f"- [{m['category']}] {m['subject']}: {m['content']}" for m in long_term]
             sections.append("## Long-term memories\n" + "\n".join(lines))
@@ -600,13 +780,24 @@ async def update_memory(self, llm: LLMClient, model: str, candidate: dict) -> st
             new_content = (decision.get("content") or content).strip()
             new_category = decision.get("category") or category
             new_subject = _normalize_subject(decision.get("subject") or subject)
+            blob = await self._embed_blob(f"{new_subject}: {new_content}")
             await self._ensure_schema()
             async with aiosqlite.connect(self.db_path) as db:
-                await db.execute(
-                    "UPDATE long_term SET category = ?, subject = ?, content = ?, "
-                    "updated_at = datetime('now') WHERE id = ?",
-                    (new_category, new_subject, new_content, target_id),
-                )
+                # Re-mentioning a fact reinforces it: bump importance (capped).
+                if blob is not None:
+                    await db.execute(
+                        "UPDATE long_term SET category = ?, subject = ?, content = ?, "
+                        "embedding = ?, importance = MIN(10.0, importance + 1.0), "
+                        "updated_at = datetime('now') WHERE id = ?",
+                        (new_category, new_subject, new_content, blob, target_id),
+                    )
+                else:
+                    await db.execute(
+                        "UPDATE long_term SET category = ?, subject = ?, content = ?, "
+                        "importance = MIN(10.0, importance + 1.0), "
+                        "updated_at = datetime('now') WHERE id = ?",
+                        (new_category, new_subject, new_content, target_id),
+                    )
                 await db.commit()
             log.debug("UPDATE long-term %s: %s", target_id, new_content[:80])
             return "UPDATE"
@@ -626,38 +817,77 @@ async def update_memory(self, llm: LLMClient, model: str, candidate: dict) -> st
         return "NOOP"
 
     async def _retrieve_similar_long_term(self, subject: str, content: str) -> list[dict]:
-        """Return the top-k existing long-term memories lexically similar to a
-        candidate (subject + content), ranked by token overlap with a boost for
-        a matching subject. Cheap and dependency-free; fine at <1k rows."""
+        """Return the top-k existing (non-archived) long-term memories similar to
+        a candidate (subject + content).
+
+        Uses embedding cosine similarity when an embedder is configured (with a
+        per-row lexical fallback for memories that have no stored vector yet),
+        otherwise pure token overlap. A matching subject adds a fixed boost.
+        Cheap and dependency-free at <1k rows."""
         await self._ensure_schema()
         async with aiosqlite.connect(self.db_path) as db:
             db.row_factory = aiosqlite.Row
             cursor = await db.execute(
-                "SELECT id, category, subject, content, created_at, updated_at FROM long_term"
+                "SELECT id, category, subject, content, created_at, updated_at, embedding "
+                "FROM long_term WHERE archived = 0"
             )
             rows = [dict(r) for r in await cursor.fetchall()]
 
         subject_norm = _normalize_subject(subject)
         cand_tokens = _tokens(f"{subject} {content}")
+        cand_vec = await self._safe_embed(f"{subject}: {content}")
+
         scored: list[tuple[float, dict]] = []
         for row in rows:
-            score = _similarity(cand_tokens, _tokens(f"{row['subject']} {row['content']}"))
+            row_tokens = _tokens(f"{row['subject']} {row['content']}")
+            if cand_vec:
+                vec = unpack_vector(row.get("embedding"))
+                base = (
+                    cosine_similarity(cand_vec, vec)
+                    if vec
+                    else _similarity(cand_tokens, row_tokens)
+                )
+            else:
+                base = _similarity(cand_tokens, row_tokens)
+            score = base
             if subject_norm and _normalize_subject(row["subject"]) == subject_norm:
                 score += 0.5
             if score > 0:
+                row.pop("embedding", None)
                 scored.append((score, row))
 
         scored.sort(key=lambda pair: pair[0], reverse=True)
         return [row for _, row in scored[: self._UPDATE_TOP_K]]
 
-    async def _insert_long_term(self, category: str, subject: str, content: str) -> None:
-        """Insert a new long-term memory row."""
+    async def _safe_embed(self, text: str) -> list[float] | None:
+        """Best-effort embedding; returns None if disabled or on failure."""
+        if not self.embedder:
+            return None
+        try:
+            vec = await self.embedder.embed_one(text)
+        except Exception:
+            log.exception("Embedding call failed; proceeding without a vector")
+            return None
+        return vec or None
+
+    async def _embed_blob(self, text: str) -> bytes | None:
+        """Best-effort packed embedding blob (None if disabled or on failure)."""
+        vec = await self._safe_embed(text)
+        return pack_vector(vec) if vec else None
+
+    async def _insert_long_term(
+        self, category: str, subject: str, content: str, importance: float | None = None
+    ) -> None:
+        """Insert a new long-term memory row (with embedding + importance)."""
         await self._ensure_schema()
+        blob = await self._embed_blob(f"{subject}: {content}")
+        imp = self.default_importance if importance is None else importance
         async with aiosqlite.connect(self.db_path) as db:
             await db.execute(
-                "INSERT INTO long_term (category, subject, content, source, confidence) "
-                "VALUES (?, ?, ?, 'conversation', 'stated')",
-                (category, subject, content),
+                "INSERT INTO long_term "
+                "(category, subject, content, source, confidence, embedding, importance) "
+                "VALUES (?, ?, ?, 'conversation', 'stated', ?, ?)",
+                (category, subject, content, blob, imp),
             )
             await db.commit()
 
@@ -732,19 +962,170 @@ async def consolidate_and_cleanup(self, llm: LLMClient, model: str) -> dict:
         # Delete all expired short-term memories
         expired_count = await self._delete_expired_short_term()
 
+        # Tier 4: merge near-duplicate / contradictory long-term rows.
+        merged = 0
+        if self.hygiene_enabled:
+            merged = await self._hygiene_pass(llm, model)
+
+        # Tier 3: archive cold, low-importance long-term memories.
+        archived = await self._archive_cold_memories()
+
         summary = {
             "active_reviewed": len(active_short_term),
             "promoted_to_long_term": promoted,
             "expired_deleted": expired_count,
+            "hygiene_merged": merged,
+            "archived": archived,
         }
         log.info(
-            "Memory consolidation complete: %d active reviewed, %d promoted, %d expired deleted",
+            "Memory consolidation complete: %d reviewed, %d promoted, %d expired deleted, "
+            "%d merged, %d archived",
             summary["active_reviewed"],
             summary["promoted_to_long_term"],
             summary["expired_deleted"],
+            summary["hygiene_merged"],
+            summary["archived"],
         )
         return summary
 
+    async def _archive_cold_memories(self) -> int:
+        """Archive cold, low-importance long-term memories (Tier 3, issue #9).
+
+        A memory is archived (soft-deleted via the ``archived`` flag, not hard
+        deleted) when it is old enough, has low importance, and has not been
+        accessed recently. Returns the number archived.
+        """
+        await self._ensure_schema()
+        async with aiosqlite.connect(self.db_path) as db:
+            cursor = await db.execute(
+                "UPDATE long_term SET archived = 1 WHERE archived = 0 "
+                "AND importance <= ? "
+                "AND created_at < datetime('now', ?) "
+                "AND COALESCE(last_accessed, created_at) < datetime('now', ?)",
+                (
+                    self.archive_max_importance,
+                    f"-{self.archive_after_days} days",
+                    f"-{self.archive_min_idle_days} days",
+                ),
+            )
+            count = cursor.rowcount
+            await db.commit()
+        if count:
+            log.info("Archived %d cold long-term memories", count)
+        return count
+
+    # Cap how many clusters one hygiene pass resolves, to bound LLM cost.
+    _HYGIENE_MAX_CLUSTERS = 10
+
+    async def _hygiene_pass(self, llm: LLMClient, model: str) -> int:
+        """Cluster near-duplicate long-term memories and merge each cluster via
+        one LLM call (Tier 4, issue #6). Returns the number of rows removed."""
+        await self._ensure_schema()
+        async with aiosqlite.connect(self.db_path) as db:
+            db.row_factory = aiosqlite.Row
+            cursor = await db.execute(
+                "SELECT id, category, subject, content, created_at, updated_at, embedding "
+                "FROM long_term WHERE archived = 0"
+            )
+            rows = [dict(r) for r in await cursor.fetchall()]
+
+        if len(rows) < 2:
+            return 0
+
+        clusters = self._cluster_long_term(rows)[: self._HYGIENE_MAX_CLUSTERS]
+        removed = 0
+        for cluster in clusters:
+            try:
+                removed += await self._resolve_cluster(llm, model, cluster)
+            except Exception:
+                log.exception("Hygiene cluster resolution failed")
+        if removed:
+            log.info("Hygiene pass merged away %d duplicate long-term memories", removed)
+        return removed
+
+    def _cluster_long_term(self, rows: list[dict]) -> list[list[dict]]:
+        """Greedily group memories whose pairwise similarity meets the threshold.
+
+        Returns only clusters with two or more members (singletons need no work).
+        """
+        threshold = self.hygiene_similarity_threshold
+        unassigned = list(rows)
+        clusters: list[list[dict]] = []
+        while unassigned:
+            seed = unassigned.pop(0)
+            cluster = [seed]
+            rest: list[dict] = []
+            for row in unassigned:
+                if _pair_similarity(seed, row) >= threshold:
+                    cluster.append(row)
+                else:
+                    rest.append(row)
+            unassigned = rest
+            if len(cluster) >= 2:
+                clusters.append(cluster)
+        return clusters
+
+    async def _resolve_cluster(self, llm: LLMClient, model: str, cluster: list[dict]) -> int:
+        """Ask the LLM to merge one cluster; apply updates/deletes. Returns rows
+        removed (deletes that actually matched a cluster member)."""
+        cluster_lines = [
+            f"- id={row['id']} [{row['category']}] {row['subject']}: {row['content']} "
+            f"(created {row['created_at']}, updated {row['updated_at']})"
+            for row in cluster
+        ]
+        prompt = _HYGIENE_PROMPT.format(
+            today=datetime.now(tz=UTC).date().isoformat(),
+            cluster="\n".join(cluster_lines),
+        )
+        try:
+            raw = await llm.generate_text(model=model, prompt=prompt, max_tokens=1024)
+        except Exception:
+            log.exception("Hygiene LLM call failed")
+            return 0
+
+        plan = _extract_json_object(raw)
+        if not isinstance(plan, dict):
+            log.warning("Hygiene LLM returned non-JSON: %s", raw[:200])
+            return 0
+
+        valid_ids = {row["id"] for row in cluster}
+        updates = plan.get("updates") or []
+        deletes = plan.get("deletes") or []
+
+        removed = 0
+        await self._ensure_schema()
+        async with aiosqlite.connect(self.db_path) as db:
+            for upd in updates:
+                if not isinstance(upd, dict):
+                    continue
+                uid = upd.get("id")
+                if uid not in valid_ids:
+                    continue
+                content = (upd.get("content") or "").strip()
+                if not content:
+                    continue
+                subject = _normalize_subject(upd.get("subject") or "")
+                category = upd.get("category") or "fact"
+                blob = await self._embed_blob(f"{subject}: {content}")
+                if blob is not None:
+                    await db.execute(
+                        "UPDATE long_term SET category = ?, subject = ?, content = ?, "
+                        "embedding = ?, updated_at = datetime('now') WHERE id = ?",
+                        (category, subject, content, blob, uid),
+                    )
+                else:
+                    await db.execute(
+                        "UPDATE long_term SET category = ?, subject = ?, content = ?, "
+                        "updated_at = datetime('now') WHERE id = ?",
+                        (category, subject, content, uid),
+                    )
+            for did in deletes:
+                if did in valid_ids:
+                    await db.execute("DELETE FROM long_term WHERE id = ?", (did,))
+                    removed += 1
+            await db.commit()
+        return removed
+
     async def _run_consolidation_llm(
         self, llm: LLMClient, model: str, short_term_rows: list[dict]
     ) -> int:
diff --git a/schema/memory.sql b/schema/memory.sql
index 49d3954..f364c8b 100644
--- a/schema/memory.sql
+++ b/schema/memory.sql
@@ -9,7 +9,14 @@ CREATE TABLE IF NOT EXISTS long_term (
     source TEXT,
     confidence TEXT DEFAULT 'stated',
     created_at DATETIME DEFAULT (datetime('now')),
-    updated_at DATETIME DEFAULT (datetime('now'))
+    updated_at DATETIME DEFAULT (datetime('now')),
+    -- Tier 2: cached embedding vector (packed float32 blob; NULL until computed)
+    embedding BLOB,
+    -- Tier 3: forgetting / importance / reinforcement
+    importance REAL NOT NULL DEFAULT 5.0,
+    last_accessed DATETIME,
+    access_count INTEGER NOT NULL DEFAULT 0,
+    archived INTEGER NOT NULL DEFAULT 0
 );
 
 CREATE TABLE IF NOT EXISTS short_term (
@@ -23,3 +30,5 @@ CREATE TABLE IF NOT EXISTS short_term (
 CREATE INDEX IF NOT EXISTS idx_lt_category ON long_term(category);
 CREATE INDEX IF NOT EXISTS idx_lt_subject ON long_term(subject);
 CREATE INDEX IF NOT EXISTS idx_st_expires ON short_term(expires_at);
+-- idx_lt_archived is created in MemoryStore._migrate_long_term, after the
+-- archived column is guaranteed to exist (so legacy DBs migrate cleanly).

From 12c803d1325ccf047f9b6f93c6214d2e9dd60656 Mon Sep 17 00:00:00 2001
From: Matteo Merola <mattmezza@gmail.com>
Date: Sun, 7 Jun 2026 22:37:36 +0200
Subject: [PATCH 4/7] test(memory): cover Tiers 2-4 + legacy schema migration;
 document new pipeline

Add test_memory_tiers.py: vector helpers, embedding-backed retrieval and
relevance-ranked injection with reinforcement, cold-memory archiving,
importance reinforcement on update, hygiene clustering/merge (incl.
malformed-plan safety), consolidation summary keys, and in-place
migration of a legacy long_term table. Update the session-prompt test
stub for the new query kwarg. Document Tiers 2-4 in pa.md and add the
embedding/forgetting/hygiene knobs to config.yml.example.
---
 config.yml.example         |  24 +++
 pa.md                      |  33 ++++
 tests/test_memory_tiers.py | 350 +++++++++++++++++++++++++++++++++++++
 tests/test_tools.py        |   2 +-
 4 files changed, 408 insertions(+), 1 deletion(-)
 create mode 100644 tests/test_memory_tiers.py

diff --git a/config.yml.example b/config.yml.example
index dbabffd..6d7841f 100644
--- a/config.yml.example
+++ b/config.yml.example
@@ -75,3 +75,27 @@ memory:
   db_path: "data/memory.db"
   long_term_limit: 50
   extraction_model: "claude-haiku-4-5"  # cheap model for post-turn memory extraction
+  consolidation_model: "claude-haiku-4-5"  # model for scheduled consolidation + hygiene
+
+  # Tier 2 — semantic similarity + relevance-ranked injection.
+  # Off by default: the memory pipeline works on lexical retrieval with no
+  # extra dependency. When enabled, vectors come from an OpenAI-compatible
+  # /embeddings endpoint and are stored alongside each long-term memory.
+  embedding:
+    enabled: false
+    provider: "openai"               # OpenAI-compatible embeddings endpoint
+    model: "text-embedding-3-small"
+    api_key: ""                      # falls back to the agent provider key when empty
+    base_url: ""                     # falls back to the agent provider base URL when empty
+    dimensions: 0                    # 0 = provider default
+    injection_top_k: 12              # relevance-ranked memories injected per turn
+
+  # Tier 3 — forgetting / importance / reinforcement
+  default_importance: 5.0            # 1-10 scale assigned to new long-term memories
+  archive_after_days: 90             # min age before a cold memory may be archived
+  archive_max_importance: 4.0        # only archive memories at/below this importance
+  archive_min_idle_days: 45          # require this long since last access/creation
+
+  # Tier 4 — long-term hygiene pass (cluster + merge near-duplicates)
+  hygiene_enabled: true
+  hygiene_similarity_threshold: 0.45 # min similarity to cluster two memories
diff --git a/pa.md b/pa.md
index 678739b..85ec652 100644
--- a/pa.md
+++ b/pa.md
@@ -1096,6 +1096,39 @@ memory:
 
 You can also trigger consolidation manually via the admin API: `POST /memory/consolidate`.
 
+#### Semantic retrieval & relevance-ranked injection (Tier 2)
+
+Embeddings are **optional and off by default** — the pipeline runs on Tier-1 lexical retrieval with no extra dependency or network call. When `memory.embedding.enabled` is set, each long-term memory gets a vector from an OpenAI-compatible `/embeddings` endpoint, stored as a packed float32 blob in the `embedding` column. Similarity is brute-force cosine in Python (no native SQLite extension, so it behaves identically locally and in the container; trivial at <1k rows).
+
+With embeddings on:
+
+- `update_memory` retrieves ADD/UPDATE/DELETE/NOOP candidates by cosine similarity (with a lexical fallback for any memory that has no vector yet).
+- Prompt injection becomes **relevance-ranked**: instead of dumping the most recent `long_term_limit` rows, only the `injection_top_k` memories most relevant to the current message are injected, scored Generative-Agents style (relevance + importance + recency). The inbound message is threaded into prompt building as the query. In session mode (where the static prompt is snapshotted once), the first message of the session is used as the query.
+
+#### Forgetting, importance & reinforcement (Tier 3)
+
+`long_term` carries `importance` (1–10), `last_accessed`, `access_count`, and an `archived` flag. Recalled memories are reinforced (their `access_count`/`last_accessed` are bumped), and re-mentioning a fact (an UPDATE through the unified pipeline) raises its importance. The consolidation job archives **cold** memories — old, low-importance, and not accessed recently — via the `archived` flag (a soft delete, not a hard delete), so long-term memory stops growing without bound. Thresholds are configurable (`archive_after_days`, `archive_max_importance`, `archive_min_idle_days`).
+
+#### Long-term hygiene pass (Tier 4)
+
+Each consolidation run also clusters near-duplicate long-term memories (by embedding or lexical similarity, threshold `hygiene_similarity_threshold`) and resolves each cluster with one LLM call that merges duplicates and drops contradictions, keeping the most recent fact. This compacts memories that accumulated over time, not just at write time.
+
+These behaviours are configured in the `memory` section (see `config.yml.example` for the full set):
+
+```yaml
+memory:
+  embedding:
+    enabled: false
+    provider: "openai"
+    model: "text-embedding-3-small"
+    injection_top_k: 12
+  default_importance: 5.0
+  archive_after_days: 90
+  hygiene_enabled: true
+```
+
+The consolidation summary returned by `consolidate_and_cleanup` reports `active_reviewed`, `promoted_to_long_term`, `expired_deleted`, `hygiene_merged`, and `archived`. The new columns are added to existing databases by an in-place `ALTER TABLE` migration in `MemoryStore._ensure_schema`, so upgrading needs no manual steps.
+
 ### 8.5 Memory in the Agent Loop
 
 On each conversation turn, the orchestrator queries both memory tiers and injects them into the system prompt as context. This happens before the LLM call:
diff --git a/tests/test_memory_tiers.py b/tests/test_memory_tiers.py
new file mode 100644
index 0000000..bac8547
--- /dev/null
+++ b/tests/test_memory_tiers.py
@@ -0,0 +1,350 @@
+"""Tests for Tier 2 (embeddings + relevance injection), Tier 3 (forgetting /
+importance / reinforcement), and Tier 4 (long-term hygiene) of the memory
+system, plus the in-place schema migration (issue #5)."""
+
+from __future__ import annotations
+
+import json
+import re
+
+import aiosqlite
+import pytest
+
+from core.embeddings import cosine_similarity, pack_vector, unpack_vector
+from core.memory import MemoryStore
+
+
+class _HashEmbedder:
+    """Deterministic bag-of-words embedder for tests (no network).
+
+    Strings that share tokens get non-zero cosine similarity, so the embedding
+    retrieval/ranking paths are genuinely exercised.
+    """
+
+    DIM = 64
+
+    async def embed_one(self, text: str) -> list[float]:
+        vec = [0.0] * self.DIM
+        for tok in re.findall(r"[a-z0-9]+", text.lower()):
+            vec[hash(tok) % self.DIM] += 1.0
+        return vec
+
+    async def embed(self, texts: list[str]) -> list[list[float]]:
+        return [await self.embed_one(t) for t in texts]
+
+
+@pytest.fixture
+async def store(tmp_path):
+    s = MemoryStore(db_path=str(tmp_path / "memory.db"), long_term_limit=50)
+    await s._ensure_schema()
+    return s
+
+
+@pytest.fixture
+async def embed_store(tmp_path):
+    s = MemoryStore(
+        db_path=str(tmp_path / "memory.db"),
+        long_term_limit=50,
+        embedder=_HashEmbedder(),
+        injection_top_k=2,
+    )
+    await s._ensure_schema()
+    return s
+
+
+class _StubLLM:
+    def __init__(self, response: str):
+        self._response = response
+        self.calls = 0
+
+    async def generate_text(self, *, model, prompt, max_tokens=1024) -> str:
+        self.calls += 1
+        return self._response
+
+
+async def _insert(
+    store: MemoryStore,
+    subject: str,
+    content: str,
+    *,
+    category: str = "fact",
+    importance: float = 5.0,
+    created_offset_days: int = 0,
+    idle_days: int | None = None,
+    embedding: list[float] | None = None,
+) -> int:
+    blob = pack_vector(embedding) if embedding is not None else None
+    last_accessed = None if idle_days is None else f"datetime('now', '-{idle_days} days')"
+    async with aiosqlite.connect(store.db_path) as db:
+        cur = await db.execute(
+            f"INSERT INTO long_term "  # noqa: S608
+            "(category, subject, content, importance, embedding, created_at, last_accessed) "
+            f"VALUES (?, ?, ?, ?, ?, datetime('now', '-{created_offset_days} days'), "
+            f"{last_accessed if last_accessed else 'NULL'})",
+            (category, subject, content, importance, blob),
+        )
+        await db.commit()
+        return cur.lastrowid
+
+
+async def _row(store: MemoryStore, rid: int) -> dict:
+    async with aiosqlite.connect(store.db_path) as db:
+        db.row_factory = aiosqlite.Row
+        cur = await db.execute("SELECT * FROM long_term WHERE id = ?", (rid,))
+        row = await cur.fetchone()
+        return dict(row) if row else {}
+
+
+# -- vector helpers --
+
+
+class TestVectorHelpers:
+    def test_pack_unpack_roundtrip(self):
+        vec = [0.1, -2.0, 3.5, 0.0]
+        out = unpack_vector(pack_vector(vec))
+        assert out == pytest.approx(vec, abs=1e-6)
+
+    def test_unpack_none(self):
+        assert unpack_vector(None) is None
+        assert unpack_vector(b"") is None
+
+    def test_cosine(self):
+        assert cosine_similarity([1, 0], [1, 0]) == pytest.approx(1.0)
+        assert cosine_similarity([1, 0], [0, 1]) == pytest.approx(0.0)
+        assert cosine_similarity([], [1]) == 0.0
+        assert cosine_similarity([0, 0], [1, 1]) == 0.0
+
+
+# -- Tier 2: embeddings --
+
+
+class TestEmbeddingWritePath:
+    async def test_insert_stores_embedding_blob(self, embed_store):
+        await embed_store._insert_long_term("fact", "matteo", "lives in zurich")
+        rows = await _row_all(embed_store)
+        assert rows[0]["embedding"] is not None
+        assert unpack_vector(rows[0]["embedding"]) is not None
+
+    async def test_retrieval_uses_embeddings(self, embed_store):
+        # Two stored memories; candidate shares tokens with one of them.
+        await embed_store._insert_long_term("health", "matteo", "allergic to shellfish")
+        await embed_store._insert_long_term("fact", "simge", "speaks turkish fluently")
+
+        similar = await embed_store._retrieve_similar_long_term("matteo", "cannot eat shellfish")
+
+        assert similar
+        assert "shellfish" in similar[0]["content"]
+
+    async def test_relevant_injection_ranks_and_reinforces(self, embed_store):
+        rid_shell = await _insert(embed_store, "matteo", "allergic to shellfish", embedding=None)
+        await _insert(embed_store, "simge", "speaks turkish fluently", embedding=None)
+        # Give the shellfish row an embedding aligned with the query tokens.
+        emb = _HashEmbedder()
+        async with aiosqlite.connect(embed_store.db_path) as db:
+            for rid, text in [
+                (rid_shell, "allergic to shellfish"),
+            ]:
+                blob = pack_vector(await emb.embed_one(text))
+                await db.execute("UPDATE long_term SET embedding = ? WHERE id = ?", (blob, rid))
+            await db.commit()
+
+        out = await embed_store.get_relevant_long_term("what foods is he allergic to shellfish")
+
+        assert out  # relevance-ranked subset
+        assert out[0]["content"] == "allergic to shellfish"
+        # Reinforcement bumped access_count on the recalled row.
+        assert (await _row(embed_store, rid_shell))["access_count"] >= 1
+
+    async def test_relevant_injection_respects_top_k(self, embed_store):
+        for i in range(5):
+            await embed_store._insert_long_term("fact", "matteo", f"likes hobby number {i}")
+        out = await embed_store.get_relevant_long_term("matteo likes hobby")
+        assert len(out) <= embed_store.injection_top_k
+
+    async def test_format_for_prompt_without_query_uses_recency(self, embed_store):
+        await embed_store._insert_long_term("fact", "matteo", "lives in zurich")
+        block = await embed_store.format_for_prompt()
+        assert "lives in zurich" in block
+
+
+# -- Tier 3: forgetting / importance / reinforcement --
+
+
+class TestForgetting:
+    async def test_get_long_term_excludes_archived(self, store):
+        keep = await _insert(store, "matteo", "keeps this")
+        gone = await _insert(store, "matteo", "archived one")
+        async with aiosqlite.connect(store.db_path) as db:
+            await db.execute("UPDATE long_term SET archived = 1 WHERE id = ?", (gone,))
+            await db.commit()
+
+        rows = await store.get_long_term()
+        contents = {r["content"] for r in rows}
+        assert "keeps this" in contents
+        assert "archived one" not in contents
+        assert keep  # silence unused
+
+    async def test_archive_cold_low_importance(self, store):
+        store.archive_after_days = 60
+        store.archive_min_idle_days = 30
+        store.archive_max_importance = 4.0
+
+        cold = await _insert(
+            store, "matteo", "old trivia", importance=2.0, created_offset_days=200, idle_days=200
+        )
+        recent = await _insert(store, "matteo", "fresh fact", importance=2.0, created_offset_days=1)
+        important = await _insert(
+            store, "matteo", "old but key", importance=9.0, created_offset_days=200, idle_days=200
+        )
+
+        n = await store._archive_cold_memories()
+
+        assert n == 1
+        assert (await _row(store, cold))["archived"] == 1
+        assert (await _row(store, recent))["archived"] == 0
+        assert (await _row(store, important))["archived"] == 0
+
+    async def test_reinforce_bumps_counters(self, store):
+        rid = await _insert(store, "matteo", "a fact")
+        await store._reinforce([rid])
+        row = await _row(store, rid)
+        assert row["access_count"] == 1
+        assert row["last_accessed"] is not None
+
+    async def test_update_reinforces_importance(self, store):
+        rid = await _insert(store, "matteo", "uses a desk", importance=5.0)
+        llm = _StubLLM(
+            json.dumps({"operation": "UPDATE", "id": rid, "content": "uses a standing desk"})
+        )
+
+        op = await store.update_memory(
+            llm, "m", {"category": "work", "subject": "matteo", "content": "standing desk"}
+        )
+
+        assert op == "UPDATE"
+        row = await _row(store, rid)
+        assert row["importance"] == pytest.approx(6.0)
+        assert row["content"] == "uses a standing desk"
+
+
+# -- Tier 4: hygiene --
+
+
+class TestHygiene:
+    async def test_cluster_groups_similar(self, store):
+        rows = [
+            {"id": 1, "subject": "matteo", "content": "uses a standing desk", "embedding": None},
+            {
+                "id": 2,
+                "subject": "matteo",
+                "content": "has a standing desk at work",
+                "embedding": None,
+            },
+            {"id": 3, "subject": "simge", "content": "plays the violin", "embedding": None},
+        ]
+        clusters = store._cluster_long_term(rows)
+        # The two desk facts cluster; the violin fact is a singleton (excluded).
+        assert len(clusters) == 1
+        ids = {r["id"] for r in clusters[0]}
+        assert ids == {1, 2}
+
+    async def test_hygiene_pass_merges_duplicates(self, store):
+        keep = await _insert(store, "matteo", "uses a standing desk")
+        dup = await _insert(store, "matteo", "has a standing desk at work")
+        llm = _StubLLM(
+            json.dumps(
+                {
+                    "updates": [
+                        {
+                            "id": keep,
+                            "category": "work",
+                            "subject": "matteo",
+                            "content": "uses a standing desk at work",
+                        }
+                    ],
+                    "deletes": [dup],
+                }
+            )
+        )
+
+        removed = await store._hygiene_pass(llm, "m")
+
+        assert removed == 1
+        assert (await _row(store, dup)) == {}
+        assert (await _row(store, keep))["content"] == "uses a standing desk at work"
+
+    async def test_hygiene_pass_noop_when_nothing_similar(self, store):
+        await _insert(store, "matteo", "lives in zurich")
+        await _insert(store, "simge", "plays the violin")
+        llm = _StubLLM(json.dumps({"updates": [], "deletes": []}))
+
+        removed = await store._hygiene_pass(llm, "m")
+
+        assert removed == 0
+        assert llm.calls == 0  # no cluster formed → no LLM call
+
+    async def test_hygiene_malformed_plan_is_safe(self, store):
+        keep = await _insert(store, "matteo", "uses a standing desk")
+        dup = await _insert(store, "matteo", "has a standing desk at work")
+        llm = _StubLLM("not json")
+
+        removed = await store._hygiene_pass(llm, "m")
+
+        assert removed == 0
+        assert await _row(store, keep)
+        assert await _row(store, dup)
+
+
+# -- consolidation summary + migration --
+
+
+class TestConsolidationSummary:
+    async def test_summary_has_tier_keys(self, store):
+        llm = _StubLLM(json.dumps({"updates": [], "deletes": []}))
+        result = await store.consolidate_and_cleanup(llm, "m")
+        assert set(result) >= {
+            "active_reviewed",
+            "promoted_to_long_term",
+            "expired_deleted",
+            "hygiene_merged",
+            "archived",
+        }
+
+
+class TestMigration:
+    async def test_legacy_db_is_migrated_in_place(self, tmp_path):
+        db_path = str(tmp_path / "legacy.db")
+        # Create the original (pre-Tier-2/3) long_term table.
+        async with aiosqlite.connect(db_path) as db:
+            await db.execute(
+                "CREATE TABLE long_term ("
+                "id INTEGER PRIMARY KEY AUTOINCREMENT, category TEXT NOT NULL, "
+                "subject TEXT NOT NULL, content TEXT NOT NULL, source TEXT, "
+                "confidence TEXT DEFAULT 'stated', "
+                "created_at DATETIME DEFAULT (datetime('now')), "
+                "updated_at DATETIME DEFAULT (datetime('now')))"
+            )
+            await db.execute(
+                "INSERT INTO long_term (category, subject, content) "
+                "VALUES ('fact', 'matteo', 'old')"
+            )
+            await db.commit()
+
+        store = MemoryStore(db_path=db_path)
+        await store._ensure_schema()
+
+        async with aiosqlite.connect(db_path) as db:
+            cur = await db.execute("PRAGMA table_info(long_term)")
+            cols = {row[1] for row in await cur.fetchall()}
+        assert {"embedding", "importance", "last_accessed", "access_count", "archived"} <= cols
+
+        # The legacy row survives and is readable with sane defaults.
+        rows = await store.get_long_term()
+        assert any(r["content"] == "old" for r in rows)
+
+
+async def _row_all(store: MemoryStore) -> list[dict]:
+    async with aiosqlite.connect(store.db_path) as db:
+        db.row_factory = aiosqlite.Row
+        cur = await db.execute("SELECT * FROM long_term ORDER BY id")
+        return [dict(r) for r in await cur.fetchall()]
diff --git a/tests/test_tools.py b/tests/test_tools.py
index 587d760..a7f4092 100644
--- a/tests/test_tools.py
+++ b/tests/test_tools.py
@@ -150,7 +150,7 @@ def test_build_user_message_no_preamble_is_plain(agent) -> None:
 async def test_session_system_built_once_and_reused(agent, monkeypatch) -> None:
     calls = {"n": 0}
 
-    async def fake_build() -> str:
+    async def fake_build(*args, **kwargs) -> str:
         calls["n"] += 1
         return f"SYSTEM-{calls['n']}"
 

From 42c7549027d132284836d5e3f04a9d7e84bdf0e3 Mon Sep 17 00:00:00 2001
From: Matteo Merola <mattmezza@gmail.com>
Date: Sun, 7 Jun 2026 23:04:23 +0200
Subject: [PATCH 5/7] feat(memory): on-device local embeddings (fastembed),
 prefetched in Docker
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a local embedding backend so semantic memory needs no API key and keeps
data on the box. core/embeddings.py gains LocalEmbeddingClient (fastembed,
ONNX/CPU, BAAI/bge-small-en-v1.5) that loads the model lazily in a worker
thread, plus prefetch_local_model() and a 'python -m core.embeddings
prefetch' CLI.

Embeddings now default ON with provider=local. The Dockerfile prefetches the
model into /app/models (outside the data volume, chowned to the runtime user)
so the image is self-contained and works offline — verified: build-prefetch
then HF_HUB_OFFLINE load + embed succeeds, related>unrelated cosine.

_build_embedder branches local vs OpenAI-compatible API; remote keys still
fall back to the matching agent provider key. fastembed added to deps/lock
(onnxruntime 1.26 has cp314 wheels).
---
 .gitignore         |   3 +
 Dockerfile         |   7 ++
 config.yml.example |  20 ++---
 core/agent.py      |  19 +++--
 core/config.py     |  13 ++--
 core/embeddings.py |  90 +++++++++++++++++++++++
 pyproject.toml     |   1 +
 uv.lock            | 178 +++++++++++++++++++++++++++++++++++++--------
 8 files changed, 282 insertions(+), 49 deletions(-)

diff --git a/.gitignore b/.gitignore
index 49f47b0..b3efad1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -38,3 +38,6 @@ tailwindcss
 # Tailwind CSS build output
 api/static/style.css
 node_modules
+
+# Local embedding models (fastembed cache)
+models/
diff --git a/Dockerfile b/Dockerfile
index 6f5d29e..c18bff4 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -50,6 +50,13 @@ COPY tools/ tools/
 COPY voice/ voice/
 COPY api/ api/
 
+# Prefetch the local embedding model (semantic memory, Tier 2) so it is bundled
+# in the image — no runtime download, works offline. Stored in /app/models,
+# OUTSIDE the /app/data volume so the mounted volume cannot shadow it. Keep the
+# default in sync with EmbeddingConfig (core/config.py).
+ARG EMBED_MODEL=BAAI/bge-small-en-v1.5
+RUN uv run python -m core.embeddings prefetch "${EMBED_MODEL}" /app/models
+
 # Build CSS with Tailwind CSS v4 standalone CLI
 RUN ARCH=$(dpkg --print-architecture) && \
     if [ "$ARCH" = "arm64" ]; then TW_ARCH="linux-arm64"; else TW_ARCH="linux-x64"; fi && \
diff --git a/config.yml.example b/config.yml.example
index 6d7841f..4991aa0 100644
--- a/config.yml.example
+++ b/config.yml.example
@@ -78,16 +78,18 @@ memory:
   consolidation_model: "claude-haiku-4-5"  # model for scheduled consolidation + hygiene
 
   # Tier 2 — semantic similarity + relevance-ranked injection.
-  # Off by default: the memory pipeline works on lexical retrieval with no
-  # extra dependency. When enabled, vectors come from an OpenAI-compatible
-  # /embeddings endpoint and are stored alongside each long-term memory.
+  # Default: a local on-device model (fastembed, private, no API key, free).
+  # The model is bundled in the Docker image (prefetched at build). Set
+  # enabled: false to fall back to lexical (word-overlap) retrieval, or switch
+  # provider to "openai"/"google" to use a remote embeddings API instead.
   embedding:
-    enabled: false
-    provider: "openai"               # OpenAI-compatible embeddings endpoint
-    model: "text-embedding-3-small"
-    api_key: ""                      # falls back to the agent provider key when empty
-    base_url: ""                     # falls back to the agent provider base URL when empty
-    dimensions: 0                    # 0 = provider default
+    enabled: true
+    provider: "local"                # "local" (on-device) | "openai" | "google"
+    model: "BAAI/bge-small-en-v1.5"  # local model id; for API e.g. text-embedding-3-small
+    cache_dir: "models"              # where local models live (bundled in the image)
+    api_key: ""                      # API providers only; falls back to the agent provider key
+    base_url: ""                     # API providers only; falls back to the agent provider base URL
+    dimensions: 0                    # 0 = provider default (API providers only)
     injection_top_k: 12              # relevance-ranked memories injected per turn
 
   # Tier 3 — forgetting / importance / reinforcement
diff --git a/core/agent.py b/core/agent.py
index 89028a5..b73665f 100644
--- a/core/agent.py
+++ b/core/agent.py
@@ -17,7 +17,7 @@
 
 from core.compaction import compact_messages, should_compact
 from core.config import Config
-from core.embeddings import EmbeddingClient
+from core.embeddings import LOCAL_PROVIDERS, EmbeddingClient, LocalEmbeddingClient
 from core.executor import ToolExecutor
 from core.goal_decomposition import DecomposedGoal, classify_complexity, decompose_goal
 from core.history import ConversationHistory
@@ -1144,17 +1144,26 @@ def _background_llm(self, provider: str) -> LLMClient:
             base_url=getattr(cfg, f"{provider}_base_url", None),
         )
 
-    def _build_embedder(self) -> EmbeddingClient | None:
+    def _build_embedder(self):
         """Construct the embedding client for semantic memory, if enabled.
 
-        Credentials fall back to the matching agent provider key / base URL when
-        not set explicitly on the embedding config. Returns None when disabled
-        or when no usable API key is available (the store then runs on Tier-1
+        For ``provider: local`` a lazy on-device fastembed client is returned
+        (no model load until first use, so this stays cheap). For API providers
+        credentials fall back to the matching agent provider key / base URL.
+        Returns None when disabled or unusable (the store then runs on Tier-1
         lexical retrieval).
         """
         emb = self.config.memory.embedding
         if not emb.enabled:
             return None
+
+        if emb.provider in LOCAL_PROVIDERS:
+            try:
+                return LocalEmbeddingClient(model=emb.model, cache_dir=emb.cache_dir)
+            except Exception:
+                log.exception("Failed to build local embedder; disabling semantic memory")
+                return None
+
         cfg = self.config.agent
         api_key = emb.api_key or getattr(cfg, f"{emb.provider}_api_key", "")
         base_url = emb.base_url or getattr(cfg, f"{emb.provider}_base_url", "") or None
diff --git a/core/config.py b/core/config.py
index cc11e1e..1fc1a3b 100644
--- a/core/config.py
+++ b/core/config.py
@@ -141,12 +141,13 @@ class EmbeddingConfig(BaseModel):
     native extension required, identical on local and container SQLite).
     """
 
-    enabled: bool = False
-    provider: str = "openai"  # OpenAI-compatible embeddings endpoint
-    model: str = "text-embedding-3-small"
-    api_key: str = ""  # falls back to the matching agent provider key when empty
-    base_url: str = ""  # falls back to the agent provider base URL when empty
-    dimensions: int = 0  # 0 = provider default
+    enabled: bool = True
+    provider: str = "local"  # "local" (fastembed, on-device) or an OpenAI-compatible API
+    model: str = "BAAI/bge-small-en-v1.5"  # local model id; for API use e.g. text-embedding-3-small
+    cache_dir: str = "models"  # where local models are stored (bundled in the Docker image)
+    api_key: str = ""  # API providers only; falls back to the agent provider key when empty
+    base_url: str = ""  # API providers only; falls back to the agent provider base URL when empty
+    dimensions: int = 0  # 0 = provider default (API providers only)
     injection_top_k: int = 12  # relevance-ranked memories injected per turn
 
 
diff --git a/core/embeddings.py b/core/embeddings.py
index f4b5f41..ee722f0 100644
--- a/core/embeddings.py
+++ b/core/embeddings.py
@@ -9,6 +9,7 @@
 from __future__ import annotations
 
 import array
+import asyncio
 import importlib
 import logging
 import math
@@ -16,6 +17,14 @@
 
 log = logging.getLogger(__name__)
 
+# Default local model: small, CPU-friendly, 384-dim (~130MB ONNX). Good balance
+# of quality and speed on modest self-hosted hardware.
+DEFAULT_LOCAL_MODEL = "BAAI/bge-small-en-v1.5"
+DEFAULT_LOCAL_CACHE = "models"
+
+# Provider names that mean "run the model locally" rather than call an API.
+LOCAL_PROVIDERS = frozenset({"local", "fastembed"})
+
 # OpenAI-compatible base URLs for providers that expose an /embeddings endpoint.
 _DEFAULT_BASE_URLS = {
     "google": "https://generativelanguage.googleapis.com/v1beta/openai",
@@ -91,3 +100,84 @@ async def embed_one(self, text: str) -> list[float]:
         """Return a single embedding vector (empty list on failure)."""
         vectors = await self.embed([text])
         return vectors[0] if vectors else []
+
+
+class LocalEmbeddingClient:
+    """Runs a sentence-embedding model locally via ``fastembed`` (ONNX/CPU).
+
+    No API key, no network at inference time, and the data never leaves the
+    machine. The model is loaded lazily on first use (in a worker thread, so it
+    never blocks construction or the event loop) and cached for the process
+    lifetime. In Docker the model is prefetched at build time (see the
+    ``prefetch`` entry point below) so the first call has no download latency.
+    """
+
+    def __init__(self, model: str = DEFAULT_LOCAL_MODEL, cache_dir: str | None = None):
+        self.model = model or DEFAULT_LOCAL_MODEL
+        self.cache_dir = cache_dir or DEFAULT_LOCAL_CACHE
+        self._model: Any = None
+        self._lock = asyncio.Lock()
+
+    def _load_model(self) -> Any:
+        try:
+            module = importlib.import_module("fastembed")
+            text_embedding = cast(Any, getattr(module, "TextEmbedding"))
+        except Exception as exc:  # pragma: no cover - import guard
+            raise RuntimeError(
+                "fastembed is required for local embeddings (pip install fastembed)"
+            ) from exc
+        return text_embedding(model_name=self.model, cache_dir=self.cache_dir)
+
+    async def _ensure_model(self) -> Any:
+        if self._model is None:
+            async with self._lock:
+                if self._model is None:
+                    log.info(
+                        "Loading local embedding model %s (cache=%s)", self.model, self.cache_dir
+                    )
+                    self._model = await asyncio.to_thread(self._load_model)
+        return self._model
+
+    async def embed(self, texts: list[str]) -> list[list[float]]:
+        if not texts:
+            return []
+        model = await self._ensure_model()
+
+        def _run() -> list[list[float]]:
+            return [list(map(float, vec)) for vec in model.embed(list(texts))]
+
+        return await asyncio.to_thread(_run)
+
+    async def embed_one(self, text: str) -> list[float]:
+        vectors = await self.embed([text])
+        return vectors[0] if vectors else []
+
+
+def prefetch_local_model(
+    model: str = DEFAULT_LOCAL_MODEL, cache_dir: str = DEFAULT_LOCAL_CACHE
+) -> int:
+    """Download a local embedding model into *cache_dir* and verify it runs.
+
+    Returns the embedding dimension. Used by the Docker build (and the admin
+    "Download model" button) so the model is bundled ahead of time.
+    """
+    module = importlib.import_module("fastembed")
+    text_embedding = cast(Any, getattr(module, "TextEmbedding"))
+    embedder = text_embedding(model_name=model, cache_dir=cache_dir)
+    vec = next(iter(embedder.embed(["warmup"])))
+    dim = len(list(vec))
+    log.info("Prefetched local embedding model %s (dim=%d) into %s", model, dim, cache_dir)
+    return dim
+
+
+if __name__ == "__main__":  # pragma: no cover - build-time / CLI use
+    import sys
+
+    _args = sys.argv[1:]
+    if _args and _args[0] == "prefetch":
+        _model = _args[1] if len(_args) > 1 else DEFAULT_LOCAL_MODEL
+        _cache = _args[2] if len(_args) > 2 else DEFAULT_LOCAL_CACHE
+        _dim = prefetch_local_model(_model, _cache)
+        print(f"prefetched {_model} (dim={_dim}) -> {_cache}")
+    else:
+        print("usage: python -m core.embeddings prefetch [MODEL] [CACHE_DIR]")
diff --git a/pyproject.toml b/pyproject.toml
index f7d2151..13bc089 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -23,6 +23,7 @@ dependencies = [
     "jinja2",
     "python-multipart>=0.0.22",
     "vobject>=0.9.9",
+    "fastembed",
 ]
 
 [dependency-groups]
diff --git a/uv.lock b/uv.lock
index a2b2401..b7186ea 100644
--- a/uv.lock
+++ b/uv.lock
@@ -338,6 +338,27 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/9e/dd/d0ee25348ac58245ee9f90b6f3cbb666bf01f69be7e0911f9851bddbda16/fastapi-0.129.0-py3-none-any.whl", hash = "sha256:b4946880e48f462692b31c083be0432275cbfb6e2274566b1be91479cc1a84ec", size = 102950, upload-time = "2026-02-12T13:54:54.528Z" },
 ]
 
+[[package]]
+name = "fastembed"
+version = "0.8.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "huggingface-hub" },
+    { name = "loguru" },
+    { name = "mmh3" },
+    { name = "numpy" },
+    { name = "onnxruntime" },
+    { name = "pillow" },
+    { name = "py-rust-stemmers" },
+    { name = "requests" },
+    { name = "tokenizers" },
+    { name = "tqdm" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/26/25/58865e36b6e8a9a0d0ff905b5601aa30db97956327c0df42ec4ed6accc21/fastembed-0.8.0.tar.gz", hash = "sha256:75966edfa8b006ee78514c726bd7f6a50721dadc89305279052be9db72fd53e8", size = 75115, upload-time = "2026-03-23T16:34:41.699Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/2a/e8/26b7d78bb8972498c467ca34cb12ee2e60d26ba5eae6d8443189a1af37a5/fastembed-0.8.0-py3-none-any.whl", hash = "sha256:40bee672657574a1009e35ec50030a55f2b426842cb011845379817641bbbbd0", size = 116572, upload-time = "2026-03-23T16:34:40.69Z" },
+]
+
 [[package]]
 name = "faster-whisper"
 version = "1.2.1"
@@ -667,6 +688,19 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/f9/8e/7def204fea9f9be8b3c21a6f2dd6c020cf56c7d5ff753e0e23ed7f9ea57e/jiter-0.13.0-cp314-cp314t-win_arm64.whl", hash = "sha256:2c26cf47e2cad140fa23b6d58d435a7c0161f5c514284802f25e87fddfe11024", size = 187152, upload-time = "2026-02-02T12:37:22.124Z" },
 ]
 
+[[package]]
+name = "loguru"
+version = "0.7.3"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "colorama", marker = "sys_platform == 'win32'" },
+    { name = "win32-setctime", marker = "sys_platform == 'win32'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/3a/05/a1dae3dffd1116099471c643b8924f5aa6524411dc6c63fdae648c4f1aca/loguru-0.7.3.tar.gz", hash = "sha256:19480589e77d47b8d85b2c827ad95d49bf31b0dcde16593892eb51dd18706eb6", size = 63559, upload-time = "2024-12-06T11:20:56.608Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/0c/29/0348de65b8cc732daa3e33e67806420b2ae89bdce2b04af740289c5c6c8c/loguru-0.7.3-py3-none-any.whl", hash = "sha256:31a33c10c8e1e10422bfd431aeb5d351c7cf7fa671e3c4df004162264b28220c", size = 61595, upload-time = "2024-12-06T11:20:54.538Z" },
+]
+
 [[package]]
 name = "lxml"
 version = "6.0.2"
@@ -762,6 +796,51 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/b3/38/89ba8ad64ae25be8de66a6d463314cf1eb366222074cfda9ee839c56a4b4/mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8", size = 9979, upload-time = "2022-08-14T12:40:09.779Z" },
 ]
 
+[[package]]
+name = "mmh3"
+version = "5.2.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/91/1a/edb23803a168f070ded7a3014c6d706f63b90c84ccc024f89d794a3b7a6d/mmh3-5.2.1.tar.gz", hash = "sha256:bbea5b775f0ac84945191fb83f845a6fd9a21a03ea7f2e187defac7e401616ad", size = 33775, upload-time = "2026-03-05T15:55:57.716Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/63/b4/65bc1fb2bb7f83e91c30865023b1847cf89a5f237165575e8c83aa536584/mmh3-5.2.1-cp314-cp314-android_24_arm64_v8a.whl", hash = "sha256:d771f085fcdf4035786adfb1d8db026df1eb4b41dac1c3d070d1e49512843227", size = 40794, upload-time = "2026-03-05T15:55:09.773Z" },
+    { url = "https://files.pythonhosted.org/packages/c4/86/7168b3d83be8eb553897b1fac9da8bbb06568e5cfe555ffc329ebb46f59d/mmh3-5.2.1-cp314-cp314-android_24_x86_64.whl", hash = "sha256:7f196cd7910d71e9d9860da0ff7a77f64d22c1ad931f1dd18559a06e03109fc0", size = 41923, upload-time = "2026-03-05T15:55:10.924Z" },
+    { url = "https://files.pythonhosted.org/packages/bf/9b/b653ab611c9060ce8ff0ba25c0226757755725e789292f3ca138a58082cd/mmh3-5.2.1-cp314-cp314-ios_13_0_arm64_iphoneos.whl", hash = "sha256:b1f12bd684887a0a5d55e6363ca87056f361e45451105012d329b86ec19dbe0b", size = 39131, upload-time = "2026-03-05T15:55:11.961Z" },
+    { url = "https://files.pythonhosted.org/packages/9b/b4/5a2e0d34ab4d33543f01121e832395ea510132ea8e52cdf63926d9d81754/mmh3-5.2.1-cp314-cp314-ios_13_0_arm64_iphonesimulator.whl", hash = "sha256:d106493a60dcb4aef35a0fac85105e150a11cf8bc2b0d388f5a33272d756c966", size = 39825, upload-time = "2026-03-05T15:55:13.013Z" },
+    { url = "https://files.pythonhosted.org/packages/bd/69/81699a8f39a3f8d368bec6443435c0c392df0d200ad915bf0d222b588e03/mmh3-5.2.1-cp314-cp314-ios_13_0_x86_64_iphonesimulator.whl", hash = "sha256:44983e45310ee5b9f73397350251cdf6e63a466406a105f1d16cb5baa659270b", size = 40344, upload-time = "2026-03-05T15:55:14.026Z" },
+    { url = "https://files.pythonhosted.org/packages/0c/b3/71c8c775807606e8fd8acc5c69016e1caf3200d50b50b6dd4b40ce10b76c/mmh3-5.2.1-cp314-cp314-macosx_10_15_universal2.whl", hash = "sha256:368625fb01666655985391dbad3860dc0ba7c0d6b9125819f3121ee7292b4ac8", size = 56291, upload-time = "2026-03-05T15:55:15.137Z" },
+    { url = "https://files.pythonhosted.org/packages/6f/75/2c24517d4b2ce9e4917362d24f274d3d541346af764430249ddcc4cb3a08/mmh3-5.2.1-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:72d1cc63bcc91e14933f77d51b3df899d6a07d184ec515ea7f56bff659e124d7", size = 40575, upload-time = "2026-03-05T15:55:16.518Z" },
+    { url = "https://files.pythonhosted.org/packages/bf/b9/e4a360164365ac9f07a25f0f7928e3a66eb9ecc989384060747aa170e6aa/mmh3-5.2.1-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:e8b4b5580280b9265af3e0409974fb79c64cf7523632d03fbf11df18f8b0181e", size = 40052, upload-time = "2026-03-05T15:55:17.735Z" },
+    { url = "https://files.pythonhosted.org/packages/97/ca/120d92223a7546131bbbc31c9174168ee7a73b1366f5463ffe69d9e691fe/mmh3-5.2.1-cp314-cp314-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:4cbbde66f1183db040daede83dd86c06d663c5bb2af6de1142b7c8c37923dd74", size = 97311, upload-time = "2026-03-05T15:55:18.959Z" },
+    { url = "https://files.pythonhosted.org/packages/b6/71/c1a60c1652b8813ef9de6d289784847355417ee0f2980bca002fe87f4ae5/mmh3-5.2.1-cp314-cp314-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:8ff038d52ef6aa0f309feeba00c5095c9118d0abf787e8e8454d6048db2037fc", size = 103279, upload-time = "2026-03-05T15:55:20.448Z" },
+    { url = "https://files.pythonhosted.org/packages/48/29/ad97f4be1509cdcb28ae32c15593ce7c415db47ace37f8fad35b493faa9a/mmh3-5.2.1-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a4130d0b9ce5fad6af07421b1aecc7e079519f70d6c05729ab871794eded8617", size = 106290, upload-time = "2026-03-05T15:55:21.6Z" },
+    { url = "https://files.pythonhosted.org/packages/77/29/1f86d22e281bd8827ba373600a4a8b0c0eae5ca6aa55b9a8c26d2a34decc/mmh3-5.2.1-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:f6e0bfe77d238308839699944164b96a2eeccaf55f2af400f54dc20669d8d5f2", size = 113116, upload-time = "2026-03-05T15:55:22.826Z" },
+    { url = "https://files.pythonhosted.org/packages/a7/7c/339971ea7ed4c12d98f421f13db3ea576a9114082ccb59d2d1a0f00ccac1/mmh3-5.2.1-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:f963eafc0a77a6c0562397da004f5876a9bcf7265a7bcc3205e29636bc4a1312", size = 120740, upload-time = "2026-03-05T15:55:24.3Z" },
+    { url = "https://files.pythonhosted.org/packages/e4/92/3c7c4bdb8e926bb3c972d1e2907d77960c1c4b250b41e8366cf20c6e4373/mmh3-5.2.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:92883836caf50d5255be03d988d75bc93e3f86ba247b7ca137347c323f731deb", size = 99143, upload-time = "2026-03-05T15:55:25.456Z" },
+    { url = "https://files.pythonhosted.org/packages/df/0a/33dd8706e732458c8375eae63c981292de07a406bad4ec03e5269654aa2c/mmh3-5.2.1-cp314-cp314-musllinux_1_2_i686.whl", hash = "sha256:57b52603e89355ff318025dd55158f6e71396c0f1f609d548e9ea9c94cc6ce0a", size = 98703, upload-time = "2026-03-05T15:55:26.723Z" },
+    { url = "https://files.pythonhosted.org/packages/51/04/76bbce05df76cbc3d396f13b2ea5b1578ef02b6a5187e132c6c33f99d596/mmh3-5.2.1-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:f40a95186a72fa0b67d15fef0f157bfcda00b4f59c8a07cbe5530d41ac35d105", size = 106484, upload-time = "2026-03-05T15:55:28.214Z" },
+    { url = "https://files.pythonhosted.org/packages/d3/8f/c6e204a2c70b719c1f62ffd9da27aef2dddcba875ea9c31ca0e87b975a46/mmh3-5.2.1-cp314-cp314-musllinux_1_2_s390x.whl", hash = "sha256:58370d05d033ee97224c81263af123dea3d931025030fd34b61227a768a8858a", size = 110012, upload-time = "2026-03-05T15:55:29.532Z" },
+    { url = "https://files.pythonhosted.org/packages/e3/37/7181efd8e39db386c1ebc3e6b7d1f702a09d7c1197a6f2742ed6b5c16597/mmh3-5.2.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:7be6dfb49e48fd0a7d91ff758a2b51336f1cd21f9d44b20f6801f072bd080cdd", size = 97508, upload-time = "2026-03-05T15:55:31.01Z" },
+    { url = "https://files.pythonhosted.org/packages/42/0f/afa7ca2615fd85e1469474bb860e381443d0b868c083b62b41cb1d7ca32f/mmh3-5.2.1-cp314-cp314-win32.whl", hash = "sha256:54fe8518abe06a4c3852754bfd498b30cc58e667f376c513eac89a244ce781a4", size = 41387, upload-time = "2026-03-05T15:55:32.403Z" },
+    { url = "https://files.pythonhosted.org/packages/71/0d/46d42a260ee1357db3d486e6c7a692e303c017968e14865e00efa10d09fc/mmh3-5.2.1-cp314-cp314-win_amd64.whl", hash = "sha256:3f796b535008708846044c43302719c6956f39ca2d93f2edda5319e79a29efbb", size = 42101, upload-time = "2026-03-05T15:55:33.646Z" },
+    { url = "https://files.pythonhosted.org/packages/a4/7b/848a8378059d96501a41159fca90d6a99e89736b0afbe8e8edffeac8c74b/mmh3-5.2.1-cp314-cp314-win_arm64.whl", hash = "sha256:cd471ede0d802dd936b6fab28188302b2d497f68436025857ca72cd3810423fe", size = 39836, upload-time = "2026-03-05T15:55:35.026Z" },
+    { url = "https://files.pythonhosted.org/packages/27/61/1dabea76c011ba8547c25d30c91c0ec22544487a8750997a27a0c9e1180b/mmh3-5.2.1-cp314-cp314t-macosx_10_15_universal2.whl", hash = "sha256:5174a697ce042fa77c407e05efe41e03aa56dae9ec67388055820fb48cf4c3ba", size = 57727, upload-time = "2026-03-05T15:55:36.162Z" },
+    { url = "https://files.pythonhosted.org/packages/b7/32/731185950d1cf2d5e28979cc8593016ba1619a295faba10dda664a4931b5/mmh3-5.2.1-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:0a3984146e414684a6be2862d84fcb1035f4984851cb81b26d933bab6119bf00", size = 41308, upload-time = "2026-03-05T15:55:37.254Z" },
+    { url = "https://files.pythonhosted.org/packages/76/aa/66c76801c24b8c9418b4edde9b5e57c75e72c94e29c48f707e3962534f18/mmh3-5.2.1-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:bd6e7d363aa93bd3421b30b6af97064daf47bc96005bddba67c5ffbc6df426b8", size = 40758, upload-time = "2026-03-05T15:55:38.61Z" },
+    { url = "https://files.pythonhosted.org/packages/9e/bb/79a1f638a02f0ae389f706d13891e2fbf7d8c0a22ecde67ba828951bb60a/mmh3-5.2.1-cp314-cp314t-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:113f78e7463a36dbbcea05bfe688efd7fa759d0f0c56e73c974d60dcfec3dfcc", size = 109670, upload-time = "2026-03-05T15:55:40.13Z" },
+    { url = "https://files.pythonhosted.org/packages/26/94/8cd0e187a288985bcfc79bf5144d1d712df9dee74365f59d26e3a1865be6/mmh3-5.2.1-cp314-cp314t-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:7e8ec5f606e0809426d2440e0683509fb605a8820a21ebd120dcdba61b74ef7f", size = 117399, upload-time = "2026-03-05T15:55:42.076Z" },
+    { url = "https://files.pythonhosted.org/packages/42/94/dfea6059bd5c5beda565f58a4096e43f4858fb6d2862806b8bbd12cbb284/mmh3-5.2.1-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:22b0f9971ec4e07e8223f2beebe96a6cfc779d940b6f27d26604040dd74d3a44", size = 120386, upload-time = "2026-03-05T15:55:43.481Z" },
+    { url = "https://files.pythonhosted.org/packages/47/cb/f9c45e62aaa67220179f487772461d891bb582bb2f9783c944832c60efd9/mmh3-5.2.1-cp314-cp314t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:85ffc9920ffc39c5eee1e3ac9100c913a0973996fbad5111f939bbda49204bb7", size = 125924, upload-time = "2026-03-05T15:55:44.638Z" },
+    { url = "https://files.pythonhosted.org/packages/a5/83/fe54a4a7c11bc9f623dfc1707decd034245602b076dfc1dcc771a4163170/mmh3-5.2.1-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:7aec798c2b01aaa65a55f1124f3405804184373abb318a3091325aece235f67c", size = 135280, upload-time = "2026-03-05T15:55:45.866Z" },
+    { url = "https://files.pythonhosted.org/packages/97/67/fe7e9e9c143daddd210cd22aef89cbc425d58ecf238d2b7d9eb0da974105/mmh3-5.2.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:55dbbd8ffbc40d1697d5e2d0375b08599dae8746b0b08dea05eee4ce81648fac", size = 110050, upload-time = "2026-03-05T15:55:47.074Z" },
+    { url = "https://files.pythonhosted.org/packages/43/c4/6d4b09fcbef80794de447c9378e39eefc047156b290fa3dd2d5257ca8227/mmh3-5.2.1-cp314-cp314t-musllinux_1_2_i686.whl", hash = "sha256:6c85c38a279ca9295a69b9b088a2e48aa49737bb1b34e6a9dc6297c110e8d912", size = 111158, upload-time = "2026-03-05T15:55:48.239Z" },
+    { url = "https://files.pythonhosted.org/packages/81/a6/ca51c864bdb30524beb055a6d8826db3906af0834ec8c41d097a6e8573d5/mmh3-5.2.1-cp314-cp314t-musllinux_1_2_ppc64le.whl", hash = "sha256:6290289fa5fb4c70fd7f72016e03633d60388185483ff3b162912c81205ae2cf", size = 116890, upload-time = "2026-03-05T15:55:49.405Z" },
+    { url = "https://files.pythonhosted.org/packages/cc/04/5a1fe2e2ad843d03e89af25238cbc4f6840a8bb6c4329a98ab694c71deda/mmh3-5.2.1-cp314-cp314t-musllinux_1_2_s390x.whl", hash = "sha256:4fc6cd65dc4d2fdb2625e288939a3566e36127a84811a4913f02f3d5931da52d", size = 123121, upload-time = "2026-03-05T15:55:50.61Z" },
+    { url = "https://files.pythonhosted.org/packages/af/4d/3c820c6f4897afd25905270a9f2330a23f77a207ea7356f7aadace7273c0/mmh3-5.2.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:623f938f6a039536cc02b7582a07a080f13fdfd48f87e63201d92d7e34d09a18", size = 110187, upload-time = "2026-03-05T15:55:52.143Z" },
+    { url = "https://files.pythonhosted.org/packages/21/54/1d71cd143752361c0aebef16ad3f55926a6faf7b112d355745c1f8a25f7f/mmh3-5.2.1-cp314-cp314t-win32.whl", hash = "sha256:29bc3973676ae334412efdd367fcd11d036b7be3efc1ce2407ef8676dabfeb82", size = 41934, upload-time = "2026-03-05T15:55:53.564Z" },
+    { url = "https://files.pythonhosted.org/packages/9d/e4/63a2a88f31d93dea03947cccc2a076946857e799ea4f7acdecbf43b324aa/mmh3-5.2.1-cp314-cp314t-win_amd64.whl", hash = "sha256:28cfab66577000b9505a0d068c731aee7ca85cd26d4d63881fab17857e0fe1fb", size = 43036, upload-time = "2026-03-05T15:55:55.252Z" },
+    { url = "https://files.pythonhosted.org/packages/a0/0f/59204bf136d1201f8d7884cfbaf7498c5b4674e87a4c693f9bde63741ce1/mmh3-5.2.1-cp314-cp314t-win_arm64.whl", hash = "sha256:dfd51b4c56b673dfbc43d7d27ef857dd91124801e2806c69bb45585ce0fa019b", size = 40391, upload-time = "2026-03-05T15:55:56.697Z" },
+]
+
 [[package]]
 name = "mpa"
 version = "0.11.0"
@@ -773,6 +852,7 @@ dependencies = [
     { name = "caldav" },
     { name = "edge-tts" },
     { name = "fastapi" },
+    { name = "fastembed" },
     { name = "faster-whisper" },
     { name = "httpx" },
     { name = "jinja2" },
@@ -804,6 +884,7 @@ requires-dist = [
     { name = "caldav" },
     { name = "edge-tts" },
     { name = "fastapi" },
+    { name = "fastembed" },
     { name = "faster-whisper" },
     { name = "httpx" },
     { name = "jinja2" },
@@ -827,15 +908,6 @@ dev = [
     { name = "ruff" },
 ]
 
-[[package]]
-name = "mpmath"
-version = "1.3.0"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/e0/47/dd32fa426cc72114383ac549964eecb20ecfd886d1e5ccf5340b55b02f57/mpmath-1.3.0.tar.gz", hash = "sha256:7a28eb2a9774d00c7bc92411c19a89209d5da7c4c9a9e227be8330a23a25b91f", size = 508106, upload-time = "2023-03-07T16:47:11.061Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/43/e3/7d92a15f894aa0c9c4b49b8ee9ac9850d6e63b03c9c32c0367a13ae62209/mpmath-1.3.0-py3-none-any.whl", hash = "sha256:a0b2b9fe80bbcd81a6647ff13108738cfb482d481d826cc0e02f5b35e5c88d2c", size = 536198, upload-time = "2023-03-07T16:47:09.197Z" },
-]
-
 [[package]]
 name = "multidict"
 version = "6.7.1"
@@ -926,22 +998,22 @@ wheels = [
 
 [[package]]
 name = "onnxruntime"
-version = "1.24.1"
+version = "1.26.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "flatbuffers" },
     { name = "numpy" },
     { name = "packaging" },
     { name = "protobuf" },
-    { name = "sympy" },
 ]
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/87/23/167d964414cee2af9c72af323b28d2c4cb35beed855c830a23f198265c79/onnxruntime-1.24.1-cp314-cp314-macosx_14_0_arm64.whl", hash = "sha256:890c503ca187bc883c3aa72c53f2a604ec8e8444bdd1bf6ac243ec6d5e085202", size = 17214004, upload-time = "2026-02-05T17:31:11.917Z" },
-    { url = "https://files.pythonhosted.org/packages/b4/24/6e5558fdd51027d6830cf411bc003ae12c64054826382e2fab89e99486a0/onnxruntime-1.24.1-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4da1b84b3bdeec543120df169e5e62a1445bf732fc2c7fb036c2f8a4090455e8", size = 15017034, upload-time = "2026-02-05T17:31:04.331Z" },
-    { url = "https://files.pythonhosted.org/packages/91/d4/3cb1c9eaae1103265ed7eb00a3eaeb0d9ba51dc88edc398b7071c9553bed/onnxruntime-1.24.1-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:557753ec345efa227c6a65139f3d29c76330fcbd54cc10dd1b64232ebb939c13", size = 17097531, upload-time = "2026-02-05T17:31:40.303Z" },
-    { url = "https://files.pythonhosted.org/packages/0f/da/4522b199c12db7c5b46aaf265ee0d741abe65ea912f6c0aaa2cc18a4654d/onnxruntime-1.24.1-cp314-cp314-win_amd64.whl", hash = "sha256:ea4942104805e868f3ddddfa1fbb58b04503a534d489ab2d1452bbfa345c78c2", size = 12795556, upload-time = "2026-02-05T17:32:11.886Z" },
-    { url = "https://files.pythonhosted.org/packages/a1/53/3b8969417276b061ff04502ccdca9db4652d397abbeb06c9f6ae05cec9ca/onnxruntime-1.24.1-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ea8963a99e0f10489acdf00ef3383c3232b7e44aa497b063c63be140530d9f85", size = 15025434, upload-time = "2026-02-05T17:31:06.942Z" },
-    { url = "https://files.pythonhosted.org/packages/ab/a2/cfcf009eb38d90cc628c087b6506b3dfe1263387f3cbbf8d272af4fef957/onnxruntime-1.24.1-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:34488aa760fb5c2e6d06a7ca9241124eb914a6a06f70936a14c669d1b3df9598", size = 17099815, upload-time = "2026-02-05T17:31:43.092Z" },
+    { url = "https://files.pythonhosted.org/packages/40/89/17546c1c20f6bfc3ae41c22152378a26edfea918af3129e2139dcd7c99f3/onnxruntime-1.26.0-cp314-cp314-macosx_14_0_arm64.whl", hash = "sha256:33a791f31432a3af1a96db5e54818b37aba5e5eefc2e6af5794c10a9118a9993", size = 18019724, upload-time = "2026-05-08T19:07:30.723Z" },
+    { url = "https://files.pythonhosted.org/packages/bb/24/89457a35f6af29538a76647f2c18c3a28277e6c19234c847e7b4b7c19860/onnxruntime-1.26.0-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e90c00732c4553618103149d93f688e8c3063017938f8983e21a71d9f3b6d22e", size = 16054821, upload-time = "2026-05-08T19:07:22.348Z" },
+    { url = "https://files.pythonhosted.org/packages/12/f9/15b2e1815cf570d238e0135529f80d2dce64e8e8818a1489cae83823c5c6/onnxruntime-1.26.0-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:01498e80ba8988428d08c2d51b1338f89e3de2a93e6ffe555f79c68f26a5c06b", size = 18185815, upload-time = "2026-05-08T19:07:44.179Z" },
+    { url = "https://files.pythonhosted.org/packages/d7/65/2e11055faf015e4b07f45b513fa49b391baf2e19d92d77d73ebee13c1004/onnxruntime-1.26.0-cp314-cp314-win_amd64.whl", hash = "sha256:7ead61450d8405167c87dd3a31d8da1d576b490a57dab1aa8b82a7da6825f5aa", size = 13349887, upload-time = "2026-05-08T19:08:08.671Z" },
+    { url = "https://files.pythonhosted.org/packages/19/e4/0f9d1a5718b1781c610c1e354765a3820597081754277a6a9a2b50705702/onnxruntime-1.26.0-cp314-cp314-win_arm64.whl", hash = "sha256:31d71a53490e46910877d0902b5ad99c69a5955e5c7ea6c82863519410e1ba7c", size = 13140121, upload-time = "2026-05-08T19:07:57.804Z" },
+    { url = "https://files.pythonhosted.org/packages/1c/42/3b8e635f067d06d9f45bede470b8d539d101a4166c272213158dfd08b6ce/onnxruntime-1.26.0-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d7b6d258fb78fdfcf049795bcfaa74dcb90ae7baa277afd21e6fd28b83f2c496", size = 16057240, upload-time = "2026-05-08T19:07:25.163Z" },
+    { url = "https://files.pythonhosted.org/packages/93/99/f2be40a31b908d96b861ae0ce98582fa376c18a7f816b9d5eb4cd6aa0a4c/onnxruntime-1.26.0-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4eefd386a45202aefb7a5132b94f32df9d506c9edcc7faf2fc60d65183f4b183", size = 18197382, upload-time = "2026-05-08T19:07:46.965Z" },
 ]
 
 [[package]]
@@ -972,6 +1044,39 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/b7/b9/c538f279a4e237a006a2c98387d081e9eb060d203d8ed34467cc0f0b9b53/packaging-26.0-py3-none-any.whl", hash = "sha256:b36f1fef9334a5588b4166f8bcd26a14e521f2b55e6b9de3aaa80d3ff7a37529", size = 74366, upload-time = "2026-01-21T20:50:37.788Z" },
 ]
 
+[[package]]
+name = "pillow"
+version = "12.2.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/8c/21/c2bcdd5906101a30244eaffc1b6e6ce71a31bd0742a01eb89e660ebfac2d/pillow-12.2.0.tar.gz", hash = "sha256:a830b1a40919539d07806aa58e1b114df53ddd43213d9c8b75847eee6c0182b5", size = 46987819, upload-time = "2026-04-01T14:46:17.687Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/bf/98/4595daa2365416a86cb0d495248a393dfc84e96d62ad080c8546256cb9c0/pillow-12.2.0-cp314-cp314-ios_13_0_arm64_iphoneos.whl", hash = "sha256:3adc9215e8be0448ed6e814966ecf3d9952f0ea40eb14e89a102b87f450660d8", size = 4100848, upload-time = "2026-04-01T14:44:48.48Z" },
+    { url = "https://files.pythonhosted.org/packages/0b/79/40184d464cf89f6663e18dfcf7ca21aae2491fff1a16127681bf1fa9b8cf/pillow-12.2.0-cp314-cp314-ios_13_0_arm64_iphonesimulator.whl", hash = "sha256:6a9adfc6d24b10f89588096364cc726174118c62130c817c2837c60cf08a392b", size = 4176515, upload-time = "2026-04-01T14:44:51.353Z" },
+    { url = "https://files.pythonhosted.org/packages/b0/63/703f86fd4c422a9cf722833670f4f71418fb116b2853ff7da722ea43f184/pillow-12.2.0-cp314-cp314-ios_13_0_x86_64_iphonesimulator.whl", hash = "sha256:6a6e67ea2e6feda684ed370f9a1c52e7a243631c025ba42149a2cc5934dec295", size = 3640159, upload-time = "2026-04-01T14:44:53.588Z" },
+    { url = "https://files.pythonhosted.org/packages/71/e0/fb22f797187d0be2270f83500aab851536101b254bfa1eae10795709d283/pillow-12.2.0-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:2bb4a8d594eacdfc59d9e5ad972aa8afdd48d584ffd5f13a937a664c3e7db0ed", size = 5312185, upload-time = "2026-04-01T14:44:56.039Z" },
+    { url = "https://files.pythonhosted.org/packages/ba/8c/1a9e46228571de18f8e28f16fabdfc20212a5d019f3e3303452b3f0a580d/pillow-12.2.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:80b2da48193b2f33ed0c32c38140f9d3186583ce7d516526d462645fd98660ae", size = 4695386, upload-time = "2026-04-01T14:44:58.663Z" },
+    { url = "https://files.pythonhosted.org/packages/70/62/98f6b7f0c88b9addd0e87c217ded307b36be024d4ff8869a812b241d1345/pillow-12.2.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:22db17c68434de69d8ecfc2fe821569195c0c373b25cccb9cbdacf2c6e53c601", size = 6280384, upload-time = "2026-04-01T14:45:01.5Z" },
+    { url = "https://files.pythonhosted.org/packages/5e/03/688747d2e91cfbe0e64f316cd2e8005698f76ada3130d0194664174fa5de/pillow-12.2.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:7b14cc0106cd9aecda615dd6903840a058b4700fcb817687d0ee4fc8b6e389be", size = 8091599, upload-time = "2026-04-01T14:45:04.5Z" },
+    { url = "https://files.pythonhosted.org/packages/f6/35/577e22b936fcdd66537329b33af0b4ccfefaeabd8aec04b266528cddb33c/pillow-12.2.0-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8cbeb542b2ebc6fcdacabf8aca8c1a97c9b3ad3927d46b8723f9d4f033288a0f", size = 6396021, upload-time = "2026-04-01T14:45:07.117Z" },
+    { url = "https://files.pythonhosted.org/packages/11/8d/d2532ad2a603ca2b93ad9f5135732124e57811d0168155852f37fbce2458/pillow-12.2.0-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4bfd07bc812fbd20395212969e41931001fd59eb55a60658b0e5710872e95286", size = 7083360, upload-time = "2026-04-01T14:45:09.763Z" },
+    { url = "https://files.pythonhosted.org/packages/5e/26/d325f9f56c7e039034897e7380e9cc202b1e368bfd04d4cbe6a441f02885/pillow-12.2.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:9aba9a17b623ef750a4d11b742cbafffeb48a869821252b30ee21b5e91392c50", size = 6507628, upload-time = "2026-04-01T14:45:12.378Z" },
+    { url = "https://files.pythonhosted.org/packages/5f/f7/769d5632ffb0988f1c5e7660b3e731e30f7f8ec4318e94d0a5d674eb65a4/pillow-12.2.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:deede7c263feb25dba4e82ea23058a235dcc2fe1f6021025dc71f2b618e26104", size = 7209321, upload-time = "2026-04-01T14:45:15.122Z" },
+    { url = "https://files.pythonhosted.org/packages/6a/7a/c253e3c645cd47f1aceea6a8bacdba9991bf45bb7dfe927f7c893e89c93c/pillow-12.2.0-cp314-cp314-win32.whl", hash = "sha256:632ff19b2778e43162304d50da0181ce24ac5bb8180122cbe1bf4673428328c7", size = 6479723, upload-time = "2026-04-01T14:45:17.797Z" },
+    { url = "https://files.pythonhosted.org/packages/cd/8b/601e6566b957ca50e28725cb6c355c59c2c8609751efbecd980db44e0349/pillow-12.2.0-cp314-cp314-win_amd64.whl", hash = "sha256:4e6c62e9d237e9b65fac06857d511e90d8461a32adcc1b9065ea0c0fa3a28150", size = 7217400, upload-time = "2026-04-01T14:45:20.529Z" },
+    { url = "https://files.pythonhosted.org/packages/d6/94/220e46c73065c3e2951bb91c11a1fb636c8c9ad427ac3ce7d7f3359b9b2f/pillow-12.2.0-cp314-cp314-win_arm64.whl", hash = "sha256:b1c1fbd8a5a1af3412a0810d060a78b5136ec0836c8a4ef9aa11807f2a22f4e1", size = 2554835, upload-time = "2026-04-01T14:45:23.162Z" },
+    { url = "https://files.pythonhosted.org/packages/b6/ab/1b426a3974cb0e7da5c29ccff4807871d48110933a57207b5a676cccc155/pillow-12.2.0-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:57850958fe9c751670e49b2cecf6294acc99e562531f4bd317fa5ddee2068463", size = 5314225, upload-time = "2026-04-01T14:45:25.637Z" },
+    { url = "https://files.pythonhosted.org/packages/19/1e/dce46f371be2438eecfee2a1960ee2a243bbe5e961890146d2dee1ff0f12/pillow-12.2.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:d5d38f1411c0ed9f97bcb49b7bd59b6b7c314e0e27420e34d99d844b9ce3b6f3", size = 4698541, upload-time = "2026-04-01T14:45:28.355Z" },
+    { url = "https://files.pythonhosted.org/packages/55/c3/7fbecf70adb3a0c33b77a300dc52e424dc22ad8cdc06557a2e49523b703d/pillow-12.2.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:5c0a9f29ca8e79f09de89293f82fc9b0270bb4af1d58bc98f540cc4aedf03166", size = 6322251, upload-time = "2026-04-01T14:45:30.924Z" },
+    { url = "https://files.pythonhosted.org/packages/1c/3c/7fbc17cfb7e4fe0ef1642e0abc17fc6c94c9f7a16be41498e12e2ba60408/pillow-12.2.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:1610dd6c61621ae1cf811bef44d77e149ce3f7b95afe66a4512f8c59f25d9ebe", size = 8127807, upload-time = "2026-04-01T14:45:33.908Z" },
+    { url = "https://files.pythonhosted.org/packages/ff/c3/a8ae14d6defd2e448493ff512fae903b1e9bd40b72efb6ec55ce0048c8ce/pillow-12.2.0-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0a34329707af4f73cf1782a36cd2289c0368880654a2c11f027bcee9052d35dd", size = 6433935, upload-time = "2026-04-01T14:45:36.623Z" },
+    { url = "https://files.pythonhosted.org/packages/6e/32/2880fb3a074847ac159d8f902cb43278a61e85f681661e7419e6596803ed/pillow-12.2.0-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8e9c4f5b3c546fa3458a29ab22646c1c6c787ea8f5ef51300e5a60300736905e", size = 7116720, upload-time = "2026-04-01T14:45:39.258Z" },
+    { url = "https://files.pythonhosted.org/packages/46/87/495cc9c30e0129501643f24d320076f4cc54f718341df18cc70ec94c44e1/pillow-12.2.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:fb043ee2f06b41473269765c2feae53fc2e2fbf96e5e22ca94fb5ad677856f06", size = 6540498, upload-time = "2026-04-01T14:45:41.879Z" },
+    { url = "https://files.pythonhosted.org/packages/18/53/773f5edca692009d883a72211b60fdaf8871cbef075eaa9d577f0a2f989e/pillow-12.2.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:f278f034eb75b4e8a13a54a876cc4a5ab39173d2cdd93a638e1b467fc545ac43", size = 7239413, upload-time = "2026-04-01T14:45:44.705Z" },
+    { url = "https://files.pythonhosted.org/packages/c9/e4/4b64a97d71b2a83158134abbb2f5bd3f8a2ea691361282f010998f339ec7/pillow-12.2.0-cp314-cp314t-win32.whl", hash = "sha256:6bb77b2dcb06b20f9f4b4a8454caa581cd4dd0643a08bacf821216a16d9c8354", size = 6482084, upload-time = "2026-04-01T14:45:47.568Z" },
+    { url = "https://files.pythonhosted.org/packages/ba/13/306d275efd3a3453f72114b7431c877d10b1154014c1ebbedd067770d629/pillow-12.2.0-cp314-cp314t-win_amd64.whl", hash = "sha256:6562ace0d3fb5f20ed7290f1f929cae41b25ae29528f2af1722966a0a02e2aa1", size = 7225152, upload-time = "2026-04-01T14:45:50.032Z" },
+    { url = "https://files.pythonhosted.org/packages/ff/6e/cf826fae916b8658848d7b9f38d88da6396895c676e8086fc0988073aaf8/pillow-12.2.0-cp314-cp314t-win_arm64.whl", hash = "sha256:aa88ccfe4e32d362816319ed727a004423aab09c5cea43c01a4b435643fa34eb", size = 2556579, upload-time = "2026-04-01T14:45:52.529Z" },
+]
+
 [[package]]
 name = "pluggy"
 version = "1.6.0"
@@ -1035,6 +1140,24 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/57/bf/2086963c69bdac3d7cff1cc7ff79b8ce5ea0bec6797a017e1be338a46248/protobuf-6.33.5-py3-none-any.whl", hash = "sha256:69915a973dd0f60f31a08b8318b73eab2bd6a392c79184b3612226b0a3f8ec02", size = 170687, upload-time = "2026-01-29T21:51:32.557Z" },
 ]
 
+[[package]]
+name = "py-rust-stemmers"
+version = "0.1.8"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/6b/c1/9763f9fb1cd73f9c317a83feeed6e0d4af320c6bbddab47b4a94f3a47d0c/py_rust_stemmers-0.1.8.tar.gz", hash = "sha256:6b0f6f48bc54d607aed802de872fcd5a71bae969a6760976dc78ce55e8eaf3da", size = 9732, upload-time = "2026-05-22T11:00:24.358Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/b0/7e/f4346adfd44acbd7eaedcbd7d21b7f40ec9712e6c699e71fddad8dae6f8d/py_rust_stemmers-0.1.8-cp314-cp314-macosx_10_12_x86_64.whl", hash = "sha256:526b58958c6ffa36c4a805326cfb624ecbd665d16ba435027dbed0bcbcaa09d2", size = 290379, upload-time = "2026-05-22T11:00:08.192Z" },
+    { url = "https://files.pythonhosted.org/packages/c2/d8/988fc3f5dc0dbbd4bf5909f50ff953ab55ee8b5f79a835d00e57847d3123/py_rust_stemmers-0.1.8-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:2b607f0b270951fb66479baf4b68716cc63a981585cbd898b0b6b5c359efde7e", size = 275458, upload-time = "2026-05-22T11:00:09.522Z" },
+    { url = "https://files.pythonhosted.org/packages/f4/94/e04c8b6a8364bca1b368785cef143755dd2d1ffe74df8f8b47b075bb1043/py_rust_stemmers-0.1.8-cp314-cp314-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8b0327b151ab8a338fb54fdac114ba34394327fc1e2c4c425ad1caf2013e5de3", size = 314711, upload-time = "2026-05-22T11:00:10.878Z" },
+    { url = "https://files.pythonhosted.org/packages/4f/cb/f59f9a80caa099cb6625a46c9a8e6e7e80bb3ed284f17e80245c8240a66e/py_rust_stemmers-0.1.8-cp314-cp314-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:dadd0e369703817fc7026987b3093f461f9f58d8dde74e689d546184bc8f3451", size = 319370, upload-time = "2026-05-22T11:00:11.961Z" },
+    { url = "https://files.pythonhosted.org/packages/06/59/8211cd0f56e53f7770debd9a78de37985fb5662ae66e3b7b380f4c79888b/py_rust_stemmers-0.1.8-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:245e2c61c52e073341893a9682cd1396b61047154548aee30bb1af3d8ed4b4cc", size = 321373, upload-time = "2026-05-22T11:00:13.213Z" },
+    { url = "https://files.pythonhosted.org/packages/10/72/fe33e614c114264d1ba54d39da4b5a4abeb6aedd0d26e5a8fd0637d6ddba/py_rust_stemmers-0.1.8-cp314-cp314-manylinux_2_28_x86_64.whl", hash = "sha256:451ee1c02a3f5cf1e161b46ba9032cdda4ba10a8b03ff9ee61c1d34d42a0bc81", size = 321707, upload-time = "2026-05-22T11:00:14.177Z" },
+    { url = "https://files.pythonhosted.org/packages/91/f9/3cd18902fe2fa54557d3fe9132552256372d381c7aca71346163055d78b1/py_rust_stemmers-0.1.8-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:d396dd25c473c1bc4248c79cd223f4b36356b55a124652f015c6a001547f81ac", size = 492457, upload-time = "2026-05-22T11:00:15.245Z" },
+    { url = "https://files.pythonhosted.org/packages/90/d7/32c6d3995e7036b73683389de2771f4dbbf40de192b7efe73c2528ee1eb5/py_rust_stemmers-0.1.8-cp314-cp314-musllinux_1_2_armv7l.whl", hash = "sha256:479c77c32d8be692f3cfcde7e19273f02ac81d6f45c6aef49887ef95cab7abbb", size = 596085, upload-time = "2026-05-22T11:00:16.404Z" },
+    { url = "https://files.pythonhosted.org/packages/00/8c/e68fa5d862ea6a27fced3535c25ea4eaa26ba1ce00dfef5841924c74b167/py_rust_stemmers-0.1.8-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:c786235275c5c2abb7f206b8236aee3ca0bc53c7497daf7fb7b01d3491469547", size = 539747, upload-time = "2026-05-22T11:00:17.414Z" },
+    { url = "https://files.pythonhosted.org/packages/44/48/aa584cf3772e01231641c95dc1aa73327a7d986c562639d78d0013733acf/py_rust_stemmers-0.1.8-cp314-cp314-win_amd64.whl", hash = "sha256:931d13570962b093417e5443a9d1bd63d73fa239ebb81e5b1d346663571403e4", size = 209636, upload-time = "2026-05-22T11:00:18.662Z" },
+]
+
 [[package]]
 name = "pydantic"
 version = "2.12.5"
@@ -1449,18 +1572,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/81/0d/13d1d239a25cbfb19e740db83143e95c772a1fe10202dda4b76792b114dd/starlette-0.52.1-py3-none-any.whl", hash = "sha256:0029d43eb3d273bc4f83a08720b4912ea4b071087a3b48db01b7c839f7954d74", size = 74272, upload-time = "2026-01-18T13:34:09.188Z" },
 ]
 
-[[package]]
-name = "sympy"
-version = "1.14.0"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "mpmath" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/83/d3/803453b36afefb7c2bb238361cd4ae6125a569b4db67cd9e79846ba2d68c/sympy-1.14.0.tar.gz", hash = "sha256:d3d3fe8df1e5a0b42f0e7bdf50541697dbe7d23746e894990c030e2b05e72517", size = 7793921, upload-time = "2025-04-27T18:05:01.611Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/a2/09/77d55d46fd61b4a135c444fc97158ef34a095e5681d0a6c10b75bf356191/sympy-1.14.0-py3-none-any.whl", hash = "sha256:e091cc3e99d2141a0ba2847328f5479b05d94a6635cb96148ccb3f34671bd8f5", size = 6299353, upload-time = "2025-04-27T18:04:59.103Z" },
-]
-
 [[package]]
 name = "tabulate"
 version = "0.9.0"
@@ -1768,6 +1879,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/6f/28/258ebab549c2bf3e64d2b0217b973467394a9cea8c42f70418ca2c5d0d2e/websockets-16.0-py3-none-any.whl", hash = "sha256:1637db62fad1dc833276dded54215f2c7fa46912301a24bd94d45d46a011ceec", size = 171598, upload-time = "2026-01-10T09:23:45.395Z" },
 ]
 
+[[package]]
+name = "win32-setctime"
+version = "1.2.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/b3/8f/705086c9d734d3b663af0e9bb3d4de6578d08f46b1b101c2442fd9aecaa2/win32_setctime-1.2.0.tar.gz", hash = "sha256:ae1fdf948f5640aae05c511ade119313fb6a30d7eabe25fef9764dca5873c4c0", size = 4867, upload-time = "2024-12-07T15:28:28.314Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/e1/07/c6fe3ad3e685340704d314d765b7912993bcb8dc198f0e7a89382d37974b/win32_setctime-1.2.0-py3-none-any.whl", hash = "sha256:95d644c4e708aba81dc3704a116d8cbc974d70b3bdb8be1d150e36be6e9d1390", size = 4083, upload-time = "2024-12-07T15:28:26.465Z" },
+]
+
 [[package]]
 name = "x-wr-timezone"
 version = "2.0.1"

From 237b19dd83933f322f00c59dab8de9a7bb37c388 Mon Sep 17 00:00:00 2001
From: Matteo Merola <mattmezza@gmail.com>
Date: Sun, 7 Jun 2026 23:04:31 +0200
Subject: [PATCH 6/7] feat(admin): Memory tab controls for embeddings +
 lifecycle, with explainers

Add a Semantic memory (embeddings) card and a Memory lifecycle (forgetting &
hygiene) card to the admin Memory tab: enable toggle, backend (local/openai/
google), model, top-k, importance/archive/hygiene knobs, plus collapsible
'how it works (the science)' sections. Wire status/prefetch/test endpoints
(/memory/embedding/*) and a Download-model button; changes apply live to the
running agent (embedder rebuilt lazily, lifecycle knobs refreshed in
patch_config). Refactor the memory partial render into a shared helper so the
post-delete refresh keeps the full config.

Tests: local backend lazy-load + prefetch (fastembed stubbed), embedding
status/prefetch/test endpoints, memory partial render, and a config_store
roundtrip proving UI-saved memory.embedding.* keys rebuild EmbeddingConfig.
Docs updated (pa.md). Full suite: 304 passed.
---
 api/admin.py                       | 166 ++++++++++++++++----
 api/templates/partials/memory.html | 240 ++++++++++++++++++++++++++---
 pa.md                              |  16 +-
 tests/test_config_store.py         |  24 +++
 tests/test_embeddings_local.py     |  71 +++++++++
 tests/test_memory_admin.py         | 116 ++++++++++++++
 6 files changed, 574 insertions(+), 59 deletions(-)
 create mode 100644 tests/test_embeddings_local.py
 create mode 100644 tests/test_memory_admin.py

diff --git a/api/admin.py b/api/admin.py
index b9774fa..e69d43e 100644
--- a/api/admin.py
+++ b/api/admin.py
@@ -906,25 +906,47 @@ async def partial_search() -> HTMLResponse:
             max_results=max_results,
         )
 
-    @app.get("/partials/memory", dependencies=[Depends(auth)])
-    async def partial_memory() -> HTMLResponse:
-        """Memory tab partial."""
-        # Memory config
-        memory_long_term_limit = await config_store.get("memory.long_term_limit") or "50"
+    async def _render_memory_partial() -> HTMLResponse:
+        """Build the Memory tab partial (config + stored memories).
 
-        # Memory data — read directly from DB (works even when agent is stopped)
+        Shared by the tab load and the post-delete refresh so both render the
+        full embedding/lifecycle config, not just the memory tables.
+        """
         import aiosqlite
 
+        async def _cfg(key: str, default: str) -> str:
+            val = await config_store.get(key)
+            return default if val is None or val == "" else str(val)
+
+        async def _bool(key: str, default: str) -> str:
+            val = await config_store.get(key)
+            return default if val is None else str(val).lower()
+
+        ctx: dict[str, object] = {
+            "memory_long_term_limit": await _cfg("memory.long_term_limit", "50"),
+            "emb_enabled": await _bool("memory.embedding.enabled", "true"),
+            "emb_provider": await _cfg("memory.embedding.provider", "local"),
+            "emb_model": await _cfg("memory.embedding.model", "BAAI/bge-small-en-v1.5"),
+            "emb_base_url": await _cfg("memory.embedding.base_url", ""),
+            "emb_top_k": await _cfg("memory.embedding.injection_top_k", "12"),
+            "hygiene_enabled": await _bool("memory.hygiene_enabled", "true"),
+            "default_importance": await _cfg("memory.default_importance", "5.0"),
+            "archive_after_days": await _cfg("memory.archive_after_days", "90"),
+            "archive_max_importance": await _cfg("memory.archive_max_importance", "4.0"),
+            "archive_min_idle_days": await _cfg("memory.archive_min_idle_days", "45"),
+            "hygiene_threshold": await _cfg("memory.hygiene_similarity_threshold", "0.45"),
+        }
+
+        # Memory data — read directly from DB (works even when agent is stopped)
         memory_db = await config_store.get("memory.db_path") or "data/memory.db"
-        long_term = []
-        short_term = []
+        long_term: list[dict] = []
+        short_term: list[dict] = []
         if Path(memory_db).exists():
             cols = "id, category, subject, content, source, confidence, created_at, updated_at"
             async with aiosqlite.connect(memory_db) as db:
                 db.row_factory = aiosqlite.Row
                 cursor = await db.execute(f"SELECT {cols} FROM long_term ORDER BY updated_at DESC")
                 long_term = [dict(row) for row in await cursor.fetchall()]
-
                 cursor = await db.execute(
                     "SELECT id, content, context, expires_at, created_at "
                     "FROM short_term WHERE expires_at > datetime('now') "
@@ -933,12 +955,14 @@ async def partial_memory() -> HTMLResponse:
                 short_term = [dict(row) for row in await cursor.fetchall()]
 
         return _render_partial(
-            "partials/memory.html",
-            long_term=long_term,
-            short_term=short_term,
-            memory_long_term_limit=memory_long_term_limit,
+            "partials/memory.html", long_term=long_term, short_term=short_term, **ctx
         )
 
+    @app.get("/partials/memory", dependencies=[Depends(auth)])
+    async def partial_memory() -> HTMLResponse:
+        """Memory tab partial."""
+        return await _render_memory_partial()
+
     @app.get("/partials/history", dependencies=[Depends(auth)])
     async def partial_history() -> HTMLResponse:
         """History tab partial."""
@@ -1243,7 +1267,18 @@ async def patch_config(body: ConfigPatchIn) -> dict:
                 agent.llm = LLMClient.from_agent_config(new_config.agent)
                 agent.executor.tool_env = tool_env(new_config)
                 agent.history_mode = new_config.history.mode
-                agent.memory.long_term_limit = new_config.memory.long_term_limit
+                mem_cfg = new_config.memory
+                agent.memory.long_term_limit = mem_cfg.long_term_limit
+                # Rebuild the embedder (lazy — no model load here) and refresh the
+                # Tier 3/4 lifecycle knobs so memory config changes apply live.
+                agent.memory.embedder = agent._build_embedder()
+                agent.memory.injection_top_k = mem_cfg.embedding.injection_top_k
+                agent.memory.default_importance = mem_cfg.default_importance
+                agent.memory.archive_after_days = mem_cfg.archive_after_days
+                agent.memory.archive_max_importance = mem_cfg.archive_max_importance
+                agent.memory.archive_min_idle_days = mem_cfg.archive_min_idle_days
+                agent.memory.hygiene_enabled = mem_cfg.hygiene_enabled
+                agent.memory.hygiene_similarity_threshold = mem_cfg.hygiene_similarity_threshold
                 agent.reflections.max_reflections = new_config.task_reflection.max_reflections
                 if new_config.search.enabled and new_config.search.api_key:
                     from tavily import TavilyClient
@@ -1965,28 +2000,91 @@ async def delete_memory(request: Request) -> HTMLResponse:
             if cursor.rowcount == 0:
                 raise HTTPException(404, f"Memory {memory_id} not found in {tier}")
 
-        # Return refreshed memory partial
-        memory_long_term_limit = await config_store.get("memory.long_term_limit") or "50"
-        long_term = []
-        short_term = []
-        cols = "id, category, subject, content, source, confidence, created_at, updated_at"
-        async with aiosqlite.connect(agent.memory.db_path) as db:
-            db.row_factory = aiosqlite.Row
-            cursor = await db.execute(f"SELECT {cols} FROM long_term ORDER BY updated_at DESC")
-            long_term = [dict(row) for row in await cursor.fetchall()]
-            cursor = await db.execute(
-                "SELECT id, content, context, expires_at, created_at "
-                "FROM short_term WHERE expires_at > datetime('now') "
-                "ORDER BY created_at DESC"
-            )
-            short_term = [dict(row) for row in await cursor.fetchall()]
-        return _render_partial(
-            "partials/memory.html",
-            long_term=long_term,
-            short_term=short_term,
-            memory_long_term_limit=memory_long_term_limit,
+        # Return refreshed memory partial (full config + tables)
+        return await _render_memory_partial()
+
+    @app.get("/memory/embedding/status", dependencies=[Depends(auth)])
+    async def embedding_status() -> dict:
+        """Report embedding config + whether a local model is already on disk."""
+        from core.embeddings import LOCAL_PROVIDERS
+
+        config = await config_store.export_to_config()
+        emb = config.memory.embedding
+        is_local = emb.provider in LOCAL_PROVIDERS
+        model_ready: bool | None = None
+        if is_local:
+            cache = Path(emb.cache_dir)
+            model_ready = cache.exists() and any(cache.rglob("*.onnx"))
+        return {
+            "enabled": emb.enabled,
+            "provider": emb.provider,
+            "model": emb.model,
+            "local": is_local,
+            "model_ready": model_ready,
+            "cache_dir": emb.cache_dir,
+        }
+
+    @app.post("/memory/embedding/prefetch", dependencies=[Depends(auth)])
+    async def embedding_prefetch() -> dict:
+        """Download the local embedding model now (also done at Docker build)."""
+        from core.embeddings import LOCAL_PROVIDERS, prefetch_local_model
+
+        config = await config_store.export_to_config()
+        emb = config.memory.embedding
+        if emb.provider not in LOCAL_PROVIDERS:
+            raise HTTPException(400, "Prefetch only applies to the local embedding provider")
+        try:
+            dim = await asyncio.to_thread(prefetch_local_model, emb.model, emb.cache_dir)
+        except Exception as exc:
+            log.exception("Embedding model prefetch failed")
+            raise HTTPException(500, f"Prefetch failed: {exc}") from exc
+        return {"ok": True, "model": emb.model, "dimensions": dim, "cache_dir": emb.cache_dir}
+
+    @app.post("/memory/embedding/test", dependencies=[Depends(auth)])
+    async def embedding_test() -> dict:
+        """Embed a few probe sentences and report dimension + a sanity cosine."""
+        from core.embeddings import (
+            LOCAL_PROVIDERS,
+            EmbeddingClient,
+            LocalEmbeddingClient,
+            cosine_similarity,
         )
 
+        config = await config_store.export_to_config()
+        emb = config.memory.embedding
+        try:
+            if emb.provider in LOCAL_PROVIDERS:
+                client: object = LocalEmbeddingClient(model=emb.model, cache_dir=emb.cache_dir)
+            else:
+                cfg = config.agent
+                api_key = emb.api_key or getattr(cfg, f"{emb.provider}_api_key", "")
+                base_url = emb.base_url or getattr(cfg, f"{emb.provider}_base_url", "") or None
+                if not api_key:
+                    raise HTTPException(400, f"No API key configured for provider {emb.provider}")
+                client = EmbeddingClient(
+                    provider=emb.provider,
+                    api_key=api_key,
+                    model=emb.model,
+                    base_url=base_url,
+                    dimensions=emb.dimensions,
+                )
+            probes = ["allergic to shellfish", "cannot eat prawns", "the weather is sunny today"]
+            vecs = await client.embed(probes)  # type: ignore[attr-defined]
+            if len(vecs) < 3 or not vecs[0]:
+                raise HTTPException(500, "Embedding returned no vectors")
+            return {
+                "ok": True,
+                "model": emb.model,
+                "dimensions": len(vecs[0]),
+                "similar_pair": round(cosine_similarity(vecs[0], vecs[1]), 3),
+                "unrelated_pair": round(cosine_similarity(vecs[0], vecs[2]), 3),
+            }
+        except HTTPException:
+            raise
+        except Exception as exc:
+            log.exception("Embedding test failed")
+            raise HTTPException(500, f"Test failed: {exc}") from exc
+
     @app.post("/memory/consolidate", dependencies=[Depends(auth)])
     async def trigger_consolidation() -> HTMLResponse:
         agent = agent_state.agent
diff --git a/api/templates/partials/memory.html b/api/templates/partials/memory.html
index 6cf9ede..ca23f24 100644
--- a/api/templates/partials/memory.html
+++ b/api/templates/partials/memory.html
@@ -2,8 +2,67 @@
   <div x-data="{
     longTermLimit: {{ memory_long_term_limit|default('50', true)|tojson|forceescape }},
     cfgResult: '',
-    cfgOk: false
-  }">
+    cfgOk: false,
+    // Embeddings (Tier 2)
+    embEnabled: {{ emb_enabled|default('true', true)|tojson|forceescape }} === 'true',
+    embProvider: {{ emb_provider|default('local', true)|tojson|forceescape }},
+    embModel: {{ emb_model|default('BAAI/bge-small-en-v1.5', true)|tojson|forceescape }},
+    embBaseUrl: {{ emb_base_url|default('', true)|tojson|forceescape }},
+    embTopK: {{ emb_top_k|default('12', true)|tojson|forceescape }},
+    embResult: '', embOk: false,
+    embStatus: null, embBusy: false, embTestResult: '',
+    // Lifecycle (Tier 3/4)
+    defaultImportance: {{ default_importance|default('5.0', true)|tojson|forceescape }},
+    archiveAfterDays: {{ archive_after_days|default('90', true)|tojson|forceescape }},
+    archiveMaxImportance: {{ archive_max_importance|default('4.0', true)|tojson|forceescape }},
+    archiveMinIdleDays: {{ archive_min_idle_days|default('45', true)|tojson|forceescape }},
+    hygieneEnabled: {{ hygiene_enabled|default('true', true)|tojson|forceescape }} === 'true',
+    hygieneThreshold: {{ hygiene_threshold|default('0.45', true)|tojson|forceescape }},
+    lifeResult: '', lifeOk: false,
+    get isLocal() { return this.embProvider === 'local' || this.embProvider === 'fastembed'; },
+    _hdrs() {
+      return {
+        'Content-Type': 'application/json',
+        'Authorization': 'Bearer ' + (localStorage.getItem('admin_api_key') || '')
+      };
+    },
+    saveCfg(vals, okMsg, okField, resField) {
+      return fetch('/config', { method: 'PATCH', headers: this._hdrs(), body: JSON.stringify({values: vals}) })
+        .then(r => { this[okField] = r.ok; return r.json(); })
+        .then(d => {
+          this[resField] = this[okField] ? okMsg : (d.detail || 'Error');
+          if (this[okField] && window.showToast) { window.showToast(okMsg); }
+        })
+        .catch(e => { this[okField] = false; this[resField] = e.message; });
+    },
+    loadEmbStatus() {
+      fetch('/memory/embedding/status', { headers: this._hdrs() })
+        .then(r => r.json()).then(d => { this.embStatus = d; }).catch(() => {});
+    },
+    downloadModel() {
+      this.embBusy = true; this.embTestResult = 'Downloading model… (first time may take a minute)';
+      fetch('/memory/embedding/prefetch', { method: 'POST', headers: this._hdrs() })
+        .then(r => r.json().then(d => ({ok: r.ok, d})))
+        .then(({ok, d}) => {
+          this.embTestResult = ok ? ('Model ready (dim ' + d.dimensions + ')') : ('Error: ' + (d.detail || 'failed'));
+          this.loadEmbStatus();
+        })
+        .catch(e => { this.embTestResult = 'Error: ' + e.message; })
+        .finally(() => { this.embBusy = false; });
+    },
+    testEmb() {
+      this.embBusy = true; this.embTestResult = 'Testing…';
+      fetch('/memory/embedding/test', { method: 'POST', headers: this._hdrs() })
+        .then(r => r.json().then(d => ({ok: r.ok, d})))
+        .then(({ok, d}) => {
+          this.embTestResult = ok
+            ? ('OK · dim ' + d.dimensions + ' · related=' + d.similar_pair + ' vs unrelated=' + d.unrelated_pair)
+            : ('Error: ' + (d.detail || 'failed'));
+        })
+        .catch(e => { this.embTestResult = 'Error: ' + e.message; })
+        .finally(() => { this.embBusy = false; });
+    }
+  }" x-init="loadEmbStatus()">
   {# Memory config section #}
   <div class="card mb-6">
     <h2 class="text-base mb-1">Memory Settings</h2>
@@ -20,25 +79,7 @@ <h2 class="text-base mb-1">Memory Settings</h2>
 
     <div class="flex justify-end mt-3">
       <button class="btn-primary btn-sm"
-              @click="
-                const vals = {
-                  'memory.long_term_limit': String(longTermLimit)
-                };
-                fetch('/config', {
-                  method: 'PATCH',
-                  headers: {
-                    'Content-Type': 'application/json',
-                    'Authorization': 'Bearer ' + (localStorage.getItem('admin_api_key') || '')
-                  },
-                  body: JSON.stringify({values: vals})
-                })
-                .then(r => { cfgOk = r.ok; return r.json(); })
-                .then(d => {
-                  cfgResult = cfgOk ? 'Memory settings saved' : (d.detail || 'Error');
-                  if (cfgOk && window.showToast) { window.showToast('Memory settings saved'); }
-                })
-                .catch(e => { cfgOk = false; cfgResult = e.message; })
-              ">
+              @click="saveCfg({'memory.long_term_limit': String(longTermLimit)}, 'Memory settings saved', 'cfgOk', 'cfgResult')">
         Save settings
       </button>
     </div>
@@ -52,6 +93,163 @@ <h2 class="text-base mb-1">Memory Settings</h2>
     </p>
   </div>
 
+  {# Semantic memory (embeddings) — Tier 2 #}
+  <div class="card mb-6">
+    <h2 class="text-base mb-1">Semantic memory (embeddings)</h2>
+    <p class="text-muted text-xs mb-3">
+      Match memories by <em>meaning</em>, not just words — so “allergic to shellfish” surfaces when you ask about eating prawns.
+    </p>
+
+    <details class="mb-4 rounded border border-border p-2">
+      <summary class="text-xs cursor-pointer">How it works (the science)</summary>
+      <div class="text-muted text-xs mt-2 space-y-2">
+        <p>An <strong>embedding model</strong> turns each memory and your current message into a vector — a list of numbers positioning the text in a “meaning space”. Texts with similar meaning land close together, even with no shared words.</p>
+        <p>Closeness is measured by <strong>cosine similarity</strong> (1.0 = same direction/meaning, 0 = unrelated). On every message we embed it, compare against stored memory vectors, and inject only the most relevant ones — ranked by <em>relevance + importance + recency</em> (a Generative-Agents-style score) instead of dumping everything.</p>
+        <p>The same similarity drives dedup (does this new fact match an existing one?) and the hygiene pass (cluster &amp; merge near-duplicates).</p>
+        <p><strong>Local</strong> runs the model on this machine (private, free, no key; the model file is bundled in the Docker image). <strong>API</strong> providers call a remote endpoint (needs a key; a few cents/year at most, but your memory text is sent to them). When disabled, memory falls back to fast word-overlap matching — still works, just less “fuzzy”.</p>
+      </div>
+    </details>
+
+    <div class="space-y-3">
+      <div class="flex items-center gap-2">
+        <label class="label mb-0">Enabled</label>
+        <input type="checkbox" x-model="embEnabled" class="rounded border-border">
+        <span class="text-muted text-xs">Off → lexical (word-overlap) retrieval, no model needed.</span>
+      </div>
+
+      <div>
+        <label class="label">Backend</label>
+        <select class="input-sm" style="max-width:320px" x-model="embProvider" :disabled="!embEnabled">
+          <option value="local">Local (on-device, private, no key)</option>
+          <option value="openai">OpenAI API</option>
+          <option value="google">Google API</option>
+        </select>
+        <p class="text-muted text-xs mt-1" x-show="!isLocal">
+          API key falls back to the matching provider key set in the
+          <a href="#" class="text-link" @click.prevent="$dispatch('switch-tab', 'llm')">LLM tab</a>.
+          (Note: DeepSeek has no embeddings endpoint.)
+        </p>
+      </div>
+
+      <div>
+        <label class="label">Model</label>
+        <input type="text" class="input-sm" style="max-width:420px" x-model="embModel" :disabled="!embEnabled"
+               placeholder="BAAI/bge-small-en-v1.5">
+        <p class="text-muted text-xs mt-1" x-show="isLocal">Small CPU model (384-dim, ~130MB). Bundled in Docker; use “Download model” if missing.</p>
+        <p class="text-muted text-xs mt-1" x-show="!isLocal">e.g. <code>text-embedding-3-small</code> (OpenAI) or <code>text-embedding-004</code> (Google).</p>
+      </div>
+
+      <div x-show="!isLocal">
+        <label class="label">Base URL (optional)</label>
+        <input type="text" class="input-sm" style="max-width:420px" x-model="embBaseUrl" :disabled="!embEnabled"
+               placeholder="https://… (OpenAI-compatible /embeddings)">
+      </div>
+
+      <div>
+        <label class="label">Injected memories per turn (top-k)</label>
+        <input type="number" class="input-sm" style="max-width:120px" x-model="embTopK" :disabled="!embEnabled" min="1" max="100">
+        <p class="text-muted text-xs mt-1">How many of the most relevant long-term memories to put in the prompt each message.</p>
+      </div>
+
+      <div class="text-xs" x-show="isLocal && embStatus">
+        Model on disk:
+        <span x-show="embStatus && embStatus.model_ready" class="text-success">yes ✓</span>
+        <span x-show="embStatus && embStatus.model_ready === false" class="text-error">not downloaded yet</span>
+      </div>
+    </div>
+
+    <div class="flex items-center gap-2 mt-4 flex-wrap">
+      <button class="btn-primary btn-sm"
+              @click="saveCfg({
+                'memory.embedding.enabled': embEnabled ? 'true' : 'false',
+                'memory.embedding.provider': embProvider,
+                'memory.embedding.model': embModel,
+                'memory.embedding.base_url': embBaseUrl,
+                'memory.embedding.injection_top_k': String(embTopK)
+              }, 'Embedding settings saved', 'embOk', 'embResult').then(loadEmbStatus)">
+        Save embedding settings
+      </button>
+      <button class="btn btn-sm" type="button" x-show="isLocal" :disabled="embBusy" @click="downloadModel()">
+        <span x-show="!embBusy">Download model</span>
+        <span x-show="embBusy">Working…</span>
+      </button>
+      <button class="btn btn-sm" type="button" :disabled="embBusy || !embEnabled" @click="testEmb()">Test</button>
+    </div>
+    <template x-if="embResult">
+      <div :class="embOk ? 'alert-success' : 'alert-error'" class="mt-2" x-text="embResult"></div>
+    </template>
+    <template x-if="embTestResult">
+      <div class="text-muted text-xs mt-2" x-text="embTestResult"></div>
+    </template>
+    <p class="text-muted text-xs mt-3">Changes apply live to the running agent (the local model loads on first use).</p>
+  </div>
+
+  {# Memory lifecycle (forgetting + hygiene) — Tier 3/4 #}
+  <div class="card mb-6">
+    <h2 class="text-base mb-1">Memory lifecycle (forgetting &amp; hygiene)</h2>
+    <p class="text-muted text-xs mb-3">Keep long-term memory from growing forever: reinforce what matters, forget cold trivia, merge duplicates.</p>
+
+    <details class="mb-4 rounded border border-border p-2">
+      <summary class="text-xs cursor-pointer">How it works (the science)</summary>
+      <div class="text-muted text-xs mt-2 space-y-2">
+        <p>Each memory has an <strong>importance</strong> (1–10). Recall <strong>reinforces</strong> it (access count + recency), and re-stating a fact bumps its importance — mirroring how human memory strengthens with use.</p>
+        <p><strong>Forgetting:</strong> during consolidation, memories that are old, low-importance, and not accessed for a while are <em>archived</em> (soft-deleted, recoverable) rather than injected forever. This is decay, like the “use it or lose it” curve.</p>
+        <p><strong>Hygiene:</strong> the same run clusters near-duplicate memories (by similarity) and asks the model to merge them and drop contradictions, keeping the most recent — so the store stays compact and consistent.</p>
+      </div>
+    </details>
+
+    <div class="space-y-3">
+      <div>
+        <label class="label">Default importance (1–10)</label>
+        <input type="number" step="0.5" class="input-sm" style="max-width:120px" x-model="defaultImportance" min="1" max="10">
+        <p class="text-muted text-xs mt-1">Starting importance for a new memory.</p>
+      </div>
+      <div>
+        <label class="label">Archive after (days old)</label>
+        <input type="number" class="input-sm" style="max-width:120px" x-model="archiveAfterDays" min="1">
+        <p class="text-muted text-xs mt-1">A memory must be at least this old before it can be archived.</p>
+      </div>
+      <div>
+        <label class="label">Archive only if importance ≤</label>
+        <input type="number" step="0.5" class="input-sm" style="max-width:120px" x-model="archiveMaxImportance" min="1" max="10">
+        <p class="text-muted text-xs mt-1">Important memories are never auto-archived.</p>
+      </div>
+      <div>
+        <label class="label">Archive only if idle (days)</label>
+        <input type="number" class="input-sm" style="max-width:120px" x-model="archiveMinIdleDays" min="1">
+        <p class="text-muted text-xs mt-1">Require this long since last access/creation before archiving.</p>
+      </div>
+      <hr class="border-border">
+      <div class="flex items-center gap-2">
+        <label class="label mb-0">Hygiene pass enabled</label>
+        <input type="checkbox" x-model="hygieneEnabled" class="rounded border-border">
+        <span class="text-muted text-xs">Cluster &amp; merge near-duplicates during consolidation.</span>
+      </div>
+      <div>
+        <label class="label">Hygiene similarity threshold</label>
+        <input type="number" step="0.05" class="input-sm" style="max-width:120px" x-model="hygieneThreshold" min="0" max="1" :disabled="!hygieneEnabled">
+        <p class="text-muted text-xs mt-1">Higher = stricter (only very-similar memories merge). 0.45 is a sensible default.</p>
+      </div>
+    </div>
+
+    <div class="flex justify-end mt-4">
+      <button class="btn-primary btn-sm"
+              @click="saveCfg({
+                'memory.default_importance': String(defaultImportance),
+                'memory.archive_after_days': String(archiveAfterDays),
+                'memory.archive_max_importance': String(archiveMaxImportance),
+                'memory.archive_min_idle_days': String(archiveMinIdleDays),
+                'memory.hygiene_enabled': hygieneEnabled ? 'true' : 'false',
+                'memory.hygiene_similarity_threshold': String(hygieneThreshold)
+              }, 'Lifecycle settings saved', 'lifeOk', 'lifeResult')">
+        Save lifecycle settings
+      </button>
+    </div>
+    <template x-if="lifeResult">
+      <div :class="lifeOk ? 'alert-success' : 'alert-error'" class="mt-2" x-text="lifeResult"></div>
+    </template>
+  </div>
+
   {# Memory data #}
   <div class="flex items-center gap-2 mb-4">
     <button class="btn btn-sm" hx-get="/partials/memory" hx-target="#tab-content" hx-swap="innerHTML">
diff --git a/pa.md b/pa.md
index 85ec652..13a3872 100644
--- a/pa.md
+++ b/pa.md
@@ -1098,7 +1098,14 @@ You can also trigger consolidation manually via the admin API: `POST /memory/con
 
 #### Semantic retrieval & relevance-ranked injection (Tier 2)
 
-Embeddings are **optional and off by default** — the pipeline runs on Tier-1 lexical retrieval with no extra dependency or network call. When `memory.embedding.enabled` is set, each long-term memory gets a vector from an OpenAI-compatible `/embeddings` endpoint, stored as a packed float32 blob in the `embedding` column. Similarity is brute-force cosine in Python (no native SQLite extension, so it behaves identically locally and in the container; trivial at <1k rows).
+Each long-term memory gets a vector embedding, stored as a packed float32 blob in the `embedding` column. Similarity is brute-force cosine in Python (no native SQLite extension, so it behaves identically locally and in the container; trivial at <1k rows).
+
+**Backends** (`memory.embedding.provider`):
+
+- `local` (default) — runs a small on-device model via `fastembed` (`BAAI/bge-small-en-v1.5`, 384-dim, ~130MB ONNX/CPU). Private (memory never leaves the box), no API key, free. The model is **prefetched at Docker build** into `/app/models` (outside the data volume) so the first call has no download latency and the container works offline. The model loads lazily on first use, in a worker thread. A "Download model" button in the admin Memory tab (and `python -m core.embeddings prefetch`) fetches it on demand.
+- `openai` / `google` (or any OpenAI-compatible `/embeddings` endpoint via `base_url`) — calls a remote API. Needs a key (falls back to the matching agent provider key); a few cents/year at typical volume, but memory text is sent to the provider. Note: DeepSeek has no embeddings endpoint.
+
+Set `memory.embedding.enabled: false` to fall back to Tier-1 lexical (word-overlap) retrieval — still works, no model needed. All of this is configurable from the **admin Memory tab** (enable toggle, backend, model, top-k, download/test buttons) and applies live to the running agent.
 
 With embeddings on:
 
@@ -1118,9 +1125,10 @@ These behaviours are configured in the `memory` section (see `config.yml.example
 ```yaml
 memory:
   embedding:
-    enabled: false
-    provider: "openai"
-    model: "text-embedding-3-small"
+    enabled: true
+    provider: "local"                # "local" | "openai" | "google"
+    model: "BAAI/bge-small-en-v1.5"  # local model; API e.g. text-embedding-3-small
+    cache_dir: "models"              # local model store (bundled in the image)
     injection_top_k: 12
   default_importance: 5.0
   archive_after_days: 90
diff --git a/tests/test_config_store.py b/tests/test_config_store.py
index ae2908d..5a68aca 100644
--- a/tests/test_config_store.py
+++ b/tests/test_config_store.py
@@ -30,6 +30,30 @@ def test_parse_value_handles_int_bool_json() -> None:
     assert _parse_value("[1, 2]") == [1, 2]
 
 
+@pytest.mark.asyncio
+async def test_embedding_config_roundtrips_to_nested_model(tmp_path) -> None:
+    """UI-saved flat memory.embedding.* keys reconstruct EmbeddingConfig."""
+    store = ConfigStore(db_path=str(tmp_path / "config.db"))
+    await store.set_many(
+        {
+            "memory.embedding.enabled": "false",
+            "memory.embedding.provider": "openai",
+            "memory.embedding.model": "text-embedding-3-small",
+            "memory.embedding.injection_top_k": "20",
+            "memory.hygiene_enabled": "false",
+            "memory.default_importance": "7.5",
+        }
+    )
+    config = await store.export_to_config()
+    emb = config.memory.embedding
+    assert emb.enabled is False
+    assert emb.provider == "openai"
+    assert emb.model == "text-embedding-3-small"
+    assert emb.injection_top_k == 20
+    assert config.memory.hygiene_enabled is False
+    assert config.memory.default_importance == 7.5
+
+
 @pytest.mark.asyncio
 async def test_set_get_delete(tmp_path) -> None:
     store = ConfigStore(db_path=str(tmp_path / "config.db"))
diff --git a/tests/test_embeddings_local.py b/tests/test_embeddings_local.py
new file mode 100644
index 0000000..9c67af4
--- /dev/null
+++ b/tests/test_embeddings_local.py
@@ -0,0 +1,71 @@
+"""Tests for the local (fastembed) embedding backend + prefetch helper.
+
+fastembed is stubbed via importlib so no model is downloaded.
+"""
+
+from __future__ import annotations
+
+import pytest
+
+from core.embeddings import LocalEmbeddingClient, prefetch_local_model
+
+
+class _FakeTextEmbedding:
+    def __init__(self, model_name, cache_dir=None):
+        self.model_name = model_name
+        self.cache_dir = cache_dir
+
+    def embed(self, texts):
+        for t in texts:
+            yield [float(len(t)), 1.0, 0.0]
+
+
+class _FakeFastembed:
+    TextEmbedding = _FakeTextEmbedding
+
+
+@pytest.fixture
+def fake_fastembed(monkeypatch):
+    calls = {"imports": 0}
+
+    def fake_import(name):
+        calls["imports"] += 1
+        if name == "fastembed":
+            return _FakeFastembed
+        raise ImportError(name)
+
+    monkeypatch.setattr("core.embeddings.importlib.import_module", fake_import)
+    return calls
+
+
+class TestLocalEmbeddingClient:
+    async def test_not_loaded_at_construction(self, fake_fastembed):
+        LocalEmbeddingClient(model="m", cache_dir="c")
+        assert fake_fastembed["imports"] == 0  # lazy — nothing imported/loaded yet
+
+    async def test_embed_one(self, fake_fastembed):
+        client = LocalEmbeddingClient(model="m", cache_dir="c")
+        vec = await client.embed_one("hello")
+        assert vec == [5.0, 1.0, 0.0]
+
+    async def test_model_loaded_once(self, fake_fastembed):
+        client = LocalEmbeddingClient(model="m", cache_dir="c")
+        await client.embed_one("a")
+        await client.embed_one("bb")
+        await client.embed(["ccc", "dddd"])
+        assert fake_fastembed["imports"] == 1  # loaded a single time, then cached
+
+    async def test_embed_empty(self, fake_fastembed):
+        client = LocalEmbeddingClient()
+        assert await client.embed([]) == []
+
+    async def test_defaults(self):
+        client = LocalEmbeddingClient()
+        assert client.model == "BAAI/bge-small-en-v1.5"
+        assert client.cache_dir == "models"
+
+
+class TestPrefetch:
+    def test_prefetch_returns_dim(self, fake_fastembed):
+        dim = prefetch_local_model("some-model", "some-cache")
+        assert dim == 3  # _FakeTextEmbedding yields 3-dim vectors
diff --git a/tests/test_memory_admin.py b/tests/test_memory_admin.py
new file mode 100644
index 0000000..c5efb97
--- /dev/null
+++ b/tests/test_memory_admin.py
@@ -0,0 +1,116 @@
+"""Admin API tests for the embedding status / prefetch / test endpoints."""
+
+from __future__ import annotations
+
+from typing import cast
+
+from fastapi.testclient import TestClient
+
+from api.admin import AgentState, create_admin_app
+from core.config import Config
+from core.config_store import ConfigStore
+
+HEADERS = {"Authorization": "Bearer secret"}
+
+
+class _StoreStub:
+    """Minimal config store: auth + export_to_config(Config defaults)."""
+
+    def __init__(self, overrides: dict | None = None):
+        self._overrides = overrides or {}
+
+    async def is_setup_complete(self) -> bool:
+        return True
+
+    async def get(self, key: str):
+        if key == "admin.password_hash":
+            return "hash"
+        if key == "admin.password_salt":
+            return "salt"
+        return self._overrides.get(key)
+
+    async def verify_admin_password(self, password: str) -> bool:
+        return password == "secret"
+
+    async def export_to_config(self) -> Config:
+        cfg = Config()
+        emb = cfg.memory.embedding
+        for key, val in self._overrides.items():
+            if key == "memory.embedding.provider":
+                emb.provider = val
+            elif key == "memory.embedding.model":
+                emb.model = val
+        return cfg
+
+
+def _client(overrides: dict | None = None) -> TestClient:
+    agent_state = AgentState(agent=None)
+    app, _auth = create_admin_app(agent_state, cast(ConfigStore, _StoreStub(overrides)))
+    return TestClient(app)
+
+
+def test_embedding_status_local_default() -> None:
+    resp = _client().get("/memory/embedding/status", headers=HEADERS)
+    assert resp.status_code == 200
+    data = resp.json()
+    assert data["enabled"] is True
+    assert data["provider"] == "local"
+    assert data["local"] is True
+    assert "model_ready" in data  # bool (true/false) for local
+
+
+def test_embedding_prefetch_invokes_helper(monkeypatch) -> None:
+    seen = {}
+
+    def fake_prefetch(model, cache_dir):
+        seen["model"] = model
+        seen["cache_dir"] = cache_dir
+        return 384
+
+    monkeypatch.setattr("core.embeddings.prefetch_local_model", fake_prefetch)
+
+    resp = _client().post("/memory/embedding/prefetch", headers=HEADERS)
+    assert resp.status_code == 200
+    data = resp.json()
+    assert data["ok"] is True
+    assert data["dimensions"] == 384
+    assert seen["model"] == "BAAI/bge-small-en-v1.5"
+
+
+def test_embedding_prefetch_rejects_remote(monkeypatch) -> None:
+    resp = _client({"memory.embedding.provider": "openai"}).post(
+        "/memory/embedding/prefetch", headers=HEADERS
+    )
+    assert resp.status_code == 400
+
+
+def test_embedding_test_endpoint(monkeypatch) -> None:
+    class _FakeClient:
+        def __init__(self, *a, **k):
+            pass
+
+        async def embed(self, texts):
+            # First two related (close), third unrelated (orthogonal).
+            return [[1.0, 0.0], [0.9, 0.1], [0.0, 1.0]][: len(texts)]
+
+    monkeypatch.setattr("core.embeddings.LocalEmbeddingClient", _FakeClient)
+
+    resp = _client().post("/memory/embedding/test", headers=HEADERS)
+    assert resp.status_code == 200
+    data = resp.json()
+    assert data["ok"] is True
+    assert data["dimensions"] == 2
+    assert data["similar_pair"] > data["unrelated_pair"]
+
+
+def test_endpoints_require_auth() -> None:
+    assert _client().get("/memory/embedding/status").status_code in (401, 403)
+
+
+def test_memory_partial_renders() -> None:
+    resp = _client().get("/partials/memory", headers=HEADERS)
+    assert resp.status_code == 200
+    body = resp.text
+    assert "Semantic memory (embeddings)" in body
+    assert "Memory lifecycle" in body
+    assert "Download model" in body

From 0061acef41574ce5e5dbe705b601895029736922 Mon Sep 17 00:00:00 2001
From: Matteo Merola <mattmezza@gmail.com>
Date: Sun, 7 Jun 2026 23:13:11 +0200
Subject: [PATCH 7/7] perf(memory): vectorise cosine similarity with numpy

Replace the pure-Python cosine loop with numpy. unpack_vector returns an
ndarray (np.frombuffer), and a new cosine_to_matrix() scores a query against
all candidate vectors in one normalised matmul. get_relevant_long_term and
_retrieve_similar_long_term now batch via _batch_relevance (rows whose stored
vector matches the query dim use cosine; the rest fall back to lexical).

Measured at 384-dim: per-message retrieval over 1k memories 65ms -> 2.4ms
(~28x), 5k 330ms -> 17ms. Hygiene pairwise similarly faster. numpy is
already present (fastembed/onnxruntime) and is now a declared dependency.

Also fixes ndarray truthiness in _pair_similarity (is not None + shape check).
---
 core/embeddings.py         | 57 +++++++++++++++++++++++++-------------
 core/memory.py             | 46 ++++++++++++++++++++----------
 pyproject.toml             |  1 +
 tests/test_memory_tiers.py | 21 ++++++++++++--
 uv.lock                    |  2 ++
 5 files changed, 91 insertions(+), 36 deletions(-)

diff --git a/core/embeddings.py b/core/embeddings.py
index ee722f0..7df7597 100644
--- a/core/embeddings.py
+++ b/core/embeddings.py
@@ -8,13 +8,13 @@
 
 from __future__ import annotations
 
-import array
 import asyncio
 import importlib
 import logging
-import math
 from typing import Any, cast
 
+import numpy as np
+
 log = logging.getLogger(__name__)
 
 # Default local model: small, CPU-friendly, 384-dim (~130MB ONNX). Good balance
@@ -32,34 +32,51 @@
 }
 
 
-def pack_vector(vector: list[float]) -> bytes:
-    """Pack a float vector into a compact float32 blob for storage."""
-    return array.array("f", vector).tobytes()
+def pack_vector(vector) -> bytes:
+    """Pack a float vector (list or ndarray) into a compact float32 blob."""
+    return np.asarray(vector, dtype=np.float32).tobytes()
 
 
-def unpack_vector(blob: bytes | None) -> list[float] | None:
-    """Unpack a float32 blob back into a list of floats (None if empty)."""
+def unpack_vector(blob: bytes | None) -> np.ndarray | None:
+    """Unpack a float32 blob back into a 1-D ndarray (None if empty)."""
     if not blob:
         return None
-    arr = array.array("f")
-    arr.frombytes(blob)
-    return list(arr)
+    return np.frombuffer(blob, dtype=np.float32)
 
 
-def cosine_similarity(a: list[float], b: list[float]) -> float:
+def cosine_similarity(a, b) -> float:
     """Cosine similarity between two equal-length vectors (0.0 on degenerate input)."""
-    if not a or not b or len(a) != len(b):
+    va = np.asarray(a, dtype=np.float32)
+    vb = np.asarray(b, dtype=np.float32)
+    if va.size == 0 or vb.size == 0 or va.shape != vb.shape:
         return 0.0
-    dot = 0.0
-    na = 0.0
-    nb = 0.0
-    for x, y in zip(a, b, strict=False):
-        dot += x * y
-        na += x * x
-        nb += y * y
+    na = float(np.linalg.norm(va))
+    nb = float(np.linalg.norm(vb))
     if na == 0.0 or nb == 0.0:
         return 0.0
-    return dot / (math.sqrt(na) * math.sqrt(nb))
+    return float(np.dot(va, vb) / (na * nb))
+
+
+def cosine_to_matrix(query, vectors: list[np.ndarray]) -> np.ndarray:
+    """Cosine of *query* against every row in *vectors* in one vectorised pass.
+
+    All vectors must share the query's dimension (callers filter mismatches).
+    Returns a 1-D array of similarities (empty array when there are no vectors).
+    Rows with a zero norm score 0.0.
+    """
+    if not vectors:
+        return np.empty(0, dtype=np.float32)
+    q = np.asarray(query, dtype=np.float32)
+    qn = float(np.linalg.norm(q))
+    if qn == 0.0:
+        return np.zeros(len(vectors), dtype=np.float32)
+    matrix = np.vstack(vectors).astype(np.float32, copy=False)
+    dots = matrix @ q
+    norms = np.linalg.norm(matrix, axis=1) * qn
+    out = np.zeros(len(vectors), dtype=np.float32)
+    nonzero = norms > 0
+    out[nonzero] = dots[nonzero] / norms[nonzero]
+    return out
 
 
 class EmbeddingClient:
diff --git a/core/memory.py b/core/memory.py
index 2d271c8..d6d4b52 100644
--- a/core/memory.py
+++ b/core/memory.py
@@ -14,6 +14,7 @@
 from core.embeddings import (
     EmbeddingClient,
     cosine_similarity,
+    cosine_to_matrix,
     pack_vector,
     unpack_vector,
 )
@@ -396,12 +397,33 @@ def _recency_score(ts: str | None) -> float:
     return 0.5 ** (age_days / _RECENCY_HALF_LIFE_DAYS)
 
 
+def _batch_relevance(query_vec, rows: list[dict]) -> dict[int, float]:
+    """Map row index -> cosine similarity to *query_vec*, computed in one
+    vectorised pass. Only rows whose stored embedding matches the query
+    dimension are included; the rest are left for a lexical fallback by the
+    caller. Returns an empty map when there is no query vector."""
+    if query_vec is None:
+        return {}
+    dim = len(query_vec)
+    idxs: list[int] = []
+    vecs: list = []
+    for i, row in enumerate(rows):
+        vec = unpack_vector(row.get("embedding"))
+        if vec is not None and vec.shape[0] == dim:
+            idxs.append(i)
+            vecs.append(vec)
+    if not vecs:
+        return {}
+    sims = cosine_to_matrix(query_vec, vecs)
+    return {idx: float(sims[k]) for k, idx in enumerate(idxs)}
+
+
 def _pair_similarity(a: dict, b: dict) -> float:
     """Similarity between two long-term rows: embedding cosine when both have a
     stored vector, otherwise token overlap on subject + content."""
     va = unpack_vector(a.get("embedding"))
     vb = unpack_vector(b.get("embedding"))
-    if va and vb:
+    if va is not None and vb is not None and va.shape == vb.shape:
         return cosine_similarity(va, vb)
     return _similarity(
         _tokens(f"{a['subject']} {a['content']}"),
@@ -522,10 +544,10 @@ async def get_relevant_long_term(self, query: str) -> list[dict]:
             )
             rows = [dict(r) for r in await cursor.fetchall()]
 
+        rel_map = _batch_relevance(query_vec, rows)
         scored: list[tuple[float, dict]] = []
-        for row in rows:
-            vec = unpack_vector(row.get("embedding"))
-            relevance = cosine_similarity(query_vec, vec) if vec else 0.0
+        for i, row in enumerate(rows):
+            relevance = rel_map.get(i, 0.0)
             importance = (row.get("importance") or self.default_importance) / 10.0
             recency = _recency_score(row.get("last_accessed") or row.get("updated_at"))
             score = relevance + 0.5 * importance + 0.3 * recency
@@ -836,19 +858,15 @@ async def _retrieve_similar_long_term(self, subject: str, content: str) -> list[
         subject_norm = _normalize_subject(subject)
         cand_tokens = _tokens(f"{subject} {content}")
         cand_vec = await self._safe_embed(f"{subject}: {content}")
+        # Embedding cosine for rows with a matching-dim vector; lexical for the rest.
+        rel_map = _batch_relevance(cand_vec, rows)
 
         scored: list[tuple[float, dict]] = []
-        for row in rows:
-            row_tokens = _tokens(f"{row['subject']} {row['content']}")
-            if cand_vec:
-                vec = unpack_vector(row.get("embedding"))
-                base = (
-                    cosine_similarity(cand_vec, vec)
-                    if vec
-                    else _similarity(cand_tokens, row_tokens)
-                )
+        for i, row in enumerate(rows):
+            if i in rel_map:
+                base = rel_map[i]
             else:
-                base = _similarity(cand_tokens, row_tokens)
+                base = _similarity(cand_tokens, _tokens(f"{row['subject']} {row['content']}"))
             score = base
             if subject_norm and _normalize_subject(row["subject"]) == subject_norm:
                 score += 0.5
diff --git a/pyproject.toml b/pyproject.toml
index 13bc089..c15c557 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -24,6 +24,7 @@ dependencies = [
     "python-multipart>=0.0.22",
     "vobject>=0.9.9",
     "fastembed",
+    "numpy",
 ]
 
 [dependency-groups]
diff --git a/tests/test_memory_tiers.py b/tests/test_memory_tiers.py
index bac8547..ded15f1 100644
--- a/tests/test_memory_tiers.py
+++ b/tests/test_memory_tiers.py
@@ -8,9 +8,10 @@
 import re
 
 import aiosqlite
+import numpy as np
 import pytest
 
-from core.embeddings import cosine_similarity, pack_vector, unpack_vector
+from core.embeddings import cosine_similarity, cosine_to_matrix, pack_vector, unpack_vector
 from core.memory import MemoryStore
 
 
@@ -102,7 +103,7 @@ class TestVectorHelpers:
     def test_pack_unpack_roundtrip(self):
         vec = [0.1, -2.0, 3.5, 0.0]
         out = unpack_vector(pack_vector(vec))
-        assert out == pytest.approx(vec, abs=1e-6)
+        assert out.tolist() == pytest.approx(vec, abs=1e-6)
 
     def test_unpack_none(self):
         assert unpack_vector(None) is None
@@ -113,6 +114,22 @@ def test_cosine(self):
         assert cosine_similarity([1, 0], [0, 1]) == pytest.approx(0.0)
         assert cosine_similarity([], [1]) == 0.0
         assert cosine_similarity([0, 0], [1, 1]) == 0.0
+        assert cosine_similarity([1, 2, 3], [1, 2]) == 0.0  # shape mismatch
+
+    def test_cosine_to_matrix(self):
+        q = [1.0, 0.0]
+        rows = [np.array([1.0, 0.0]), np.array([0.0, 1.0]), np.array([1.0, 1.0])]
+        out = cosine_to_matrix(q, rows)
+        assert out.shape == (3,)
+        assert out[0] == pytest.approx(1.0)
+        assert out[1] == pytest.approx(0.0)
+        assert out[2] == pytest.approx(0.7071, abs=1e-3)
+        # Matches the scalar implementation row-by-row.
+        for i, r in enumerate(rows):
+            assert out[i] == pytest.approx(cosine_similarity(q, r), abs=1e-5)
+
+    def test_cosine_to_matrix_empty(self):
+        assert cosine_to_matrix([1.0, 0.0], []).shape == (0,)
 
 
 # -- Tier 2: embeddings --
diff --git a/uv.lock b/uv.lock
index b7186ea..38640f2 100644
--- a/uv.lock
+++ b/uv.lock
@@ -856,6 +856,7 @@ dependencies = [
     { name = "faster-whisper" },
     { name = "httpx" },
     { name = "jinja2" },
+    { name = "numpy" },
     { name = "openai" },
     { name = "pydantic-settings" },
     { name = "python-dotenv" },
@@ -888,6 +889,7 @@ requires-dist = [
     { name = "faster-whisper" },
     { name = "httpx" },
     { name = "jinja2" },
+    { name = "numpy" },
     { name = "openai" },
     { name = "pydantic-settings" },
     { name = "python-dotenv" },