diff --git a/.gitignore b/.gitignore index 49f47b0..b3efad1 100644 --- a/.gitignore +++ b/.gitignore @@ -38,3 +38,6 @@ tailwindcss # Tailwind CSS build output api/static/style.css node_modules + +# Local embedding models (fastembed cache) +models/ diff --git a/Dockerfile b/Dockerfile index 6f5d29e..c18bff4 100644 --- a/Dockerfile +++ b/Dockerfile @@ -50,6 +50,13 @@ COPY tools/ tools/ COPY voice/ voice/ COPY api/ api/ +# Prefetch the local embedding model (semantic memory, Tier 2) so it is bundled +# in the image — no runtime download, works offline. Stored in /app/models, +# OUTSIDE the /app/data volume so the mounted volume cannot shadow it. Keep the +# default in sync with EmbeddingConfig (core/config.py). +ARG EMBED_MODEL=BAAI/bge-small-en-v1.5 +RUN uv run python -m core.embeddings prefetch "${EMBED_MODEL}" /app/models + # Build CSS with Tailwind CSS v4 standalone CLI RUN ARCH=$(dpkg --print-architecture) && \ if [ "$ARCH" = "arm64" ]; then TW_ARCH="linux-arm64"; else TW_ARCH="linux-x64"; fi && \ diff --git a/api/admin.py b/api/admin.py index 5cf7346..e69d43e 100644 --- a/api/admin.py +++ b/api/admin.py @@ -906,25 +906,47 @@ async def partial_search() -> HTMLResponse: max_results=max_results, ) - @app.get("/partials/memory", dependencies=[Depends(auth)]) - async def partial_memory() -> HTMLResponse: - """Memory tab partial.""" - # Memory config - memory_long_term_limit = await config_store.get("memory.long_term_limit") or "50" + async def _render_memory_partial() -> HTMLResponse: + """Build the Memory tab partial (config + stored memories). - # Memory data — read directly from DB (works even when agent is stopped) + Shared by the tab load and the post-delete refresh so both render the + full embedding/lifecycle config, not just the memory tables. + """ import aiosqlite + async def _cfg(key: str, default: str) -> str: + val = await config_store.get(key) + return default if val is None or val == "" else str(val) + + async def _bool(key: str, default: str) -> str: + val = await config_store.get(key) + return default if val is None else str(val).lower() + + ctx: dict[str, object] = { + "memory_long_term_limit": await _cfg("memory.long_term_limit", "50"), + "emb_enabled": await _bool("memory.embedding.enabled", "true"), + "emb_provider": await _cfg("memory.embedding.provider", "local"), + "emb_model": await _cfg("memory.embedding.model", "BAAI/bge-small-en-v1.5"), + "emb_base_url": await _cfg("memory.embedding.base_url", ""), + "emb_top_k": await _cfg("memory.embedding.injection_top_k", "12"), + "hygiene_enabled": await _bool("memory.hygiene_enabled", "true"), + "default_importance": await _cfg("memory.default_importance", "5.0"), + "archive_after_days": await _cfg("memory.archive_after_days", "90"), + "archive_max_importance": await _cfg("memory.archive_max_importance", "4.0"), + "archive_min_idle_days": await _cfg("memory.archive_min_idle_days", "45"), + "hygiene_threshold": await _cfg("memory.hygiene_similarity_threshold", "0.45"), + } + + # Memory data — read directly from DB (works even when agent is stopped) memory_db = await config_store.get("memory.db_path") or "data/memory.db" - long_term = [] - short_term = [] + long_term: list[dict] = [] + short_term: list[dict] = [] if Path(memory_db).exists(): cols = "id, category, subject, content, source, confidence, created_at, updated_at" async with aiosqlite.connect(memory_db) as db: db.row_factory = aiosqlite.Row cursor = await db.execute(f"SELECT {cols} FROM long_term ORDER BY updated_at DESC") long_term = [dict(row) for row in await cursor.fetchall()] - cursor = await db.execute( "SELECT id, content, context, expires_at, created_at " "FROM short_term WHERE expires_at > datetime('now') " @@ -933,12 +955,14 @@ async def partial_memory() -> HTMLResponse: short_term = [dict(row) for row in await cursor.fetchall()] return _render_partial( - "partials/memory.html", - long_term=long_term, - short_term=short_term, - memory_long_term_limit=memory_long_term_limit, + "partials/memory.html", long_term=long_term, short_term=short_term, **ctx ) + @app.get("/partials/memory", dependencies=[Depends(auth)]) + async def partial_memory() -> HTMLResponse: + """Memory tab partial.""" + return await _render_memory_partial() + @app.get("/partials/history", dependencies=[Depends(auth)]) async def partial_history() -> HTMLResponse: """History tab partial.""" @@ -1243,7 +1267,18 @@ async def patch_config(body: ConfigPatchIn) -> dict: agent.llm = LLMClient.from_agent_config(new_config.agent) agent.executor.tool_env = tool_env(new_config) agent.history_mode = new_config.history.mode - agent.memory.long_term_limit = new_config.memory.long_term_limit + mem_cfg = new_config.memory + agent.memory.long_term_limit = mem_cfg.long_term_limit + # Rebuild the embedder (lazy — no model load here) and refresh the + # Tier 3/4 lifecycle knobs so memory config changes apply live. + agent.memory.embedder = agent._build_embedder() + agent.memory.injection_top_k = mem_cfg.embedding.injection_top_k + agent.memory.default_importance = mem_cfg.default_importance + agent.memory.archive_after_days = mem_cfg.archive_after_days + agent.memory.archive_max_importance = mem_cfg.archive_max_importance + agent.memory.archive_min_idle_days = mem_cfg.archive_min_idle_days + agent.memory.hygiene_enabled = mem_cfg.hygiene_enabled + agent.memory.hygiene_similarity_threshold = mem_cfg.hygiene_similarity_threshold agent.reflections.max_reflections = new_config.task_reflection.max_reflections if new_config.search.enabled and new_config.search.api_key: from tavily import TavilyClient @@ -1277,15 +1312,16 @@ async def system_prompt_preview(body: PromptPreviewIn) -> dict: memories = "" if body.include_memories: + query = message or None if agent_state.agent: - memories = await agent_state.agent.memory.format_for_prompt() + memories = await agent_state.agent.memory.format_for_prompt(query=query) else: from core.memory import MemoryStore memories = await MemoryStore( db_path=config.memory.db_path, long_term_limit=config.memory.long_term_limit, - ).format_for_prompt() + ).format_for_prompt(query=query) reflections = "" if body.include_reflections and config.task_reflection.enabled: @@ -1964,28 +2000,91 @@ async def delete_memory(request: Request) -> HTMLResponse: if cursor.rowcount == 0: raise HTTPException(404, f"Memory {memory_id} not found in {tier}") - # Return refreshed memory partial - memory_long_term_limit = await config_store.get("memory.long_term_limit") or "50" - long_term = [] - short_term = [] - cols = "id, category, subject, content, source, confidence, created_at, updated_at" - async with aiosqlite.connect(agent.memory.db_path) as db: - db.row_factory = aiosqlite.Row - cursor = await db.execute(f"SELECT {cols} FROM long_term ORDER BY updated_at DESC") - long_term = [dict(row) for row in await cursor.fetchall()] - cursor = await db.execute( - "SELECT id, content, context, expires_at, created_at " - "FROM short_term WHERE expires_at > datetime('now') " - "ORDER BY created_at DESC" - ) - short_term = [dict(row) for row in await cursor.fetchall()] - return _render_partial( - "partials/memory.html", - long_term=long_term, - short_term=short_term, - memory_long_term_limit=memory_long_term_limit, + # Return refreshed memory partial (full config + tables) + return await _render_memory_partial() + + @app.get("/memory/embedding/status", dependencies=[Depends(auth)]) + async def embedding_status() -> dict: + """Report embedding config + whether a local model is already on disk.""" + from core.embeddings import LOCAL_PROVIDERS + + config = await config_store.export_to_config() + emb = config.memory.embedding + is_local = emb.provider in LOCAL_PROVIDERS + model_ready: bool | None = None + if is_local: + cache = Path(emb.cache_dir) + model_ready = cache.exists() and any(cache.rglob("*.onnx")) + return { + "enabled": emb.enabled, + "provider": emb.provider, + "model": emb.model, + "local": is_local, + "model_ready": model_ready, + "cache_dir": emb.cache_dir, + } + + @app.post("/memory/embedding/prefetch", dependencies=[Depends(auth)]) + async def embedding_prefetch() -> dict: + """Download the local embedding model now (also done at Docker build).""" + from core.embeddings import LOCAL_PROVIDERS, prefetch_local_model + + config = await config_store.export_to_config() + emb = config.memory.embedding + if emb.provider not in LOCAL_PROVIDERS: + raise HTTPException(400, "Prefetch only applies to the local embedding provider") + try: + dim = await asyncio.to_thread(prefetch_local_model, emb.model, emb.cache_dir) + except Exception as exc: + log.exception("Embedding model prefetch failed") + raise HTTPException(500, f"Prefetch failed: {exc}") from exc + return {"ok": True, "model": emb.model, "dimensions": dim, "cache_dir": emb.cache_dir} + + @app.post("/memory/embedding/test", dependencies=[Depends(auth)]) + async def embedding_test() -> dict: + """Embed a few probe sentences and report dimension + a sanity cosine.""" + from core.embeddings import ( + LOCAL_PROVIDERS, + EmbeddingClient, + LocalEmbeddingClient, + cosine_similarity, ) + config = await config_store.export_to_config() + emb = config.memory.embedding + try: + if emb.provider in LOCAL_PROVIDERS: + client: object = LocalEmbeddingClient(model=emb.model, cache_dir=emb.cache_dir) + else: + cfg = config.agent + api_key = emb.api_key or getattr(cfg, f"{emb.provider}_api_key", "") + base_url = emb.base_url or getattr(cfg, f"{emb.provider}_base_url", "") or None + if not api_key: + raise HTTPException(400, f"No API key configured for provider {emb.provider}") + client = EmbeddingClient( + provider=emb.provider, + api_key=api_key, + model=emb.model, + base_url=base_url, + dimensions=emb.dimensions, + ) + probes = ["allergic to shellfish", "cannot eat prawns", "the weather is sunny today"] + vecs = await client.embed(probes) # type: ignore[attr-defined] + if len(vecs) < 3 or not vecs[0]: + raise HTTPException(500, "Embedding returned no vectors") + return { + "ok": True, + "model": emb.model, + "dimensions": len(vecs[0]), + "similar_pair": round(cosine_similarity(vecs[0], vecs[1]), 3), + "unrelated_pair": round(cosine_similarity(vecs[0], vecs[2]), 3), + } + except HTTPException: + raise + except Exception as exc: + log.exception("Embedding test failed") + raise HTTPException(500, f"Test failed: {exc}") from exc + @app.post("/memory/consolidate", dependencies=[Depends(auth)]) async def trigger_consolidation() -> HTMLResponse: agent = agent_state.agent diff --git a/api/templates/partials/memory.html b/api/templates/partials/memory.html index 6cf9ede..ca23f24 100644 --- a/api/templates/partials/memory.html +++ b/api/templates/partials/memory.html @@ -2,8 +2,67 @@
+ cfgOk: false, + // Embeddings (Tier 2) + embEnabled: {{ emb_enabled|default('true', true)|tojson|forceescape }} === 'true', + embProvider: {{ emb_provider|default('local', true)|tojson|forceescape }}, + embModel: {{ emb_model|default('BAAI/bge-small-en-v1.5', true)|tojson|forceescape }}, + embBaseUrl: {{ emb_base_url|default('', true)|tojson|forceescape }}, + embTopK: {{ emb_top_k|default('12', true)|tojson|forceescape }}, + embResult: '', embOk: false, + embStatus: null, embBusy: false, embTestResult: '', + // Lifecycle (Tier 3/4) + defaultImportance: {{ default_importance|default('5.0', true)|tojson|forceescape }}, + archiveAfterDays: {{ archive_after_days|default('90', true)|tojson|forceescape }}, + archiveMaxImportance: {{ archive_max_importance|default('4.0', true)|tojson|forceescape }}, + archiveMinIdleDays: {{ archive_min_idle_days|default('45', true)|tojson|forceescape }}, + hygieneEnabled: {{ hygiene_enabled|default('true', true)|tojson|forceescape }} === 'true', + hygieneThreshold: {{ hygiene_threshold|default('0.45', true)|tojson|forceescape }}, + lifeResult: '', lifeOk: false, + get isLocal() { return this.embProvider === 'local' || this.embProvider === 'fastembed'; }, + _hdrs() { + return { + 'Content-Type': 'application/json', + 'Authorization': 'Bearer ' + (localStorage.getItem('admin_api_key') || '') + }; + }, + saveCfg(vals, okMsg, okField, resField) { + return fetch('/config', { method: 'PATCH', headers: this._hdrs(), body: JSON.stringify({values: vals}) }) + .then(r => { this[okField] = r.ok; return r.json(); }) + .then(d => { + this[resField] = this[okField] ? okMsg : (d.detail || 'Error'); + if (this[okField] && window.showToast) { window.showToast(okMsg); } + }) + .catch(e => { this[okField] = false; this[resField] = e.message; }); + }, + loadEmbStatus() { + fetch('/memory/embedding/status', { headers: this._hdrs() }) + .then(r => r.json()).then(d => { this.embStatus = d; }).catch(() => {}); + }, + downloadModel() { + this.embBusy = true; this.embTestResult = 'Downloading model… (first time may take a minute)'; + fetch('/memory/embedding/prefetch', { method: 'POST', headers: this._hdrs() }) + .then(r => r.json().then(d => ({ok: r.ok, d}))) + .then(({ok, d}) => { + this.embTestResult = ok ? ('Model ready (dim ' + d.dimensions + ')') : ('Error: ' + (d.detail || 'failed')); + this.loadEmbStatus(); + }) + .catch(e => { this.embTestResult = 'Error: ' + e.message; }) + .finally(() => { this.embBusy = false; }); + }, + testEmb() { + this.embBusy = true; this.embTestResult = 'Testing…'; + fetch('/memory/embedding/test', { method: 'POST', headers: this._hdrs() }) + .then(r => r.json().then(d => ({ok: r.ok, d}))) + .then(({ok, d}) => { + this.embTestResult = ok + ? ('OK · dim ' + d.dimensions + ' · related=' + d.similar_pair + ' vs unrelated=' + d.unrelated_pair) + : ('Error: ' + (d.detail || 'failed')); + }) + .catch(e => { this.embTestResult = 'Error: ' + e.message; }) + .finally(() => { this.embBusy = false; }); + } + }" x-init="loadEmbStatus()"> {# Memory config section #}

Memory Settings

@@ -20,25 +79,7 @@

Memory Settings

@@ -52,6 +93,163 @@

Memory Settings

+ {# Semantic memory (embeddings) — Tier 2 #} +
+

Semantic memory (embeddings)

+

+ Match memories by meaning, not just words — so “allergic to shellfish” surfaces when you ask about eating prawns. +

+ +
+ How it works (the science) +
+

An embedding model turns each memory and your current message into a vector — a list of numbers positioning the text in a “meaning space”. Texts with similar meaning land close together, even with no shared words.

+

Closeness is measured by cosine similarity (1.0 = same direction/meaning, 0 = unrelated). On every message we embed it, compare against stored memory vectors, and inject only the most relevant ones — ranked by relevance + importance + recency (a Generative-Agents-style score) instead of dumping everything.

+

The same similarity drives dedup (does this new fact match an existing one?) and the hygiene pass (cluster & merge near-duplicates).

+

Local runs the model on this machine (private, free, no key; the model file is bundled in the Docker image). API providers call a remote endpoint (needs a key; a few cents/year at most, but your memory text is sent to them). When disabled, memory falls back to fast word-overlap matching — still works, just less “fuzzy”.

+
+
+ +
+
+ + + Off → lexical (word-overlap) retrieval, no model needed. +
+ +
+ + +

+ API key falls back to the matching provider key set in the + LLM tab. + (Note: DeepSeek has no embeddings endpoint.) +

+
+ +
+ + +

Small CPU model (384-dim, ~130MB). Bundled in Docker; use “Download model” if missing.

+

e.g. text-embedding-3-small (OpenAI) or text-embedding-004 (Google).

+
+ +
+ + +
+ +
+ + +

How many of the most relevant long-term memories to put in the prompt each message.

+
+ +
+ Model on disk: + yes ✓ + not downloaded yet +
+
+ +
+ + + +
+ + +

Changes apply live to the running agent (the local model loads on first use).

+
+ + {# Memory lifecycle (forgetting + hygiene) — Tier 3/4 #} +
+

Memory lifecycle (forgetting & hygiene)

+

Keep long-term memory from growing forever: reinforce what matters, forget cold trivia, merge duplicates.

+ +
+ How it works (the science) +
+

Each memory has an importance (1–10). Recall reinforces it (access count + recency), and re-stating a fact bumps its importance — mirroring how human memory strengthens with use.

+

Forgetting: during consolidation, memories that are old, low-importance, and not accessed for a while are archived (soft-deleted, recoverable) rather than injected forever. This is decay, like the “use it or lose it” curve.

+

Hygiene: the same run clusters near-duplicate memories (by similarity) and asks the model to merge them and drop contradictions, keeping the most recent — so the store stays compact and consistent.

+
+
+ +
+
+ + +

Starting importance for a new memory.

+
+
+ + +

A memory must be at least this old before it can be archived.

+
+
+ + +

Important memories are never auto-archived.

+
+
+ + +

Require this long since last access/creation before archiving.

+
+
+
+ + + Cluster & merge near-duplicates during consolidation. +
+
+ + +

Higher = stricter (only very-similar memories merge). 0.45 is a sensible default.

+
+
+ +
+ +
+ +
+ {# Memory data #}