diff --git a/README.md b/README.md
index cc19188f..27ad2e16 100644
--- a/README.md
+++ b/README.md
@@ -109,6 +109,7 @@ wiki/ │ ← the foundation
├── sources/ Full-text conversions
├── summaries/ Per-document summaries
├── concepts/ Cross-document synthesis ← the good stuff
+ ├── entities/ Specific named things (people, orgs, places, products)
├── explorations/ Saved query results
└── reports/ Lint reports
│
@@ -136,9 +137,10 @@ Short docs are read in full by the LLM. Long PDFs are indexed by PageIndex into
When you add a document, the LLM:
1. Generates a **summary** page
-2. Reads existing **concept** pages
+2. Reads existing **concept** and **entity** pages
3. Creates or updates concepts with cross-document synthesis
-4. Updates the **index** and **log**
+4. Creates or updates **entity** pages (people, orgs, places, products)
+5. Updates the **index** and **log**
A single source might touch 10-15 wiki pages. Knowledge accumulates: each document enriches the existing wiki rather than sitting in isolation.
@@ -153,6 +155,7 @@ OpenKB commands fall into two layers: the **wiki foundation** (compile + manage
| `openkb init` | Initialize a new knowledge base (interactive) |
| openkb add <file_or_dir_or_URL> | Add documents and compile to wiki. URL ingest auto-detects PDF (saved as `.pdf` → PageIndex / markitdown) vs HTML (trafilatura main-content extract → `.md`) |
| openkb remove <doc> | Remove a document and clean up its wiki pages, images, registry, and PageIndex state (use `--dry-run` to preview, `--keep-raw` / `--keep-empty-concepts` to retain artifacts) |
+| openkb recompile [<doc>] [--all] | Re-run the current compile pipeline on already-indexed docs (e.g. to backfill the `entities/` layer) without re-indexing. Regenerates summaries and rewrites concept pages — manual edits are overwritten. Use `--dry-run` to preview, `--refresh-schema` to also update `wiki/AGENTS.md` |
| `openkb watch` | Watch `raw/` and auto-compile new files |
| `openkb lint` | Run structural + knowledge health checks |
| `openkb list` | List indexed documents and concepts |
diff --git a/openkb/agent/compiler.py b/openkb/agent/compiler.py
index 87d96652..6751ae49 100644
--- a/openkb/agent/compiler.py
+++ b/openkb/agent/compiler.py
@@ -30,7 +30,7 @@
import yaml
from openkb.lint import list_existing_wiki_targets, strip_ghost_wikilinks
-from openkb.schema import get_agents_md
+from openkb.schema import INDEX_SEED, get_agents_md
logger = logging.getLogger(__name__)
@@ -68,29 +68,53 @@
"""
+# Canonical entity-type enum — the single source of truth shared by the
+# plan prompt, the entity-page prompts, and create/update validation. The
+# prompt templates carry an ``__ENTITY_TYPES__`` token that is substituted
+# with this list once at import time (see below), so adding a type here
+# updates every place at once.
+_ENTITY_TYPE_LIST = ("person", "organization", "place", "product", "work", "event", "other")
+_ENTITY_TYPES = frozenset(_ENTITY_TYPE_LIST)
+_ENTITY_TYPES_STR = ", ".join(_ENTITY_TYPE_LIST)
+
+
_CONCEPTS_PLAN_USER = """\
-Based on the summary above, decide how to update the wiki's concept pages.
+Based on the summary above, decide how to update the wiki's CONCEPT pages and
+ENTITY pages.
+
+A CONCEPT is an abstract, recurring idea/pattern/mechanism (e.g. "agentic
+systems"). An ENTITY is a specific named thing — a person, organization,
+place, product, named work, or event (e.g. "Anthropic"). Each name goes in
+exactly ONE group. A topic may have both (entity "NVIDIA" and concept
+"ai-infrastructure-demand"); they cross-link, they do not merge.
Existing concept pages:
{concept_briefs}
-Return a JSON object with three keys:
+Existing entity pages (with source counts = how many docs already cite them):
+{entity_briefs}
-1. "create" — new concepts not covered by any existing page. Array of objects:
- {{"name": "concept-slug", "title": "Human-Readable Title"}}
+Return a JSON object with two top-level keys, "concepts" and "entities".
-2. "update" — existing concepts that have significant new information from \
-this document worth integrating. Array of objects:
- {{"name": "existing-slug", "title": "Existing Title"}}
+"concepts" is an object with:
+1. "create" — new concepts. Array of {{"name": "concept-slug", "title": "Title"}}
+2. "update" — existing concepts with significant new info. Same shape.
+3. "related" — existing concept slugs to cross-link only. Array of strings.
-3. "related" — existing concepts tangentially related to this document but \
-not needing content changes, just a cross-reference link. Array of slug strings.
+"entities" is an object with the same three keys, but create/update objects
+add a "type" field, one of: __ENTITY_TYPES__. Example:
+ {{"name": "anthropic", "title": "Anthropic", "type": "organization"}}
Rules:
- For the first few documents, create 2-3 foundational concepts at most.
-- Do NOT create a concept that overlaps with an existing one — use "update".
+- Create an ENTITY page only when the entity is (a) central to this document
+ or (b) likely to recur across sources. Do NOT page proper nouns mentioned
+ only in passing. Roughly 5-15 entities per document is typical; fewer for
+ sparse documents.
+- Prefer "update" over "create" for any concept or entity already listed above.
+- Do NOT create a concept/entity that overlaps an existing one — use "update".
- Do NOT create concepts that are just the document topic itself.
-- "related" is for lightweight cross-linking only, no content rewrite needed.
+- "related" is lightweight cross-linking only, no content rewrite.
Return ONLY valid JSON, no fences, no explanation.
"""
@@ -104,8 +128,9 @@
Rules for [[wikilinks]] in all subsequent responses:
- For [[concepts/X]]: X must appear in the whitelist above.
- For [[summaries/Y]]: Y must appear in the whitelist above.
+- For [[entities/Z]]: Z must appear in the whitelist above.
- Do NOT invent new wikilink targets. If you want to mention a concept \
-that is not in the whitelist, write it as plain text without brackets.
+or entity that is not in the whitelist, write it as plain text without brackets.
"""
_CONCEPT_PAGE_USER = """\
@@ -147,6 +172,47 @@
Return ONLY valid JSON, no fences.
"""
+_ENTITY_PAGE_USER = """\
+Write the entity page for: {title} (type: {type})
+
+This entity relates to the document "{doc_name}" summarized above.
+
+Return a JSON object with three keys:
+- "brief": A single sentence (under 100 chars) identifying this entity
+- "type": one of __ENTITY_TYPES__
+- "content": The full entity page in Markdown — what this entity is, the key
+ facts about it from this document, and [[wikilinks]] to related concepts,
+ other [[entities/...]], and [[summaries/{doc_name}]] — subject to the
+ whitelist rules from the message above.
+
+Return ONLY valid JSON, no fences.
+"""
+
+_ENTITY_UPDATE_USER = """\
+Update the entity page for: {title} (type: {type})
+
+Current content of this page:
+{existing_content}
+
+Integrate the new facts about this entity from document "{doc_name}"
+(summarized above). Rewrite the full page — do not just append. Preserve the
+existing structure and intent. Follow the whitelist rules from the message
+above for all [[wikilinks]].
+
+Return a JSON object with three keys:
+- "brief": A single sentence (under 100 chars) identifying this entity
+- "type": one of __ENTITY_TYPES__
+- "content": The rewritten full entity page in Markdown
+
+Return ONLY valid JSON, no fences.
+"""
+
+# Substitute the canonical entity-type list into every prompt that advertises
+# it, so the prompt text can never drift from ``_ENTITY_TYPES`` validation.
+_CONCEPTS_PLAN_USER = _CONCEPTS_PLAN_USER.replace("__ENTITY_TYPES__", _ENTITY_TYPES_STR)
+_ENTITY_PAGE_USER = _ENTITY_PAGE_USER.replace("__ENTITY_TYPES__", _ENTITY_TYPES_STR)
+_ENTITY_UPDATE_USER = _ENTITY_UPDATE_USER.replace("__ENTITY_TYPES__", _ENTITY_TYPES_STR)
+
_SUMMARY_REWRITE_USER = """\
Task: Rewrite the summary you wrote above into a final version that is \
consistent with the concept pages now in the wiki (per the whitelist message \
@@ -366,6 +432,50 @@ def _filter_related_slugs(items: list) -> list[str]:
return valid
+def _filter_entity_items(items: object) -> list[dict]:
+ """Validate entity create/update objects: require name+title, coerce type.
+
+ Each kept item is normalized to ``{"name", "title", "type"}`` where
+ ``type`` falls back to ``"other"`` when missing or outside the entity
+ enum and ``title`` falls back to ``name``.
+ """
+ out: list[dict] = []
+ if not isinstance(items, list):
+ return out
+ for it in items:
+ if not isinstance(it, dict):
+ continue
+ name = it.get("name")
+ if not isinstance(name, str) or not name.strip():
+ continue
+ title = it.get("title") if isinstance(it.get("title"), str) else name
+ etype = it.get("type")
+ if not isinstance(etype, str) or etype not in _ENTITY_TYPES:
+ etype = "other"
+ out.append({"name": name, "title": title, "type": etype})
+ return out
+
+
+def _parse_entities_plan(parsed: object) -> dict:
+ """Extract the entities group from a plan dict, with graceful fallback.
+
+ Returns ``{"create": [...], "update": [...], "related": [...]}``. A
+ missing/malformed ``entities`` key yields empty lists, so older or
+ partial LLM responses never raise.
+ """
+ empty = {"create": [], "update": [], "related": []}
+ if not isinstance(parsed, dict):
+ return empty
+ group = parsed.get("entities")
+ if not isinstance(group, dict):
+ return empty
+ return {
+ "create": _filter_entity_items(group.get("create", [])),
+ "update": _filter_entity_items(group.get("update", [])),
+ "related": _filter_related_slugs(group.get("related", [])),
+ }
+
+
# ---------------------------------------------------------------------------
# File I/O helpers
# ---------------------------------------------------------------------------
@@ -422,6 +532,52 @@ def _read_concept_briefs(wiki_dir: Path) -> str:
return "\n".join(lines) or "(none yet)"
+def _read_entity_briefs(wiki_dir: Path) -> str:
+ """Read existing entity pages as compact lines for the plan call.
+
+ Formats each as ``- {slug} ({type}, {n} sources) — {brief}``. The source
+ count is the cross-document recurrence signal the LLM uses to decide
+ create-vs-update and salience. Returns "(none yet)" when empty.
+ """
+ entities_dir = wiki_dir / "entities"
+ if not entities_dir.exists():
+ return "(none yet)"
+
+ md_files = sorted(entities_dir.glob("*.md"))
+ if not md_files:
+ return "(none yet)"
+
+ lines: list[str] = []
+ for path in md_files:
+ text = path.read_text(encoding="utf-8")
+ brief = ""
+ etype = "other"
+ n_sources = 0
+ body = text
+ if text.startswith("---"):
+ end = text.find("---", 3)
+ if end != -1:
+ fm_text = text[3:end].strip("\n")
+ body = text[end + 3:]
+ try:
+ fm = yaml.safe_load(fm_text)
+ except yaml.YAMLError:
+ fm = None
+ if isinstance(fm, dict):
+ if isinstance(fm.get("brief"), str):
+ brief = fm["brief"].strip()
+ if isinstance(fm.get("type"), str):
+ etype = fm["type"].strip() or "other"
+ if isinstance(fm.get("sources"), list):
+ n_sources = len(fm["sources"])
+ if not brief:
+ brief = body.strip().replace("\n", " ")[:150]
+ suffix = f" — {brief}" if brief else ""
+ lines.append(f"- {path.stem} ({etype}, {n_sources} sources){suffix}")
+
+ return "\n".join(lines) or "(none yet)"
+
+
def _iter_h2_headings(lines: list[str]) -> list[tuple[int, str]]:
"""Return ``[(line_index, normalized_heading), ...]`` for every ATX H2.
@@ -457,20 +613,26 @@ def _get_section_bounds(lines: list[str], heading: str) -> tuple[int, int] | Non
return None
-def _ensure_h2_section(lines: list[str], heading: str) -> None:
+def _ensure_h2_section(lines: list[str], heading: str, *, quiet: bool = False) -> None:
"""Ensure an H2 section ``heading`` exists in ``lines``; append if missing.
Recovers from hand-edited or drifted index.md files where the expected
section was removed or renamed — without this, downstream inserts would
silently no-op and entries would be dropped.
+
+ ``quiet=True`` suppresses the drift warning. Use it when adding a section
+ is the normal, expected operation (e.g. a backlink helper creating a
+ ``## Related Documents`` / ``## Entities`` section on a page for the first
+ time), as opposed to repairing a drifted index.
"""
if _get_section_bounds(lines, heading) is not None:
return
- logger.warning(
- "Wiki page is missing %r section; appending it. "
- "Check whether the file was hand-edited away from the canonical layout.",
- heading,
- )
+ if not quiet:
+ logger.warning(
+ "Wiki page is missing %r section; appending it. "
+ "Check whether the file was hand-edited away from the canonical layout.",
+ heading,
+ )
while lines and lines[-1] == "":
lines.pop()
if lines:
@@ -479,6 +641,33 @@ def _ensure_h2_section(lines: list[str], heading: str) -> None:
lines.append("")
+def _ensure_h2_section_before(
+ lines: list[str], heading: str, before: str,
+) -> None:
+ """Ensure H2 ``heading`` exists, inserting it just before ``before``.
+
+ If ``heading`` is already present, no-op. If ``before`` is absent, fall
+ back to :func:`_ensure_h2_section` (append at end). This keeps the
+ canonical index order (e.g. ``## Entities`` ahead of ``## Explorations``)
+ when recovering an older index.md that predates the section.
+ """
+ if _get_section_bounds(lines, heading) is not None:
+ return
+ before_bounds = _get_section_bounds(lines, before)
+ if before_bounds is None:
+ _ensure_h2_section(lines, heading)
+ return
+ # ``start`` is the line after the ``before`` heading; insert the new
+ # section (heading + blank line) right before that heading line.
+ insert_at = before_bounds[0] - 1
+ logger.warning(
+ "Wiki index is missing %r section; inserting it before %r. "
+ "Check whether the file was hand-edited away from the canonical layout.",
+ heading, before,
+ )
+ lines[insert_at:insert_at] = [heading, ""]
+
+
def _section_contains_link(lines: list[str], heading: str, link: str) -> bool:
"""Check whether an index entry already exists inside the named section."""
bounds = _get_section_bounds(lines, heading)
@@ -655,6 +844,78 @@ def _write_concept(wiki_dir: Path, name: str, content: str, source_file: str, is
path.write_text(frontmatter + content, encoding="utf-8")
+def _write_entity(
+ wiki_dir: Path, name: str, content: str, source_file: str,
+ is_update: bool, brief: str = "", type_: str = "other",
+ aliases: list[str] | None = None,
+) -> None:
+ """Write or update an entity page in entities/, managing frontmatter.
+
+ Frontmatter fields: ``sources`` (list), ``type`` (one of the entity
+ enum), ``brief`` (one-liner), and optional ``aliases`` (list, omitted
+ when empty). On update the new source is prepended and the body replaced
+ with the LLM rewrite; ``type`` is preserved from the new write.
+ """
+ entities_dir = wiki_dir / "entities"
+ entities_dir.mkdir(parents=True, exist_ok=True)
+ safe_name = _sanitize_concept_name(name)
+ path = (entities_dir / f"{safe_name}.md").resolve()
+ if not path.is_relative_to(entities_dir.resolve()):
+ logger.warning("Entity name escapes entities dir: %s", name)
+ return
+
+ # Strip any frontmatter the LLM body may carry.
+ clean = content
+ if clean.startswith("---"):
+ end = clean.find("---", 3)
+ if end != -1:
+ clean = clean[end + 3:].lstrip("\n")
+
+ def _build_frontmatter(sources: list[str]) -> str:
+ fm_lines = [_yaml_list_line("sources", sources)]
+ fm_lines.append(_yaml_kv_line("type", type_ or "other"))
+ if brief:
+ fm_lines.append(_yaml_kv_line("brief", brief))
+ if aliases:
+ fm_lines.append(_yaml_list_line("aliases", aliases))
+ return "---\n" + "\n".join(fm_lines) + "\n---\n\n"
+
+ if is_update and path.exists():
+ existing = path.read_text(encoding="utf-8")
+ if source_file not in existing:
+ existing = _prepend_source_to_frontmatter(existing, source_file)
+ end = existing.find("---", 3) if existing.startswith("---") else -1
+ if end != -1:
+ fm = existing[:end + 3]
+ fm = _set_fm_line(fm, "brief", brief) if brief else fm
+ fm = _set_fm_line(fm, "type", type_) if type_ else fm
+ existing = fm + "\n\n" + clean
+ else:
+ # Malformed/absent frontmatter (opening ``---`` with no closing
+ # delimiter, or no frontmatter at all): rebuild valid frontmatter
+ # rather than writing a body-only page and dropping sources/type/
+ # brief. ``_prepend_source_to_frontmatter`` already ensured the
+ # new source is present in the (still-malformed) block, so seed
+ # with it here.
+ existing = _build_frontmatter([source_file]) + clean
+ path.write_text(existing, encoding="utf-8")
+ return
+
+ path.write_text(_build_frontmatter([source_file]) + clean, encoding="utf-8")
+
+
+def _set_fm_line(fm: str, key: str, value: str) -> str:
+ """Set or replace a single scalar ``key:`` line inside a frontmatter block.
+
+ ``fm`` includes the opening and closing ``---`` markers. Uses a lambda
+ replacement so values containing regex backrefs are inserted literally.
+ """
+ line = _yaml_kv_line(key, value)
+ if re.search(rf"^{re.escape(key)}:", fm, flags=re.MULTILINE):
+ return re.sub(rf"^{re.escape(key)}:.*", lambda _m: line, fm, count=1, flags=re.MULTILINE)
+ return fm.replace("---\n", f"---\n{line}\n", 1)
+
+
def _prepend_source_to_frontmatter(text: str, source_file: str) -> str:
"""Prepend ``source_file`` to the inline ``sources:`` list in YAML frontmatter.
@@ -725,85 +986,117 @@ def _remove_source_from_frontmatter(text: str, source_file: str) -> tuple[str, b
return text, False
-def _add_related_link(wiki_dir: Path, concept_slug: str, doc_name: str, source_file: str) -> None:
- """Add a cross-reference link to an existing concept page (no LLM call)."""
- concepts_dir = wiki_dir / "concepts"
- path = concepts_dir / f"{concept_slug}.md"
+def _add_related_link(
+ wiki_dir: Path, slug: str, doc_name: str, source_file: str,
+ page_dir: str = "concepts",
+) -> bool:
+ """Add a cross-reference link to an existing page (no LLM call).
+
+ Works for any page directory (``concepts`` or ``entities``). Returns True
+ when the page exists (whether or not a link was added), so callers can
+ track which related slugs are real pages. The standalone ``See also:``
+ paragraph it writes is symmetric with ``remove_doc_from_pages``' cleanup.
+ """
+ path = wiki_dir / page_dir / f"{slug}.md"
if not path.exists():
- return
+ return False
text = path.read_text(encoding="utf-8")
link = f"[[summaries/{doc_name}]]"
if link in text:
- return
+ return True
if source_file not in text:
text = _prepend_source_to_frontmatter(text, source_file)
text += f"\n\nSee also: {link}"
path.write_text(text, encoding="utf-8")
+ return True
-def _backlink_summary(wiki_dir: Path, doc_name: str, concept_slugs: list[str]) -> None:
- """Append missing concept wikilinks to the summary page (no LLM call).
-
- After all concepts are generated, this ensures the summary page links
- back to every related concept — closing the bidirectional link that
- concept pages already have toward the summary.
+def _backlink_summary_pages(
+ wiki_dir: Path, doc_name: str, slugs: list[str],
+ *, page_dir: str, section: str,
+) -> None:
+ """Append missing ``[[{page_dir}/slug]]`` wikilinks to the summary page.
- If a ``## Related Concepts`` section already exists, new links are
- appended into it rather than creating a duplicate section.
+ Closes the bidirectional link the pages already hold toward the summary,
+ inserting them under ``section`` (created if absent). Shared by the
+ concept and entity summary-backlink wrappers below.
"""
summary_path = wiki_dir / "summaries" / f"{doc_name}.md"
if not summary_path.exists():
return
text = summary_path.read_text(encoding="utf-8")
- missing = [slug for slug in concept_slugs if f"[[concepts/{slug}]]" not in text]
+ missing = [slug for slug in slugs if f"[[{page_dir}/{slug}]]" not in text]
if not missing:
return
lines = text.split("\n")
- _ensure_h2_section(lines, "## Related Concepts")
+ _ensure_h2_section(lines, section, quiet=True)
for slug in reversed(missing):
- _insert_section_entry(lines, "## Related Concepts", f"- [[concepts/{slug}]]")
+ _insert_section_entry(lines, section, f"- [[{page_dir}/{slug}]]")
summary_path.write_text("\n".join(lines), encoding="utf-8")
-def _backlink_concepts(wiki_dir: Path, doc_name: str, concept_slugs: list[str]) -> None:
- """Append missing summary wikilink to each concept page (no LLM call).
-
- Ensures every concept page links back to the source document's summary,
- regardless of whether the LLM included the link in its output.
-
- If a ``## Related Documents`` section already exists, the link is
- appended into it rather than creating a duplicate section.
- """
+def _backlink_pages(
+ wiki_dir: Path, doc_name: str, slugs: list[str], *, page_dir: str,
+) -> None:
+ """Append the source summary wikilink to each page under '## Related
+ Documents'. Shared by the concept and entity page-backlink wrappers."""
link = f"[[summaries/{doc_name}]]"
- concepts_dir = wiki_dir / "concepts"
+ pages_dir = wiki_dir / page_dir
- for slug in concept_slugs:
- path = concepts_dir / f"{slug}.md"
+ for slug in slugs:
+ path = pages_dir / f"{slug}.md"
if not path.exists():
continue
text = path.read_text(encoding="utf-8")
if link in text:
continue
lines = text.split("\n")
- _ensure_h2_section(lines, "## Related Documents")
+ _ensure_h2_section(lines, "## Related Documents", quiet=True)
_insert_section_entry(lines, "## Related Documents", f"- {link}")
path.write_text("\n".join(lines), encoding="utf-8")
-def remove_doc_from_concept_pages(
+def _backlink_summary(wiki_dir: Path, doc_name: str, concept_slugs: list[str]) -> None:
+ """Link the summary page back to every related concept (no LLM call)."""
+ _backlink_summary_pages(
+ wiki_dir, doc_name, concept_slugs,
+ page_dir="concepts", section="## Related Concepts",
+ )
+
+
+def _backlink_concepts(wiki_dir: Path, doc_name: str, concept_slugs: list[str]) -> None:
+ """Link every related concept page back to the source summary (no LLM call)."""
+ _backlink_pages(wiki_dir, doc_name, concept_slugs, page_dir="concepts")
+
+
+def _backlink_summary_entities(wiki_dir: Path, doc_name: str, entity_slugs: list[str]) -> None:
+ """Link the summary page back to every related entity under '## Entities'."""
+ _backlink_summary_pages(
+ wiki_dir, doc_name, entity_slugs,
+ page_dir="entities", section="## Entities",
+ )
+
+
+def _backlink_entities(wiki_dir: Path, doc_name: str, entity_slugs: list[str]) -> None:
+ """Link every related entity page back to the source summary (no LLM call)."""
+ _backlink_pages(wiki_dir, doc_name, entity_slugs, page_dir="entities")
+
+
+def _remove_doc_from_pages(
wiki_dir: Path,
doc_name: str,
*,
+ page_dir: str,
keep_empty: bool = False,
) -> dict[str, list[str]]:
- """Update or delete concept pages affected by removing a document.
+ """Update or delete pages in ``page_dir`` affected by removing a document.
- For each ``concepts/*.md`` whose frontmatter ``sources:`` lists
+ For each ``{page_dir}/*.md`` whose frontmatter ``sources:`` lists
``summaries/{doc_name}``:
- Remove that source from the frontmatter list.
@@ -812,24 +1105,16 @@ def remove_doc_from_concept_pages(
- Remove any standalone ``See also: [[summaries/{doc_name}]]`` lines
(left by ``_add_related_link``).
- If the ``sources:`` list becomes empty AND ``keep_empty`` is False,
- delete the concept page entirely.
-
- Args:
- wiki_dir: Path to the wiki root directory.
- doc_name: The summary slug being removed (e.g.
- ``"attention-is-all-you-need"``).
- keep_empty: When True, retains concept pages whose only source
- was the removed doc — leaves their frontmatter with an empty
- ``sources: []`` list. Useful when the doc is being replaced
- by a newer version that will repopulate the source on the
- next ``openkb add``.
-
- Returns:
- ``{"modified": [slugs...], "deleted": [slugs...]}`` — concept
- slugs whose pages were edited vs. deleted.
+ delete the page entirely.
+
+ Shared by the concept and entity removal wrappers so the cleanup (in
+ particular the standalone ``See also:`` strip) can never drift between
+ the two page types.
+
+ Returns ``{"modified": [slugs...], "deleted": [slugs...]}``.
"""
- concepts_dir = wiki_dir / "concepts"
- if not concepts_dir.is_dir():
+ pages_dir = wiki_dir / page_dir
+ if not pages_dir.is_dir():
return {"modified": [], "deleted": []}
source_file = f"summaries/{doc_name}.md"
@@ -839,7 +1124,7 @@ def remove_doc_from_concept_pages(
modified: list[str] = []
deleted: list[str] = []
- for path in sorted(concepts_dir.glob("*.md")):
+ for path in sorted(pages_dir.glob("*.md")):
text = path.read_text(encoding="utf-8")
# Cheap filter: skip pages that don't reference the doc at all.
if source_file not in text and bare_source not in text:
@@ -885,9 +1170,44 @@ def remove_doc_from_concept_pages(
return {"modified": modified, "deleted": deleted}
-def remove_doc_from_index(wiki_dir: Path, doc_name: str, concept_slugs_deleted: list[str]) -> None:
+def remove_doc_from_concept_pages(
+ wiki_dir: Path,
+ doc_name: str,
+ *,
+ keep_empty: bool = False,
+) -> dict[str, list[str]]:
+ """Update or delete concept pages affected by removing a document.
+
+ ``keep_empty`` retains concept pages whose only source was the removed
+ doc (leaving ``sources: []``) — useful when the doc is being replaced by
+ a newer version that will repopulate the source on the next ``openkb
+ add``. Returns ``{"modified": [slugs...], "deleted": [slugs...]}``.
+ """
+ return _remove_doc_from_pages(
+ wiki_dir, doc_name, page_dir="concepts", keep_empty=keep_empty,
+ )
+
+
+def remove_doc_from_entity_pages(
+ wiki_dir: Path,
+ doc_name: str,
+ *,
+ keep_empty: bool = False,
+) -> dict[str, list[str]]:
+ """Update or delete entity pages affected by removing a document.
+
+ Mirrors ``remove_doc_from_concept_pages`` for the entities/ directory.
+ Returns ``{"modified": [...], "deleted": [...]}``.
+ """
+ return _remove_doc_from_pages(
+ wiki_dir, doc_name, page_dir="entities", keep_empty=keep_empty,
+ )
+
+
+def remove_doc_from_index(wiki_dir: Path, doc_name: str, concept_slugs_deleted: list[str],
+ entity_slugs_deleted: list[str] | None = None) -> None:
"""Remove the document's entry from ``index.md`` along with any concept
- entries for concepts that were deleted as a side effect.
+ and entity entries for pages that were deleted as a side effect.
No-op when ``index.md`` doesn't exist. Section headings are kept even
when their last entry is removed — adding a new doc later repopulates
@@ -908,6 +1228,11 @@ def remove_doc_from_index(wiki_dir: Path, doc_name: str, concept_slugs_deleted:
while _remove_section_entry(lines, "## Concepts", concept_link):
pass
+ for slug in (entity_slugs_deleted or []):
+ entity_link = f"[[entities/{slug}]]"
+ while _remove_section_entry(lines, "## Entities", entity_link):
+ pass
+
index_path.write_text("\n".join(lines), encoding="utf-8")
@@ -915,6 +1240,8 @@ def _update_index(
wiki_dir: Path, doc_name: str, concept_names: list[str],
doc_brief: str = "", concept_briefs: dict[str, str] | None = None,
doc_type: str = "short",
+ entity_names: list[str] | None = None,
+ entity_meta: dict[str, tuple[str, str]] | None = None,
) -> None:
"""Append document and concept entries to index.md.
@@ -930,10 +1257,7 @@ def _update_index(
index_path = wiki_dir / "index.md"
if not index_path.exists():
- index_path.write_text(
- "# Knowledge Base Index\n\n## Documents\n\n## Concepts\n\n## Explorations\n",
- encoding="utf-8",
- )
+ index_path.write_text(INDEX_SEED, encoding="utf-8")
lines = index_path.read_text(encoding="utf-8").split("\n")
@@ -959,6 +1283,26 @@ def _update_index(
else:
_insert_section_entry(lines, "## Concepts", concept_entry)
+ entity_names = entity_names or []
+ entity_meta = entity_meta or {}
+ if entity_names:
+ # Keep canonical order: Entities sits before Explorations. On an older
+ # index.md that predates the Entities section, plain ``_ensure_h2_section``
+ # would append it after Explorations.
+ _ensure_h2_section_before(lines, "## Entities", "## Explorations")
+ for name in entity_names:
+ link = f"[[entities/{name}]]"
+ # Callers always populate entity_meta alongside entity_names; the
+ # default is a defensive fallback, never hit in practice.
+ etype, brief = entity_meta.get(name, ("other", ""))
+ entry = f"- {link} ({etype})"
+ if brief:
+ entry += f" — {brief}"
+ if _section_contains_link(lines, "## Entities", link):
+ _replace_section_entry(lines, "## Entities", link, entry)
+ else:
+ _insert_section_entry(lines, "## Entities", entry)
+
index_path.write_text("\n".join(lines), encoding="utf-8")
@@ -1002,6 +1346,7 @@ async def _compile_concepts(
# --- Step 2: Get concepts plan (A cached) ---
concept_briefs = _read_concept_briefs(wiki_dir)
+ entity_briefs = _read_entity_briefs(wiki_dir)
# Second cache breakpoint: end of the assistant summary message. Covers
# (system + doc + summary) for the plan call and every concept call.
@@ -1013,6 +1358,7 @@ async def _compile_concepts(
summary_msg,
{"role": "user", "content": _CONCEPTS_PLAN_USER.format(
concept_briefs=concept_briefs,
+ entity_briefs=entity_briefs,
)},
], "concepts-plan", max_tokens=2048, response_format=_JSON_RESPONSE_FORMAT)
@@ -1056,37 +1402,76 @@ def _write_v1_summary_stripped() -> None:
return
# Fallback: if LLM returns a flat list, treat all items as "create".
+ # The new plan contract nests concepts under a "concepts" key alongside
+ # an "entities" key; the legacy flat shape (create/update/related at top
+ # level) is still honored by falling back to ``parsed`` itself.
+ if not isinstance(parsed, (list, dict)):
+ # A JSON scalar (int/str/None/bool) is valid JSON but not a usable
+ # plan. ``_parse_json`` normally rejects scalars, but guard here too
+ # so ``parsed.get(...)`` can never raise AttributeError and abort the
+ # compile — treat it as an empty/unparseable plan.
+ logger.warning(
+ "Concepts plan parsed to a %s scalar, not an object/array — "
+ "treating as empty plan for %s.",
+ type(parsed).__name__, doc_name,
+ )
+ if rewrite_summary:
+ _write_v1_summary_stripped()
+ _update_index(wiki_dir, doc_name, [], doc_brief=doc_brief, doc_type=doc_type)
+ return
+
if isinstance(parsed, list):
plan = {"create": _filter_concept_items(parsed, "list"),
"update": [], "related": []}
+ entities_plan = {"create": [], "update": [], "related": []}
else:
+ concepts_group = (
+ parsed.get("concepts")
+ if isinstance(parsed.get("concepts"), dict)
+ else parsed
+ )
plan = {
- "create": _filter_concept_items(parsed.get("create", []), "create"),
- "update": _filter_concept_items(parsed.get("update", []), "update"),
- "related": _filter_related_slugs(parsed.get("related", [])),
+ "create": _filter_concept_items(concepts_group.get("create", []), "create"),
+ "update": _filter_concept_items(concepts_group.get("update", []), "update"),
+ "related": _filter_related_slugs(concepts_group.get("related", [])),
}
+ entities_plan = _parse_entities_plan(parsed)
create_items = plan["create"]
update_items = plan["update"]
related_items = plan["related"]
+ entity_create = entities_plan["create"]
+ entity_update = entities_plan["update"]
+ entity_related = entities_plan["related"]
# Distinguish "filters dropped everything" from "LLM emitted an empty plan".
+ # Count entity items too, so a plan that emitted only entities — all of
+ # which were dropped as malformed — still surfaces the warning.
+ def _raw_group_count(group: object) -> int:
+ if not isinstance(group, dict):
+ return 0
+ return sum(
+ len(group.get(k, [])) if isinstance(group.get(k), list) else 0
+ for k in ("create", "update", "related")
+ )
+
if isinstance(parsed, list):
original_total = len(parsed)
else:
- original_total = sum(
- len(parsed.get(k, [])) if isinstance(parsed.get(k), list) else 0
- for k in ("create", "update", "related")
- )
- post_filter_total = len(create_items) + len(update_items) + len(related_items)
+ original_total = _raw_group_count(concepts_group) + _raw_group_count(parsed.get("entities"))
+ post_filter_total = (
+ len(create_items) + len(update_items) + len(related_items)
+ + len(entity_create) + len(entity_update) + len(entity_related)
+ )
if original_total > 0 and post_filter_total == 0:
sys.stdout.write(
- f" [WARN] concepts plan for {doc_name} had {original_total} "
+ f" [WARN] plan for {doc_name} had {original_total} "
f"item(s), all dropped as malformed — see log (stderr).\n"
)
sys.stdout.flush()
- if not create_items and not update_items and not related_items:
+ if (not create_items and not update_items and not related_items
+ and not entity_create and not entity_update and not entity_related):
if rewrite_summary:
_write_v1_summary_stripped()
_update_index(wiki_dir, doc_name, [], doc_brief=doc_brief, doc_type=doc_type)
@@ -1101,9 +1486,15 @@ def _write_v1_summary_stripped() -> None:
} | {
_sanitize_concept_name(s) for s in related_items
}
+ entity_planned = {
+ _sanitize_concept_name(e["name"]) for e in entity_create + entity_update
+ } | {
+ _sanitize_concept_name(s) for s in entity_related
+ }
known_targets: set[str] = (
list_existing_wiki_targets(wiki_dir)
| {f"concepts/{s}" for s in planned_slugs}
+ | {f"entities/{s}" for s in entity_planned}
| {f"summaries/{doc_name}"}
)
known_targets_str = _format_known_targets(known_targets)
@@ -1144,10 +1535,13 @@ async def _gen_create(concept: dict) -> tuple[str, str, bool, str]:
try:
parsed = _parse_json(raw)
brief = parsed.get("brief", "")
- # ``or raw``: ``.get("content", raw)`` returns None for
- # ``{"content": null}`` (legal under json_object mode).
- content = parsed.get("content") or raw
+ # Parse succeeded: do NOT fall back to ``raw`` (the JSON string).
+ # An empty/None ``content`` field yields "" so
+ # ``_require_nonempty_content`` raises and the page is skipped,
+ # rather than writing the raw JSON as the markdown body.
+ content = parsed.get("content") or ""
except (json.JSONDecodeError, ValueError):
+ # Parse FAILED: ``raw`` is the legitimate non-JSON body fallback.
brief, content = "", raw
_require_nonempty_content(content, name)
return name, content, False, brief
@@ -1179,27 +1573,118 @@ async def _gen_update(concept: dict) -> tuple[str, str, bool, str]:
try:
parsed = _parse_json(raw)
brief = parsed.get("brief", "")
- content = parsed.get("content") or raw
+ # Parse succeeded: do NOT fall back to ``raw`` (the JSON string).
+ content = parsed.get("content") or ""
except (json.JSONDecodeError, ValueError):
+ # Parse FAILED: ``raw`` is the legitimate non-JSON body fallback.
brief, content = "", raw
_require_nonempty_content(content, name)
return name, content, True, brief
+ async def _gen_entity_create(ent: dict) -> tuple[str, str, str, str]:
+ name = ent["name"]
+ title = ent.get("title", name)
+ etype = ent.get("type", "other")
+ async with semaphore:
+ raw = await _llm_call_async(model, [
+ system_msg,
+ doc_msg, # cached (BP1)
+ summary_msg, # cached (BP2)
+ known_targets_msg, # cached (BP3) — whitelist
+ {"role": "user", "content": _ENTITY_PAGE_USER.format(
+ title=title, type=etype, doc_name=doc_name,
+ )},
+ ], f"entity: {name}", response_format=_JSON_RESPONSE_FORMAT)
+ try:
+ parsed = _parse_json(raw)
+ brief = parsed.get("brief", "")
+ etype_out = parsed.get("type") if parsed.get("type") in _ENTITY_TYPES else etype
+ # Parse succeeded: do NOT fall back to ``raw`` (the JSON string).
+ content = parsed.get("content") or ""
+ except (json.JSONDecodeError, ValueError):
+ # Parse FAILED: ``raw`` is the legitimate non-JSON body fallback.
+ brief, etype_out, content = "", etype, raw
+ _require_nonempty_content(content, name)
+ return name, content, brief, etype_out
+
+ async def _gen_entity_update(ent: dict) -> tuple[str, str, str, str]:
+ name = ent["name"]
+ title = ent.get("title", name)
+ etype = ent.get("type", "other")
+ epath = wiki_dir / "entities" / f"{_sanitize_concept_name(name)}.md"
+ if epath.exists():
+ raw_text = epath.read_text(encoding="utf-8")
+ if raw_text.startswith("---"):
+ parts = raw_text.split("---", 2)
+ existing_content = parts[2].strip() if len(parts) >= 3 else raw_text
+ else:
+ existing_content = raw_text
+ else:
+ existing_content = "(page not found — create from scratch)"
+ async with semaphore:
+ raw = await _llm_call_async(model, [
+ system_msg,
+ doc_msg, # cached (BP1)
+ summary_msg, # cached (BP2)
+ known_targets_msg, # cached (BP3) — whitelist
+ {"role": "user", "content": _ENTITY_UPDATE_USER.format(
+ title=title, type=etype, doc_name=doc_name,
+ existing_content=existing_content,
+ )},
+ ], f"entity-update: {name}", response_format=_JSON_RESPONSE_FORMAT)
+ try:
+ parsed = _parse_json(raw)
+ brief = parsed.get("brief", "")
+ etype_out = parsed.get("type") if parsed.get("type") in _ENTITY_TYPES else etype
+ # Parse succeeded: do NOT fall back to ``raw`` (the JSON string).
+ content = parsed.get("content") or ""
+ except (json.JSONDecodeError, ValueError):
+ # Parse FAILED: ``raw`` is the legitimate non-JSON body fallback.
+ brief, etype_out, content = "", etype, raw
+ _require_nonempty_content(content, name)
+ return name, content, brief, etype_out
+
tasks = []
tasks.extend(_gen_create(c) for c in create_items)
tasks.extend(_gen_update(c) for c in update_items)
+ # --- Step 3 (entities): build the entity task list up front so it can be
+ # gathered concurrently with the concept tasks below. Entity coroutines
+ # return 4-arity tuples (name, content, brief, type), so their results are
+ # processed in their own loop rather than mixed with the concept tuples.
+ entity_tasks = []
+ entity_tasks.extend(_gen_entity_create(e) for e in entity_create)
+ entity_tasks.extend(_gen_entity_update(e) for e in entity_update)
+
concept_names: list[str] = []
concept_briefs_map: dict[str, str] = {}
pending_writes: list[tuple[str, str, bool, str]] = []
-
+ entity_names: list[str] = []
+ entity_meta: dict[str, tuple[str, str]] = {}
+ entity_pending: list[tuple[str, str, str, str]] = []
+
+ # Concepts and entities are independent and share the cached prompt
+ # context + the same concurrency ``semaphore``, so overlap them in one
+ # outer gather instead of running entities only after concepts finish.
+ total = len(tasks)
+ etotal = len(entity_tasks)
if tasks:
- total = len(tasks)
sys.stdout.write(f" Generating {total} concept(s) (concurrency={max_concurrency})...\n")
sys.stdout.flush()
+ if entity_tasks:
+ sys.stdout.write(
+ f" Generating {etotal} entity(ies) (concurrency={max_concurrency})...\n"
+ )
+ sys.stdout.flush()
- results = await asyncio.gather(*tasks, return_exceptions=True)
+ results, entity_results = ([], [])
+ if tasks or entity_tasks:
+ results, entity_results = await asyncio.gather(
+ asyncio.gather(*tasks, return_exceptions=True),
+ asyncio.gather(*entity_tasks, return_exceptions=True),
+ )
+ if tasks:
failure_types: list[str] = []
for r in results:
if isinstance(r, Exception):
@@ -1227,6 +1712,43 @@ async def _gen_update(concept: dict) -> tuple[str, str, bool, str]:
)
sys.stdout.flush()
+ if entity_tasks:
+ entity_failure_types: list[str] = []
+ for r in entity_results:
+ if isinstance(r, Exception):
+ logger.warning("Entity generation failed: %s", r)
+ entity_failure_types.append(type(r).__name__)
+ continue
+ name, page_content, brief, etype = r
+ entity_pending.append((name, page_content, brief, etype))
+
+ ewritten = len(entity_pending)
+ if ewritten < etotal:
+ reason = (
+ ", ".join(sorted(set(entity_failure_types)))
+ if entity_failure_types else "see log (stderr)"
+ )
+ sys.stdout.write(
+ f" [WARN] {etotal} entity(ies) planned but only {ewritten} written "
+ f"for {doc_name} ({reason}).\n"
+ )
+ sys.stdout.flush()
+
+ # Strip ghost wikilinks from entity bodies and write each page.
+ for name, page_content, brief, etype in entity_pending:
+ cleaned, ghosts = strip_ghost_wikilinks(page_content, known_targets)
+ if ghosts:
+ logger.info(
+ "stripped %d ghost wikilink(s) from entity %s: %s",
+ len(ghosts), name, ghosts[:5],
+ )
+ safe = _sanitize_concept_name(name)
+ is_update = (wiki_dir / "entities" / f"{safe}.md").exists()
+ _write_entity(wiki_dir, name, cleaned, source_file, is_update,
+ brief=brief, type_=etype)
+ entity_names.append(safe)
+ entity_meta[safe] = (etype, brief)
+
# Strip unresolved wikilinks from concept bodies before writing. The
# whitelist includes existing files + this round's planned slugs +
# the summary for this document.
@@ -1319,10 +1841,25 @@ async def _gen_update(concept: dict) -> tuple[str, str, bool, str]:
_backlink_summary(wiki_dir, doc_name, all_concept_slugs)
_backlink_concepts(wiki_dir, doc_name, all_concept_slugs)
+ # --- Step 3d: Process entity related items + backlinks (code only) ---
+ # Reuse _add_related_link (page_dir="entities") so related-entity
+ # cross-refs are written in the same "See also:" form the concept path
+ # uses — and torn down symmetrically by _remove_doc_from_pages.
+ entity_related_slugs = [
+ slug for slug in (_sanitize_concept_name(s) for s in entity_related)
+ if _add_related_link(wiki_dir, slug, doc_name, source_file, page_dir="entities")
+ ]
+
+ entity_backlink_slugs = entity_names + entity_related_slugs
+ if entity_backlink_slugs:
+ _backlink_summary_entities(wiki_dir, doc_name, entity_backlink_slugs)
+ _backlink_entities(wiki_dir, doc_name, entity_backlink_slugs)
+
# --- Step 4: Update index (code only) ---
_update_index(wiki_dir, doc_name, concept_names,
doc_brief=doc_brief, concept_briefs=concept_briefs_map,
- doc_type=doc_type)
+ doc_type=doc_type, entity_names=entity_names,
+ entity_meta=entity_meta)
async def compile_short_doc(
diff --git a/openkb/agent/linter.py b/openkb/agent/linter.py
index 969a2cfa..7e790299 100644
--- a/openkb/agent/linter.py
+++ b/openkb/agent/linter.py
@@ -24,12 +24,16 @@
4. **Redundancy** — Are there multiple pages that cover the same content and
could be merged?
5. **Concept coverage** — Are important themes in the summaries missing concept pages?
+6. **Entity coverage** — Are important named things (people, organizations, places,
+ products, works, events) in the summaries missing entity pages, or are existing
+ entity pages contradictory, redundant, or orphaned (unlinked from any source)?
## Process
1. Start with index.md to understand scope.
2. Read summary pages to understand document content.
3. Read concept pages to check for contradictions and gaps.
-4. Produce a structured Markdown report listing issues found with references
+4. Read entity pages to check for contradictions, redundancy, coverage, and orphans.
+5. Produce a structured Markdown report listing issues found with references
to the specific pages where each issue occurs.
Be thorough but concise. If the wiki is small or sparse, say so.
@@ -99,9 +103,9 @@ async def run_knowledge_lint(kb_dir: Path, model: str) -> str:
prompt = (
"Please audit this knowledge base wiki for semantic quality issues: "
- "contradictions, gaps, staleness, redundancy, and missing concept pages. "
- "Start with index.md, then read summaries and concepts as needed. "
- "Produce a structured Markdown report."
+ "contradictions, gaps, staleness, redundancy, and missing concept and "
+ "entity pages. Start with index.md, then read summaries, concepts, and "
+ "entities as needed. Produce a structured Markdown report."
)
result = await Runner.run(agent, prompt, max_turns=MAX_TURNS)
diff --git a/openkb/agent/query.py b/openkb/agent/query.py
index 790a186c..b545e9af 100644
--- a/openkb/agent/query.py
+++ b/openkb/agent/query.py
@@ -28,15 +28,17 @@
Summaries may omit details — if you need more, follow the summary's
`full_text` frontmatter field to the source (see step 4).
3. Read concept pages (concepts/) for cross-document synthesis.
-4. When you need detailed source document content, each summary page has a
+4. For "who/what is X" questions about a specific named person, organization,
+ place, or product, read the matching page in entities/ first.
+5. When you need detailed source document content, each summary page has a
`full_text` frontmatter field with the path to the original document content:
- Short documents (doc_type: short): read_file with that path.
- PageIndex documents (doc_type: pageindex): use get_page_content(doc_name, pages)
with tight page ranges. The summary shows document tree structure with page
ranges to help you target. Never fetch the whole document.
-5. Source content may reference images (e.g. ).
+6. Source content may reference images (e.g. ).
Use the get_image tool to view them when needed.
-6. Synthesize a clear, concise, well-cited answer grounded in wiki content.
+7. Synthesize a clear, concise, well-cited answer grounded in wiki content.
Answer based only on wiki content. Be concise.
Before each tool call, output one short sentence explaining the reason.
diff --git a/openkb/cli.py b/openkb/cli.py
index 68a3e807..0ad10602 100644
--- a/openkb/cli.py
+++ b/openkb/cli.py
@@ -43,7 +43,7 @@ def filter(self, record: logging.LogRecord) -> bool:
from openkb.config import DEFAULT_CONFIG, load_config, save_config, load_global_config, register_kb
from openkb.converter import convert_document
from openkb.log import append_log
-from openkb.schema import AGENTS_MD
+from openkb.schema import AGENTS_MD, INDEX_SEED, PAGE_CONTENT_DIRS
# Suppress warnings after all imports — markitdown overrides filters at import time
import warnings
@@ -217,7 +217,7 @@ def _preflight_skill_new(kb_dir: Path, name: str) -> str | None:
Checks (in order):
* skill name is a valid kebab-case slug
* ``/wiki`` exists
- * ``/wiki/concepts`` or ``/wiki/summaries`` has at least
+ * any of ``/wiki/{summaries,concepts,entities}`` has at least
one file (i.e. some document has been ingested + compiled)
Returns ``None`` if all gates pass, else a single-line error message
@@ -239,7 +239,7 @@ def _preflight_skill_new(kb_dir: Path, name: str) -> str | None:
has_content = any(
(wiki / sub).is_dir() and any((wiki / sub).iterdir())
- for sub in ("concepts", "summaries")
+ for sub in PAGE_CONTENT_DIRS
)
if not has_content:
return (
@@ -538,13 +538,11 @@ def init(model, language):
Path("wiki/sources/images").mkdir(parents=True, exist_ok=True)
Path("wiki/summaries").mkdir(parents=True, exist_ok=True)
Path("wiki/concepts").mkdir(parents=True, exist_ok=True)
+ Path("wiki/entities").mkdir(parents=True, exist_ok=True)
# Write wiki files
Path("wiki/AGENTS.md").write_text(AGENTS_MD, encoding="utf-8")
- Path("wiki/index.md").write_text(
- "# Knowledge Base Index\n\n## Documents\n\n## Concepts\n\n## Explorations\n",
- encoding="utf-8",
- )
+ Path("wiki/index.md").write_text(INDEX_SEED, encoding="utf-8")
Path("wiki/log.md").write_text("# Operations Log\n\n", encoding="utf-8")
# Create .openkb/ state directory
@@ -800,6 +798,7 @@ def remove(ctx, identifier, keep_raw, keep_empty_concepts, dry_run, yes):
"""
from openkb.agent.compiler import (
remove_doc_from_concept_pages,
+ remove_doc_from_entity_pages,
remove_doc_from_index,
)
from openkb.lint import fix_broken_links
@@ -895,6 +894,42 @@ def remove(ctx, identifier, keep_raw, keep_empty_concepts, dry_run, yes):
for slug in concept_edits:
actions.append(("MODIFY", f"wiki/concepts/{slug}.md (drop this doc from sources)"))
+ # Scan entity pages with the same frontmatter logic as concepts. The
+ # executor calls ``remove_doc_from_entity_pages``; this only makes the
+ # preview/summary truthful about what it will delete vs. edit.
+ affected_entities: list[tuple[str, int]] = [] # (slug, remaining_sources)
+ entities_dir = wiki_dir / "entities"
+ if entities_dir.is_dir():
+ for path in sorted(entities_dir.glob("*.md")):
+ text = path.read_text(encoding="utf-8")
+ if not text.startswith("---"):
+ continue
+ fm_end = text.find("---", 3)
+ if fm_end == -1:
+ continue
+ sources_count = 0
+ source_in_frontmatter = False
+ for line in text[:fm_end].split("\n"):
+ if line.lstrip().startswith("sources:"):
+ lb = line.find("[")
+ rb = line.rfind("]")
+ if lb != -1 and rb != -1 and rb > lb:
+ items = [s.strip() for s in line[lb + 1:rb].split(",") if s.strip()]
+ sources_count = len(items)
+ source_in_frontmatter = source_file_marker in items
+ break
+ if not source_in_frontmatter:
+ continue
+ remaining = max(sources_count - 1, 0)
+ affected_entities.append((path.stem, remaining))
+
+ entity_deletes = [s for s, r in affected_entities if r == 0 and not keep_empty_concepts]
+ entity_edits = [s for s, r in affected_entities if r > 0 or keep_empty_concepts]
+ for slug in entity_deletes:
+ actions.append(("DELETE", f"wiki/entities/{slug}.md (only source: this doc)"))
+ for slug in entity_edits:
+ actions.append(("MODIFY", f"wiki/entities/{slug}.md (drop this doc from sources)"))
+
if (wiki_dir / "index.md").exists():
actions.append(("MODIFY", "wiki/index.md (remove Documents entry)"))
@@ -936,6 +971,12 @@ def remove(ctx, identifier, keep_raw, keep_empty_concepts, dry_run, yes):
f" {len(concept_deletes)} concept(s) will be DELETED because this is their only source."
)
click.echo(" Pass --keep-empty-concepts to retain them instead.")
+ if entity_deletes:
+ click.echo("")
+ click.echo(
+ f" {len(entity_deletes)} entity(s) will be DELETED because this is their only source."
+ )
+ click.echo(" Pass --keep-empty-concepts to retain them instead.")
click.echo("")
if dry_run:
@@ -967,15 +1008,20 @@ def remove(ctx, identifier, keep_raw, keep_empty_concepts, dry_run, yes):
wiki_dir, doc_name, keep_empty=keep_empty_concepts,
)
- remove_doc_from_index(wiki_dir, doc_name, concept_result["deleted"])
+ entity_result = remove_doc_from_entity_pages(
+ wiki_dir, doc_name, keep_empty=keep_empty_concepts,
+ )
+
+ remove_doc_from_index(wiki_dir, doc_name, concept_result["deleted"],
+ entity_slugs_deleted=entity_result["deleted"])
# Strip dangling wikilinks now so a retry (after a PageIndex
# failure below) finds a clean wiki — no point in re-running this
# on every attempt.
#
# Scope: only the pages this remove actually touched (modified
- # concept pages ∪ index.md). Previously this swept the whole wiki
- # via ``fix_broken_links(wiki_dir)``, which silently stripped
+ # concept + entity pages ∪ index.md). Previously this swept the whole
+ # wiki via ``fix_broken_links(wiki_dir)``, which silently stripped
# pre-existing dangling links in unrelated pages — see issue #58
# (Bug 2). Users who want a wiki-wide sweep can still run
# ``openkb lint --fix`` explicitly.
@@ -983,6 +1029,10 @@ def remove(ctx, identifier, keep_raw, keep_empty_concepts, dry_run, yes):
wiki_dir / "concepts" / f"{slug}.md"
for slug in concept_result["modified"]
]
+ lint_scope += [
+ wiki_dir / "entities" / f"{slug}.md"
+ for slug in entity_result["modified"]
+ ]
index_md = wiki_dir / "index.md"
if index_md.exists():
lint_scope.append(index_md)
@@ -1025,6 +1075,197 @@ def remove(ctx, identifier, keep_raw, keep_empty_concepts, dry_run, yes):
click.echo(f" [OK] {name} removed from knowledge base.")
+def _refresh_schema(wiki_dir: Path) -> bool:
+ """Back up + overwrite ``wiki/AGENTS.md`` with the current ``AGENTS_MD``.
+
+ If the on-disk schema differs from the bundled default, copy it to
+ ``wiki/AGENTS.md.bak`` then overwrite with ``AGENTS_MD``. No-op when the
+ file is missing or already identical. Returns True if it overwrote.
+ """
+ agents_file = wiki_dir / "AGENTS.md"
+ if not agents_file.exists():
+ # No-op when missing: get_agents_md() already falls back to the
+ # bundled AGENTS_MD default at runtime, so there is nothing to refresh.
+ return False
+ current = agents_file.read_text(encoding="utf-8")
+ if current == AGENTS_MD:
+ return False
+ backup = wiki_dir / "AGENTS.md.bak"
+ backup.write_text(current, encoding="utf-8")
+ click.echo(f" Backed up existing schema to {backup.relative_to(wiki_dir.parent)}")
+ agents_file.write_text(AGENTS_MD, encoding="utf-8")
+ click.echo(" Refreshed wiki/AGENTS.md to the current schema.")
+ return True
+
+
+@cli.command()
+@click.argument("doc_name", required=False)
+@click.option("--all", "all_docs", is_flag=True, default=False,
+ help="Recompile every indexed document.")
+@click.option("--dry-run", is_flag=True, default=False,
+ help="List the docs that would be recompiled; no LLM calls, no writes.")
+@click.option("--yes", "-y", is_flag=True, default=False,
+ help="Skip the --all confirmation prompt.")
+@click.option("--refresh-schema", "refresh_schema", is_flag=True, default=False,
+ help="Overwrite wiki/AGENTS.md with the bundled schema (backs up "
+ "the old one to AGENTS.md.bak) if it differs.")
+@click.pass_context
+def recompile(ctx, doc_name, all_docs, dry_run, yes, refresh_schema):
+ """Re-run the current compile pipeline on already-indexed documents.
+
+ Recompiling re-runs the same ``compile_short_doc`` / ``compile_long_doc``
+ that ``openkb add`` uses, so pre-feature KBs gain the ``entities/`` layer
+ and pages refresh to the current format. It does NOT re-run PageIndex or
+ re-convert raw files — it reuses the on-disk ``wiki/sources/`` and
+ ``wiki/summaries/`` content (and the registry's PageIndex ``doc_id``).
+
+ DOC_NAME recompiles one doc (resolved like ``openkb remove`` — filename,
+ slug, or unique substring). ``--all`` recompiles every indexed doc.
+ Exactly one of DOC_NAME or ``--all`` is required.
+
+ Side effect: this regenerates summaries (short docs) and rewrites concept
+ pages with the current logic — manual edits to those pages are overwritten.
+ """
+ from openkb.state import HashRegistry
+
+ kb_dir = _find_kb_dir(ctx.obj.get("kb_dir_override"))
+ if kb_dir is None:
+ click.echo("No knowledge base found. Run `openkb init` first.")
+ return
+
+ if all_docs and doc_name:
+ click.echo("Specify either a DOC_NAME or --all, not both.")
+ return
+ if not all_docs and not doc_name:
+ click.echo("Specify a document name or pass --all to recompile every doc.")
+ return
+
+ openkb_dir = kb_dir / ".openkb"
+ wiki_dir = kb_dir / "wiki"
+ registry = HashRegistry(openkb_dir / "hashes.json")
+
+ # Resolve the set of docs to recompile.
+ if all_docs:
+ entries = list(registry.all_entries().values())
+ if not entries:
+ click.echo("No documents indexed yet. Run `openkb add` first.")
+ return
+ targets = entries
+ else:
+ matches = _resolve_doc_identifier(registry, doc_name)
+ if not matches:
+ click.echo(f"No document matching '{doc_name}' found in the KB.")
+ click.echo("Try `openkb list` to see indexed documents.")
+ return
+ if len(matches) > 1:
+ click.echo(f"'{doc_name}' matches multiple documents:")
+ for _, m in matches:
+ click.echo(f" - {m.get('name', '?')} (doc_name: {m.get('doc_name', '?')})")
+ click.echo("Use a more specific name or the exact doc_name slug.")
+ return
+ targets = [matches[0][1]]
+
+ def _classify(meta: dict) -> str:
+ return "long" if meta.get("type") == "long_pdf" else "short"
+
+ # --dry-run: enumerate only, no LLM calls, no writes.
+ if dry_run:
+ click.echo(f"Would recompile {len(targets)} document(s):")
+ for meta in targets:
+ name = meta.get("doc_name") or meta.get("name", "?")
+ click.echo(f" - {name} ({_classify(meta)})")
+ click.echo(
+ "\nNote: recompiling regenerates summaries (short docs) and rewrites "
+ "concept pages — manual edits would be overwritten."
+ )
+ click.echo("(dry-run — nothing modified)")
+ return
+
+ # --all confirmation (the summary/concept-regeneration side effect).
+ if all_docs and not yes:
+ click.echo(
+ f"This will recompile {len(targets)} document(s), regenerating "
+ "summaries and rewriting concept pages with the current logic.\n"
+ "Manual edits to those pages will be overwritten."
+ )
+ if not click.confirm("Proceed?", default=False):
+ click.echo("Aborted.")
+ return
+
+ if refresh_schema:
+ _refresh_schema(wiki_dir)
+
+ _setup_llm_key(kb_dir)
+ config = load_config(openkb_dir / "config.yaml")
+ model: str = config.get("model", DEFAULT_CONFIG["model"])
+
+ # Import lazily and reference via the module so tests can patch
+ # ``openkb.agent.compiler.compile_*`` and see the call.
+ from openkb.agent import compiler
+
+ recompiled = 0
+ skipped = 0
+ total = len(targets)
+ for i, meta in enumerate(targets, 1):
+ name = meta.get("doc_name") or Path(meta.get("name", "")).stem
+ if not name:
+ click.echo(f"[{i}/{total}] [SKIP] registry entry has no doc_name.")
+ skipped += 1
+ continue
+
+ if meta.get("type") == "long_pdf":
+ summary_path = wiki_dir / "summaries" / f"{name}.md"
+ doc_id = meta.get("doc_id")
+ if not doc_id:
+ click.echo(
+ f"[{i}/{total}] [SKIP] {name}: legacy long-doc entry without a "
+ "doc_id — re-add to refresh."
+ )
+ skipped += 1
+ continue
+ if not summary_path.exists():
+ click.echo(
+ f"[{i}/{total}] [SKIP] {name}: missing summary at "
+ f"{summary_path.relative_to(kb_dir)}."
+ )
+ skipped += 1
+ continue
+ click.echo(f"[{i}/{total}] Recompiling long doc {name}...")
+ start = time.time()
+ try:
+ asyncio.run(compiler.compile_long_doc(name, summary_path, doc_id, kb_dir, model))
+ except Exception as exc:
+ click.echo(f" [ERROR] Compilation failed: {exc}")
+ logging.getLogger(__name__).debug("Recompile traceback:", exc_info=True)
+ skipped += 1
+ continue
+ click.echo(f" [OK] {name} ({time.time() - start:.1f}s)")
+ recompiled += 1
+ else:
+ source_path = wiki_dir / "sources" / f"{name}.md"
+ if not source_path.exists():
+ click.echo(
+ f"[{i}/{total}] [SKIP] {name}: missing source at "
+ f"{source_path.relative_to(kb_dir)}."
+ )
+ skipped += 1
+ continue
+ click.echo(f"[{i}/{total}] Recompiling short doc {name}...")
+ start = time.time()
+ try:
+ asyncio.run(compiler.compile_short_doc(name, source_path, kb_dir, model))
+ except Exception as exc:
+ click.echo(f" [ERROR] Compilation failed: {exc}")
+ logging.getLogger(__name__).debug("Recompile traceback:", exc_info=True)
+ skipped += 1
+ continue
+ click.echo(f" [OK] {name} ({time.time() - start:.1f}s)")
+ recompiled += 1
+
+ click.echo(f"\nDone: recompiled {recompiled}, skipped {skipped}.")
+ append_log(wiki_dir, "recompile", f"recompiled {recompiled}, skipped {skipped}")
+
+
@cli.command()
@click.option(
"--resume", "-r", "resume",
@@ -1277,6 +1518,15 @@ def print_list(kb_dir: Path) -> None:
for c in concepts:
click.echo(f" - {c}")
+ # Display entities
+ entities_dir = kb_dir / "wiki" / "entities"
+ if entities_dir.exists():
+ entities = sorted(p.stem for p in entities_dir.glob("*.md"))
+ if entities:
+ click.echo(f"\nEntities ({len(entities)}):")
+ for e in entities:
+ click.echo(f" - {e}")
+
# Display reports
reports_dir = kb_dir / "wiki" / "reports"
if reports_dir.exists():
@@ -1301,7 +1551,7 @@ def list_cmd(ctx):
def print_status(kb_dir: Path) -> None:
"""Print knowledge base status. Usable from CLI and chat REPL."""
wiki_dir = kb_dir / "wiki"
- subdirs = ["sources", "summaries", "concepts", "reports"]
+ subdirs = ["sources", "summaries", "concepts", "entities", "reports"]
# Print the active KB path as the first line. Agents and scripts
# parse this to locate the wiki without assuming cwd == KB root.
@@ -1332,15 +1582,19 @@ def print_status(kb_dir: Path) -> None:
hashes = json.loads(hashes_file.read_text(encoding="utf-8"))
click.echo(f"\n Total indexed: {len(hashes)} document(s)")
- # Last compile time: newest file in wiki/summaries/
- summaries_dir = wiki_dir / "summaries"
- if summaries_dir.exists():
- summaries = list(summaries_dir.glob("*.md"))
- if summaries:
- newest_summary = max(summaries, key=lambda p: p.stat().st_mtime)
- import datetime
- mtime = datetime.datetime.fromtimestamp(newest_summary.stat().st_mtime)
- click.echo(f" Last compile: {mtime.strftime('%Y-%m-%d %H:%M:%S')}")
+ # Last compile time: newest compiled page across summaries/, concepts/,
+ # and entities/ (an entity-only compile must still bump the shown time).
+ compiled_pages = [
+ p
+ for sub in PAGE_CONTENT_DIRS
+ for p in (wiki_dir / sub).glob("*.md")
+ if (wiki_dir / sub).exists()
+ ]
+ if compiled_pages:
+ newest_page = max(compiled_pages, key=lambda p: p.stat().st_mtime)
+ import datetime
+ mtime = datetime.datetime.fromtimestamp(newest_page.stat().st_mtime)
+ click.echo(f" Last compile: {mtime.strftime('%Y-%m-%d %H:%M:%S')}")
# Last lint time: newest file in wiki/reports/
reports_dir = wiki_dir / "reports"
diff --git a/openkb/lint.py b/openkb/lint.py
index fa6df76a..2ac6af1d 100644
--- a/openkb/lint.py
+++ b/openkb/lint.py
@@ -15,6 +15,8 @@
import yaml
+from openkb.schema import PAGE_CONTENT_DIRS
+
# Matches [[wikilink]] or [[subdir/link]]
_WIKILINK_RE = re.compile(r"\[\[([^\]]+)\]\]")
@@ -171,6 +173,9 @@ def list_existing_wiki_targets(wiki_dir: Path) -> set[str]:
targets.update(f"concepts/{p.stem}" for p in concepts_dir.glob("*.md"))
if summaries_dir.is_dir():
targets.update(f"summaries/{p.stem}" for p in summaries_dir.glob("*.md"))
+ entities_dir = wiki_dir / "entities"
+ if entities_dir.is_dir():
+ targets.update(f"entities/{p.stem}" for p in entities_dir.glob("*.md"))
if (wiki_dir / "index.md").exists():
targets.add("index")
return targets
@@ -365,7 +370,7 @@ def check_index_sync(wiki: Path) -> list[str]:
Returns issues for:
- Links in index.md pointing to non-existent pages
- - Pages in summaries/ or concepts/ not mentioned in index.md
+ - Pages in summaries/, concepts/, or entities/ not mentioned in index.md
Args:
wiki: Path to the wiki root directory.
@@ -389,11 +394,11 @@ def check_index_sync(wiki: Path) -> list[str]:
if lnk_norm not in pages:
issues.append(f"index.md links to missing page: [[{lnk}]]")
- # Check that summaries and concepts pages are mentioned in index
+ # Check that summaries, concepts, and entities pages are mentioned in index
index_stems = {Path(lnk.strip()).stem for lnk in index_links}
index_text_lower = index_text.lower()
- for subdir in ("summaries", "concepts"):
+ for subdir in PAGE_CONTENT_DIRS:
subdir_path = wiki / subdir
if not subdir_path.exists():
continue
diff --git a/openkb/schema.py b/openkb/schema.py
index b2c8cf07..605fa8eb 100644
--- a/openkb/schema.py
+++ b/openkb/schema.py
@@ -2,6 +2,14 @@
from pathlib import Path
+# The compiled page-type subdirectories under wiki/. Shared source of truth
+# for surfaces that enumerate page content (list, lint, status, skill gate).
+PAGE_CONTENT_DIRS = ("summaries", "concepts", "entities")
+
+# Canonical empty index.md seed. Used by `openkb init` and the compiler's
+# lazy-create path so they never drift.
+INDEX_SEED = "# Knowledge Base Index\n\n## Documents\n\n## Concepts\n\n## Entities\n\n## Explorations\n"
+
AGENTS_MD = """\
# Wiki Schema
@@ -10,6 +18,7 @@
- sources/images/ — Extracted images from documents, referenced by sources.
- summaries/ — One per source document. Summary of key content.
- concepts/ — Cross-document topic synthesis. Created when a theme spans multiple documents.
+- entities/ — Specific named things: people, organizations, places, products, named works, events. One page per entity, accumulated across documents.
- explorations/ — Saved query results, analyses, and comparisons worth keeping.
- reports/ — Lint health check reports. Auto-generated.
@@ -20,13 +29,15 @@
## Page Types
- **Summary Page** (summaries/): Key content of a single source document.
- **Concept Page** (concepts/): Cross-document topic synthesis with [[wikilinks]].
+- **Entity Page** (entities/): A specific named thing (proper noun). Frontmatter `type:` is one of: person, organization, place, product, work, event, other. An entity differs from a concept: a concept is an abstract recurring idea; an entity is a specific named thing. Create an entity page only when the entity is central to a document or recurs across sources — do not page passing mentions.
- **Exploration Page** (explorations/): Saved query results — analyses, comparisons, syntheses.
- **Index Page** (index.md): One-liner summary of every page in the wiki. Auto-maintained.
## Index Page Format
-index.md lists all documents, concepts, and explorations with metadata:
+index.md lists all documents, concepts, entities, and explorations with metadata:
- Documents: name, one-liner description, type (short|pageindex), detail access path
- Concepts: name, one-liner description
+- Entities: name, type, one-liner description
- Explorations: name, one-liner description
## Log Format
diff --git a/skills/openkb/SKILL.md b/skills/openkb/SKILL.md
index b515774f..f5c4b6f6 100644
--- a/skills/openkb/SKILL.md
+++ b/skills/openkb/SKILL.md
@@ -14,12 +14,17 @@ description: |
The user has compiled their documents into a Markdown wiki at `wiki/`.
-The wiki holds three kinds of pages:
+The wiki holds these kinds of pages:
- **Concept pages** at `wiki/concepts/*.md` — cross-document synthesis
on specific topics. This is where OpenKB's value compounds: a
concept with multiple sources represents knowledge merged across
documents the user has ingested.
+- **Entity pages** at `wiki/entities/*.md` — one per specific named
+ thing (people, organizations, places, products, named works,
+ events), accumulated across documents. Each has a `type:`
+ frontmatter field. For "who is X" / "what is X" questions about a
+ named thing, read the matching `entities/` page first.
- **Summary pages** at `wiki/summaries/*.md` — one per ingested
document, linking to the concepts that document touches.
- **Source files** at `wiki/sources/*.{md,json}` — full text for short
@@ -76,8 +81,9 @@ After capturing the KB path from `openkb status`, drill in via:
- `openkb list` — table of ingested documents (name, type, page count)
plus the concept list.
-- Read `/wiki/index.md` — the compiled table of contents. Every
- document and concept has a one-line `brief`. Scan this and pick the
+- Read `/wiki/index.md` — the compiled table of contents. It has
+ `## Documents`, `## Concepts`, `## Entities`, and `## Explorations`
+ sections; every entry has a one-line `brief`. Scan this and pick the
slugs that semantically match the user's question.
## Read content
@@ -90,6 +96,7 @@ calls these `Read` / `Grep` / `Bash`; Gemini CLI uses `read_file` /
| Goal | Action |
|---|---|
| Read a concept page | read the file at `/wiki/concepts/.md` |
+| Answer "who/what is X" about a named thing | read `/wiki/entities/.md` |
| Read a document's summary | read `/wiki/summaries/.md` |
| Read a short doc's full text | read `/wiki/sources/.md` |
| Read a long doc's specific page | shell: `jq '.[N-1]' /wiki/sources/.json` (N = 1-indexed PDF page; `.[0]` is page 1) |
diff --git a/skills/openkb/references/wiki-schema.md b/skills/openkb/references/wiki-schema.md
index 6b1a4e7f..ca95026f 100644
--- a/skills/openkb/references/wiki-schema.md
+++ b/skills/openkb/references/wiki-schema.md
@@ -14,6 +14,7 @@ long-PDF JSON shape, wikilink resolution rules.
├── log.md Chronological ingest/edit log
├── summaries/.md One per ingested document
├── concepts/.md Cross-document synthesis pages
+ ├── entities/.md Named-thing pages (people/orgs/places/...)
├── sources/ Converted source content
│ ├── .md Short-doc full text
│ ├── .json Long-doc paginated content
@@ -28,7 +29,7 @@ registry, PageIndex DB). **Do not read these directly** — use
## `wiki/index.md`
-Three top-level sections, each entry has a one-line brief:
+Four top-level sections, each entry has a one-line brief:
```markdown
## Documents
@@ -38,6 +39,9 @@ Three top-level sections, each entry has a one-line brief:
## Concepts
- [[concepts/attention]] — brief from frontmatter
+## Entities
+- [[entities/ada-lovelace]] (person) — brief from frontmatter
+
## Explorations
- [[explorations/some-saved-query]] — saved query answer
```
@@ -76,6 +80,23 @@ Body: free-form sections + `## Related Documents` listing
contributing summaries. **Multi-source = cross-document synthesis**
— this is the high-value output of OpenKB's compile pipeline.
+## `wiki/entities/.md`
+
+Frontmatter:
+
+```yaml
+---
+sources: [summaries/paper.md, summaries/notes.md]
+brief: One-line description.
+type: person # person | organization | place | product | work | event | other
+---
+```
+
+Body: free-form sections about the named thing + a `## Related
+Documents` section. One page per entity, accumulated as more
+documents mention it. For "who/what is X" questions about a named
+thing, read the matching entity page first.
+
## `wiki/sources/.md` (short docs)
The markitdown-converted full text. Image refs appear as
diff --git a/tests/test_cli.py b/tests/test_cli.py
index ab3378b1..65463566 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -24,6 +24,7 @@ def test_init_creates_structure(tmp_path):
assert (cwd / "wiki" / "sources" / "images").is_dir()
assert (cwd / "wiki" / "summaries").is_dir()
assert (cwd / "wiki" / "concepts").is_dir()
+ assert (cwd / "wiki" / "entities").is_dir()
assert (cwd / ".openkb").is_dir()
# Files
@@ -39,7 +40,7 @@ def test_init_creates_structure(tmp_path):
# index.md header
index_content = (cwd / "wiki" / "index.md").read_text()
- assert index_content == "# Knowledge Base Index\n\n## Documents\n\n## Concepts\n\n## Explorations\n"
+ assert index_content == "# Knowledge Base Index\n\n## Documents\n\n## Concepts\n\n## Entities\n\n## Explorations\n"
def test_init_schema_content(tmp_path):
diff --git a/tests/test_compiler.py b/tests/test_compiler.py
index de5f0afa..83f94f0c 100644
--- a/tests/test_compiler.py
+++ b/tests/test_compiler.py
@@ -14,12 +14,18 @@
_sanitize_concept_name,
_write_summary,
_write_concept,
+ _write_entity,
_update_index,
_read_wiki_context,
_read_concept_briefs,
+ _read_entity_briefs,
_add_related_link,
_backlink_summary,
_backlink_concepts,
+ _backlink_summary_entities,
+ _backlink_entities,
+ _parse_entities_plan,
+ remove_doc_from_entity_pages,
)
@@ -61,6 +67,31 @@ def test_fenced_dict(self):
assert parsed["create"] == []
+class TestParseEntitiesPlan:
+ def test_extracts_entities_group(self):
+ parsed = {
+ "concepts": {"create": [{"name": "x", "title": "X"}], "update": [], "related": []},
+ "entities": {
+ "create": [{"name": "anthropic", "title": "Anthropic", "type": "organization"}],
+ "update": [],
+ "related": ["nvidia"],
+ },
+ }
+ ents = _parse_entities_plan(parsed)
+ assert ents["create"] == [{"name": "anthropic", "title": "Anthropic", "type": "organization"}]
+ assert ents["related"] == ["nvidia"]
+
+ def test_missing_entities_key_is_empty(self):
+ ents = _parse_entities_plan({"create": [], "update": [], "related": []})
+ assert ents == {"create": [], "update": [], "related": []}
+
+ def test_bad_type_falls_back_to_other(self):
+ parsed = {"entities": {"create": [{"name": "x", "title": "X", "type": "alien"}],
+ "update": [], "related": []}}
+ ents = _parse_entities_plan(parsed)
+ assert ents["create"][0]["type"] == "other"
+
+
class TestParseBriefContent:
def test_dict_with_brief_and_content(self):
text = json.dumps({"brief": "A short desc", "content": "# Full page\n\nDetails."})
@@ -330,6 +361,27 @@ def test_recovers_when_concepts_section_missing(self, tmp_path):
assert "[[concepts/attention]] — Focus" in text
assert "[[summaries/my-doc]]" in text
+ def test_entities_inserted_before_explorations(self, tmp_path):
+ """#8: an old index.md predating ## Entities must get it inserted
+ before ## Explorations, not appended after it (canonical order)."""
+ wiki = tmp_path / "wiki"
+ wiki.mkdir()
+ # Old order: no ## Entities section yet.
+ (wiki / "index.md").write_text(
+ "# Index\n\n## Documents\n\n## Concepts\n\n## Explorations\n",
+ encoding="utf-8",
+ )
+ _update_index(
+ wiki, "my-doc", [],
+ entity_names=["anthropic"],
+ entity_meta={"anthropic": ("organization", "AI lab.")},
+ )
+ text = (wiki / "index.md").read_text()
+ assert "## Entities" in text
+ # Canonical order: Entities before Explorations.
+ assert text.index("## Entities") < text.index("## Explorations")
+ assert "[[entities/anthropic]] (organization) — AI lab." in text
+
class TestReadWikiContext:
def test_empty_wiki(self, tmp_path):
@@ -437,6 +489,125 @@ def test_falls_back_to_body_truncation(self, tmp_path):
assert "- old: Old concept without brief field." in result
+class TestReadEntityBriefs:
+ def test_none_when_missing(self, tmp_path):
+ assert _read_entity_briefs(tmp_path) == "(none yet)"
+
+ def test_brief_type_and_source_count(self, tmp_path):
+ ent = tmp_path / "entities"
+ ent.mkdir()
+ (ent / "anthropic.md").write_text(
+ "---\n"
+ "sources: [summaries/a.md, summaries/b.md]\n"
+ "type: organization\n"
+ "brief: AI lab behind Claude.\n"
+ "---\n\n# Anthropic\n",
+ encoding="utf-8",
+ )
+ out = _read_entity_briefs(tmp_path)
+ assert out == "- anthropic (organization, 2 sources) — AI lab behind Claude."
+
+ def test_empty_dir_returns_none(self, tmp_path):
+ ent = tmp_path / "entities"
+ ent.mkdir()
+ assert _read_entity_briefs(tmp_path) == "(none yet)"
+
+ def test_falls_back_to_body_when_no_brief(self, tmp_path):
+ ent = tmp_path / "entities"
+ ent.mkdir()
+ body_text = "OpenAI is a research lab focused on artificial general intelligence."
+ (ent / "openai.md").write_text(
+ "---\n"
+ "type: organization\n"
+ "sources: [summaries/a.md, summaries/b.md, summaries/c.md]\n"
+ "---\n\n" + body_text,
+ encoding="utf-8",
+ )
+ out = _read_entity_briefs(tmp_path)
+ # Should use truncated body (first 150 chars) as the brief
+ expected_brief = body_text[:150]
+ assert f" — {expected_brief}" in out
+ # Should still include type and source count
+ assert "(organization, 3 sources)" in out
+
+ def test_sorted_alphabetically(self, tmp_path):
+ ent = tmp_path / "entities"
+ ent.mkdir()
+ (ent / "zeta.md").write_text(
+ "---\ntype: person\nsources: [summaries/a.md]\nbrief: Last letter of Greek alphabet.\n---\n",
+ encoding="utf-8",
+ )
+ (ent / "alpha.md").write_text(
+ "---\ntype: concept\nsources: [summaries/b.md]\nbrief: First letter of Greek alphabet.\n---\n",
+ encoding="utf-8",
+ )
+ out = _read_entity_briefs(tmp_path)
+ lines = out.strip().splitlines()
+ assert lines[0].startswith("- alpha ")
+ assert lines[1].startswith("- zeta ")
+
+
+class TestWriteEntity:
+ def test_new_entity_frontmatter(self, tmp_path):
+ _write_entity(
+ tmp_path, "anthropic", "# Anthropic\n\nAI lab.",
+ "summaries/a.md", is_update=False,
+ brief="AI lab behind Claude.", type_="organization",
+ aliases=["Anthropic PBC"],
+ )
+ text = (tmp_path / "entities" / "anthropic.md").read_text(encoding="utf-8")
+ assert "type:" in text and "organization" in text
+ assert "brief:" in text and "AI lab behind Claude." in text
+ assert "sources:" in text and "summaries/a.md" in text
+ assert "Anthropic PBC" in text
+ assert text.count("---") == 2 # exactly one frontmatter block
+
+ def test_update_prepends_source_keeps_type(self, tmp_path):
+ _write_entity(
+ tmp_path, "anthropic", "# Anthropic\n\nv1.",
+ "summaries/a.md", is_update=False,
+ brief="b1", type_="organization", aliases=None,
+ )
+ _write_entity(
+ tmp_path, "anthropic", "# Anthropic\n\nv2 richer.",
+ "summaries/b.md", is_update=True,
+ brief="b2", type_="organization", aliases=None,
+ )
+ text = (tmp_path / "entities" / "anthropic.md").read_text(encoding="utf-8")
+ assert "summaries/b.md" in text and "summaries/a.md" in text
+ # _yaml_list_line uses json.dumps: b prepended before a, double-quoted
+ assert '"summaries/b.md", "summaries/a.md"' in text
+ assert "type:" in text and "organization" in text
+ assert "v2 richer." in text
+ assert "v1." not in text
+ assert "brief:" in text and "b2" in text
+
+ def test_update_rebuilds_frontmatter_when_no_closing_delim(self, tmp_path):
+ """#11: malformed existing file (opening --- but no closing ---) must
+ not drop frontmatter; rebuild valid sources/type/brief on update."""
+ entities = tmp_path / "entities"
+ entities.mkdir(parents=True)
+ # Opening delimiter, NO closing delimiter — find("---", 3) == -1.
+ (entities / "anthropic.md").write_text(
+ "---\nsources: [\"summaries/a.md\"]\ntype: organization\n"
+ "# Anthropic (no closing fence)\n\nOld body.",
+ encoding="utf-8",
+ )
+ _write_entity(
+ tmp_path, "anthropic", "# Anthropic\n\nv2 rewritten.",
+ "summaries/b.md", is_update=True,
+ brief="AI lab.", type_="organization", aliases=None,
+ )
+ text = (entities / "anthropic.md").read_text(encoding="utf-8")
+ # Frontmatter rebuilt with a proper closing delimiter, not body-only.
+ assert text.startswith("---\n")
+ assert text.count("---") == 2
+ assert "sources:" in text and "summaries/b.md" in text
+ assert "type:" in text and "organization" in text
+ assert "brief:" in text and "AI lab." in text
+ assert "v2 rewritten." in text
+
+
class TestBacklinkSummary:
def test_adds_missing_concept_links(self, tmp_path):
wiki = tmp_path / "wiki"
@@ -926,6 +1097,33 @@ async def test_empty_plan_strips_v1_summary_ghosts(self, tmp_path):
assert "[[concepts/imaginary]]" not in text
assert "imaginary" in text # plain text preserved
+ @pytest.mark.asyncio
+ async def test_scalar_plan_handled_gracefully(self, tmp_path):
+ """#10: a JSON scalar plan (valid JSON, not object/array) must not
+ crash with AttributeError; it takes the graceful empty-plan path —
+ v1 summary written, index updated, no concept/entity pages."""
+ wiki, source_path = self._setup_kb(tmp_path)
+
+ summary_response = json.dumps({
+ "brief": "B", "content": "# Summary\n\nPlain body, no links.",
+ })
+ # Plan call returns a bare JSON scalar (an integer).
+ scalar_plan_response = "42"
+
+ with patch("openkb.agent.compiler.litellm") as mock_litellm:
+ mock_litellm.completion = MagicMock(
+ side_effect=_mock_completion([summary_response, scalar_plan_response])
+ )
+ # Must not raise (AttributeError) and must complete.
+ await compile_short_doc("doc", source_path, tmp_path, "gpt-4o-mini")
+
+ # Summary still written, index updated with the document.
+ assert (wiki / "summaries" / "doc.md").exists()
+ index_text = (wiki / "index.md").read_text()
+ assert "[[summaries/doc]]" in index_text
+ # No concept pages produced from the unusable plan.
+ assert not list((wiki / "concepts").glob("*.md"))
+
class TestCacheControl:
"""Verify cache_control breakpoints are emitted on the right messages
@@ -1221,6 +1419,47 @@ async def ordered_acompletion(*args, **kwargs):
assert "[[concepts/flash-attention]]" in index_text
assert "[[concepts/attention]]" in index_text
+ @pytest.mark.asyncio
+ async def test_empty_content_skips_page_no_json_body(self, tmp_path):
+ """#9: when the page LLM returns parseable JSON with empty content
+ ({"content": ""}), the page is skipped (not written as raw JSON)."""
+ wiki = self._setup_wiki(tmp_path)
+
+ plan_response = json.dumps({
+ "create": [{"name": "ghost-concept", "title": "Ghost Concept"}],
+ "update": [],
+ "related": [],
+ })
+ # Parseable JSON, but empty content — old code fell back to raw JSON.
+ empty_content_response = json.dumps({"brief": "B", "content": ""})
+
+ system_msg = {"role": "system", "content": "You are a wiki agent."}
+ doc_msg = {"role": "user", "content": "Document content."}
+
+ with patch("openkb.agent.compiler.litellm") as mock_litellm:
+ mock_litellm.completion = MagicMock(
+ side_effect=_mock_completion([plan_response])
+ )
+ mock_litellm.acompletion = AsyncMock(
+ side_effect=_mock_completion([empty_content_response])
+ )
+ await _compile_concepts(
+ wiki, tmp_path, "gpt-4o-mini", system_msg, doc_msg,
+ "Summary.", "test-doc", 5,
+ )
+
+ # The concept page must NOT be written (generation raised + dropped).
+ page = wiki / "concepts" / "ghost-concept.md"
+ assert not page.exists()
+ # And no concept index entry either.
+ index_text = (wiki / "index.md").read_text()
+ assert "[[concepts/ghost-concept]]" not in index_text
+ # Definitely no raw JSON written anywhere as a body.
+ assert not any(
+ '"content":' in p.read_text()
+ for p in (wiki / "concepts").glob("*.md")
+ )
+
@pytest.mark.asyncio
async def test_related_adds_link_no_llm(self, tmp_path):
"""Plan has only related items. No acompletion calls should be made."""
@@ -1345,3 +1584,240 @@ async def test_short_doc_briefs_in_index_and_frontmatter(self, tmp_path):
index_text = (wiki / "index.md").read_text()
assert "— A paper about transformers" in index_text
assert "— NN architecture using self-attention" in index_text
+
+
+class TestIndexEntities:
+ def test_entities_section_written(self, tmp_path):
+ _update_index(
+ tmp_path, "doc", [], doc_brief="d",
+ entity_names=["anthropic"],
+ entity_meta={"anthropic": ("organization", "AI lab behind Claude.")},
+ )
+ text = (tmp_path / "index.md").read_text(encoding="utf-8")
+ assert "## Entities" in text
+ assert "- [[entities/anthropic]] (organization) — AI lab behind Claude." in text
+
+ def test_entity_entry_replaced_on_update(self, tmp_path):
+ _update_index(tmp_path, "doc", [], entity_names=["anthropic"],
+ entity_meta={"anthropic": ("organization", "old")})
+ _update_index(tmp_path, "doc2", [], entity_names=["anthropic"],
+ entity_meta={"anthropic": ("organization", "new")})
+ text = (tmp_path / "index.md").read_text(encoding="utf-8")
+ assert text.count("[[entities/anthropic]]") == 1
+ assert "new" in text and "old" not in text
+
+
+class TestEntityBacklinks:
+ def _seed(self, tmp_path):
+ (tmp_path / "summaries").mkdir()
+ (tmp_path / "summaries" / "doc.md").write_text(
+ "---\nsources: []\n---\n\n# Doc\n", encoding="utf-8")
+ (tmp_path / "entities").mkdir()
+ (tmp_path / "entities" / "anthropic.md").write_text(
+ "---\ntype: organization\nsources: [summaries/doc.md]\n---\n\n# Anthropic\n",
+ encoding="utf-8")
+
+ def test_summary_gets_entities_section(self, tmp_path):
+ self._seed(tmp_path)
+ _backlink_summary_entities(tmp_path, "doc", ["anthropic"])
+ text = (tmp_path / "summaries" / "doc.md").read_text(encoding="utf-8")
+ assert "## Entities" in text
+ assert "[[entities/anthropic]]" in text
+
+ def test_entity_gets_related_documents(self, tmp_path):
+ self._seed(tmp_path)
+ _backlink_entities(tmp_path, "doc", ["anthropic"])
+ text = (tmp_path / "entities" / "anthropic.md").read_text(encoding="utf-8")
+ assert "## Related Documents" in text
+ assert "[[summaries/doc]]" in text
+
+ def test_idempotent(self, tmp_path):
+ self._seed(tmp_path)
+ _backlink_summary_entities(tmp_path, "doc", ["anthropic"])
+ _backlink_summary_entities(tmp_path, "doc", ["anthropic"])
+ text = (tmp_path / "summaries" / "doc.md").read_text(encoding="utf-8")
+ assert text.count("[[entities/anthropic]]") == 1
+
+
+class TestRemoveEntityPages:
+ def test_strip_source_and_delete_when_empty(self, tmp_path):
+ ent = tmp_path / "entities"
+ ent.mkdir()
+ (ent / "solo.md").write_text(
+ "---\ntype: organization\nsources: [summaries/doc.md]\n---\n\n"
+ "# Solo\n\n## Related Documents\n- [[summaries/doc]]\n",
+ encoding="utf-8")
+ (ent / "shared.md").write_text(
+ "---\ntype: organization\nsources: [summaries/doc.md, summaries/other.md]\n---\n\n"
+ "# Shared\n\n## Related Documents\n- [[summaries/doc]]\n- [[summaries/other]]\n",
+ encoding="utf-8")
+ result = remove_doc_from_entity_pages(tmp_path, "doc")
+ assert result == {"modified": ["shared"], "deleted": ["solo"]}
+ assert not (ent / "solo.md").exists()
+ shared = (ent / "shared.md").read_text(encoding="utf-8")
+ assert "summaries/doc" not in shared
+ assert "summaries/other" in shared
+
+ def test_strips_standalone_see_also_line(self, tmp_path):
+ # A related entity (linked via _add_related_link) carries a
+ # standalone "See also:" paragraph, not a "## Related Documents"
+ # section. Removing the doc must strip it so no dangling wikilink
+ # survives on an entity that has other sources.
+ ent = tmp_path / "entities"
+ ent.mkdir()
+ (ent / "shared.md").write_text(
+ "---\ntype: organization\nsources: [summaries/doc.md, summaries/other.md]\n---\n\n"
+ "# Shared\n\nSee also: [[summaries/doc]]",
+ encoding="utf-8")
+ result = remove_doc_from_entity_pages(tmp_path, "doc")
+ assert result == {"modified": ["shared"], "deleted": []}
+ shared = (ent / "shared.md").read_text(encoding="utf-8")
+ assert "summaries/doc" not in shared
+ assert "See also" not in shared
+ assert "summaries/other" in shared
+
+
+class TestCompileEntitiesEndToEnd:
+ @pytest.mark.asyncio
+ async def test_entity_and_concept_split(self, tmp_path, monkeypatch):
+ wiki = tmp_path / "wiki"
+ (wiki / "summaries").mkdir(parents=True)
+ (wiki / "summaries" / "doc.md").write_text(
+ "---\nsources: []\n---\n\n# Doc\n", encoding="utf-8")
+
+ # Mocked LLM: plan call returns one concept + one entity; each
+ # generation call returns a tiny page.
+ def fake_llm(model, messages, label, **kw):
+ if label == "concepts-plan":
+ return json.dumps({
+ "concepts": {"create": [{"name": "ai-demand", "title": "AI Demand"}],
+ "update": [], "related": []},
+ "entities": {"create": [{"name": "nvidia", "title": "NVIDIA",
+ "type": "organization"}],
+ "update": [], "related": []},
+ })
+ return json.dumps({"brief": "b", "type": "organization", "content": "# Page\n"})
+
+ async def fake_llm_async(model, messages, label, **kw):
+ return fake_llm(model, messages, label, **kw)
+
+ monkeypatch.setattr("openkb.agent.compiler._llm_call", fake_llm)
+ monkeypatch.setattr("openkb.agent.compiler._llm_call_async", fake_llm_async)
+
+ from openkb.agent.compiler import _compile_concepts
+ sys_msg = {"role": "system", "content": "x"}
+ doc_msg = {"role": "user", "content": "x"}
+ await _compile_concepts(wiki, tmp_path, "m", sys_msg, doc_msg,
+ "summary text", "doc", max_concurrency=2,
+ doc_type="short", rewrite_summary=False)
+
+ assert (wiki / "concepts" / "ai-demand.md").exists()
+ assert (wiki / "entities" / "nvidia.md").exists()
+ ent = (wiki / "entities" / "nvidia.md").read_text(encoding="utf-8")
+ # Frontmatter values are JSON-quoted by _yaml_kv_line (see _write_entity,
+ # Task 2), matching the tolerant assertion style in TestWriteEntity.
+ assert "type:" in ent and "organization" in ent
+ index = (wiki / "index.md").read_text(encoding="utf-8")
+ assert "[[entities/nvidia]]" in index
+ summary = (wiki / "summaries" / "doc.md").read_text(encoding="utf-8")
+ assert "[[entities/nvidia]]" in summary # backlink
+
+ @pytest.mark.asyncio
+ async def test_related_entity_does_not_downgrade_index_label(self, tmp_path, monkeypatch):
+ """Related-only entities must not overwrite a correct index entry with (other)."""
+ wiki = tmp_path / "wiki"
+ (wiki / "summaries").mkdir(parents=True)
+ (wiki / "entities").mkdir(parents=True)
+
+ # Pre-seed summaries/doc.md
+ (wiki / "summaries" / "doc.md").write_text(
+ "---\nsources: []\n---\n\n# Doc\n", encoding="utf-8")
+
+ # Pre-seed index.md with a correct entry for anthropic
+ (wiki / "index.md").write_text(
+ "## Documents\n\n## Concepts\n\n## Entities\n\n"
+ "- [[entities/anthropic]] (organization) — AI safety lab\n",
+ encoding="utf-8",
+ )
+
+ # Pre-seed entities/anthropic.md with type frontmatter and a source
+ (wiki / "entities" / "anthropic.md").write_text(
+ "---\ntype: organization\nsources: []\n---\n\n# Anthropic\n",
+ encoding="utf-8",
+ )
+
+ # LLM plan: anthropic is ONLY under entities.related, not create/update
+ def fake_llm(model, messages, label, **kw):
+ if label == "concepts-plan":
+ return json.dumps({
+ "concepts": {"create": [], "update": [], "related": []},
+ "entities": {"create": [], "update": [], "related": ["anthropic"]},
+ })
+ return json.dumps({"brief": "b", "type": "organization", "content": "# Page\n"})
+
+ async def fake_llm_async(model, messages, label, **kw):
+ return fake_llm(model, messages, label, **kw)
+
+ monkeypatch.setattr("openkb.agent.compiler._llm_call", fake_llm)
+ monkeypatch.setattr("openkb.agent.compiler._llm_call_async", fake_llm_async)
+
+ from openkb.agent.compiler import _compile_concepts
+ sys_msg = {"role": "system", "content": "x"}
+ doc_msg = {"role": "user", "content": "x"}
+ await _compile_concepts(wiki, tmp_path, "m", sys_msg, doc_msg,
+ "summary text", "doc", max_concurrency=2,
+ doc_type="short", rewrite_summary=False)
+
+ index = (wiki / "index.md").read_text(encoding="utf-8")
+ # The pre-existing correct line must NOT have been downgraded to (other)
+ assert "(organization)" in index, "index entry was downgraded from (organization) to (other)"
+ assert "AI safety lab" in index, "index brief was stripped from the entry"
+
+
+# ---------------------------------------------------------------------------
+# Task 9: schema declares entities
+# ---------------------------------------------------------------------------
+
+from openkb.schema import AGENTS_MD
+
+
+def test_schema_declares_entities():
+ assert "entities/" in AGENTS_MD
+ assert "Entity Page" in AGENTS_MD
+ for t in ("person", "organization", "place", "product", "work", "event", "other"):
+ assert t in AGENTS_MD
+
+
+def test_ensure_h2_section_quiet_suppresses_drift_warning(caplog):
+ """Backlink helpers create sections as a normal operation, so quiet=True
+ must not emit the 'hand-edited' drift warning; default still warns."""
+ import logging
+
+ from openkb.agent.compiler import _ensure_h2_section
+
+ with caplog.at_level(logging.WARNING, logger="openkb.agent.compiler"):
+ lines = ["# Doc", ""]
+ _ensure_h2_section(lines, "## Entities", quiet=True)
+ assert "## Entities" in lines
+ assert caplog.records == []
+
+ _ensure_h2_section(["# Doc", ""], "## Entities") # default warns
+ assert any("missing" in r.getMessage() for r in caplog.records)
+
+
+def test_known_targets_prompt_has_entities_rule():
+ """The whitelist message must tell the LLM the [[entities/X]] rule, since
+ entity-page prompts instruct writing such links; otherwise entity links
+ are generated freely and then stripped as ghosts."""
+ from openkb.agent.compiler import _KNOWN_TARGETS_USER
+
+ assert "[[entities/" in _KNOWN_TARGETS_USER
+
+
+def test_plan_prompt_keeps_topic_itself_guard():
+ """The concept-plan prompt must retain the guard against creating a concept
+ that merely mirrors the document's own topic."""
+ from openkb.agent.compiler import _CONCEPTS_PLAN_USER
+
+ assert "just the document topic itself" in _CONCEPTS_PLAN_USER
+
diff --git a/tests/test_lint.py b/tests/test_lint.py
index fe6a3e6a..99dca51d 100644
--- a/tests/test_lint.py
+++ b/tests/test_lint.py
@@ -12,6 +12,7 @@
find_missing_entries,
find_orphans,
fix_broken_links,
+ list_existing_wiki_targets,
run_structural_lint,
strip_ghost_wikilinks,
)
@@ -184,6 +185,22 @@ def test_page_not_in_index(self, tmp_path):
assert any("unlisted" in issue for issue in result)
+ def test_entity_page_not_in_index(self, tmp_path):
+ wiki = _make_wiki(tmp_path)
+ (wiki / "entities").mkdir()
+ (wiki / "entities" / "ada-lovelace.md").write_text("# Ada Lovelace")
+ # index.md has no mention of the entity
+ (wiki / "index.md").write_text(
+ "# Index\n\n## Documents\n\n## Concepts\n\n## Entities\n"
+ )
+
+ result = check_index_sync(wiki)
+
+ assert any(
+ "entities/ada-lovelace.md not mentioned in index.md" in issue
+ for issue in result
+ )
+
def test_missing_index_md(self, tmp_path):
wiki = tmp_path / "wiki"
wiki.mkdir()
@@ -501,3 +518,10 @@ def test_restrict_to_uses_global_known_targets(self, tmp_path):
assert "[[concepts/sibling]]" in text
# Ghost link gets demoted.
assert "[[concepts/ghost]]" not in text
+
+
+def test_whitelist_includes_entities(tmp_path):
+ (tmp_path / "entities").mkdir()
+ (tmp_path / "entities" / "anthropic.md").write_text("# A", encoding="utf-8")
+ targets = list_existing_wiki_targets(tmp_path)
+ assert "entities/anthropic" in targets
diff --git a/tests/test_list_status.py b/tests/test_list_status.py
index babffb06..76365b08 100644
--- a/tests/test_list_status.py
+++ b/tests/test_list_status.py
@@ -17,6 +17,7 @@ def _setup_kb(tmp_path: Path) -> Path:
(kb_dir / "wiki" / "sources" / "images").mkdir(parents=True)
(kb_dir / "wiki" / "summaries").mkdir(parents=True)
(kb_dir / "wiki" / "concepts").mkdir(parents=True)
+ (kb_dir / "wiki" / "entities").mkdir(parents=True)
(kb_dir / "wiki" / "reports").mkdir(parents=True)
openkb_dir = kb_dir / ".openkb"
openkb_dir.mkdir()
@@ -87,6 +88,34 @@ def test_list_no_concepts_section_when_empty(self, tmp_path):
# No concepts in output since none exist
assert "Concepts:" not in result.output
+ def test_list_shows_entities(self, tmp_path):
+ kb_dir = _setup_kb(tmp_path)
+ hashes = {"abc": {"name": "paper.pdf", "type": "pdf"}}
+ (kb_dir / ".openkb" / "hashes.json").write_text(json.dumps(hashes))
+ (kb_dir / "wiki" / "entities" / "ada-lovelace.md").write_text("# Ada")
+ (kb_dir / "wiki" / "entities" / "openai.md").write_text("# OpenAI")
+
+ runner = CliRunner()
+ with patch("openkb.cli._find_kb_dir", return_value=kb_dir):
+ result = runner.invoke(cli, ["list"])
+
+ assert "Entities (2):" in result.output
+ assert "ada-lovelace" in result.output
+ assert "openai" in result.output
+
+ def test_list_no_entities_section_when_empty(self, tmp_path):
+ kb_dir = _setup_kb(tmp_path)
+ hashes = {"abc": {"name": "paper.pdf", "type": "pdf"}}
+ (kb_dir / ".openkb" / "hashes.json").write_text(json.dumps(hashes))
+
+ runner = CliRunner()
+ with patch("openkb.cli._find_kb_dir", return_value=kb_dir):
+ result = runner.invoke(cli, ["list"])
+
+ assert result.exit_code == 0
+ assert "Entities:" not in result.output
+ assert "Entities (" not in result.output
+
class TestStatusCommand:
def test_status_no_kb(self, tmp_path):
@@ -111,6 +140,7 @@ def test_status_shows_directory_counts(self, tmp_path):
assert "sources" in result.output
assert "summaries" in result.output
assert "concepts" in result.output
+ assert "entities" in result.output
assert "reports" in result.output
def test_status_shows_total_indexed(self, tmp_path):
diff --git a/tests/test_query.py b/tests/test_query.py
index 4fe421c8..e9585d32 100644
--- a/tests/test_query.py
+++ b/tests/test_query.py
@@ -73,6 +73,14 @@ async def fake_run(agent, message, **kwargs):
assert "How does attention work?" in captured["message"]
+def test_query_strategy_mentions_entities():
+ """Task 10: query agent must direct who/what questions to entities/."""
+ from openkb.agent import query as query_mod
+
+ text = query_mod._QUERY_INSTRUCTIONS_TEMPLATE
+ assert "entities/" in text
+
+
class TestFmtFallback:
"""Regression tests for issue #34.
diff --git a/tests/test_recompile.py b/tests/test_recompile.py
new file mode 100644
index 00000000..29d06137
--- /dev/null
+++ b/tests/test_recompile.py
@@ -0,0 +1,314 @@
+"""Tests for the `openkb recompile` CLI command.
+
+`recompile` re-runs the current compile pipeline (compile_short_doc /
+compile_long_doc) on already-indexed docs so pre-feature KBs gain the
+entities/ layer and refresh to the current format. It does NOT re-run
+PageIndex or re-convert raw files.
+
+Covers:
+- short-doc dispatch (compile_short_doc called with the right args)
+- long-doc dispatch (compile_long_doc called with doc_id; PageIndex not invoked)
+- --all confirmation + --yes bypass
+- --dry-run: no compile calls, no writes
+- skip+warn paths (missing source, missing summary/doc_id) with others
+ still processed
+- unknown / empty registry friendly error
+- --refresh-schema backs up + overwrites only when AGENTS.md differs
+"""
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
+from unittest.mock import AsyncMock, patch
+
+from click.testing import CliRunner
+
+from openkb.cli import cli
+from openkb.schema import AGENTS_MD
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+def _invoke(kb_dir, args, input_text=None):
+ return CliRunner().invoke(
+ cli, ["--kb-dir", str(kb_dir), *args], input=input_text,
+ )
+
+
+def _seed_short(kb_dir: Path) -> None:
+ """One short doc with a source file on disk."""
+ (kb_dir / ".openkb" / "hashes.json").write_text(json.dumps({
+ "h_s": {"name": "notes.md", "doc_name": "notes-h_s", "type": "md"},
+ }))
+ (kb_dir / "wiki" / "sources" / "notes-h_s.md").write_text(
+ "# Notes\n\nbody\n", encoding="utf-8",
+ )
+ (kb_dir / "wiki" / "log.md").write_text("# Log\n\n", encoding="utf-8")
+
+
+def _seed_long(kb_dir: Path) -> None:
+ """One long (PageIndex) doc with a summary file + doc_id on disk."""
+ (kb_dir / ".openkb" / "hashes.json").write_text(json.dumps({
+ "h_l": {
+ "name": "paper.pdf", "doc_name": "paper-h_l",
+ "type": "long_pdf", "doc_id": "doc-abc123",
+ },
+ }))
+ (kb_dir / "wiki" / "summaries" / "paper-h_l.md").write_text(
+ "---\nsources: [raw/paper.pdf]\nbrief: P\n---\n# Paper\n",
+ encoding="utf-8",
+ )
+ (kb_dir / "wiki" / "log.md").write_text("# Log\n\n", encoding="utf-8")
+
+
+# ---------------------------------------------------------------------------
+# short-doc dispatch
+# ---------------------------------------------------------------------------
+
+
+def test_recompile_short_dispatches_compile_short_doc(kb_dir):
+ _seed_short(kb_dir)
+ with patch("openkb.agent.compiler.compile_short_doc", new_callable=AsyncMock) as short, \
+ patch("openkb.agent.compiler.compile_long_doc", new_callable=AsyncMock) as long_:
+ result = _invoke(kb_dir, ["recompile", "notes.md"])
+
+ assert result.exit_code == 0, result.output
+ short.assert_called_once()
+ args = short.call_args.args
+ assert args[0] == "notes-h_s" # doc_name
+ assert args[1] == kb_dir / "wiki" / "sources" / "notes-h_s.md" # source_path
+ assert args[2] == kb_dir # kb_dir
+ long_.assert_not_called()
+ assert "recompiled 1" in result.output
+
+
+# ---------------------------------------------------------------------------
+# long-doc dispatch
+# ---------------------------------------------------------------------------
+
+
+def test_recompile_long_dispatches_compile_long_doc_with_doc_id(kb_dir):
+ _seed_long(kb_dir)
+ with patch("openkb.agent.compiler.compile_long_doc", new_callable=AsyncMock) as long_, \
+ patch("openkb.agent.compiler.compile_short_doc", new_callable=AsyncMock) as short, \
+ patch("openkb.indexer.index_long_document") as index:
+ result = _invoke(kb_dir, ["recompile", "paper.pdf"])
+
+ assert result.exit_code == 0, result.output
+ long_.assert_called_once()
+ args = long_.call_args.args
+ assert args[0] == "paper-h_l" # doc_name
+ assert args[1] == kb_dir / "wiki" / "summaries" / "paper-h_l.md"
+ assert args[2] == "doc-abc123" # doc_id
+ assert args[3] == kb_dir
+ short.assert_not_called()
+ # PageIndex must NOT be re-run
+ index.assert_not_called()
+ assert "recompiled 1" in result.output
+
+
+# ---------------------------------------------------------------------------
+# --all confirmation + --yes
+# ---------------------------------------------------------------------------
+
+
+def test_recompile_all_requires_confirmation(kb_dir):
+ _seed_short(kb_dir)
+ with patch("openkb.agent.compiler.compile_short_doc") as short:
+ result = _invoke(kb_dir, ["recompile", "--all"], input_text="n\n")
+
+ assert result.exit_code == 0, result.output
+ assert "Aborted" in result.output
+ short.assert_not_called()
+
+
+def test_recompile_all_yes_bypasses_confirmation(kb_dir):
+ _seed_short(kb_dir)
+ with patch("openkb.agent.compiler.compile_short_doc", new_callable=AsyncMock) as short:
+ result = _invoke(kb_dir, ["recompile", "--all", "--yes"])
+
+ assert result.exit_code == 0, result.output
+ short.assert_called_once()
+ assert "recompiled 1" in result.output
+
+
+# ---------------------------------------------------------------------------
+# --dry-run
+# ---------------------------------------------------------------------------
+
+
+def test_recompile_dry_run_no_calls_no_writes(kb_dir):
+ _seed_short(kb_dir)
+ log_before = (kb_dir / "wiki" / "log.md").read_text()
+ with patch("openkb.agent.compiler.compile_short_doc") as short, \
+ patch("openkb.agent.compiler.compile_long_doc") as long_:
+ result = _invoke(kb_dir, ["recompile", "--all", "--dry-run"])
+
+ assert result.exit_code == 0, result.output
+ short.assert_not_called()
+ long_.assert_not_called()
+ assert "notes-h_s" in result.output
+ assert "short" in result.output
+ # No writes: log.md unchanged
+ assert (kb_dir / "wiki" / "log.md").read_text() == log_before
+
+
+# ---------------------------------------------------------------------------
+# skip + warn paths
+# ---------------------------------------------------------------------------
+
+
+def test_recompile_skips_short_missing_source(kb_dir):
+ """Short doc with no source on disk is warned + skipped; others run."""
+ (kb_dir / ".openkb" / "hashes.json").write_text(json.dumps({
+ "h_ok": {"name": "ok.md", "doc_name": "ok-h_ok", "type": "md"},
+ "h_miss": {"name": "gone.md", "doc_name": "gone-h_miss", "type": "md"},
+ }))
+ (kb_dir / "wiki" / "sources" / "ok-h_ok.md").write_text("# ok\n")
+ (kb_dir / "wiki" / "log.md").write_text("# Log\n\n", encoding="utf-8")
+
+ with patch("openkb.agent.compiler.compile_short_doc", new_callable=AsyncMock) as short:
+ result = _invoke(kb_dir, ["recompile", "--all", "--yes"])
+
+ assert result.exit_code == 0, result.output
+ # only the doc with a present source compiled
+ assert short.call_count == 1
+ assert short.call_args.args[0] == "ok-h_ok"
+ assert "recompiled 1" in result.output
+ assert "skipped 1" in result.output
+
+
+def test_recompile_skips_long_missing_doc_id(kb_dir):
+ """Long doc lacking doc_id is warned + skipped; others run."""
+ (kb_dir / ".openkb" / "hashes.json").write_text(json.dumps({
+ "h_l": {"name": "legacy.pdf", "doc_name": "legacy-h_l", "type": "long_pdf"},
+ }))
+ (kb_dir / "wiki" / "summaries" / "legacy-h_l.md").write_text("# legacy\n")
+ (kb_dir / "wiki" / "log.md").write_text("# Log\n\n", encoding="utf-8")
+
+ with patch("openkb.agent.compiler.compile_long_doc") as long_:
+ result = _invoke(kb_dir, ["recompile", "--all", "--yes"])
+
+ assert result.exit_code == 0, result.output
+ long_.assert_not_called()
+ assert "skipped 1" in result.output
+ assert "recompiled 0" in result.output
+
+
+def test_recompile_skips_long_missing_summary(kb_dir):
+ """Long doc with doc_id but no summary on disk is warned + skipped."""
+ (kb_dir / ".openkb" / "hashes.json").write_text(json.dumps({
+ "h_l": {
+ "name": "paper.pdf", "doc_name": "paper-h_l",
+ "type": "long_pdf", "doc_id": "doc-x",
+ },
+ }))
+ (kb_dir / "wiki" / "log.md").write_text("# Log\n\n", encoding="utf-8")
+
+ with patch("openkb.agent.compiler.compile_long_doc") as long_:
+ result = _invoke(kb_dir, ["recompile", "--all", "--yes"])
+
+ assert result.exit_code == 0, result.output
+ long_.assert_not_called()
+ assert "skipped 1" in result.output
+
+
+# ---------------------------------------------------------------------------
+# error paths
+# ---------------------------------------------------------------------------
+
+
+def test_recompile_requires_doc_or_all(kb_dir):
+ _seed_short(kb_dir)
+ with patch("openkb.agent.compiler.compile_short_doc", new_callable=AsyncMock) as short:
+ result = _invoke(kb_dir, ["recompile"])
+ # Usage guard echoes a message and returns (exit 0); no compile runs.
+ assert "Specify a document name or pass --all" in result.output
+ short.assert_not_called()
+
+
+def test_recompile_doc_and_all_conflict(kb_dir):
+ _seed_short(kb_dir)
+ with patch("openkb.agent.compiler.compile_short_doc", new_callable=AsyncMock) as short:
+ result = _invoke(kb_dir, ["recompile", "notes.md", "--all"])
+ assert "not both" in result.output.lower()
+ short.assert_not_called()
+
+
+def test_recompile_unknown_doc_friendly_error(kb_dir):
+ _seed_short(kb_dir)
+ with patch("openkb.agent.compiler.compile_short_doc") as short:
+ result = _invoke(kb_dir, ["recompile", "no-such-doc"])
+ assert result.exit_code == 0, result.output
+ assert "no-such-doc" in result.output
+ short.assert_not_called()
+
+
+def test_recompile_empty_registry_friendly_error(kb_dir):
+ (kb_dir / ".openkb" / "hashes.json").write_text(json.dumps({}))
+ with patch("openkb.agent.compiler.compile_short_doc") as short:
+ result = _invoke(kb_dir, ["recompile", "--all"], input_text="y\n")
+ assert result.exit_code == 0, result.output
+ short.assert_not_called()
+ assert "No documents" in result.output or "no documents" in result.output
+
+
+# ---------------------------------------------------------------------------
+# --refresh-schema
+# ---------------------------------------------------------------------------
+
+
+def test_recompile_refresh_schema_overwrites_when_differing(kb_dir):
+ _seed_short(kb_dir)
+ agents = kb_dir / "wiki" / "AGENTS.md"
+ agents.write_text("OLD CUSTOM SCHEMA\n", encoding="utf-8")
+ with patch("openkb.agent.compiler.compile_short_doc", new_callable=AsyncMock) as short:
+ result = _invoke(kb_dir, ["recompile", "notes.md", "--refresh-schema"])
+
+ assert result.exit_code == 0, result.output
+ bak = kb_dir / "wiki" / "AGENTS.md.bak"
+ assert bak.exists()
+ assert bak.read_text(encoding="utf-8") == "OLD CUSTOM SCHEMA\n"
+ assert agents.read_text(encoding="utf-8") == AGENTS_MD
+
+
+def test_recompile_refresh_schema_noop_when_identical(kb_dir):
+ _seed_short(kb_dir)
+ agents = kb_dir / "wiki" / "AGENTS.md"
+ agents.write_text(AGENTS_MD, encoding="utf-8")
+ with patch("openkb.agent.compiler.compile_short_doc", new_callable=AsyncMock) as short:
+ result = _invoke(kb_dir, ["recompile", "notes.md", "--refresh-schema"])
+
+ assert result.exit_code == 0, result.output
+ assert not (kb_dir / "wiki" / "AGENTS.md.bak").exists()
+
+
+def test_recompile_no_refresh_schema_by_default(kb_dir):
+ _seed_short(kb_dir)
+ agents = kb_dir / "wiki" / "AGENTS.md"
+ agents.write_text("OLD CUSTOM SCHEMA\n", encoding="utf-8")
+ with patch("openkb.agent.compiler.compile_short_doc", new_callable=AsyncMock) as short:
+ result = _invoke(kb_dir, ["recompile", "notes.md"])
+
+ assert result.exit_code == 0, result.output
+ # Untouched without the flag
+ assert agents.read_text(encoding="utf-8") == "OLD CUSTOM SCHEMA\n"
+ assert not (kb_dir / "wiki" / "AGENTS.md.bak").exists()
+
+
+def test_recompile_refresh_schema_noop_when_agents_missing(kb_dir):
+ """Spec: --refresh-schema is a no-op when AGENTS.md is absent (runtime
+ already falls back to the bundled default), so nothing is written."""
+ _seed_short(kb_dir)
+ agents = kb_dir / "wiki" / "AGENTS.md"
+ agents.unlink(missing_ok=True)
+ with patch("openkb.agent.compiler.compile_short_doc", new_callable=AsyncMock) as short:
+ result = _invoke(kb_dir, ["recompile", "notes.md", "--refresh-schema"])
+
+ assert result.exit_code == 0, result.output
+ assert not agents.exists() # not materialized
+ assert not (kb_dir / "wiki" / "AGENTS.md.bak").exists()
diff --git a/tests/test_remove.py b/tests/test_remove.py
index 8518d639..a915ee56 100644
--- a/tests/test_remove.py
+++ b/tests/test_remove.py
@@ -390,6 +390,34 @@ def test_cli_remove_dry_run_does_nothing(kb_dir):
assert "h_a" in hashes
+def test_cli_remove_preview_lists_entity_actions(kb_dir):
+ """The dry-run preview must enumerate entity-page DELETE/MODIFY actions
+ and report an 'N entity(s) will be DELETED' summary line."""
+ _seed_two_doc_kb(kb_dir)
+ (kb_dir / "wiki" / "entities").mkdir(parents=True)
+ # Single-source entity (only attention) -> will be DELETED
+ (kb_dir / "wiki" / "entities" / "vaswani.md").write_text(
+ "---\nsources: [summaries/attention-h_a.md]\ntype: person\nbrief: V\n---\n"
+ "# Vaswani\n\n## Related Documents\n- [[summaries/attention-h_a]]\n",
+ encoding="utf-8",
+ )
+ # Multi-source entity (both) -> will be MODIFIED
+ (kb_dir / "wiki" / "entities" / "google.md").write_text(
+ "---\nsources: [summaries/attention-h_a.md, summaries/llm-h_l.md]\n"
+ "type: organization\nbrief: G\n---\n# Google\n",
+ encoding="utf-8",
+ )
+
+ result = _invoke(kb_dir, ["remove", "attention.pdf", "--dry-run"])
+
+ assert result.exit_code == 0, result.output
+ assert "DELETE wiki/entities/vaswani.md" in result.output
+ assert "MODIFY wiki/entities/google.md" in result.output
+ assert "1 entity(s) will be DELETED" in result.output
+ # Nothing actually removed in dry-run.
+ assert (kb_dir / "wiki" / "entities" / "vaswani.md").exists()
+
+
def test_cli_remove_yes_executes_full_plan(kb_dir):
_seed_two_doc_kb(kb_dir)
result = _invoke(kb_dir, ["remove", "attention.pdf", "--yes"])
diff --git a/tests/test_skill_chat_slash.py b/tests/test_skill_chat_slash.py
index d99dd152..faf13725 100644
--- a/tests/test_skill_chat_slash.py
+++ b/tests/test_skill_chat_slash.py
@@ -87,6 +87,19 @@ async def test_slash_skill_new_rejects_empty_wiki(tmp_path):
assert not (kb / "output").exists()
+def test_preflight_gate_counts_entities(tmp_path):
+ """The wiki-content gate must accept a KB whose only compiled content
+ lives in entities/ (no concept or summary pages yet)."""
+ from openkb.cli import _preflight_skill_new
+
+ kb = tmp_path
+ (kb / "wiki" / "entities").mkdir(parents=True)
+ (kb / "wiki" / "entities" / "ada.md").write_text("# Ada\n")
+
+ # No error means the gate passed.
+ assert _preflight_skill_new(kb, "demo") is None
+
+
@pytest.mark.asyncio
async def test_slash_skill_new_rejects_when_target_exists(tmp_path):
"""Chat / slash command must not silently overwrite an existing skill."""