From 5504541b31e2126cf019ff4a3b0b51fc13149835 Mon Sep 17 00:00:00 2001 From: mountain Date: Sat, 30 May 2026 09:52:09 +0800 Subject: [PATCH 01/29] feat(compiler): _read_entity_briefs for entity plan context --- openkb/agent/compiler.py | 46 ++++++++++++++++++++++++++++++++++++++++ tests/test_compiler.py | 20 +++++++++++++++++ 2 files changed, 66 insertions(+) diff --git a/openkb/agent/compiler.py b/openkb/agent/compiler.py index 87d96652..65e7cc3d 100644 --- a/openkb/agent/compiler.py +++ b/openkb/agent/compiler.py @@ -422,6 +422,52 @@ def _read_concept_briefs(wiki_dir: Path) -> str: return "\n".join(lines) or "(none yet)" +def _read_entity_briefs(wiki_dir: Path) -> str: + """Read existing entity pages as compact lines for the plan call. + + Formats each as ``- {slug} ({type}, {n} sources) — {brief}``. The source + count is the cross-document recurrence signal the LLM uses to decide + create-vs-update and salience. Returns "(none yet)" when empty. + """ + entities_dir = wiki_dir / "entities" + if not entities_dir.exists(): + return "(none yet)" + + md_files = sorted(entities_dir.glob("*.md")) + if not md_files: + return "(none yet)" + + lines: list[str] = [] + for path in md_files: + text = path.read_text(encoding="utf-8") + brief = "" + etype = "other" + n_sources = 0 + body = text + if text.startswith("---"): + end = text.find("---", 3) + if end != -1: + fm_text = text[3:end].strip("\n") + body = text[end + 3:] + try: + fm = yaml.safe_load(fm_text) + except yaml.YAMLError: + fm = None + if isinstance(fm, dict): + if isinstance(fm.get("brief"), str): + brief = fm["brief"].strip() + if isinstance(fm.get("type"), str): + etype = fm["type"].strip() or "other" + if isinstance(fm.get("sources"), list): + n_sources = len(fm["sources"]) + if not brief: + brief = body.strip().replace("\n", " ")[:150] + suffix = f" — {brief}" if brief else "" + lines.append(f"- {path.stem} ({etype}, {n_sources} sources){suffix}") + + return "\n".join(lines) or "(none yet)" + + def _iter_h2_headings(lines: list[str]) -> list[tuple[int, str]]: """Return ``[(line_index, normalized_heading), ...]`` for every ATX H2. diff --git a/tests/test_compiler.py b/tests/test_compiler.py index de5f0afa..aa32730a 100644 --- a/tests/test_compiler.py +++ b/tests/test_compiler.py @@ -17,6 +17,7 @@ _update_index, _read_wiki_context, _read_concept_briefs, + _read_entity_briefs, _add_related_link, _backlink_summary, _backlink_concepts, @@ -437,6 +438,25 @@ def test_falls_back_to_body_truncation(self, tmp_path): assert "- old: Old concept without brief field." in result +class TestReadEntityBriefs: + def test_none_when_missing(self, tmp_path): + assert _read_entity_briefs(tmp_path) == "(none yet)" + + def test_brief_type_and_source_count(self, tmp_path): + ent = tmp_path / "entities" + ent.mkdir() + (ent / "anthropic.md").write_text( + "---\n" + "sources: [summaries/a.md, summaries/b.md]\n" + "type: organization\n" + "brief: AI lab behind Claude.\n" + "---\n\n# Anthropic\n", + encoding="utf-8", + ) + out = _read_entity_briefs(tmp_path) + assert out == "- anthropic (organization, 2 sources) — AI lab behind Claude." + + class TestBacklinkSummary: def test_adds_missing_concept_links(self, tmp_path): wiki = tmp_path / "wiki" From 7181d575d7f24daa74e841183635da134bd235bf Mon Sep 17 00:00:00 2001 From: mountain Date: Sat, 30 May 2026 09:54:52 +0800 Subject: [PATCH 02/29] test(compiler): parity tests for _read_entity_briefs --- tests/test_compiler.py | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/tests/test_compiler.py b/tests/test_compiler.py index aa32730a..24bddf12 100644 --- a/tests/test_compiler.py +++ b/tests/test_compiler.py @@ -456,6 +456,45 @@ def test_brief_type_and_source_count(self, tmp_path): out = _read_entity_briefs(tmp_path) assert out == "- anthropic (organization, 2 sources) — AI lab behind Claude." + def test_empty_dir_returns_none(self, tmp_path): + ent = tmp_path / "entities" + ent.mkdir() + assert _read_entity_briefs(tmp_path) == "(none yet)" + + def test_falls_back_to_body_when_no_brief(self, tmp_path): + ent = tmp_path / "entities" + ent.mkdir() + body_text = "OpenAI is a research lab focused on artificial general intelligence." + (ent / "openai.md").write_text( + "---\n" + "type: organization\n" + "sources: [summaries/a.md, summaries/b.md, summaries/c.md]\n" + "---\n\n" + body_text, + encoding="utf-8", + ) + out = _read_entity_briefs(tmp_path) + # Should use truncated body (first 150 chars) as the brief + expected_brief = body_text[:150] + assert f" — {expected_brief}" in out + # Should still include type and source count + assert "(organization, 3 sources)" in out + + def test_sorted_alphabetically(self, tmp_path): + ent = tmp_path / "entities" + ent.mkdir() + (ent / "zeta.md").write_text( + "---\ntype: person\nsources: [summaries/a.md]\nbrief: Last letter of Greek alphabet.\n---\n", + encoding="utf-8", + ) + (ent / "alpha.md").write_text( + "---\ntype: concept\nsources: [summaries/b.md]\nbrief: First letter of Greek alphabet.\n---\n", + encoding="utf-8", + ) + out = _read_entity_briefs(tmp_path) + lines = out.strip().splitlines() + assert lines[0].startswith("- alpha ") + assert lines[1].startswith("- zeta ") + class TestBacklinkSummary: def test_adds_missing_concept_links(self, tmp_path): From efacb6f3266b6a45469cc2928ee72af8f94deac9 Mon Sep 17 00:00:00 2001 From: mountain Date: Sat, 30 May 2026 09:57:26 +0800 Subject: [PATCH 03/29] feat(compiler): _write_entity with type/aliases frontmatter --- openkb/agent/compiler.py | 67 ++++++++++++++++++++++++++++++++++++++++ tests/test_compiler.py | 35 +++++++++++++++++++++ 2 files changed, 102 insertions(+) diff --git a/openkb/agent/compiler.py b/openkb/agent/compiler.py index 65e7cc3d..8321ad4c 100644 --- a/openkb/agent/compiler.py +++ b/openkb/agent/compiler.py @@ -701,6 +701,73 @@ def _write_concept(wiki_dir: Path, name: str, content: str, source_file: str, is path.write_text(frontmatter + content, encoding="utf-8") +def _write_entity( + wiki_dir: Path, name: str, content: str, source_file: str, + is_update: bool, brief: str = "", type_: str = "other", + aliases: list[str] | None = None, +) -> None: + """Write or update an entity page in entities/, managing frontmatter. + + Frontmatter fields: ``sources`` (list), ``type`` (one of the entity + enum), ``brief`` (one-liner), and optional ``aliases`` (list, omitted + when empty). On update the new source is prepended and the body replaced + with the LLM rewrite; ``type`` is preserved from the new write. + """ + entities_dir = wiki_dir / "entities" + entities_dir.mkdir(parents=True, exist_ok=True) + safe_name = _sanitize_concept_name(name) + path = (entities_dir / f"{safe_name}.md").resolve() + if not path.is_relative_to(entities_dir.resolve()): + logger.warning("Entity name escapes entities dir: %s", name) + return + + # Strip any frontmatter the LLM body may carry. + clean = content + if clean.startswith("---"): + end = clean.find("---", 3) + if end != -1: + clean = clean[end + 3:].lstrip("\n") + + if is_update and path.exists(): + existing = path.read_text(encoding="utf-8") + if source_file not in existing: + existing = _prepend_source_to_frontmatter(existing, source_file) + if existing.startswith("---"): + end = existing.find("---", 3) + if end != -1: + fm = existing[:end + 3] + fm = _set_fm_line(fm, "brief", brief) if brief else fm + fm = _set_fm_line(fm, "type", type_) if type_ else fm + existing = fm + "\n\n" + clean + else: + existing = clean + else: + existing = clean + path.write_text(existing, encoding="utf-8") + return + + fm_lines = [_yaml_list_line("sources", [source_file])] + fm_lines.append(_yaml_kv_line("type", type_ or "other")) + if brief: + fm_lines.append(_yaml_kv_line("brief", brief)) + if aliases: + fm_lines.append(_yaml_list_line("aliases", aliases)) + frontmatter = "---\n" + "\n".join(fm_lines) + "\n---\n\n" + path.write_text(frontmatter + clean, encoding="utf-8") + + +def _set_fm_line(fm: str, key: str, value: str) -> str: + """Set or replace a single scalar ``key:`` line inside a frontmatter block. + + ``fm`` includes the opening and closing ``---`` markers. Uses a lambda + replacement so values containing regex backrefs are inserted literally. + """ + line = _yaml_kv_line(key, value) + if re.search(rf"^{re.escape(key)}:", fm, flags=re.MULTILINE): + return re.sub(rf"^{re.escape(key)}:.*", lambda _m: line, fm, flags=re.MULTILINE) + return fm.replace("---\n", f"---\n{line}\n", 1) + + def _prepend_source_to_frontmatter(text: str, source_file: str) -> str: """Prepend ``source_file`` to the inline ``sources:`` list in YAML frontmatter. diff --git a/tests/test_compiler.py b/tests/test_compiler.py index 24bddf12..db1de2aa 100644 --- a/tests/test_compiler.py +++ b/tests/test_compiler.py @@ -14,6 +14,7 @@ _sanitize_concept_name, _write_summary, _write_concept, + _write_entity, _update_index, _read_wiki_context, _read_concept_briefs, @@ -496,6 +497,40 @@ def test_sorted_alphabetically(self, tmp_path): assert lines[1].startswith("- zeta ") +class TestWriteEntity: + def test_new_entity_frontmatter(self, tmp_path): + _write_entity( + tmp_path, "anthropic", "# Anthropic\n\nAI lab.", + "summaries/a.md", is_update=False, + brief="AI lab behind Claude.", type_="organization", + aliases=["Anthropic PBC"], + ) + text = (tmp_path / "entities" / "anthropic.md").read_text(encoding="utf-8") + assert "type:" in text and "organization" in text + assert "brief:" in text and "AI lab behind Claude." in text + assert "sources:" in text and "summaries/a.md" in text + assert "Anthropic PBC" in text + assert text.count("---") == 2 # exactly one frontmatter block + + def test_update_prepends_source_keeps_type(self, tmp_path): + _write_entity( + tmp_path, "anthropic", "# Anthropic\n\nv1.", + "summaries/a.md", is_update=False, + brief="b1", type_="organization", aliases=None, + ) + _write_entity( + tmp_path, "anthropic", "# Anthropic\n\nv2 richer.", + "summaries/b.md", is_update=True, + brief="b2", type_="organization", aliases=None, + ) + text = (tmp_path / "entities" / "anthropic.md").read_text(encoding="utf-8") + assert "summaries/b.md" in text and "summaries/a.md" in text + assert "type:" in text and "organization" in text + assert "v2 richer." in text + assert "v1." not in text + assert "brief:" in text and "b2" in text + + class TestBacklinkSummary: def test_adds_missing_concept_links(self, tmp_path): wiki = tmp_path / "wiki" From 71a4a14b3c860f90ea7701ac6210d0c4091c8077 Mon Sep 17 00:00:00 2001 From: mountain Date: Sat, 30 May 2026 10:05:10 +0800 Subject: [PATCH 04/29] test(compiler): assert source ordering in _write_entity; count=1 in _set_fm_line Add explicit ordering assertion in test_update_prepends_source_keeps_type verifying the deterministic json.dumps form ("summaries/b.md", "summaries/a.md"). Pass count=1 to re.sub in _set_fm_line to make first-occurrence intent explicit. --- openkb/agent/compiler.py | 2 +- tests/test_compiler.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/openkb/agent/compiler.py b/openkb/agent/compiler.py index 8321ad4c..61e06452 100644 --- a/openkb/agent/compiler.py +++ b/openkb/agent/compiler.py @@ -764,7 +764,7 @@ def _set_fm_line(fm: str, key: str, value: str) -> str: """ line = _yaml_kv_line(key, value) if re.search(rf"^{re.escape(key)}:", fm, flags=re.MULTILINE): - return re.sub(rf"^{re.escape(key)}:.*", lambda _m: line, fm, flags=re.MULTILINE) + return re.sub(rf"^{re.escape(key)}:.*", lambda _m: line, fm, count=1, flags=re.MULTILINE) return fm.replace("---\n", f"---\n{line}\n", 1) diff --git a/tests/test_compiler.py b/tests/test_compiler.py index db1de2aa..9d8f0100 100644 --- a/tests/test_compiler.py +++ b/tests/test_compiler.py @@ -525,6 +525,8 @@ def test_update_prepends_source_keeps_type(self, tmp_path): ) text = (tmp_path / "entities" / "anthropic.md").read_text(encoding="utf-8") assert "summaries/b.md" in text and "summaries/a.md" in text + # _yaml_list_line uses json.dumps: b prepended before a, double-quoted + assert '"summaries/b.md", "summaries/a.md"' in text assert "type:" in text and "organization" in text assert "v2 richer." in text assert "v1." not in text From 97f1c51ddbf57a631d855cc44e81d088bdbe9ce3 Mon Sep 17 00:00:00 2001 From: mountain Date: Sat, 30 May 2026 10:07:14 +0800 Subject: [PATCH 05/29] feat(lint): include entities/ in wikilink whitelist --- openkb/lint.py | 3 +++ tests/test_lint.py | 8 ++++++++ 2 files changed, 11 insertions(+) diff --git a/openkb/lint.py b/openkb/lint.py index fa6df76a..2f345659 100644 --- a/openkb/lint.py +++ b/openkb/lint.py @@ -171,6 +171,9 @@ def list_existing_wiki_targets(wiki_dir: Path) -> set[str]: targets.update(f"concepts/{p.stem}" for p in concepts_dir.glob("*.md")) if summaries_dir.is_dir(): targets.update(f"summaries/{p.stem}" for p in summaries_dir.glob("*.md")) + entities_dir = wiki_dir / "entities" + if entities_dir.is_dir(): + targets.update(f"entities/{p.stem}" for p in entities_dir.glob("*.md")) if (wiki_dir / "index.md").exists(): targets.add("index") return targets diff --git a/tests/test_lint.py b/tests/test_lint.py index fe6a3e6a..8600e63d 100644 --- a/tests/test_lint.py +++ b/tests/test_lint.py @@ -12,6 +12,7 @@ find_missing_entries, find_orphans, fix_broken_links, + list_existing_wiki_targets, run_structural_lint, strip_ghost_wikilinks, ) @@ -501,3 +502,10 @@ def test_restrict_to_uses_global_known_targets(self, tmp_path): assert "[[concepts/sibling]]" in text # Ghost link gets demoted. assert "[[concepts/ghost]]" not in text + + +def test_whitelist_includes_entities(tmp_path): + (tmp_path / "entities").mkdir() + (tmp_path / "entities" / "anthropic.md").write_text("# A", encoding="utf-8") + targets = list_existing_wiki_targets(tmp_path) + assert "entities/anthropic" in targets From 3c8aa93d129893d250ce4216b8fadcf8539bd0e9 Mon Sep 17 00:00:00 2001 From: mountain Date: Sat, 30 May 2026 10:09:31 +0800 Subject: [PATCH 06/29] feat(compiler): summary<->entity backlinks --- openkb/agent/compiler.py | 34 ++++++++++++++++++++++++++++++++++ tests/test_compiler.py | 35 ++++++++++++++++++++++++++++++++++- 2 files changed, 68 insertions(+), 1 deletion(-) diff --git a/openkb/agent/compiler.py b/openkb/agent/compiler.py index 61e06452..17f07073 100644 --- a/openkb/agent/compiler.py +++ b/openkb/agent/compiler.py @@ -908,6 +908,40 @@ def _backlink_concepts(wiki_dir: Path, doc_name: str, concept_slugs: list[str]) path.write_text("\n".join(lines), encoding="utf-8") +def _backlink_summary_entities(wiki_dir: Path, doc_name: str, entity_slugs: list[str]) -> None: + """Append missing entity wikilinks to the summary page under '## Entities'.""" + summary_path = wiki_dir / "summaries" / f"{doc_name}.md" + if not summary_path.exists(): + return + text = summary_path.read_text(encoding="utf-8") + missing = [s for s in entity_slugs if f"[[entities/{s}]]" not in text] + if not missing: + return + lines = text.split("\n") + _ensure_h2_section(lines, "## Entities") + for slug in reversed(missing): + _insert_section_entry(lines, "## Entities", f"- [[entities/{slug}]]") + summary_path.write_text("\n".join(lines), encoding="utf-8") + + +def _backlink_entities(wiki_dir: Path, doc_name: str, entity_slugs: list[str]) -> None: + """Append the source summary wikilink to each entity page under + '## Related Documents' (mirrors _backlink_concepts).""" + link = f"[[summaries/{doc_name}]]" + entities_dir = wiki_dir / "entities" + for slug in entity_slugs: + path = entities_dir / f"{slug}.md" + if not path.exists(): + continue + text = path.read_text(encoding="utf-8") + if link in text: + continue + lines = text.split("\n") + _ensure_h2_section(lines, "## Related Documents") + _insert_section_entry(lines, "## Related Documents", f"- {link}") + path.write_text("\n".join(lines), encoding="utf-8") + + def remove_doc_from_concept_pages( wiki_dir: Path, doc_name: str, diff --git a/tests/test_compiler.py b/tests/test_compiler.py index 9d8f0100..2dc11cce 100644 --- a/tests/test_compiler.py +++ b/tests/test_compiler.py @@ -22,6 +22,8 @@ _add_related_link, _backlink_summary, _backlink_concepts, + _backlink_summary_entities, + _backlink_entities, ) @@ -1440,4 +1442,35 @@ async def test_short_doc_briefs_in_index_and_frontmatter(self, tmp_path): # Index has briefs index_text = (wiki / "index.md").read_text() assert "— A paper about transformers" in index_text - assert "— NN architecture using self-attention" in index_text + + +class TestEntityBacklinks: + def _seed(self, tmp_path): + (tmp_path / "summaries").mkdir() + (tmp_path / "summaries" / "doc.md").write_text( + "---\nsources: []\n---\n\n# Doc\n", encoding="utf-8") + (tmp_path / "entities").mkdir() + (tmp_path / "entities" / "anthropic.md").write_text( + "---\ntype: organization\nsources: [summaries/doc.md]\n---\n\n# Anthropic\n", + encoding="utf-8") + + def test_summary_gets_entities_section(self, tmp_path): + self._seed(tmp_path) + _backlink_summary_entities(tmp_path, "doc", ["anthropic"]) + text = (tmp_path / "summaries" / "doc.md").read_text(encoding="utf-8") + assert "## Entities" in text + assert "[[entities/anthropic]]" in text + + def test_entity_gets_related_documents(self, tmp_path): + self._seed(tmp_path) + _backlink_entities(tmp_path, "doc", ["anthropic"]) + text = (tmp_path / "entities" / "anthropic.md").read_text(encoding="utf-8") + assert "## Related Documents" in text + assert "[[summaries/doc]]" in text + + def test_idempotent(self, tmp_path): + self._seed(tmp_path) + _backlink_summary_entities(tmp_path, "doc", ["anthropic"]) + _backlink_summary_entities(tmp_path, "doc", ["anthropic"]) + text = (tmp_path / "summaries" / "doc.md").read_text(encoding="utf-8") + assert text.count("[[entities/anthropic]]") == 1 From ff1345e7499d773969fb625d2c0bde293877dceb Mon Sep 17 00:00:00 2001 From: mountain Date: Sat, 30 May 2026 10:10:55 +0800 Subject: [PATCH 07/29] test(compiler): restore assertion erroneously deleted in 3c8aa93 --- tests/test_compiler.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_compiler.py b/tests/test_compiler.py index 2dc11cce..a68d58db 100644 --- a/tests/test_compiler.py +++ b/tests/test_compiler.py @@ -1442,6 +1442,7 @@ async def test_short_doc_briefs_in_index_and_frontmatter(self, tmp_path): # Index has briefs index_text = (wiki / "index.md").read_text() assert "— A paper about transformers" in index_text + assert "— NN architecture using self-attention" in index_text class TestEntityBacklinks: From 385defd70677da63bafc4a5153896ff418a5c253 Mon Sep 17 00:00:00 2001 From: mountain Date: Sat, 30 May 2026 10:13:01 +0800 Subject: [PATCH 08/29] feat(compiler): index.md Entities section --- openkb/agent/compiler.py | 20 +++++++++++++++++++- tests/test_compiler.py | 21 +++++++++++++++++++++ 2 files changed, 40 insertions(+), 1 deletion(-) diff --git a/openkb/agent/compiler.py b/openkb/agent/compiler.py index 17f07073..d30064cb 100644 --- a/openkb/agent/compiler.py +++ b/openkb/agent/compiler.py @@ -1062,6 +1062,8 @@ def _update_index( wiki_dir: Path, doc_name: str, concept_names: list[str], doc_brief: str = "", concept_briefs: dict[str, str] | None = None, doc_type: str = "short", + entity_names: list[str] | None = None, + entity_meta: dict[str, tuple[str, str]] | None = None, ) -> None: """Append document and concept entries to index.md. @@ -1078,7 +1080,8 @@ def _update_index( index_path = wiki_dir / "index.md" if not index_path.exists(): index_path.write_text( - "# Knowledge Base Index\n\n## Documents\n\n## Concepts\n\n## Explorations\n", + "# Knowledge Base Index\n\n## Documents\n\n## Concepts\n\n" + "## Entities\n\n## Explorations\n", encoding="utf-8", ) @@ -1106,6 +1109,21 @@ def _update_index( else: _insert_section_entry(lines, "## Concepts", concept_entry) + entity_names = entity_names or [] + entity_meta = entity_meta or {} + if entity_names: + _ensure_h2_section(lines, "## Entities") + for name in entity_names: + link = f"[[entities/{name}]]" + etype, brief = entity_meta.get(name, ("other", "")) + entry = f"- {link} ({etype})" + if brief: + entry += f" — {brief}" + if _section_contains_link(lines, "## Entities", link): + _replace_section_entry(lines, "## Entities", link, entry) + else: + _insert_section_entry(lines, "## Entities", entry) + index_path.write_text("\n".join(lines), encoding="utf-8") diff --git a/tests/test_compiler.py b/tests/test_compiler.py index a68d58db..3ed7ad83 100644 --- a/tests/test_compiler.py +++ b/tests/test_compiler.py @@ -1445,6 +1445,27 @@ async def test_short_doc_briefs_in_index_and_frontmatter(self, tmp_path): assert "— NN architecture using self-attention" in index_text +class TestIndexEntities: + def test_entities_section_written(self, tmp_path): + _update_index( + tmp_path, "doc", [], doc_brief="d", + entity_names=["anthropic"], + entity_meta={"anthropic": ("organization", "AI lab behind Claude.")}, + ) + text = (tmp_path / "index.md").read_text(encoding="utf-8") + assert "## Entities" in text + assert "- [[entities/anthropic]] (organization) — AI lab behind Claude." in text + + def test_entity_entry_replaced_on_update(self, tmp_path): + _update_index(tmp_path, "doc", [], entity_names=["anthropic"], + entity_meta={"anthropic": ("organization", "old")}) + _update_index(tmp_path, "doc2", [], entity_names=["anthropic"], + entity_meta={"anthropic": ("organization", "new")}) + text = (tmp_path / "index.md").read_text(encoding="utf-8") + assert text.count("[[entities/anthropic]]") == 1 + assert "new" in text and "old" not in text + + class TestEntityBacklinks: def _seed(self, tmp_path): (tmp_path / "summaries").mkdir() From 41cda0f71ae8d20088aadc79cc5e02ce2712734d Mon Sep 17 00:00:00 2001 From: mountain Date: Sat, 30 May 2026 10:15:48 +0800 Subject: [PATCH 09/29] feat(compiler): remove_doc_from_entity_pages + index cleanup --- openkb/agent/compiler.py | 55 ++++++++++++++++++++++++++++++++++++++-- openkb/cli.py | 8 +++++- tests/test_compiler.py | 21 +++++++++++++++ 3 files changed, 81 insertions(+), 3 deletions(-) diff --git a/openkb/agent/compiler.py b/openkb/agent/compiler.py index d30064cb..13dbfc29 100644 --- a/openkb/agent/compiler.py +++ b/openkb/agent/compiler.py @@ -1032,9 +1032,55 @@ def remove_doc_from_concept_pages( return {"modified": modified, "deleted": deleted} -def remove_doc_from_index(wiki_dir: Path, doc_name: str, concept_slugs_deleted: list[str]) -> None: +def remove_doc_from_entity_pages( + wiki_dir: Path, + doc_name: str, + *, + keep_empty: bool = False, +) -> dict[str, list[str]]: + """Update or delete entity pages affected by removing a document. + + Mirrors ``remove_doc_from_concept_pages`` for the entities/ directory: + strips ``summaries/{doc_name}`` from each entity's ``sources:`` and from + its ``## Related Documents`` section; deletes the page when its sources + list empties (unless ``keep_empty``). Returns + ``{"modified": [...], "deleted": [...]}``. + """ + entities_dir = wiki_dir / "entities" + if not entities_dir.is_dir(): + return {"modified": [], "deleted": []} + + source_file = f"summaries/{doc_name}.md" + bare_source = f"summaries/{doc_name}" + link = f"[[{bare_source}]]" + + modified: list[str] = [] + deleted: list[str] = [] + + for path in sorted(entities_dir.glob("*.md")): + text = path.read_text(encoding="utf-8") + if source_file not in text and bare_source not in text: + continue + new_text, sources_empty = _remove_source_from_frontmatter(text, source_file) + if link in new_text: + lines = new_text.split("\n") + while _remove_section_entry(lines, "## Related Documents", link): + pass + new_text = "\n".join(lines) + if sources_empty and not keep_empty: + path.unlink() + deleted.append(path.stem) + elif new_text != text: + path.write_text(new_text, encoding="utf-8") + modified.append(path.stem) + + return {"modified": modified, "deleted": deleted} + + +def remove_doc_from_index(wiki_dir: Path, doc_name: str, concept_slugs_deleted: list[str], + entity_slugs_deleted: list[str] | None = None) -> None: """Remove the document's entry from ``index.md`` along with any concept - entries for concepts that were deleted as a side effect. + and entity entries for pages that were deleted as a side effect. No-op when ``index.md`` doesn't exist. Section headings are kept even when their last entry is removed — adding a new doc later repopulates @@ -1055,6 +1101,11 @@ def remove_doc_from_index(wiki_dir: Path, doc_name: str, concept_slugs_deleted: while _remove_section_entry(lines, "## Concepts", concept_link): pass + for slug in (entity_slugs_deleted or []): + entity_link = f"[[entities/{slug}]]" + while _remove_section_entry(lines, "## Entities", entity_link): + pass + index_path.write_text("\n".join(lines), encoding="utf-8") diff --git a/openkb/cli.py b/openkb/cli.py index 68a3e807..7cae8a10 100644 --- a/openkb/cli.py +++ b/openkb/cli.py @@ -800,6 +800,7 @@ def remove(ctx, identifier, keep_raw, keep_empty_concepts, dry_run, yes): """ from openkb.agent.compiler import ( remove_doc_from_concept_pages, + remove_doc_from_entity_pages, remove_doc_from_index, ) from openkb.lint import fix_broken_links @@ -967,7 +968,12 @@ def remove(ctx, identifier, keep_raw, keep_empty_concepts, dry_run, yes): wiki_dir, doc_name, keep_empty=keep_empty_concepts, ) - remove_doc_from_index(wiki_dir, doc_name, concept_result["deleted"]) + entity_result = remove_doc_from_entity_pages( + wiki_dir, doc_name, keep_empty=keep_empty_concepts, + ) + + remove_doc_from_index(wiki_dir, doc_name, concept_result["deleted"], + entity_slugs_deleted=entity_result["deleted"]) # Strip dangling wikilinks now so a retry (after a PageIndex # failure below) finds a clean wiki — no point in re-running this diff --git a/tests/test_compiler.py b/tests/test_compiler.py index 3ed7ad83..3f8ad1fc 100644 --- a/tests/test_compiler.py +++ b/tests/test_compiler.py @@ -24,6 +24,7 @@ _backlink_concepts, _backlink_summary_entities, _backlink_entities, + remove_doc_from_entity_pages, ) @@ -1496,3 +1497,23 @@ def test_idempotent(self, tmp_path): _backlink_summary_entities(tmp_path, "doc", ["anthropic"]) text = (tmp_path / "summaries" / "doc.md").read_text(encoding="utf-8") assert text.count("[[entities/anthropic]]") == 1 + + +class TestRemoveEntityPages: + def test_strip_source_and_delete_when_empty(self, tmp_path): + ent = tmp_path / "entities" + ent.mkdir() + (ent / "solo.md").write_text( + "---\ntype: organization\nsources: [summaries/doc.md]\n---\n\n" + "# Solo\n\n## Related Documents\n- [[summaries/doc]]\n", + encoding="utf-8") + (ent / "shared.md").write_text( + "---\ntype: organization\nsources: [summaries/doc.md, summaries/other.md]\n---\n\n" + "# Shared\n\n## Related Documents\n- [[summaries/doc]]\n- [[summaries/other]]\n", + encoding="utf-8") + result = remove_doc_from_entity_pages(tmp_path, "doc") + assert result == {"modified": ["shared"], "deleted": ["solo"]} + assert not (ent / "solo.md").exists() + shared = (ent / "shared.md").read_text(encoding="utf-8") + assert "summaries/doc" not in shared + assert "summaries/other" in shared From 04d2bc9054dcd3f14c9eb2f2bac330539d71cfda Mon Sep 17 00:00:00 2001 From: mountain Date: Sat, 30 May 2026 10:22:24 +0800 Subject: [PATCH 10/29] feat(compiler): plan prompt + parser for entities group MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Also wires the entity track into _compile_concepts (Tasks 7 + 8 combined, since the {entity_briefs} placeholder and the _CONCEPTS_PLAN_USER.format call are co-dependent — splitting would leave an intermediate red state). - add _ENTITY_TYPES, _filter_entity_items, _parse_entities_plan - rewrite _CONCEPTS_PLAN_USER to request nested concepts+entities groups - add _ENTITY_PAGE_USER / _ENTITY_UPDATE_USER prompts - read entity briefs and pass both briefs to the plan prompt - parse nested 'concepts' group with legacy flat-list/flat-dict fallbacks - generate entities in their own asyncio.gather (4-arity tuples) - strip ghost links + _write_entity each; handle entity related cross-links - backlink summary<->entities; pass entity_names/entity_meta to _update_index --- openkb/agent/compiler.py | 286 ++++++++++++++++++++++++++++++++++++--- tests/test_compiler.py | 72 ++++++++++ 2 files changed, 340 insertions(+), 18 deletions(-) diff --git a/openkb/agent/compiler.py b/openkb/agent/compiler.py index 13dbfc29..4d619322 100644 --- a/openkb/agent/compiler.py +++ b/openkb/agent/compiler.py @@ -69,28 +69,42 @@ _CONCEPTS_PLAN_USER = """\ -Based on the summary above, decide how to update the wiki's concept pages. +Based on the summary above, decide how to update the wiki's CONCEPT pages and +ENTITY pages. + +A CONCEPT is an abstract, recurring idea/pattern/mechanism (e.g. "agentic +systems"). An ENTITY is a specific named thing — a person, organization, +place, product, named work, or event (e.g. "Anthropic"). Each name goes in +exactly ONE group. A topic may have both (entity "NVIDIA" and concept +"ai-infrastructure-demand"); they cross-link, they do not merge. Existing concept pages: {concept_briefs} -Return a JSON object with three keys: +Existing entity pages (with source counts = how many docs already cite them): +{entity_briefs} -1. "create" — new concepts not covered by any existing page. Array of objects: - {{"name": "concept-slug", "title": "Human-Readable Title"}} +Return a JSON object with two top-level keys, "concepts" and "entities". -2. "update" — existing concepts that have significant new information from \ -this document worth integrating. Array of objects: - {{"name": "existing-slug", "title": "Existing Title"}} +"concepts" is an object with: +1. "create" — new concepts. Array of {{"name": "concept-slug", "title": "Title"}} +2. "update" — existing concepts with significant new info. Same shape. +3. "related" — existing concept slugs to cross-link only. Array of strings. -3. "related" — existing concepts tangentially related to this document but \ -not needing content changes, just a cross-reference link. Array of slug strings. +"entities" is an object with the same three keys, but create/update objects +add a "type" field, one of: person, organization, place, product, work, +event, other. Example: + {{"name": "anthropic", "title": "Anthropic", "type": "organization"}} Rules: - For the first few documents, create 2-3 foundational concepts at most. -- Do NOT create a concept that overlaps with an existing one — use "update". -- Do NOT create concepts that are just the document topic itself. -- "related" is for lightweight cross-linking only, no content rewrite needed. +- Create an ENTITY page only when the entity is (a) central to this document + or (b) likely to recur across sources. Do NOT page proper nouns mentioned + only in passing. Roughly 5-15 entities per document is typical; fewer for + sparse documents. +- Prefer "update" over "create" for any concept or entity already listed above. +- Do NOT create a concept/entity that overlaps an existing one — use "update". +- "related" is lightweight cross-linking only, no content rewrite. Return ONLY valid JSON, no fences, no explanation. """ @@ -147,6 +161,41 @@ Return ONLY valid JSON, no fences. """ +_ENTITY_PAGE_USER = """\ +Write the entity page for: {title} (type: {type}) + +This entity relates to the document "{doc_name}" summarized above. + +Return a JSON object with three keys: +- "brief": A single sentence (under 100 chars) identifying this entity +- "type": one of person, organization, place, product, work, event, other +- "content": The full entity page in Markdown — what this entity is, the key + facts about it from this document, and [[wikilinks]] to related concepts, + other [[entities/...]], and [[summaries/{doc_name}]] — subject to the + whitelist rules from the message above. + +Return ONLY valid JSON, no fences. +""" + +_ENTITY_UPDATE_USER = """\ +Update the entity page for: {title} (type: {type}) + +Current content of this page: +{existing_content} + +Integrate the new facts about this entity from document "{doc_name}" +(summarized above). Rewrite the full page — do not just append. Preserve the +existing structure and intent. Follow the whitelist rules from the message +above for all [[wikilinks]]. + +Return a JSON object with three keys: +- "brief": A single sentence (under 100 chars) identifying this entity +- "type": one of person, organization, place, product, work, event, other +- "content": The rewritten full entity page in Markdown + +Return ONLY valid JSON, no fences. +""" + _SUMMARY_REWRITE_USER = """\ Task: Rewrite the summary you wrote above into a final version that is \ consistent with the concept pages now in the wiki (per the whitelist message \ @@ -366,6 +415,53 @@ def _filter_related_slugs(items: list) -> list[str]: return valid +_ENTITY_TYPES = {"person", "organization", "place", "product", "work", "event", "other"} + + +def _filter_entity_items(items: object, label: str) -> list[dict]: + """Validate entity create/update objects: require name+title, coerce type. + + Each kept item is normalized to ``{"name", "title", "type"}`` where + ``type`` falls back to ``"other"`` when missing or outside the entity + enum and ``title`` falls back to ``name``. + """ + out: list[dict] = [] + if not isinstance(items, list): + return out + for it in items: + if not isinstance(it, dict): + continue + name = it.get("name") + if not isinstance(name, str) or not name.strip(): + continue + title = it.get("title") if isinstance(it.get("title"), str) else name + etype = it.get("type") + if not isinstance(etype, str) or etype not in _ENTITY_TYPES: + etype = "other" + out.append({"name": name, "title": title, "type": etype}) + return out + + +def _parse_entities_plan(parsed: object) -> dict: + """Extract the entities group from a plan dict, with graceful fallback. + + Returns ``{"create": [...], "update": [...], "related": [...]}``. A + missing/malformed ``entities`` key yields empty lists, so older or + partial LLM responses never raise. + """ + empty = {"create": [], "update": [], "related": []} + if not isinstance(parsed, dict): + return empty + group = parsed.get("entities") + if not isinstance(group, dict): + return empty + return { + "create": _filter_entity_items(group.get("create", []), "create"), + "update": _filter_entity_items(group.get("update", []), "update"), + "related": _filter_related_slugs(group.get("related", [])), + } + + # --------------------------------------------------------------------------- # File I/O helpers # --------------------------------------------------------------------------- @@ -1218,6 +1314,7 @@ async def _compile_concepts( # --- Step 2: Get concepts plan (A cached) --- concept_briefs = _read_concept_briefs(wiki_dir) + entity_briefs = _read_entity_briefs(wiki_dir) # Second cache breakpoint: end of the assistant summary message. Covers # (system + doc + summary) for the plan call and every concept call. @@ -1229,6 +1326,7 @@ async def _compile_concepts( summary_msg, {"role": "user", "content": _CONCEPTS_PLAN_USER.format( concept_briefs=concept_briefs, + entity_briefs=entity_briefs, )}, ], "concepts-plan", max_tokens=2048, response_format=_JSON_RESPONSE_FORMAT) @@ -1272,26 +1370,39 @@ def _write_v1_summary_stripped() -> None: return # Fallback: if LLM returns a flat list, treat all items as "create". + # The new plan contract nests concepts under a "concepts" key alongside + # an "entities" key; the legacy flat shape (create/update/related at top + # level) is still honored by falling back to ``parsed`` itself. if isinstance(parsed, list): plan = {"create": _filter_concept_items(parsed, "list"), "update": [], "related": []} + entities_plan = {"create": [], "update": [], "related": []} else: + concepts_group = ( + parsed.get("concepts") + if isinstance(parsed.get("concepts"), dict) + else parsed + ) plan = { - "create": _filter_concept_items(parsed.get("create", []), "create"), - "update": _filter_concept_items(parsed.get("update", []), "update"), - "related": _filter_related_slugs(parsed.get("related", [])), + "create": _filter_concept_items(concepts_group.get("create", []), "create"), + "update": _filter_concept_items(concepts_group.get("update", []), "update"), + "related": _filter_related_slugs(concepts_group.get("related", [])), } + entities_plan = _parse_entities_plan(parsed) create_items = plan["create"] update_items = plan["update"] related_items = plan["related"] + entity_create = entities_plan["create"] + entity_update = entities_plan["update"] + entity_related = entities_plan["related"] # Distinguish "filters dropped everything" from "LLM emitted an empty plan". if isinstance(parsed, list): original_total = len(parsed) else: original_total = sum( - len(parsed.get(k, [])) if isinstance(parsed.get(k), list) else 0 + len(concepts_group.get(k, [])) if isinstance(concepts_group.get(k), list) else 0 for k in ("create", "update", "related") ) post_filter_total = len(create_items) + len(update_items) + len(related_items) @@ -1302,7 +1413,8 @@ def _write_v1_summary_stripped() -> None: ) sys.stdout.flush() - if not create_items and not update_items and not related_items: + if (not create_items and not update_items and not related_items + and not entity_create and not entity_update and not entity_related): if rewrite_summary: _write_v1_summary_stripped() _update_index(wiki_dir, doc_name, [], doc_brief=doc_brief, doc_type=doc_type) @@ -1317,9 +1429,15 @@ def _write_v1_summary_stripped() -> None: } | { _sanitize_concept_name(s) for s in related_items } + entity_planned = { + _sanitize_concept_name(e["name"]) for e in entity_create + entity_update + } | { + _sanitize_concept_name(s) for s in entity_related + } known_targets: set[str] = ( list_existing_wiki_targets(wiki_dir) | {f"concepts/{s}" for s in planned_slugs} + | {f"entities/{s}" for s in entity_planned} | {f"summaries/{doc_name}"} ) known_targets_str = _format_known_targets(known_targets) @@ -1401,6 +1519,65 @@ async def _gen_update(concept: dict) -> tuple[str, str, bool, str]: _require_nonempty_content(content, name) return name, content, True, brief + async def _gen_entity_create(ent: dict) -> tuple[str, str, str, str]: + name = ent["name"] + title = ent.get("title", name) + etype = ent.get("type", "other") + async with semaphore: + raw = await _llm_call_async(model, [ + system_msg, + doc_msg, # cached (BP1) + summary_msg, # cached (BP2) + known_targets_msg, # cached (BP3) — whitelist + {"role": "user", "content": _ENTITY_PAGE_USER.format( + title=title, type=etype, doc_name=doc_name, + )}, + ], f"entity: {name}", response_format=_JSON_RESPONSE_FORMAT) + try: + parsed = _parse_json(raw) + brief = parsed.get("brief", "") + etype_out = parsed.get("type") if parsed.get("type") in _ENTITY_TYPES else etype + content = parsed.get("content") or raw + except (json.JSONDecodeError, ValueError): + brief, etype_out, content = "", etype, raw + _require_nonempty_content(content, name) + return name, content, brief, etype_out + + async def _gen_entity_update(ent: dict) -> tuple[str, str, str, str]: + name = ent["name"] + title = ent.get("title", name) + etype = ent.get("type", "other") + epath = wiki_dir / "entities" / f"{_sanitize_concept_name(name)}.md" + if epath.exists(): + raw_text = epath.read_text(encoding="utf-8") + if raw_text.startswith("---"): + parts = raw_text.split("---", 2) + existing_content = parts[2].strip() if len(parts) >= 3 else raw_text + else: + existing_content = raw_text + else: + existing_content = "(page not found — create from scratch)" + async with semaphore: + raw = await _llm_call_async(model, [ + system_msg, + doc_msg, # cached (BP1) + summary_msg, # cached (BP2) + known_targets_msg, # cached (BP3) — whitelist + {"role": "user", "content": _ENTITY_UPDATE_USER.format( + title=title, type=etype, doc_name=doc_name, + existing_content=existing_content, + )}, + ], f"entity-update: {name}", response_format=_JSON_RESPONSE_FORMAT) + try: + parsed = _parse_json(raw) + brief = parsed.get("brief", "") + etype_out = parsed.get("type") if parsed.get("type") in _ENTITY_TYPES else etype + content = parsed.get("content") or raw + except (json.JSONDecodeError, ValueError): + brief, etype_out, content = "", etype, raw + _require_nonempty_content(content, name) + return name, content, brief, etype_out + tasks = [] tasks.extend(_gen_create(c) for c in create_items) tasks.extend(_gen_update(c) for c in update_items) @@ -1408,6 +1585,9 @@ async def _gen_update(concept: dict) -> tuple[str, str, bool, str]: concept_names: list[str] = [] concept_briefs_map: dict[str, str] = {} pending_writes: list[tuple[str, str, bool, str]] = [] + entity_names: list[str] = [] + entity_meta: dict[str, tuple[str, str]] = {} + entity_pending: list[tuple[str, str, str, str]] = [] if tasks: total = len(tasks) @@ -1443,6 +1623,59 @@ async def _gen_update(concept: dict) -> tuple[str, str, bool, str]: ) sys.stdout.flush() + # --- Step 3 (entities): generate entity pages in their OWN gather --- + # Entity coroutines return 4-arity tuples (name, content, brief, type), + # so they are gathered separately from the concept tuples rather than + # mixed into one list with differing arities. + entity_tasks = [] + entity_tasks.extend(_gen_entity_create(e) for e in entity_create) + entity_tasks.extend(_gen_entity_update(e) for e in entity_update) + + if entity_tasks: + etotal = len(entity_tasks) + sys.stdout.write( + f" Generating {etotal} entity(ies) (concurrency={max_concurrency})...\n" + ) + sys.stdout.flush() + + entity_results = await asyncio.gather(*entity_tasks, return_exceptions=True) + + entity_failure_types: list[str] = [] + for r in entity_results: + if isinstance(r, Exception): + logger.warning("Entity generation failed: %s", r) + entity_failure_types.append(type(r).__name__) + continue + name, page_content, brief, etype = r + entity_pending.append((name, page_content, brief, etype)) + + ewritten = len(entity_pending) + if ewritten < etotal: + reason = ( + ", ".join(sorted(set(entity_failure_types))) + if entity_failure_types else "see log (stderr)" + ) + sys.stdout.write( + f" [WARN] {etotal} entity(ies) planned but only {ewritten} written " + f"for {doc_name} ({reason}).\n" + ) + sys.stdout.flush() + + # Strip ghost wikilinks from entity bodies and write each page. + for name, page_content, brief, etype in entity_pending: + cleaned, ghosts = strip_ghost_wikilinks(page_content, known_targets) + if ghosts: + logger.info( + "stripped %d ghost wikilink(s) from entity %s: %s", + len(ghosts), name, ghosts[:5], + ) + safe = _sanitize_concept_name(name) + is_update = (wiki_dir / "entities" / f"{safe}.md").exists() + _write_entity(wiki_dir, name, cleaned, source_file, is_update, + brief=brief, type_=etype) + entity_names.append(safe) + entity_meta[safe] = (etype, brief) + # Strip unresolved wikilinks from concept bodies before writing. The # whitelist includes existing files + this round's planned slugs + # the summary for this document. @@ -1535,10 +1768,27 @@ async def _gen_update(concept: dict) -> tuple[str, str, bool, str]: _backlink_summary(wiki_dir, doc_name, all_concept_slugs) _backlink_concepts(wiki_dir, doc_name, all_concept_slugs) + # --- Step 3d: Process entity related items + backlinks (code only) --- + for slug in [_sanitize_concept_name(s) for s in entity_related]: + epath = wiki_dir / "entities" / f"{slug}.md" + if epath.exists(): + etext = epath.read_text(encoding="utf-8") + if f"[[summaries/{doc_name}]]" not in etext: + if source_file not in etext: + etext = _prepend_source_to_frontmatter(etext, source_file) + etext += f"\n\nSee also: [[summaries/{doc_name}]]" + epath.write_text(etext, encoding="utf-8") + entity_names.append(slug) + + if entity_names: + _backlink_summary_entities(wiki_dir, doc_name, entity_names) + _backlink_entities(wiki_dir, doc_name, entity_names) + # --- Step 4: Update index (code only) --- _update_index(wiki_dir, doc_name, concept_names, doc_brief=doc_brief, concept_briefs=concept_briefs_map, - doc_type=doc_type) + doc_type=doc_type, entity_names=entity_names, + entity_meta=entity_meta) async def compile_short_doc( diff --git a/tests/test_compiler.py b/tests/test_compiler.py index 3f8ad1fc..2c830eaf 100644 --- a/tests/test_compiler.py +++ b/tests/test_compiler.py @@ -24,6 +24,7 @@ _backlink_concepts, _backlink_summary_entities, _backlink_entities, + _parse_entities_plan, remove_doc_from_entity_pages, ) @@ -66,6 +67,31 @@ def test_fenced_dict(self): assert parsed["create"] == [] +class TestParseEntitiesPlan: + def test_extracts_entities_group(self): + parsed = { + "concepts": {"create": [{"name": "x", "title": "X"}], "update": [], "related": []}, + "entities": { + "create": [{"name": "anthropic", "title": "Anthropic", "type": "organization"}], + "update": [], + "related": ["nvidia"], + }, + } + ents = _parse_entities_plan(parsed) + assert ents["create"] == [{"name": "anthropic", "title": "Anthropic", "type": "organization"}] + assert ents["related"] == ["nvidia"] + + def test_missing_entities_key_is_empty(self): + ents = _parse_entities_plan({"create": [], "update": [], "related": []}) + assert ents == {"create": [], "update": [], "related": []} + + def test_bad_type_falls_back_to_other(self): + parsed = {"entities": {"create": [{"name": "x", "title": "X", "type": "alien"}], + "update": [], "related": []}} + ents = _parse_entities_plan(parsed) + assert ents["create"][0]["type"] == "other" + + class TestParseBriefContent: def test_dict_with_brief_and_content(self): text = json.dumps({"brief": "A short desc", "content": "# Full page\n\nDetails."}) @@ -1517,3 +1543,49 @@ def test_strip_source_and_delete_when_empty(self, tmp_path): shared = (ent / "shared.md").read_text(encoding="utf-8") assert "summaries/doc" not in shared assert "summaries/other" in shared + + +class TestCompileEntitiesEndToEnd: + @pytest.mark.asyncio + async def test_entity_and_concept_split(self, tmp_path, monkeypatch): + wiki = tmp_path / "wiki" + (wiki / "summaries").mkdir(parents=True) + (wiki / "summaries" / "doc.md").write_text( + "---\nsources: []\n---\n\n# Doc\n", encoding="utf-8") + + # Mocked LLM: plan call returns one concept + one entity; each + # generation call returns a tiny page. + def fake_llm(model, messages, label, **kw): + if label == "concepts-plan": + return json.dumps({ + "concepts": {"create": [{"name": "ai-demand", "title": "AI Demand"}], + "update": [], "related": []}, + "entities": {"create": [{"name": "nvidia", "title": "NVIDIA", + "type": "organization"}], + "update": [], "related": []}, + }) + return json.dumps({"brief": "b", "type": "organization", "content": "# Page\n"}) + + async def fake_llm_async(model, messages, label, **kw): + return fake_llm(model, messages, label, **kw) + + monkeypatch.setattr("openkb.agent.compiler._llm_call", fake_llm) + monkeypatch.setattr("openkb.agent.compiler._llm_call_async", fake_llm_async) + + from openkb.agent.compiler import _compile_concepts + sys_msg = {"role": "system", "content": "x"} + doc_msg = {"role": "user", "content": "x"} + await _compile_concepts(wiki, tmp_path, "m", sys_msg, doc_msg, + "summary text", "doc", max_concurrency=2, + doc_type="short", rewrite_summary=False) + + assert (wiki / "concepts" / "ai-demand.md").exists() + assert (wiki / "entities" / "nvidia.md").exists() + ent = (wiki / "entities" / "nvidia.md").read_text(encoding="utf-8") + # Frontmatter values are JSON-quoted by _yaml_kv_line (see _write_entity, + # Task 2), matching the tolerant assertion style in TestWriteEntity. + assert "type:" in ent and "organization" in ent + index = (wiki / "index.md").read_text(encoding="utf-8") + assert "[[entities/nvidia]]" in index + summary = (wiki / "summaries" / "doc.md").read_text(encoding="utf-8") + assert "[[entities/nvidia]]" in summary # backlink From ad45439e441a65e852c0e83b581801d6c1437094 Mon Sep 17 00:00:00 2001 From: mountain Date: Sat, 30 May 2026 10:28:22 +0800 Subject: [PATCH 11/29] fix(compiler): related entities must not downgrade index labels Mirror the concept track: collect related-entity slugs into a separate local list used only for backlinks; pass only created/updated entity_names (+entity_meta) to _update_index. Defense-in-depth in _update_index: only _replace_section_entry when name is in entity_meta, otherwise only insert if the link is absent, so a related-only entity can never clobber a pre-existing correct (type + brief) index line with "(other)". Adds regression test test_related_entity_does_not_downgrade_index_label. --- openkb/agent/compiler.py | 29 ++++++++++++++--------- tests/test_compiler.py | 51 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 69 insertions(+), 11 deletions(-) diff --git a/openkb/agent/compiler.py b/openkb/agent/compiler.py index 4d619322..3d984670 100644 --- a/openkb/agent/compiler.py +++ b/openkb/agent/compiler.py @@ -1262,14 +1262,19 @@ def _update_index( _ensure_h2_section(lines, "## Entities") for name in entity_names: link = f"[[entities/{name}]]" - etype, brief = entity_meta.get(name, ("other", "")) - entry = f"- {link} ({etype})" - if brief: - entry += f" — {brief}" - if _section_contains_link(lines, "## Entities", link): - _replace_section_entry(lines, "## Entities", link, entry) + if name in entity_meta: + etype, brief = entity_meta[name] + entry = f"- {link} ({etype})" + if brief: + entry += f" — {brief}" + if _section_contains_link(lines, "## Entities", link): + _replace_section_entry(lines, "## Entities", link, entry) + else: + _insert_section_entry(lines, "## Entities", entry) else: - _insert_section_entry(lines, "## Entities", entry) + if not _section_contains_link(lines, "## Entities", link): + entry = f"- {link} (other)" + _insert_section_entry(lines, "## Entities", entry) index_path.write_text("\n".join(lines), encoding="utf-8") @@ -1769,6 +1774,7 @@ async def _gen_entity_update(ent: dict) -> tuple[str, str, str, str]: _backlink_concepts(wiki_dir, doc_name, all_concept_slugs) # --- Step 3d: Process entity related items + backlinks (code only) --- + entity_related_slugs = [] for slug in [_sanitize_concept_name(s) for s in entity_related]: epath = wiki_dir / "entities" / f"{slug}.md" if epath.exists(): @@ -1778,11 +1784,12 @@ async def _gen_entity_update(ent: dict) -> tuple[str, str, str, str]: etext = _prepend_source_to_frontmatter(etext, source_file) etext += f"\n\nSee also: [[summaries/{doc_name}]]" epath.write_text(etext, encoding="utf-8") - entity_names.append(slug) + entity_related_slugs.append(slug) - if entity_names: - _backlink_summary_entities(wiki_dir, doc_name, entity_names) - _backlink_entities(wiki_dir, doc_name, entity_names) + entity_backlink_slugs = entity_names + entity_related_slugs + if entity_backlink_slugs: + _backlink_summary_entities(wiki_dir, doc_name, entity_backlink_slugs) + _backlink_entities(wiki_dir, doc_name, entity_backlink_slugs) # --- Step 4: Update index (code only) --- _update_index(wiki_dir, doc_name, concept_names, diff --git a/tests/test_compiler.py b/tests/test_compiler.py index 2c830eaf..5c410198 100644 --- a/tests/test_compiler.py +++ b/tests/test_compiler.py @@ -1589,3 +1589,54 @@ async def fake_llm_async(model, messages, label, **kw): assert "[[entities/nvidia]]" in index summary = (wiki / "summaries" / "doc.md").read_text(encoding="utf-8") assert "[[entities/nvidia]]" in summary # backlink + + @pytest.mark.asyncio + async def test_related_entity_does_not_downgrade_index_label(self, tmp_path, monkeypatch): + """Related-only entities must not overwrite a correct index entry with (other).""" + wiki = tmp_path / "wiki" + (wiki / "summaries").mkdir(parents=True) + (wiki / "entities").mkdir(parents=True) + + # Pre-seed summaries/doc.md + (wiki / "summaries" / "doc.md").write_text( + "---\nsources: []\n---\n\n# Doc\n", encoding="utf-8") + + # Pre-seed index.md with a correct entry for anthropic + (wiki / "index.md").write_text( + "## Documents\n\n## Concepts\n\n## Entities\n\n" + "- [[entities/anthropic]] (organization) — AI safety lab\n", + encoding="utf-8", + ) + + # Pre-seed entities/anthropic.md with type frontmatter and a source + (wiki / "entities" / "anthropic.md").write_text( + "---\ntype: organization\nsources: []\n---\n\n# Anthropic\n", + encoding="utf-8", + ) + + # LLM plan: anthropic is ONLY under entities.related, not create/update + def fake_llm(model, messages, label, **kw): + if label == "concepts-plan": + return json.dumps({ + "concepts": {"create": [], "update": [], "related": []}, + "entities": {"create": [], "update": [], "related": ["anthropic"]}, + }) + return json.dumps({"brief": "b", "type": "organization", "content": "# Page\n"}) + + async def fake_llm_async(model, messages, label, **kw): + return fake_llm(model, messages, label, **kw) + + monkeypatch.setattr("openkb.agent.compiler._llm_call", fake_llm) + monkeypatch.setattr("openkb.agent.compiler._llm_call_async", fake_llm_async) + + from openkb.agent.compiler import _compile_concepts + sys_msg = {"role": "system", "content": "x"} + doc_msg = {"role": "user", "content": "x"} + await _compile_concepts(wiki, tmp_path, "m", sys_msg, doc_msg, + "summary text", "doc", max_concurrency=2, + doc_type="short", rewrite_summary=False) + + index = (wiki / "index.md").read_text(encoding="utf-8") + # The pre-existing correct line must NOT have been downgraded to (other) + assert "(organization)" in index, "index entry was downgraded from (organization) to (other)" + assert "AI safety lab" in index, "index brief was stripped from the entry" From 5008a14b866e527f6e0d7229543a83d640664fdd Mon Sep 17 00:00:00 2001 From: mountain Date: Sat, 30 May 2026 10:29:50 +0800 Subject: [PATCH 12/29] feat(schema): declare entities/ page type and taxonomy --- openkb/schema.py | 2 ++ tests/test_compiler.py | 14 ++++++++++++++ 2 files changed, 16 insertions(+) diff --git a/openkb/schema.py b/openkb/schema.py index b2c8cf07..9f12b88d 100644 --- a/openkb/schema.py +++ b/openkb/schema.py @@ -10,6 +10,7 @@ - sources/images/ — Extracted images from documents, referenced by sources. - summaries/ — One per source document. Summary of key content. - concepts/ — Cross-document topic synthesis. Created when a theme spans multiple documents. +- entities/ — Specific named things: people, organizations, places, products, named works, events. One page per entity, accumulated across documents. - explorations/ — Saved query results, analyses, and comparisons worth keeping. - reports/ — Lint health check reports. Auto-generated. @@ -20,6 +21,7 @@ ## Page Types - **Summary Page** (summaries/): Key content of a single source document. - **Concept Page** (concepts/): Cross-document topic synthesis with [[wikilinks]]. +- **Entity Page** (entities/): A specific named thing (proper noun). Frontmatter `type:` is one of: person, organization, place, product, work, event, other. An entity differs from a concept: a concept is an abstract recurring idea; an entity is a specific named thing. Create an entity page only when the entity is central to a document or recurs across sources — do not page passing mentions. - **Exploration Page** (explorations/): Saved query results — analyses, comparisons, syntheses. - **Index Page** (index.md): One-liner summary of every page in the wiki. Auto-maintained. diff --git a/tests/test_compiler.py b/tests/test_compiler.py index 5c410198..b84bacee 100644 --- a/tests/test_compiler.py +++ b/tests/test_compiler.py @@ -1640,3 +1640,17 @@ async def fake_llm_async(model, messages, label, **kw): # The pre-existing correct line must NOT have been downgraded to (other) assert "(organization)" in index, "index entry was downgraded from (organization) to (other)" assert "AI safety lab" in index, "index brief was stripped from the entry" + + +# --------------------------------------------------------------------------- +# Task 9: schema declares entities +# --------------------------------------------------------------------------- + +from openkb.schema import AGENTS_MD + + +def test_schema_declares_entities(): + assert "entities/" in AGENTS_MD + assert "Entity Page" in AGENTS_MD + for t in ("person", "organization", "place", "product", "work", "event", "other"): + assert t in AGENTS_MD From 1e8221447fdfc02db995f74578410b78bba192e1 Mon Sep 17 00:00:00 2001 From: mountain Date: Sat, 30 May 2026 10:32:57 +0800 Subject: [PATCH 13/29] feat(query): point who/what questions at entities/ --- openkb/agent/query.py | 8 +++++--- tests/test_query.py | 8 ++++++++ 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/openkb/agent/query.py b/openkb/agent/query.py index 790a186c..b545e9af 100644 --- a/openkb/agent/query.py +++ b/openkb/agent/query.py @@ -28,15 +28,17 @@ Summaries may omit details — if you need more, follow the summary's `full_text` frontmatter field to the source (see step 4). 3. Read concept pages (concepts/) for cross-document synthesis. -4. When you need detailed source document content, each summary page has a +4. For "who/what is X" questions about a specific named person, organization, + place, or product, read the matching page in entities/ first. +5. When you need detailed source document content, each summary page has a `full_text` frontmatter field with the path to the original document content: - Short documents (doc_type: short): read_file with that path. - PageIndex documents (doc_type: pageindex): use get_page_content(doc_name, pages) with tight page ranges. The summary shows document tree structure with page ranges to help you target. Never fetch the whole document. -5. Source content may reference images (e.g. ![image](sources/images/doc/file.png)). +6. Source content may reference images (e.g. ![image](sources/images/doc/file.png)). Use the get_image tool to view them when needed. -6. Synthesize a clear, concise, well-cited answer grounded in wiki content. +7. Synthesize a clear, concise, well-cited answer grounded in wiki content. Answer based only on wiki content. Be concise. Before each tool call, output one short sentence explaining the reason. diff --git a/tests/test_query.py b/tests/test_query.py index 4fe421c8..e9585d32 100644 --- a/tests/test_query.py +++ b/tests/test_query.py @@ -73,6 +73,14 @@ async def fake_run(agent, message, **kwargs): assert "How does attention work?" in captured["message"] +def test_query_strategy_mentions_entities(): + """Task 10: query agent must direct who/what questions to entities/.""" + from openkb.agent import query as query_mod + + text = query_mod._QUERY_INSTRUCTIONS_TEMPLATE + assert "entities/" in text + + class TestFmtFallback: """Regression tests for issue #34. From 324284493b39b573cdb22ae7c833709d161f3659 Mon Sep 17 00:00:00 2001 From: mountain Date: Sat, 30 May 2026 10:35:40 +0800 Subject: [PATCH 14/29] docs(readme): document entities/ page type --- README.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index cc19188f..8e915456 100644 --- a/README.md +++ b/README.md @@ -109,6 +109,7 @@ wiki/ │ ← the foundation ├── sources/ Full-text conversions ├── summaries/ Per-document summaries ├── concepts/ Cross-document synthesis ← the good stuff + ├── entities/ Specific named things (people, orgs, places, products) ├── explorations/ Saved query results └── reports/ Lint reports │ @@ -136,9 +137,10 @@ Short docs are read in full by the LLM. Long PDFs are indexed by PageIndex into When you add a document, the LLM: 1. Generates a **summary** page -2. Reads existing **concept** pages +2. Reads existing **concept** and **entity** pages 3. Creates or updates concepts with cross-document synthesis -4. Updates the **index** and **log** +4. Creates or updates **entity** pages (people, orgs, places, products) +5. Updates the **index** and **log** A single source might touch 10-15 wiki pages. Knowledge accumulates: each document enriches the existing wiki rather than sitting in isolation. From ff3fafb3a022e6aac0e1cda20b2886ad9b14c7cd Mon Sep 17 00:00:00 2001 From: mountain Date: Sat, 30 May 2026 10:42:24 +0800 Subject: [PATCH 15/29] feat(cli): scaffold entities/ in init and count it in status - `openkb init` now creates wiki/entities/ alongside wiki/concepts/ - init seed index.md gains ## Entities between ## Concepts and ## Explorations, matching the _update_index template in compiler.py - print_status subdirs list gains "entities" after "concepts" - Tests updated: assert wiki/entities/ exists and index.md contains ## Entities; status test asserts "entities" appears in output --- openkb/cli.py | 5 +++-- tests/test_cli.py | 3 ++- tests/test_list_status.py | 2 ++ 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/openkb/cli.py b/openkb/cli.py index 7cae8a10..867a8f95 100644 --- a/openkb/cli.py +++ b/openkb/cli.py @@ -538,11 +538,12 @@ def init(model, language): Path("wiki/sources/images").mkdir(parents=True, exist_ok=True) Path("wiki/summaries").mkdir(parents=True, exist_ok=True) Path("wiki/concepts").mkdir(parents=True, exist_ok=True) + Path("wiki/entities").mkdir(parents=True, exist_ok=True) # Write wiki files Path("wiki/AGENTS.md").write_text(AGENTS_MD, encoding="utf-8") Path("wiki/index.md").write_text( - "# Knowledge Base Index\n\n## Documents\n\n## Concepts\n\n## Explorations\n", + "# Knowledge Base Index\n\n## Documents\n\n## Concepts\n\n## Entities\n\n## Explorations\n", encoding="utf-8", ) Path("wiki/log.md").write_text("# Operations Log\n\n", encoding="utf-8") @@ -1307,7 +1308,7 @@ def list_cmd(ctx): def print_status(kb_dir: Path) -> None: """Print knowledge base status. Usable from CLI and chat REPL.""" wiki_dir = kb_dir / "wiki" - subdirs = ["sources", "summaries", "concepts", "reports"] + subdirs = ["sources", "summaries", "concepts", "entities", "reports"] # Print the active KB path as the first line. Agents and scripts # parse this to locate the wiki without assuming cwd == KB root. diff --git a/tests/test_cli.py b/tests/test_cli.py index ab3378b1..65463566 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -24,6 +24,7 @@ def test_init_creates_structure(tmp_path): assert (cwd / "wiki" / "sources" / "images").is_dir() assert (cwd / "wiki" / "summaries").is_dir() assert (cwd / "wiki" / "concepts").is_dir() + assert (cwd / "wiki" / "entities").is_dir() assert (cwd / ".openkb").is_dir() # Files @@ -39,7 +40,7 @@ def test_init_creates_structure(tmp_path): # index.md header index_content = (cwd / "wiki" / "index.md").read_text() - assert index_content == "# Knowledge Base Index\n\n## Documents\n\n## Concepts\n\n## Explorations\n" + assert index_content == "# Knowledge Base Index\n\n## Documents\n\n## Concepts\n\n## Entities\n\n## Explorations\n" def test_init_schema_content(tmp_path): diff --git a/tests/test_list_status.py b/tests/test_list_status.py index babffb06..9d2249fc 100644 --- a/tests/test_list_status.py +++ b/tests/test_list_status.py @@ -17,6 +17,7 @@ def _setup_kb(tmp_path: Path) -> Path: (kb_dir / "wiki" / "sources" / "images").mkdir(parents=True) (kb_dir / "wiki" / "summaries").mkdir(parents=True) (kb_dir / "wiki" / "concepts").mkdir(parents=True) + (kb_dir / "wiki" / "entities").mkdir(parents=True) (kb_dir / "wiki" / "reports").mkdir(parents=True) openkb_dir = kb_dir / ".openkb" openkb_dir.mkdir() @@ -111,6 +112,7 @@ def test_status_shows_directory_counts(self, tmp_path): assert "sources" in result.output assert "summaries" in result.output assert "concepts" in result.output + assert "entities" in result.output assert "reports" in result.output def test_status_shows_total_indexed(self, tmp_path): From a7a06ed51440ae58a33e15a44b5665c20cd670d8 Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 30 May 2026 05:36:06 +0000 Subject: [PATCH 16/29] fix(compiler): resolve entity-page review findings (dangling links + dedup) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses code-review findings on the entity-pages feature: - Fix dangling wikilink after `openkb remove`: entity removal now strips standalone `See also: [[summaries/{doc}]]` lines (the related-entity backlink form), matching the concept path, and cli.py adds modified entity pages to the lint sweep scope so surviving pages are cleaned. - Unify the parallel concept/entity helpers into shared cores (_backlink_summary_pages, _backlink_pages, _remove_doc_from_pages) with thin per-type wrappers, so cleanup logic can no longer drift between the two page types (this is what caused the dangling-link bug). - Route related-entity cross-refs through _add_related_link (now page-type aware) instead of an inline reimplementation — removes a duplicate file read/write and keeps backlink creation symmetric with teardown. - Centralize the entity-type enum: prompts derive their type list from a single _ENTITY_TYPE_LIST source via import-time substitution. - Count entity items in the "all dropped as malformed" plan warning. - Drop the unreachable else branch in _update_index's entity loop. - Add regression test for the See-also strip on a surviving entity page. All 542 tests pass. --- openkb/agent/compiler.py | 288 +++++++++++++++++++-------------------- openkb/cli.py | 8 +- tests/test_compiler.py | 18 +++ 3 files changed, 165 insertions(+), 149 deletions(-) diff --git a/openkb/agent/compiler.py b/openkb/agent/compiler.py index 3d984670..4776e05a 100644 --- a/openkb/agent/compiler.py +++ b/openkb/agent/compiler.py @@ -68,6 +68,16 @@ """ +# Canonical entity-type enum — the single source of truth shared by the +# plan prompt, the entity-page prompts, and create/update validation. The +# prompt templates carry an ``__ENTITY_TYPES__`` token that is substituted +# with this list once at import time (see below), so adding a type here +# updates every place at once. +_ENTITY_TYPE_LIST = ("person", "organization", "place", "product", "work", "event", "other") +_ENTITY_TYPES = frozenset(_ENTITY_TYPE_LIST) +_ENTITY_TYPES_STR = ", ".join(_ENTITY_TYPE_LIST) + + _CONCEPTS_PLAN_USER = """\ Based on the summary above, decide how to update the wiki's CONCEPT pages and ENTITY pages. @@ -92,8 +102,7 @@ 3. "related" — existing concept slugs to cross-link only. Array of strings. "entities" is an object with the same three keys, but create/update objects -add a "type" field, one of: person, organization, place, product, work, -event, other. Example: +add a "type" field, one of: __ENTITY_TYPES__. Example: {{"name": "anthropic", "title": "Anthropic", "type": "organization"}} Rules: @@ -168,7 +177,7 @@ Return a JSON object with three keys: - "brief": A single sentence (under 100 chars) identifying this entity -- "type": one of person, organization, place, product, work, event, other +- "type": one of __ENTITY_TYPES__ - "content": The full entity page in Markdown — what this entity is, the key facts about it from this document, and [[wikilinks]] to related concepts, other [[entities/...]], and [[summaries/{doc_name}]] — subject to the @@ -190,12 +199,18 @@ Return a JSON object with three keys: - "brief": A single sentence (under 100 chars) identifying this entity -- "type": one of person, organization, place, product, work, event, other +- "type": one of __ENTITY_TYPES__ - "content": The rewritten full entity page in Markdown Return ONLY valid JSON, no fences. """ +# Substitute the canonical entity-type list into every prompt that advertises +# it, so the prompt text can never drift from ``_ENTITY_TYPES`` validation. +_CONCEPTS_PLAN_USER = _CONCEPTS_PLAN_USER.replace("__ENTITY_TYPES__", _ENTITY_TYPES_STR) +_ENTITY_PAGE_USER = _ENTITY_PAGE_USER.replace("__ENTITY_TYPES__", _ENTITY_TYPES_STR) +_ENTITY_UPDATE_USER = _ENTITY_UPDATE_USER.replace("__ENTITY_TYPES__", _ENTITY_TYPES_STR) + _SUMMARY_REWRITE_USER = """\ Task: Rewrite the summary you wrote above into a final version that is \ consistent with the concept pages now in the wiki (per the whitelist message \ @@ -415,9 +430,6 @@ def _filter_related_slugs(items: list) -> list[str]: return valid -_ENTITY_TYPES = {"person", "organization", "place", "product", "work", "event", "other"} - - def _filter_entity_items(items: object, label: str) -> list[dict]: """Validate entity create/update objects: require name+title, coerce type. @@ -934,65 +946,70 @@ def _remove_source_from_frontmatter(text: str, source_file: str) -> tuple[str, b return text, False -def _add_related_link(wiki_dir: Path, concept_slug: str, doc_name: str, source_file: str) -> None: - """Add a cross-reference link to an existing concept page (no LLM call).""" - concepts_dir = wiki_dir / "concepts" - path = concepts_dir / f"{concept_slug}.md" +def _add_related_link( + wiki_dir: Path, slug: str, doc_name: str, source_file: str, + page_dir: str = "concepts", +) -> bool: + """Add a cross-reference link to an existing page (no LLM call). + + Works for any page directory (``concepts`` or ``entities``). Returns True + when the page exists (whether or not a link was added), so callers can + track which related slugs are real pages. The standalone ``See also:`` + paragraph it writes is symmetric with ``remove_doc_from_pages``' cleanup. + """ + path = wiki_dir / page_dir / f"{slug}.md" if not path.exists(): - return + return False text = path.read_text(encoding="utf-8") link = f"[[summaries/{doc_name}]]" if link in text: - return + return True if source_file not in text: text = _prepend_source_to_frontmatter(text, source_file) text += f"\n\nSee also: {link}" path.write_text(text, encoding="utf-8") + return True -def _backlink_summary(wiki_dir: Path, doc_name: str, concept_slugs: list[str]) -> None: - """Append missing concept wikilinks to the summary page (no LLM call). - - After all concepts are generated, this ensures the summary page links - back to every related concept — closing the bidirectional link that - concept pages already have toward the summary. +def _backlink_summary_pages( + wiki_dir: Path, doc_name: str, slugs: list[str], + *, page_dir: str, section: str, +) -> None: + """Append missing ``[[{page_dir}/slug]]`` wikilinks to the summary page. - If a ``## Related Concepts`` section already exists, new links are - appended into it rather than creating a duplicate section. + Closes the bidirectional link the pages already hold toward the summary, + inserting them under ``section`` (created if absent). Shared by the + concept and entity summary-backlink wrappers below. """ summary_path = wiki_dir / "summaries" / f"{doc_name}.md" if not summary_path.exists(): return text = summary_path.read_text(encoding="utf-8") - missing = [slug for slug in concept_slugs if f"[[concepts/{slug}]]" not in text] + missing = [slug for slug in slugs if f"[[{page_dir}/{slug}]]" not in text] if not missing: return lines = text.split("\n") - _ensure_h2_section(lines, "## Related Concepts") + _ensure_h2_section(lines, section) for slug in reversed(missing): - _insert_section_entry(lines, "## Related Concepts", f"- [[concepts/{slug}]]") + _insert_section_entry(lines, section, f"- [[{page_dir}/{slug}]]") summary_path.write_text("\n".join(lines), encoding="utf-8") -def _backlink_concepts(wiki_dir: Path, doc_name: str, concept_slugs: list[str]) -> None: - """Append missing summary wikilink to each concept page (no LLM call). - - Ensures every concept page links back to the source document's summary, - regardless of whether the LLM included the link in its output. - - If a ``## Related Documents`` section already exists, the link is - appended into it rather than creating a duplicate section. - """ +def _backlink_pages( + wiki_dir: Path, doc_name: str, slugs: list[str], *, page_dir: str, +) -> None: + """Append the source summary wikilink to each page under '## Related + Documents'. Shared by the concept and entity page-backlink wrappers.""" link = f"[[summaries/{doc_name}]]" - concepts_dir = wiki_dir / "concepts" + pages_dir = wiki_dir / page_dir - for slug in concept_slugs: - path = concepts_dir / f"{slug}.md" + for slug in slugs: + path = pages_dir / f"{slug}.md" if not path.exists(): continue text = path.read_text(encoding="utf-8") @@ -1004,49 +1021,42 @@ def _backlink_concepts(wiki_dir: Path, doc_name: str, concept_slugs: list[str]) path.write_text("\n".join(lines), encoding="utf-8") +def _backlink_summary(wiki_dir: Path, doc_name: str, concept_slugs: list[str]) -> None: + """Link the summary page back to every related concept (no LLM call).""" + _backlink_summary_pages( + wiki_dir, doc_name, concept_slugs, + page_dir="concepts", section="## Related Concepts", + ) + + +def _backlink_concepts(wiki_dir: Path, doc_name: str, concept_slugs: list[str]) -> None: + """Link every related concept page back to the source summary (no LLM call).""" + _backlink_pages(wiki_dir, doc_name, concept_slugs, page_dir="concepts") + + def _backlink_summary_entities(wiki_dir: Path, doc_name: str, entity_slugs: list[str]) -> None: - """Append missing entity wikilinks to the summary page under '## Entities'.""" - summary_path = wiki_dir / "summaries" / f"{doc_name}.md" - if not summary_path.exists(): - return - text = summary_path.read_text(encoding="utf-8") - missing = [s for s in entity_slugs if f"[[entities/{s}]]" not in text] - if not missing: - return - lines = text.split("\n") - _ensure_h2_section(lines, "## Entities") - for slug in reversed(missing): - _insert_section_entry(lines, "## Entities", f"- [[entities/{slug}]]") - summary_path.write_text("\n".join(lines), encoding="utf-8") + """Link the summary page back to every related entity under '## Entities'.""" + _backlink_summary_pages( + wiki_dir, doc_name, entity_slugs, + page_dir="entities", section="## Entities", + ) def _backlink_entities(wiki_dir: Path, doc_name: str, entity_slugs: list[str]) -> None: - """Append the source summary wikilink to each entity page under - '## Related Documents' (mirrors _backlink_concepts).""" - link = f"[[summaries/{doc_name}]]" - entities_dir = wiki_dir / "entities" - for slug in entity_slugs: - path = entities_dir / f"{slug}.md" - if not path.exists(): - continue - text = path.read_text(encoding="utf-8") - if link in text: - continue - lines = text.split("\n") - _ensure_h2_section(lines, "## Related Documents") - _insert_section_entry(lines, "## Related Documents", f"- {link}") - path.write_text("\n".join(lines), encoding="utf-8") + """Link every related entity page back to the source summary (no LLM call).""" + _backlink_pages(wiki_dir, doc_name, entity_slugs, page_dir="entities") -def remove_doc_from_concept_pages( +def _remove_doc_from_pages( wiki_dir: Path, doc_name: str, *, + page_dir: str, keep_empty: bool = False, ) -> dict[str, list[str]]: - """Update or delete concept pages affected by removing a document. + """Update or delete pages in ``page_dir`` affected by removing a document. - For each ``concepts/*.md`` whose frontmatter ``sources:`` lists + For each ``{page_dir}/*.md`` whose frontmatter ``sources:`` lists ``summaries/{doc_name}``: - Remove that source from the frontmatter list. @@ -1055,24 +1065,16 @@ def remove_doc_from_concept_pages( - Remove any standalone ``See also: [[summaries/{doc_name}]]`` lines (left by ``_add_related_link``). - If the ``sources:`` list becomes empty AND ``keep_empty`` is False, - delete the concept page entirely. - - Args: - wiki_dir: Path to the wiki root directory. - doc_name: The summary slug being removed (e.g. - ``"attention-is-all-you-need"``). - keep_empty: When True, retains concept pages whose only source - was the removed doc — leaves their frontmatter with an empty - ``sources: []`` list. Useful when the doc is being replaced - by a newer version that will repopulate the source on the - next ``openkb add``. - - Returns: - ``{"modified": [slugs...], "deleted": [slugs...]}`` — concept - slugs whose pages were edited vs. deleted. + delete the page entirely. + + Shared by the concept and entity removal wrappers so the cleanup (in + particular the standalone ``See also:`` strip) can never drift between + the two page types. + + Returns ``{"modified": [slugs...], "deleted": [slugs...]}``. """ - concepts_dir = wiki_dir / "concepts" - if not concepts_dir.is_dir(): + pages_dir = wiki_dir / page_dir + if not pages_dir.is_dir(): return {"modified": [], "deleted": []} source_file = f"summaries/{doc_name}.md" @@ -1082,7 +1084,7 @@ def remove_doc_from_concept_pages( modified: list[str] = [] deleted: list[str] = [] - for path in sorted(concepts_dir.glob("*.md")): + for path in sorted(pages_dir.glob("*.md")): text = path.read_text(encoding="utf-8") # Cheap filter: skip pages that don't reference the doc at all. if source_file not in text and bare_source not in text: @@ -1128,49 +1130,38 @@ def remove_doc_from_concept_pages( return {"modified": modified, "deleted": deleted} -def remove_doc_from_entity_pages( +def remove_doc_from_concept_pages( wiki_dir: Path, doc_name: str, *, keep_empty: bool = False, ) -> dict[str, list[str]]: - """Update or delete entity pages affected by removing a document. + """Update or delete concept pages affected by removing a document. - Mirrors ``remove_doc_from_concept_pages`` for the entities/ directory: - strips ``summaries/{doc_name}`` from each entity's ``sources:`` and from - its ``## Related Documents`` section; deletes the page when its sources - list empties (unless ``keep_empty``). Returns - ``{"modified": [...], "deleted": [...]}``. + ``keep_empty`` retains concept pages whose only source was the removed + doc (leaving ``sources: []``) — useful when the doc is being replaced by + a newer version that will repopulate the source on the next ``openkb + add``. Returns ``{"modified": [slugs...], "deleted": [slugs...]}``. """ - entities_dir = wiki_dir / "entities" - if not entities_dir.is_dir(): - return {"modified": [], "deleted": []} - - source_file = f"summaries/{doc_name}.md" - bare_source = f"summaries/{doc_name}" - link = f"[[{bare_source}]]" + return _remove_doc_from_pages( + wiki_dir, doc_name, page_dir="concepts", keep_empty=keep_empty, + ) - modified: list[str] = [] - deleted: list[str] = [] - for path in sorted(entities_dir.glob("*.md")): - text = path.read_text(encoding="utf-8") - if source_file not in text and bare_source not in text: - continue - new_text, sources_empty = _remove_source_from_frontmatter(text, source_file) - if link in new_text: - lines = new_text.split("\n") - while _remove_section_entry(lines, "## Related Documents", link): - pass - new_text = "\n".join(lines) - if sources_empty and not keep_empty: - path.unlink() - deleted.append(path.stem) - elif new_text != text: - path.write_text(new_text, encoding="utf-8") - modified.append(path.stem) +def remove_doc_from_entity_pages( + wiki_dir: Path, + doc_name: str, + *, + keep_empty: bool = False, +) -> dict[str, list[str]]: + """Update or delete entity pages affected by removing a document. - return {"modified": modified, "deleted": deleted} + Mirrors ``remove_doc_from_concept_pages`` for the entities/ directory. + Returns ``{"modified": [...], "deleted": [...]}``. + """ + return _remove_doc_from_pages( + wiki_dir, doc_name, page_dir="entities", keep_empty=keep_empty, + ) def remove_doc_from_index(wiki_dir: Path, doc_name: str, concept_slugs_deleted: list[str], @@ -1262,19 +1253,16 @@ def _update_index( _ensure_h2_section(lines, "## Entities") for name in entity_names: link = f"[[entities/{name}]]" - if name in entity_meta: - etype, brief = entity_meta[name] - entry = f"- {link} ({etype})" - if brief: - entry += f" — {brief}" - if _section_contains_link(lines, "## Entities", link): - _replace_section_entry(lines, "## Entities", link, entry) - else: - _insert_section_entry(lines, "## Entities", entry) + # Callers always populate entity_meta alongside entity_names; the + # default is a defensive fallback, never hit in practice. + etype, brief = entity_meta.get(name, ("other", "")) + entry = f"- {link} ({etype})" + if brief: + entry += f" — {brief}" + if _section_contains_link(lines, "## Entities", link): + _replace_section_entry(lines, "## Entities", link, entry) else: - if not _section_contains_link(lines, "## Entities", link): - entry = f"- {link} (other)" - _insert_section_entry(lines, "## Entities", entry) + _insert_section_entry(lines, "## Entities", entry) index_path.write_text("\n".join(lines), encoding="utf-8") @@ -1403,17 +1391,27 @@ def _write_v1_summary_stripped() -> None: entity_related = entities_plan["related"] # Distinguish "filters dropped everything" from "LLM emitted an empty plan". + # Count entity items too, so a plan that emitted only entities — all of + # which were dropped as malformed — still surfaces the warning. + def _raw_group_count(group: object) -> int: + if not isinstance(group, dict): + return 0 + return sum( + len(group.get(k, [])) if isinstance(group.get(k), list) else 0 + for k in ("create", "update", "related") + ) + if isinstance(parsed, list): original_total = len(parsed) else: - original_total = sum( - len(concepts_group.get(k, [])) if isinstance(concepts_group.get(k), list) else 0 - for k in ("create", "update", "related") - ) - post_filter_total = len(create_items) + len(update_items) + len(related_items) + original_total = _raw_group_count(concepts_group) + _raw_group_count(parsed.get("entities")) + post_filter_total = ( + len(create_items) + len(update_items) + len(related_items) + + len(entity_create) + len(entity_update) + len(entity_related) + ) if original_total > 0 and post_filter_total == 0: sys.stdout.write( - f" [WARN] concepts plan for {doc_name} had {original_total} " + f" [WARN] plan for {doc_name} had {original_total} " f"item(s), all dropped as malformed — see log (stderr).\n" ) sys.stdout.flush() @@ -1774,17 +1772,13 @@ async def _gen_entity_update(ent: dict) -> tuple[str, str, str, str]: _backlink_concepts(wiki_dir, doc_name, all_concept_slugs) # --- Step 3d: Process entity related items + backlinks (code only) --- - entity_related_slugs = [] - for slug in [_sanitize_concept_name(s) for s in entity_related]: - epath = wiki_dir / "entities" / f"{slug}.md" - if epath.exists(): - etext = epath.read_text(encoding="utf-8") - if f"[[summaries/{doc_name}]]" not in etext: - if source_file not in etext: - etext = _prepend_source_to_frontmatter(etext, source_file) - etext += f"\n\nSee also: [[summaries/{doc_name}]]" - epath.write_text(etext, encoding="utf-8") - entity_related_slugs.append(slug) + # Reuse _add_related_link (page_dir="entities") so related-entity + # cross-refs are written in the same "See also:" form the concept path + # uses — and torn down symmetrically by _remove_doc_from_pages. + entity_related_slugs = [ + slug for slug in (_sanitize_concept_name(s) for s in entity_related) + if _add_related_link(wiki_dir, slug, doc_name, source_file, page_dir="entities") + ] entity_backlink_slugs = entity_names + entity_related_slugs if entity_backlink_slugs: diff --git a/openkb/cli.py b/openkb/cli.py index 867a8f95..0d5f1dff 100644 --- a/openkb/cli.py +++ b/openkb/cli.py @@ -981,8 +981,8 @@ def remove(ctx, identifier, keep_raw, keep_empty_concepts, dry_run, yes): # on every attempt. # # Scope: only the pages this remove actually touched (modified - # concept pages ∪ index.md). Previously this swept the whole wiki - # via ``fix_broken_links(wiki_dir)``, which silently stripped + # concept + entity pages ∪ index.md). Previously this swept the whole + # wiki via ``fix_broken_links(wiki_dir)``, which silently stripped # pre-existing dangling links in unrelated pages — see issue #58 # (Bug 2). Users who want a wiki-wide sweep can still run # ``openkb lint --fix`` explicitly. @@ -990,6 +990,10 @@ def remove(ctx, identifier, keep_raw, keep_empty_concepts, dry_run, yes): wiki_dir / "concepts" / f"{slug}.md" for slug in concept_result["modified"] ] + lint_scope += [ + wiki_dir / "entities" / f"{slug}.md" + for slug in entity_result["modified"] + ] index_md = wiki_dir / "index.md" if index_md.exists(): lint_scope.append(index_md) diff --git a/tests/test_compiler.py b/tests/test_compiler.py index b84bacee..fa24ab6f 100644 --- a/tests/test_compiler.py +++ b/tests/test_compiler.py @@ -1544,6 +1544,24 @@ def test_strip_source_and_delete_when_empty(self, tmp_path): assert "summaries/doc" not in shared assert "summaries/other" in shared + def test_strips_standalone_see_also_line(self, tmp_path): + # A related entity (linked via _add_related_link) carries a + # standalone "See also:" paragraph, not a "## Related Documents" + # section. Removing the doc must strip it so no dangling wikilink + # survives on an entity that has other sources. + ent = tmp_path / "entities" + ent.mkdir() + (ent / "shared.md").write_text( + "---\ntype: organization\nsources: [summaries/doc.md, summaries/other.md]\n---\n\n" + "# Shared\n\nSee also: [[summaries/doc]]", + encoding="utf-8") + result = remove_doc_from_entity_pages(tmp_path, "doc") + assert result == {"modified": ["shared"], "deleted": []} + shared = (ent / "shared.md").read_text(encoding="utf-8") + assert "summaries/doc" not in shared + assert "See also" not in shared + assert "summaries/other" in shared + class TestCompileEntitiesEndToEnd: @pytest.mark.asyncio From b882ee9209b56089a287266922365a1c9c70be60 Mon Sep 17 00:00:00 2001 From: mountain Date: Sun, 31 May 2026 08:58:39 +0800 Subject: [PATCH 17/29] fix(compiler): add [[entities/X]] whitelist rule + restore concept-topic guard Remaining review findings after a7a06ed: - _KNOWN_TARGETS_USER now states the [[entities/Z]] rule, so entity links the LLM is told to write aren't silently stripped as ghosts. - Restore the dropped 'Do NOT create concepts that are just the document topic itself' plan rule to prevent redundant title-mirror concepts. --- openkb/agent/compiler.py | 4 +++- tests/test_compiler.py | 18 ++++++++++++++++++ 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/openkb/agent/compiler.py b/openkb/agent/compiler.py index 4776e05a..4b6a6b04 100644 --- a/openkb/agent/compiler.py +++ b/openkb/agent/compiler.py @@ -113,6 +113,7 @@ sparse documents. - Prefer "update" over "create" for any concept or entity already listed above. - Do NOT create a concept/entity that overlaps an existing one — use "update". +- Do NOT create concepts that are just the document topic itself. - "related" is lightweight cross-linking only, no content rewrite. Return ONLY valid JSON, no fences, no explanation. @@ -127,8 +128,9 @@ Rules for [[wikilinks]] in all subsequent responses: - For [[concepts/X]]: X must appear in the whitelist above. - For [[summaries/Y]]: Y must appear in the whitelist above. +- For [[entities/Z]]: Z must appear in the whitelist above. - Do NOT invent new wikilink targets. If you want to mention a concept \ -that is not in the whitelist, write it as plain text without brackets. +or entity that is not in the whitelist, write it as plain text without brackets. """ _CONCEPT_PAGE_USER = """\ diff --git a/tests/test_compiler.py b/tests/test_compiler.py index fa24ab6f..73fbe009 100644 --- a/tests/test_compiler.py +++ b/tests/test_compiler.py @@ -1672,3 +1672,21 @@ def test_schema_declares_entities(): assert "Entity Page" in AGENTS_MD for t in ("person", "organization", "place", "product", "work", "event", "other"): assert t in AGENTS_MD + + +def test_known_targets_prompt_has_entities_rule(): + """The whitelist message must tell the LLM the [[entities/X]] rule, since + entity-page prompts instruct writing such links; otherwise entity links + are generated freely and then stripped as ghosts.""" + from openkb.agent.compiler import _KNOWN_TARGETS_USER + + assert "[[entities/" in _KNOWN_TARGETS_USER + + +def test_plan_prompt_keeps_topic_itself_guard(): + """The concept-plan prompt must retain the guard against creating a concept + that merely mirrors the document's own topic.""" + from openkb.agent.compiler import _CONCEPTS_PLAN_USER + + assert "just the document topic itself" in _CONCEPTS_PLAN_USER + From d1dc6375174f873c23e53c863b86e1ccf0928eae Mon Sep 17 00:00:00 2001 From: mountain Date: Sun, 31 May 2026 09:27:47 +0800 Subject: [PATCH 18/29] feat(entities): shared page-dir constants + surface entities in list/lint/status/skill-gate/linter Add PAGE_CONTENT_DIRS and INDEX_SEED to openkb/schema.py as the single source of truth; replace duplicated index-seed literals in cli init and compiler._update_index with INDEX_SEED. - openkb list / chat /list: add an Entities section (#2) - lint.check_index_sync: iterate PAGE_CONTENT_DIRS so entities/ pages missing from index.md are flagged (#4) - skill-new gate: count entities/ as compiled content (#5) - status last-compile: derive from summaries/concepts/entities mtimes (#12) - semantic linter: read entities/, check contradictions/redundancy/ coverage/orphans (#3) --- openkb/agent/compiler.py | 8 ++----- openkb/agent/linter.py | 12 ++++++---- openkb/cli.py | 42 +++++++++++++++++++++------------- openkb/lint.py | 8 ++++--- openkb/schema.py | 11 ++++++++- tests/test_lint.py | 16 +++++++++++++ tests/test_list_status.py | 28 +++++++++++++++++++++++ tests/test_skill_chat_slash.py | 13 +++++++++++ 8 files changed, 108 insertions(+), 30 deletions(-) diff --git a/openkb/agent/compiler.py b/openkb/agent/compiler.py index 4b6a6b04..f90e94af 100644 --- a/openkb/agent/compiler.py +++ b/openkb/agent/compiler.py @@ -30,7 +30,7 @@ import yaml from openkb.lint import list_existing_wiki_targets, strip_ghost_wikilinks -from openkb.schema import get_agents_md +from openkb.schema import INDEX_SEED, get_agents_md logger = logging.getLogger(__name__) @@ -1219,11 +1219,7 @@ def _update_index( index_path = wiki_dir / "index.md" if not index_path.exists(): - index_path.write_text( - "# Knowledge Base Index\n\n## Documents\n\n## Concepts\n\n" - "## Entities\n\n## Explorations\n", - encoding="utf-8", - ) + index_path.write_text(INDEX_SEED, encoding="utf-8") lines = index_path.read_text(encoding="utf-8").split("\n") diff --git a/openkb/agent/linter.py b/openkb/agent/linter.py index 969a2cfa..7e790299 100644 --- a/openkb/agent/linter.py +++ b/openkb/agent/linter.py @@ -24,12 +24,16 @@ 4. **Redundancy** — Are there multiple pages that cover the same content and could be merged? 5. **Concept coverage** — Are important themes in the summaries missing concept pages? +6. **Entity coverage** — Are important named things (people, organizations, places, + products, works, events) in the summaries missing entity pages, or are existing + entity pages contradictory, redundant, or orphaned (unlinked from any source)? ## Process 1. Start with index.md to understand scope. 2. Read summary pages to understand document content. 3. Read concept pages to check for contradictions and gaps. -4. Produce a structured Markdown report listing issues found with references +4. Read entity pages to check for contradictions, redundancy, coverage, and orphans. +5. Produce a structured Markdown report listing issues found with references to the specific pages where each issue occurs. Be thorough but concise. If the wiki is small or sparse, say so. @@ -99,9 +103,9 @@ async def run_knowledge_lint(kb_dir: Path, model: str) -> str: prompt = ( "Please audit this knowledge base wiki for semantic quality issues: " - "contradictions, gaps, staleness, redundancy, and missing concept pages. " - "Start with index.md, then read summaries and concepts as needed. " - "Produce a structured Markdown report." + "contradictions, gaps, staleness, redundancy, and missing concept and " + "entity pages. Start with index.md, then read summaries, concepts, and " + "entities as needed. Produce a structured Markdown report." ) result = await Runner.run(agent, prompt, max_turns=MAX_TURNS) diff --git a/openkb/cli.py b/openkb/cli.py index 0d5f1dff..fe578e7d 100644 --- a/openkb/cli.py +++ b/openkb/cli.py @@ -43,7 +43,7 @@ def filter(self, record: logging.LogRecord) -> bool: from openkb.config import DEFAULT_CONFIG, load_config, save_config, load_global_config, register_kb from openkb.converter import convert_document from openkb.log import append_log -from openkb.schema import AGENTS_MD +from openkb.schema import AGENTS_MD, INDEX_SEED, PAGE_CONTENT_DIRS # Suppress warnings after all imports — markitdown overrides filters at import time import warnings @@ -217,7 +217,7 @@ def _preflight_skill_new(kb_dir: Path, name: str) -> str | None: Checks (in order): * skill name is a valid kebab-case slug * ``/wiki`` exists - * ``/wiki/concepts`` or ``/wiki/summaries`` has at least + * any of ``/wiki/{summaries,concepts,entities}`` has at least one file (i.e. some document has been ingested + compiled) Returns ``None`` if all gates pass, else a single-line error message @@ -239,7 +239,7 @@ def _preflight_skill_new(kb_dir: Path, name: str) -> str | None: has_content = any( (wiki / sub).is_dir() and any((wiki / sub).iterdir()) - for sub in ("concepts", "summaries") + for sub in PAGE_CONTENT_DIRS ) if not has_content: return ( @@ -542,10 +542,7 @@ def init(model, language): # Write wiki files Path("wiki/AGENTS.md").write_text(AGENTS_MD, encoding="utf-8") - Path("wiki/index.md").write_text( - "# Knowledge Base Index\n\n## Documents\n\n## Concepts\n\n## Entities\n\n## Explorations\n", - encoding="utf-8", - ) + Path("wiki/index.md").write_text(INDEX_SEED, encoding="utf-8") Path("wiki/log.md").write_text("# Operations Log\n\n", encoding="utf-8") # Create .openkb/ state directory @@ -1288,6 +1285,15 @@ def print_list(kb_dir: Path) -> None: for c in concepts: click.echo(f" - {c}") + # Display entities + entities_dir = kb_dir / "wiki" / "entities" + if entities_dir.exists(): + entities = sorted(p.stem for p in entities_dir.glob("*.md")) + if entities: + click.echo(f"\nEntities ({len(entities)}):") + for e in entities: + click.echo(f" - {e}") + # Display reports reports_dir = kb_dir / "wiki" / "reports" if reports_dir.exists(): @@ -1343,15 +1349,19 @@ def print_status(kb_dir: Path) -> None: hashes = json.loads(hashes_file.read_text(encoding="utf-8")) click.echo(f"\n Total indexed: {len(hashes)} document(s)") - # Last compile time: newest file in wiki/summaries/ - summaries_dir = wiki_dir / "summaries" - if summaries_dir.exists(): - summaries = list(summaries_dir.glob("*.md")) - if summaries: - newest_summary = max(summaries, key=lambda p: p.stat().st_mtime) - import datetime - mtime = datetime.datetime.fromtimestamp(newest_summary.stat().st_mtime) - click.echo(f" Last compile: {mtime.strftime('%Y-%m-%d %H:%M:%S')}") + # Last compile time: newest compiled page across summaries/, concepts/, + # and entities/ (an entity-only compile must still bump the shown time). + compiled_pages = [ + p + for sub in PAGE_CONTENT_DIRS + for p in (wiki_dir / sub).glob("*.md") + if (wiki_dir / sub).exists() + ] + if compiled_pages: + newest_page = max(compiled_pages, key=lambda p: p.stat().st_mtime) + import datetime + mtime = datetime.datetime.fromtimestamp(newest_page.stat().st_mtime) + click.echo(f" Last compile: {mtime.strftime('%Y-%m-%d %H:%M:%S')}") # Last lint time: newest file in wiki/reports/ reports_dir = wiki_dir / "reports" diff --git a/openkb/lint.py b/openkb/lint.py index 2f345659..2ac6af1d 100644 --- a/openkb/lint.py +++ b/openkb/lint.py @@ -15,6 +15,8 @@ import yaml +from openkb.schema import PAGE_CONTENT_DIRS + # Matches [[wikilink]] or [[subdir/link]] _WIKILINK_RE = re.compile(r"\[\[([^\]]+)\]\]") @@ -368,7 +370,7 @@ def check_index_sync(wiki: Path) -> list[str]: Returns issues for: - Links in index.md pointing to non-existent pages - - Pages in summaries/ or concepts/ not mentioned in index.md + - Pages in summaries/, concepts/, or entities/ not mentioned in index.md Args: wiki: Path to the wiki root directory. @@ -392,11 +394,11 @@ def check_index_sync(wiki: Path) -> list[str]: if lnk_norm not in pages: issues.append(f"index.md links to missing page: [[{lnk}]]") - # Check that summaries and concepts pages are mentioned in index + # Check that summaries, concepts, and entities pages are mentioned in index index_stems = {Path(lnk.strip()).stem for lnk in index_links} index_text_lower = index_text.lower() - for subdir in ("summaries", "concepts"): + for subdir in PAGE_CONTENT_DIRS: subdir_path = wiki / subdir if not subdir_path.exists(): continue diff --git a/openkb/schema.py b/openkb/schema.py index 9f12b88d..605fa8eb 100644 --- a/openkb/schema.py +++ b/openkb/schema.py @@ -2,6 +2,14 @@ from pathlib import Path +# The compiled page-type subdirectories under wiki/. Shared source of truth +# for surfaces that enumerate page content (list, lint, status, skill gate). +PAGE_CONTENT_DIRS = ("summaries", "concepts", "entities") + +# Canonical empty index.md seed. Used by `openkb init` and the compiler's +# lazy-create path so they never drift. +INDEX_SEED = "# Knowledge Base Index\n\n## Documents\n\n## Concepts\n\n## Entities\n\n## Explorations\n" + AGENTS_MD = """\ # Wiki Schema @@ -26,9 +34,10 @@ - **Index Page** (index.md): One-liner summary of every page in the wiki. Auto-maintained. ## Index Page Format -index.md lists all documents, concepts, and explorations with metadata: +index.md lists all documents, concepts, entities, and explorations with metadata: - Documents: name, one-liner description, type (short|pageindex), detail access path - Concepts: name, one-liner description +- Entities: name, type, one-liner description - Explorations: name, one-liner description ## Log Format diff --git a/tests/test_lint.py b/tests/test_lint.py index 8600e63d..99dca51d 100644 --- a/tests/test_lint.py +++ b/tests/test_lint.py @@ -185,6 +185,22 @@ def test_page_not_in_index(self, tmp_path): assert any("unlisted" in issue for issue in result) + def test_entity_page_not_in_index(self, tmp_path): + wiki = _make_wiki(tmp_path) + (wiki / "entities").mkdir() + (wiki / "entities" / "ada-lovelace.md").write_text("# Ada Lovelace") + # index.md has no mention of the entity + (wiki / "index.md").write_text( + "# Index\n\n## Documents\n\n## Concepts\n\n## Entities\n" + ) + + result = check_index_sync(wiki) + + assert any( + "entities/ada-lovelace.md not mentioned in index.md" in issue + for issue in result + ) + def test_missing_index_md(self, tmp_path): wiki = tmp_path / "wiki" wiki.mkdir() diff --git a/tests/test_list_status.py b/tests/test_list_status.py index 9d2249fc..76365b08 100644 --- a/tests/test_list_status.py +++ b/tests/test_list_status.py @@ -88,6 +88,34 @@ def test_list_no_concepts_section_when_empty(self, tmp_path): # No concepts in output since none exist assert "Concepts:" not in result.output + def test_list_shows_entities(self, tmp_path): + kb_dir = _setup_kb(tmp_path) + hashes = {"abc": {"name": "paper.pdf", "type": "pdf"}} + (kb_dir / ".openkb" / "hashes.json").write_text(json.dumps(hashes)) + (kb_dir / "wiki" / "entities" / "ada-lovelace.md").write_text("# Ada") + (kb_dir / "wiki" / "entities" / "openai.md").write_text("# OpenAI") + + runner = CliRunner() + with patch("openkb.cli._find_kb_dir", return_value=kb_dir): + result = runner.invoke(cli, ["list"]) + + assert "Entities (2):" in result.output + assert "ada-lovelace" in result.output + assert "openai" in result.output + + def test_list_no_entities_section_when_empty(self, tmp_path): + kb_dir = _setup_kb(tmp_path) + hashes = {"abc": {"name": "paper.pdf", "type": "pdf"}} + (kb_dir / ".openkb" / "hashes.json").write_text(json.dumps(hashes)) + + runner = CliRunner() + with patch("openkb.cli._find_kb_dir", return_value=kb_dir): + result = runner.invoke(cli, ["list"]) + + assert result.exit_code == 0 + assert "Entities:" not in result.output + assert "Entities (" not in result.output + class TestStatusCommand: def test_status_no_kb(self, tmp_path): diff --git a/tests/test_skill_chat_slash.py b/tests/test_skill_chat_slash.py index d99dd152..faf13725 100644 --- a/tests/test_skill_chat_slash.py +++ b/tests/test_skill_chat_slash.py @@ -87,6 +87,19 @@ async def test_slash_skill_new_rejects_empty_wiki(tmp_path): assert not (kb / "output").exists() +def test_preflight_gate_counts_entities(tmp_path): + """The wiki-content gate must accept a KB whose only compiled content + lives in entities/ (no concept or summary pages yet).""" + from openkb.cli import _preflight_skill_new + + kb = tmp_path + (kb / "wiki" / "entities").mkdir(parents=True) + (kb / "wiki" / "entities" / "ada.md").write_text("# Ada\n") + + # No error means the gate passed. + assert _preflight_skill_new(kb, "demo") is None + + @pytest.mark.asyncio async def test_slash_skill_new_rejects_when_target_exists(tmp_path): """Chat / slash command must not silently overwrite an existing skill.""" From bd81f7e7b0777085472a38ad1b2908dc099af6fa Mon Sep 17 00:00:00 2001 From: mountain Date: Sun, 31 May 2026 09:27:51 +0800 Subject: [PATCH 19/29] feat(entities): remove preview lists entity-page actions (#1) The dry-run/confirmation block now scans wiki/entities/ with the same frontmatter sources: logic as concepts, emits DELETE/MODIFY action lines per entity page, and prints an 'N entity(s) will be DELETED' summary. Execution path (remove_doc_from_entity_pages) unchanged. --- openkb/cli.py | 42 ++++++++++++++++++++++++++++++++++++++++++ tests/test_remove.py | 28 ++++++++++++++++++++++++++++ 2 files changed, 70 insertions(+) diff --git a/openkb/cli.py b/openkb/cli.py index fe578e7d..6bf2b79e 100644 --- a/openkb/cli.py +++ b/openkb/cli.py @@ -894,6 +894,42 @@ def remove(ctx, identifier, keep_raw, keep_empty_concepts, dry_run, yes): for slug in concept_edits: actions.append(("MODIFY", f"wiki/concepts/{slug}.md (drop this doc from sources)")) + # Scan entity pages with the same frontmatter logic as concepts. The + # executor calls ``remove_doc_from_entity_pages``; this only makes the + # preview/summary truthful about what it will delete vs. edit. + affected_entities: list[tuple[str, int]] = [] # (slug, remaining_sources) + entities_dir = wiki_dir / "entities" + if entities_dir.is_dir(): + for path in sorted(entities_dir.glob("*.md")): + text = path.read_text(encoding="utf-8") + if not text.startswith("---"): + continue + fm_end = text.find("---", 3) + if fm_end == -1: + continue + sources_count = 0 + source_in_frontmatter = False + for line in text[:fm_end].split("\n"): + if line.lstrip().startswith("sources:"): + lb = line.find("[") + rb = line.rfind("]") + if lb != -1 and rb != -1 and rb > lb: + items = [s.strip() for s in line[lb + 1:rb].split(",") if s.strip()] + sources_count = len(items) + source_in_frontmatter = source_file_marker in items + break + if not source_in_frontmatter: + continue + remaining = max(sources_count - 1, 0) + affected_entities.append((path.stem, remaining)) + + entity_deletes = [s for s, r in affected_entities if r == 0 and not keep_empty_concepts] + entity_edits = [s for s, r in affected_entities if r > 0 or keep_empty_concepts] + for slug in entity_deletes: + actions.append(("DELETE", f"wiki/entities/{slug}.md (only source: this doc)")) + for slug in entity_edits: + actions.append(("MODIFY", f"wiki/entities/{slug}.md (drop this doc from sources)")) + if (wiki_dir / "index.md").exists(): actions.append(("MODIFY", "wiki/index.md (remove Documents entry)")) @@ -935,6 +971,12 @@ def remove(ctx, identifier, keep_raw, keep_empty_concepts, dry_run, yes): f" {len(concept_deletes)} concept(s) will be DELETED because this is their only source." ) click.echo(" Pass --keep-empty-concepts to retain them instead.") + if entity_deletes: + click.echo("") + click.echo( + f" {len(entity_deletes)} entity(s) will be DELETED because this is their only source." + ) + click.echo(" Pass --keep-empty-concepts to retain them instead.") click.echo("") if dry_run: diff --git a/tests/test_remove.py b/tests/test_remove.py index 8518d639..a915ee56 100644 --- a/tests/test_remove.py +++ b/tests/test_remove.py @@ -390,6 +390,34 @@ def test_cli_remove_dry_run_does_nothing(kb_dir): assert "h_a" in hashes +def test_cli_remove_preview_lists_entity_actions(kb_dir): + """The dry-run preview must enumerate entity-page DELETE/MODIFY actions + and report an 'N entity(s) will be DELETED' summary line.""" + _seed_two_doc_kb(kb_dir) + (kb_dir / "wiki" / "entities").mkdir(parents=True) + # Single-source entity (only attention) -> will be DELETED + (kb_dir / "wiki" / "entities" / "vaswani.md").write_text( + "---\nsources: [summaries/attention-h_a.md]\ntype: person\nbrief: V\n---\n" + "# Vaswani\n\n## Related Documents\n- [[summaries/attention-h_a]]\n", + encoding="utf-8", + ) + # Multi-source entity (both) -> will be MODIFIED + (kb_dir / "wiki" / "entities" / "google.md").write_text( + "---\nsources: [summaries/attention-h_a.md, summaries/llm-h_l.md]\n" + "type: organization\nbrief: G\n---\n# Google\n", + encoding="utf-8", + ) + + result = _invoke(kb_dir, ["remove", "attention.pdf", "--dry-run"]) + + assert result.exit_code == 0, result.output + assert "DELETE wiki/entities/vaswani.md" in result.output + assert "MODIFY wiki/entities/google.md" in result.output + assert "1 entity(s) will be DELETED" in result.output + # Nothing actually removed in dry-run. + assert (kb_dir / "wiki" / "entities" / "vaswani.md").exists() + + def test_cli_remove_yes_executes_full_plan(kb_dir): _seed_two_doc_kb(kb_dir) result = _invoke(kb_dir, ["remove", "attention.pdf", "--yes"]) From 3d7c842247a5ad9eca8f1b7b328ce3ce4e49b15c Mon Sep 17 00:00:00 2001 From: mountain Date: Sun, 31 May 2026 09:27:57 +0800 Subject: [PATCH 20/29] docs(entities): document entity pages in shipped openkb skill (#8) Note wiki/entities/ holds named-thing pages (people/orgs/places/ products/works/events) with a type: frontmatter field, that index.md has a ## Entities section, and that 'who/what is X' questions should read the matching entities/ page first. --- skills/openkb/SKILL.md | 13 ++++++++++--- skills/openkb/references/wiki-schema.md | 23 ++++++++++++++++++++++- 2 files changed, 32 insertions(+), 4 deletions(-) diff --git a/skills/openkb/SKILL.md b/skills/openkb/SKILL.md index b515774f..f5c4b6f6 100644 --- a/skills/openkb/SKILL.md +++ b/skills/openkb/SKILL.md @@ -14,12 +14,17 @@ description: | The user has compiled their documents into a Markdown wiki at `wiki/`. -The wiki holds three kinds of pages: +The wiki holds these kinds of pages: - **Concept pages** at `wiki/concepts/*.md` — cross-document synthesis on specific topics. This is where OpenKB's value compounds: a concept with multiple sources represents knowledge merged across documents the user has ingested. +- **Entity pages** at `wiki/entities/*.md` — one per specific named + thing (people, organizations, places, products, named works, + events), accumulated across documents. Each has a `type:` + frontmatter field. For "who is X" / "what is X" questions about a + named thing, read the matching `entities/` page first. - **Summary pages** at `wiki/summaries/*.md` — one per ingested document, linking to the concepts that document touches. - **Source files** at `wiki/sources/*.{md,json}` — full text for short @@ -76,8 +81,9 @@ After capturing the KB path from `openkb status`, drill in via: - `openkb list` — table of ingested documents (name, type, page count) plus the concept list. -- Read `/wiki/index.md` — the compiled table of contents. Every - document and concept has a one-line `brief`. Scan this and pick the +- Read `/wiki/index.md` — the compiled table of contents. It has + `## Documents`, `## Concepts`, `## Entities`, and `## Explorations` + sections; every entry has a one-line `brief`. Scan this and pick the slugs that semantically match the user's question. ## Read content @@ -90,6 +96,7 @@ calls these `Read` / `Grep` / `Bash`; Gemini CLI uses `read_file` / | Goal | Action | |---|---| | Read a concept page | read the file at `/wiki/concepts/.md` | +| Answer "who/what is X" about a named thing | read `/wiki/entities/.md` | | Read a document's summary | read `/wiki/summaries/.md` | | Read a short doc's full text | read `/wiki/sources/.md` | | Read a long doc's specific page | shell: `jq '.[N-1]' /wiki/sources/.json` (N = 1-indexed PDF page; `.[0]` is page 1) | diff --git a/skills/openkb/references/wiki-schema.md b/skills/openkb/references/wiki-schema.md index 6b1a4e7f..ca95026f 100644 --- a/skills/openkb/references/wiki-schema.md +++ b/skills/openkb/references/wiki-schema.md @@ -14,6 +14,7 @@ long-PDF JSON shape, wikilink resolution rules. ├── log.md Chronological ingest/edit log ├── summaries/.md One per ingested document ├── concepts/.md Cross-document synthesis pages + ├── entities/.md Named-thing pages (people/orgs/places/...) ├── sources/ Converted source content │ ├── .md Short-doc full text │ ├── .json Long-doc paginated content @@ -28,7 +29,7 @@ registry, PageIndex DB). **Do not read these directly** — use ## `wiki/index.md` -Three top-level sections, each entry has a one-line brief: +Four top-level sections, each entry has a one-line brief: ```markdown ## Documents @@ -38,6 +39,9 @@ Three top-level sections, each entry has a one-line brief: ## Concepts - [[concepts/attention]] — brief from frontmatter +## Entities +- [[entities/ada-lovelace]] (person) — brief from frontmatter + ## Explorations - [[explorations/some-saved-query]] — saved query answer ``` @@ -76,6 +80,23 @@ Body: free-form sections + `## Related Documents` listing contributing summaries. **Multi-source = cross-document synthesis** — this is the high-value output of OpenKB's compile pipeline. +## `wiki/entities/.md` + +Frontmatter: + +```yaml +--- +sources: [summaries/paper.md, summaries/notes.md] +brief: One-line description. +type: person # person | organization | place | product | work | event | other +--- +``` + +Body: free-form sections about the named thing + a `## Related +Documents` section. One page per entity, accumulated as more +documents mention it. For "who/what is X" questions about a named +thing, read the matching entity page first. + ## `wiki/sources/.md` (short docs) The markitdown-converted full text. Image refs appear as From 022aad4741071f6a0600b3f3b0f1cf86cab7962b Mon Sep 17 00:00:00 2001 From: mountain Date: Sun, 31 May 2026 09:34:38 +0800 Subject: [PATCH 21/29] fix(compiler): don't write raw JSON body on empty LLM content In the parse-succeeded branch of _gen_create/_gen_update/_gen_entity_create/ _gen_entity_update, fall back to "" instead of the raw JSON string when the content field is empty/null. _require_nonempty_content then raises and the page is dropped, rather than writing the JSON envelope as the markdown body. The parse-FAILED (except) branch keeps content=raw as the legitimate non-JSON fallback. --- openkb/agent/compiler.py | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/openkb/agent/compiler.py b/openkb/agent/compiler.py index f90e94af..038983b0 100644 --- a/openkb/agent/compiler.py +++ b/openkb/agent/compiler.py @@ -1479,10 +1479,13 @@ async def _gen_create(concept: dict) -> tuple[str, str, bool, str]: try: parsed = _parse_json(raw) brief = parsed.get("brief", "") - # ``or raw``: ``.get("content", raw)`` returns None for - # ``{"content": null}`` (legal under json_object mode). - content = parsed.get("content") or raw + # Parse succeeded: do NOT fall back to ``raw`` (the JSON string). + # An empty/None ``content`` field yields "" so + # ``_require_nonempty_content`` raises and the page is skipped, + # rather than writing the raw JSON as the markdown body. + content = parsed.get("content") or "" except (json.JSONDecodeError, ValueError): + # Parse FAILED: ``raw`` is the legitimate non-JSON body fallback. brief, content = "", raw _require_nonempty_content(content, name) return name, content, False, brief @@ -1514,8 +1517,10 @@ async def _gen_update(concept: dict) -> tuple[str, str, bool, str]: try: parsed = _parse_json(raw) brief = parsed.get("brief", "") - content = parsed.get("content") or raw + # Parse succeeded: do NOT fall back to ``raw`` (the JSON string). + content = parsed.get("content") or "" except (json.JSONDecodeError, ValueError): + # Parse FAILED: ``raw`` is the legitimate non-JSON body fallback. brief, content = "", raw _require_nonempty_content(content, name) return name, content, True, brief @@ -1538,8 +1543,10 @@ async def _gen_entity_create(ent: dict) -> tuple[str, str, str, str]: parsed = _parse_json(raw) brief = parsed.get("brief", "") etype_out = parsed.get("type") if parsed.get("type") in _ENTITY_TYPES else etype - content = parsed.get("content") or raw + # Parse succeeded: do NOT fall back to ``raw`` (the JSON string). + content = parsed.get("content") or "" except (json.JSONDecodeError, ValueError): + # Parse FAILED: ``raw`` is the legitimate non-JSON body fallback. brief, etype_out, content = "", etype, raw _require_nonempty_content(content, name) return name, content, brief, etype_out @@ -1573,8 +1580,10 @@ async def _gen_entity_update(ent: dict) -> tuple[str, str, str, str]: parsed = _parse_json(raw) brief = parsed.get("brief", "") etype_out = parsed.get("type") if parsed.get("type") in _ENTITY_TYPES else etype - content = parsed.get("content") or raw + # Parse succeeded: do NOT fall back to ``raw`` (the JSON string). + content = parsed.get("content") or "" except (json.JSONDecodeError, ValueError): + # Parse FAILED: ``raw`` is the legitimate non-JSON body fallback. brief, etype_out, content = "", etype, raw _require_nonempty_content(content, name) return name, content, brief, etype_out From 1e2d5e04221f711b5bd79b37ece8c2a721eee9c6 Mon Sep 17 00:00:00 2001 From: mountain Date: Sun, 31 May 2026 09:34:43 +0800 Subject: [PATCH 22/29] fix(compiler): graceful scalar plan + rebuild malformed entity frontmatter - _compile_concepts: guard a non-dict/non-list parsed plan (JSON scalar) before calling .get(), taking the empty-plan path (write v1 summary if applicable + update index + return) instead of risking AttributeError. - _write_entity: when an existing page has an opening --- but no closing delimiter (or no frontmatter), rebuild valid sources/type/brief frontmatter rather than writing a body-only page that drops the metadata. --- openkb/agent/compiler.py | 56 +++++++++++++++++++++++++++------------- 1 file changed, 38 insertions(+), 18 deletions(-) diff --git a/openkb/agent/compiler.py b/openkb/agent/compiler.py index 038983b0..be0e167c 100644 --- a/openkb/agent/compiler.py +++ b/openkb/agent/compiler.py @@ -838,32 +838,37 @@ def _write_entity( if end != -1: clean = clean[end + 3:].lstrip("\n") + def _build_frontmatter(sources: list[str]) -> str: + fm_lines = [_yaml_list_line("sources", sources)] + fm_lines.append(_yaml_kv_line("type", type_ or "other")) + if brief: + fm_lines.append(_yaml_kv_line("brief", brief)) + if aliases: + fm_lines.append(_yaml_list_line("aliases", aliases)) + return "---\n" + "\n".join(fm_lines) + "\n---\n\n" + if is_update and path.exists(): existing = path.read_text(encoding="utf-8") if source_file not in existing: existing = _prepend_source_to_frontmatter(existing, source_file) - if existing.startswith("---"): - end = existing.find("---", 3) - if end != -1: - fm = existing[:end + 3] - fm = _set_fm_line(fm, "brief", brief) if brief else fm - fm = _set_fm_line(fm, "type", type_) if type_ else fm - existing = fm + "\n\n" + clean - else: - existing = clean + end = existing.find("---", 3) if existing.startswith("---") else -1 + if end != -1: + fm = existing[:end + 3] + fm = _set_fm_line(fm, "brief", brief) if brief else fm + fm = _set_fm_line(fm, "type", type_) if type_ else fm + existing = fm + "\n\n" + clean else: - existing = clean + # Malformed/absent frontmatter (opening ``---`` with no closing + # delimiter, or no frontmatter at all): rebuild valid frontmatter + # rather than writing a body-only page and dropping sources/type/ + # brief. ``_prepend_source_to_frontmatter`` already ensured the + # new source is present in the (still-malformed) block, so seed + # with it here. + existing = _build_frontmatter([source_file]) + clean path.write_text(existing, encoding="utf-8") return - fm_lines = [_yaml_list_line("sources", [source_file])] - fm_lines.append(_yaml_kv_line("type", type_ or "other")) - if brief: - fm_lines.append(_yaml_kv_line("brief", brief)) - if aliases: - fm_lines.append(_yaml_list_line("aliases", aliases)) - frontmatter = "---\n" + "\n".join(fm_lines) + "\n---\n\n" - path.write_text(frontmatter + clean, encoding="utf-8") + path.write_text(_build_frontmatter([source_file]) + clean, encoding="utf-8") def _set_fm_line(fm: str, key: str, value: str) -> str: @@ -1364,6 +1369,21 @@ def _write_v1_summary_stripped() -> None: # The new plan contract nests concepts under a "concepts" key alongside # an "entities" key; the legacy flat shape (create/update/related at top # level) is still honored by falling back to ``parsed`` itself. + if not isinstance(parsed, (list, dict)): + # A JSON scalar (int/str/None/bool) is valid JSON but not a usable + # plan. ``_parse_json`` normally rejects scalars, but guard here too + # so ``parsed.get(...)`` can never raise AttributeError and abort the + # compile — treat it as an empty/unparseable plan. + logger.warning( + "Concepts plan parsed to a %s scalar, not an object/array — " + "treating as empty plan for %s.", + type(parsed).__name__, doc_name, + ) + if rewrite_summary: + _write_v1_summary_stripped() + _update_index(wiki_dir, doc_name, [], doc_brief=doc_brief, doc_type=doc_type) + return + if isinstance(parsed, list): plan = {"create": _filter_concept_items(parsed, "list"), "update": [], "related": []} From b2451289dd357f47d24435037c656048fb257e54 Mon Sep 17 00:00:00 2001 From: mountain Date: Sun, 31 May 2026 09:34:49 +0800 Subject: [PATCH 23/29] fix(compiler): keep ## Entities before ## Explorations; drop dead param + overlap gathers - _update_index: insert ## Entities before ## Explorations on older index.md files that predate the section (new _ensure_h2_section_before helper), preserving canonical order instead of appending at EOF. - _filter_entity_items: drop the unused 'label' parameter and update call sites in _parse_entities_plan. - _compile_concepts: overlap concept and entity generation in one outer asyncio.gather (they share cached context and the same concurrency semaphore); result/error handling per list is unchanged. --- openkb/agent/compiler.py | 81 +++++++++++++++++++++++++++++----------- 1 file changed, 59 insertions(+), 22 deletions(-) diff --git a/openkb/agent/compiler.py b/openkb/agent/compiler.py index be0e167c..88328353 100644 --- a/openkb/agent/compiler.py +++ b/openkb/agent/compiler.py @@ -432,7 +432,7 @@ def _filter_related_slugs(items: list) -> list[str]: return valid -def _filter_entity_items(items: object, label: str) -> list[dict]: +def _filter_entity_items(items: object) -> list[dict]: """Validate entity create/update objects: require name+title, coerce type. Each kept item is normalized to ``{"name", "title", "type"}`` where @@ -470,8 +470,8 @@ def _parse_entities_plan(parsed: object) -> dict: if not isinstance(group, dict): return empty return { - "create": _filter_entity_items(group.get("create", []), "create"), - "update": _filter_entity_items(group.get("update", []), "update"), + "create": _filter_entity_items(group.get("create", [])), + "update": _filter_entity_items(group.get("update", [])), "related": _filter_related_slugs(group.get("related", [])), } @@ -635,6 +635,33 @@ def _ensure_h2_section(lines: list[str], heading: str) -> None: lines.append("") +def _ensure_h2_section_before( + lines: list[str], heading: str, before: str, +) -> None: + """Ensure H2 ``heading`` exists, inserting it just before ``before``. + + If ``heading`` is already present, no-op. If ``before`` is absent, fall + back to :func:`_ensure_h2_section` (append at end). This keeps the + canonical index order (e.g. ``## Entities`` ahead of ``## Explorations``) + when recovering an older index.md that predates the section. + """ + if _get_section_bounds(lines, heading) is not None: + return + before_bounds = _get_section_bounds(lines, before) + if before_bounds is None: + _ensure_h2_section(lines, heading) + return + # ``start`` is the line after the ``before`` heading; insert the new + # section (heading + blank line) right before that heading line. + insert_at = before_bounds[0] - 1 + logger.warning( + "Wiki index is missing %r section; inserting it before %r. " + "Check whether the file was hand-edited away from the canonical layout.", + heading, before, + ) + lines[insert_at:insert_at] = [heading, ""] + + def _section_contains_link(lines: list[str], heading: str, link: str) -> bool: """Check whether an index entry already exists inside the named section.""" bounds = _get_section_bounds(lines, heading) @@ -1253,7 +1280,10 @@ def _update_index( entity_names = entity_names or [] entity_meta = entity_meta or {} if entity_names: - _ensure_h2_section(lines, "## Entities") + # Keep canonical order: Entities sits before Explorations. On an older + # index.md that predates the Entities section, plain ``_ensure_h2_section`` + # would append it after Explorations. + _ensure_h2_section_before(lines, "## Entities", "## Explorations") for name in entity_names: link = f"[[entities/{name}]]" # Callers always populate entity_meta alongside entity_names; the @@ -1612,6 +1642,14 @@ async def _gen_entity_update(ent: dict) -> tuple[str, str, str, str]: tasks.extend(_gen_create(c) for c in create_items) tasks.extend(_gen_update(c) for c in update_items) + # --- Step 3 (entities): build the entity task list up front so it can be + # gathered concurrently with the concept tasks below. Entity coroutines + # return 4-arity tuples (name, content, brief, type), so their results are + # processed in their own loop rather than mixed with the concept tuples. + entity_tasks = [] + entity_tasks.extend(_gen_entity_create(e) for e in entity_create) + entity_tasks.extend(_gen_entity_update(e) for e in entity_update) + concept_names: list[str] = [] concept_briefs_map: dict[str, str] = {} pending_writes: list[tuple[str, str, bool, str]] = [] @@ -1619,13 +1657,28 @@ async def _gen_entity_update(ent: dict) -> tuple[str, str, str, str]: entity_meta: dict[str, tuple[str, str]] = {} entity_pending: list[tuple[str, str, str, str]] = [] + # Concepts and entities are independent and share the cached prompt + # context + the same concurrency ``semaphore``, so overlap them in one + # outer gather instead of running entities only after concepts finish. + total = len(tasks) + etotal = len(entity_tasks) if tasks: - total = len(tasks) sys.stdout.write(f" Generating {total} concept(s) (concurrency={max_concurrency})...\n") sys.stdout.flush() + if entity_tasks: + sys.stdout.write( + f" Generating {etotal} entity(ies) (concurrency={max_concurrency})...\n" + ) + sys.stdout.flush() - results = await asyncio.gather(*tasks, return_exceptions=True) + results, entity_results = ([], []) + if tasks or entity_tasks: + results, entity_results = await asyncio.gather( + asyncio.gather(*tasks, return_exceptions=True), + asyncio.gather(*entity_tasks, return_exceptions=True), + ) + if tasks: failure_types: list[str] = [] for r in results: if isinstance(r, Exception): @@ -1653,23 +1706,7 @@ async def _gen_entity_update(ent: dict) -> tuple[str, str, str, str]: ) sys.stdout.flush() - # --- Step 3 (entities): generate entity pages in their OWN gather --- - # Entity coroutines return 4-arity tuples (name, content, brief, type), - # so they are gathered separately from the concept tuples rather than - # mixed into one list with differing arities. - entity_tasks = [] - entity_tasks.extend(_gen_entity_create(e) for e in entity_create) - entity_tasks.extend(_gen_entity_update(e) for e in entity_update) - if entity_tasks: - etotal = len(entity_tasks) - sys.stdout.write( - f" Generating {etotal} entity(ies) (concurrency={max_concurrency})...\n" - ) - sys.stdout.flush() - - entity_results = await asyncio.gather(*entity_tasks, return_exceptions=True) - entity_failure_types: list[str] = [] for r in entity_results: if isinstance(r, Exception): From 2f09fad2481dc9a1326e39e7f4e2685cd61851c7 Mon Sep 17 00:00:00 2001 From: mountain Date: Sun, 31 May 2026 09:34:57 +0800 Subject: [PATCH 24/29] test(compiler): cover empty-content skip, scalar plan, malformed entity FM, Entities order Add regression tests for the four compiler fixes: - empty {"content":""} response skips the page (no raw JSON body) - JSON scalar plan handled gracefully (no AttributeError) - _write_entity rebuilds frontmatter when closing --- is missing - _update_index inserts ## Entities before ## Explorations --- tests/test_compiler.py | 114 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 114 insertions(+) diff --git a/tests/test_compiler.py b/tests/test_compiler.py index 73fbe009..88d55272 100644 --- a/tests/test_compiler.py +++ b/tests/test_compiler.py @@ -361,6 +361,27 @@ def test_recovers_when_concepts_section_missing(self, tmp_path): assert "[[concepts/attention]] — Focus" in text assert "[[summaries/my-doc]]" in text + def test_entities_inserted_before_explorations(self, tmp_path): + """#8: an old index.md predating ## Entities must get it inserted + before ## Explorations, not appended after it (canonical order).""" + wiki = tmp_path / "wiki" + wiki.mkdir() + # Old order: no ## Entities section yet. + (wiki / "index.md").write_text( + "# Index\n\n## Documents\n\n## Concepts\n\n## Explorations\n", + encoding="utf-8", + ) + _update_index( + wiki, "my-doc", [], + entity_names=["anthropic"], + entity_meta={"anthropic": ("organization", "AI lab.")}, + ) + text = (wiki / "index.md").read_text() + assert "## Entities" in text + # Canonical order: Entities before Explorations. + assert text.index("## Entities") < text.index("## Explorations") + assert "[[entities/anthropic]] (organization) — AI lab." in text + class TestReadWikiContext: def test_empty_wiki(self, tmp_path): @@ -561,6 +582,31 @@ def test_update_prepends_source_keeps_type(self, tmp_path): assert "v1." not in text assert "brief:" in text and "b2" in text + def test_update_rebuilds_frontmatter_when_no_closing_delim(self, tmp_path): + """#11: malformed existing file (opening --- but no closing ---) must + not drop frontmatter; rebuild valid sources/type/brief on update.""" + entities = tmp_path / "entities" + entities.mkdir(parents=True) + # Opening delimiter, NO closing delimiter — find("---", 3) == -1. + (entities / "anthropic.md").write_text( + "---\nsources: [\"summaries/a.md\"]\ntype: organization\n" + "# Anthropic (no closing fence)\n\nOld body.", + encoding="utf-8", + ) + _write_entity( + tmp_path, "anthropic", "# Anthropic\n\nv2 rewritten.", + "summaries/b.md", is_update=True, + brief="AI lab.", type_="organization", aliases=None, + ) + text = (entities / "anthropic.md").read_text(encoding="utf-8") + # Frontmatter rebuilt with a proper closing delimiter, not body-only. + assert text.startswith("---\n") + assert text.count("---") == 2 + assert "sources:" in text and "summaries/b.md" in text + assert "type:" in text and "organization" in text + assert "brief:" in text and "AI lab." in text + assert "v2 rewritten." in text + class TestBacklinkSummary: def test_adds_missing_concept_links(self, tmp_path): @@ -1051,6 +1097,33 @@ async def test_empty_plan_strips_v1_summary_ghosts(self, tmp_path): assert "[[concepts/imaginary]]" not in text assert "imaginary" in text # plain text preserved + @pytest.mark.asyncio + async def test_scalar_plan_handled_gracefully(self, tmp_path): + """#10: a JSON scalar plan (valid JSON, not object/array) must not + crash with AttributeError; it takes the graceful empty-plan path — + v1 summary written, index updated, no concept/entity pages.""" + wiki, source_path = self._setup_kb(tmp_path) + + summary_response = json.dumps({ + "brief": "B", "content": "# Summary\n\nPlain body, no links.", + }) + # Plan call returns a bare JSON scalar (an integer). + scalar_plan_response = "42" + + with patch("openkb.agent.compiler.litellm") as mock_litellm: + mock_litellm.completion = MagicMock( + side_effect=_mock_completion([summary_response, scalar_plan_response]) + ) + # Must not raise (AttributeError) and must complete. + await compile_short_doc("doc", source_path, tmp_path, "gpt-4o-mini") + + # Summary still written, index updated with the document. + assert (wiki / "summaries" / "doc.md").exists() + index_text = (wiki / "index.md").read_text() + assert "[[summaries/doc]]" in index_text + # No concept pages produced from the unusable plan. + assert not list((wiki / "concepts").glob("*.md")) + class TestCacheControl: """Verify cache_control breakpoints are emitted on the right messages @@ -1346,6 +1419,47 @@ async def ordered_acompletion(*args, **kwargs): assert "[[concepts/flash-attention]]" in index_text assert "[[concepts/attention]]" in index_text + @pytest.mark.asyncio + async def test_empty_content_skips_page_no_json_body(self, tmp_path): + """#9: when the page LLM returns parseable JSON with empty content + ({"content": ""}), the page is skipped (not written as raw JSON).""" + wiki = self._setup_wiki(tmp_path) + + plan_response = json.dumps({ + "create": [{"name": "ghost-concept", "title": "Ghost Concept"}], + "update": [], + "related": [], + }) + # Parseable JSON, but empty content — old code fell back to raw JSON. + empty_content_response = json.dumps({"brief": "B", "content": ""}) + + system_msg = {"role": "system", "content": "You are a wiki agent."} + doc_msg = {"role": "user", "content": "Document content."} + + with patch("openkb.agent.compiler.litellm") as mock_litellm: + mock_litellm.completion = MagicMock( + side_effect=_mock_completion([plan_response]) + ) + mock_litellm.acompletion = AsyncMock( + side_effect=_mock_completion([empty_content_response]) + ) + await _compile_concepts( + wiki, tmp_path, "gpt-4o-mini", system_msg, doc_msg, + "Summary.", "test-doc", 5, + ) + + # The concept page must NOT be written (generation raised + dropped). + page = wiki / "concepts" / "ghost-concept.md" + assert not page.exists() + # And no concept index entry either. + index_text = (wiki / "index.md").read_text() + assert "[[concepts/ghost-concept]]" not in index_text + # Definitely no raw JSON written anywhere as a body. + assert not any( + '"content":' in p.read_text() + for p in (wiki / "concepts").glob("*.md") + ) + @pytest.mark.asyncio async def test_related_adds_link_no_llm(self, tmp_path): """Plan has only related items. No acompletion calls should be made.""" From 0a27c04be477acf2cd92da7ce8ad017d7401c974 Mon Sep 17 00:00:00 2001 From: mountain Date: Sun, 31 May 2026 10:01:04 +0800 Subject: [PATCH 25/29] fix(compiler): silence spurious 'hand-edited' warning on backlink section creation _backlink_summary_pages / _backlink_pages create ## Entities / ## Related Documents sections as a normal first-time operation; pass quiet=True so _ensure_h2_section no longer logs the index-drift warning in that case. Index-repair callers keep the warning. --- openkb/agent/compiler.py | 22 ++++++++++++++-------- tests/test_compiler.py | 17 +++++++++++++++++ 2 files changed, 31 insertions(+), 8 deletions(-) diff --git a/openkb/agent/compiler.py b/openkb/agent/compiler.py index 88328353..6751ae49 100644 --- a/openkb/agent/compiler.py +++ b/openkb/agent/compiler.py @@ -613,20 +613,26 @@ def _get_section_bounds(lines: list[str], heading: str) -> tuple[int, int] | Non return None -def _ensure_h2_section(lines: list[str], heading: str) -> None: +def _ensure_h2_section(lines: list[str], heading: str, *, quiet: bool = False) -> None: """Ensure an H2 section ``heading`` exists in ``lines``; append if missing. Recovers from hand-edited or drifted index.md files where the expected section was removed or renamed — without this, downstream inserts would silently no-op and entries would be dropped. + + ``quiet=True`` suppresses the drift warning. Use it when adding a section + is the normal, expected operation (e.g. a backlink helper creating a + ``## Related Documents`` / ``## Entities`` section on a page for the first + time), as opposed to repairing a drifted index. """ if _get_section_bounds(lines, heading) is not None: return - logger.warning( - "Wiki page is missing %r section; appending it. " - "Check whether the file was hand-edited away from the canonical layout.", - heading, - ) + if not quiet: + logger.warning( + "Wiki page is missing %r section; appending it. " + "Check whether the file was hand-edited away from the canonical layout.", + heading, + ) while lines and lines[-1] == "": lines.pop() if lines: @@ -1028,7 +1034,7 @@ def _backlink_summary_pages( return lines = text.split("\n") - _ensure_h2_section(lines, section) + _ensure_h2_section(lines, section, quiet=True) for slug in reversed(missing): _insert_section_entry(lines, section, f"- [[{page_dir}/{slug}]]") summary_path.write_text("\n".join(lines), encoding="utf-8") @@ -1050,7 +1056,7 @@ def _backlink_pages( if link in text: continue lines = text.split("\n") - _ensure_h2_section(lines, "## Related Documents") + _ensure_h2_section(lines, "## Related Documents", quiet=True) _insert_section_entry(lines, "## Related Documents", f"- {link}") path.write_text("\n".join(lines), encoding="utf-8") diff --git a/tests/test_compiler.py b/tests/test_compiler.py index 88d55272..83f94f0c 100644 --- a/tests/test_compiler.py +++ b/tests/test_compiler.py @@ -1788,6 +1788,23 @@ def test_schema_declares_entities(): assert t in AGENTS_MD +def test_ensure_h2_section_quiet_suppresses_drift_warning(caplog): + """Backlink helpers create sections as a normal operation, so quiet=True + must not emit the 'hand-edited' drift warning; default still warns.""" + import logging + + from openkb.agent.compiler import _ensure_h2_section + + with caplog.at_level(logging.WARNING, logger="openkb.agent.compiler"): + lines = ["# Doc", ""] + _ensure_h2_section(lines, "## Entities", quiet=True) + assert "## Entities" in lines + assert caplog.records == [] + + _ensure_h2_section(["# Doc", ""], "## Entities") # default warns + assert any("missing" in r.getMessage() for r in caplog.records) + + def test_known_targets_prompt_has_entities_rule(): """The whitelist message must tell the LLM the [[entities/X]] rule, since entity-page prompts instruct writing such links; otherwise entity links From 58f8edc92a980f996a41623439e44a2540b41ed8 Mon Sep 17 00:00:00 2001 From: mountain Date: Sun, 31 May 2026 10:26:45 +0800 Subject: [PATCH 26/29] feat(cli): add `recompile` command to re-run compile on indexed docs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Re-runs the current compile_short_doc/compile_long_doc pipeline on already-indexed docs so pre-feature KBs gain the entities/ layer and refresh to the current format. Reuses on-disk sources/summaries and the registry's PageIndex doc_id — does not re-index or re-convert. Supports a positional (resolved via _resolve_doc_identifier) or --all (with a regeneration-warning confirmation, bypassed by --yes), --dry-run (enumerate only, no LLM calls/writes), and --refresh-schema (back up + overwrite wiki/AGENTS.md when it differs from AGENTS_MD). Processes docs sequentially with per-doc progress, skips+warns on missing sources / summaries / doc_id, prints a recompiled/skipped summary, and appends a recompile entry to log.md. --- openkb/cli.py | 188 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 188 insertions(+) diff --git a/openkb/cli.py b/openkb/cli.py index 6bf2b79e..c3ce3057 100644 --- a/openkb/cli.py +++ b/openkb/cli.py @@ -1075,6 +1075,194 @@ def remove(ctx, identifier, keep_raw, keep_empty_concepts, dry_run, yes): click.echo(f" [OK] {name} removed from knowledge base.") +def _refresh_schema(wiki_dir: Path) -> bool: + """Back up + overwrite ``wiki/AGENTS.md`` with the current ``AGENTS_MD``. + + If the on-disk schema differs from the bundled default, copy it to + ``wiki/AGENTS.md.bak`` then overwrite with ``AGENTS_MD``. No-op when the + file is missing or already identical. Returns True if it overwrote. + """ + agents_file = wiki_dir / "AGENTS.md" + current = agents_file.read_text(encoding="utf-8") if agents_file.exists() else "" + if current == AGENTS_MD: + return False + if agents_file.exists(): + backup = wiki_dir / "AGENTS.md.bak" + backup.write_text(current, encoding="utf-8") + click.echo(f" Backed up existing schema to {backup.relative_to(wiki_dir.parent)}") + agents_file.write_text(AGENTS_MD, encoding="utf-8") + click.echo(" Refreshed wiki/AGENTS.md to the current schema.") + return True + + +@cli.command() +@click.argument("doc_name", required=False) +@click.option("--all", "all_docs", is_flag=True, default=False, + help="Recompile every indexed document.") +@click.option("--dry-run", is_flag=True, default=False, + help="List the docs that would be recompiled; no LLM calls, no writes.") +@click.option("--yes", "-y", is_flag=True, default=False, + help="Skip the --all confirmation prompt.") +@click.option("--refresh-schema", "refresh_schema", is_flag=True, default=False, + help="Overwrite wiki/AGENTS.md with the bundled schema (backs up " + "the old one to AGENTS.md.bak) if it differs.") +@click.pass_context +def recompile(ctx, doc_name, all_docs, dry_run, yes, refresh_schema): + """Re-run the current compile pipeline on already-indexed documents. + + Recompiling re-runs the same ``compile_short_doc`` / ``compile_long_doc`` + that ``openkb add`` uses, so pre-feature KBs gain the ``entities/`` layer + and pages refresh to the current format. It does NOT re-run PageIndex or + re-convert raw files — it reuses the on-disk ``wiki/sources/`` and + ``wiki/summaries/`` content (and the registry's PageIndex ``doc_id``). + + DOC_NAME recompiles one doc (resolved like ``openkb remove`` — filename, + slug, or unique substring). ``--all`` recompiles every indexed doc. + Exactly one of DOC_NAME or ``--all`` is required. + + Side effect: this regenerates summaries (short docs) and rewrites concept + pages with the current logic — manual edits to those pages are overwritten. + """ + from openkb.state import HashRegistry + + kb_dir = _find_kb_dir(ctx.obj.get("kb_dir_override")) + if kb_dir is None: + click.echo("No knowledge base found. Run `openkb init` first.") + return + + if all_docs and doc_name: + click.echo("Specify either a DOC_NAME or --all, not both.") + return + if not all_docs and not doc_name: + click.echo("Specify a document name or pass --all to recompile every doc.") + return + + openkb_dir = kb_dir / ".openkb" + wiki_dir = kb_dir / "wiki" + registry = HashRegistry(openkb_dir / "hashes.json") + + # Resolve the set of docs to recompile. + if all_docs: + entries = list(registry.all_entries().values()) + if not entries: + click.echo("No documents indexed yet. Run `openkb add` first.") + return + targets = entries + else: + matches = _resolve_doc_identifier(registry, doc_name) + if not matches: + click.echo(f"No document matching '{doc_name}' found in the KB.") + click.echo("Try `openkb list` to see indexed documents.") + return + if len(matches) > 1: + click.echo(f"'{doc_name}' matches multiple documents:") + for _, m in matches: + click.echo(f" - {m.get('name', '?')} (doc_name: {m.get('doc_name', '?')})") + click.echo("Use a more specific name or the exact doc_name slug.") + return + targets = [matches[0][1]] + + def _classify(meta: dict) -> str: + return "long" if meta.get("type") == "long_pdf" else "short" + + # --dry-run: enumerate only, no LLM calls, no writes. + if dry_run: + click.echo(f"Would recompile {len(targets)} document(s):") + for meta in targets: + name = meta.get("doc_name") or meta.get("name", "?") + click.echo(f" - {name} ({_classify(meta)})") + click.echo( + "\nNote: recompiling regenerates summaries (short docs) and rewrites " + "concept pages — manual edits would be overwritten." + ) + click.echo("(dry-run — nothing modified)") + return + + # --all confirmation (the summary/concept-regeneration side effect). + if all_docs and not yes: + click.echo( + f"This will recompile {len(targets)} document(s), regenerating " + "summaries and rewriting concept pages with the current logic.\n" + "Manual edits to those pages will be overwritten." + ) + if not click.confirm("Proceed?", default=False): + click.echo("Aborted.") + return + + if refresh_schema: + _refresh_schema(wiki_dir) + + _setup_llm_key(kb_dir) + config = load_config(openkb_dir / "config.yaml") + model: str = config.get("model", DEFAULT_CONFIG["model"]) + + # Import lazily and reference via the module so tests can patch + # ``openkb.agent.compiler.compile_*`` and see the call. + from openkb.agent import compiler + + recompiled = 0 + skipped = 0 + total = len(targets) + for i, meta in enumerate(targets, 1): + name = meta.get("doc_name") or Path(meta.get("name", "")).stem + if not name: + click.echo(f"[{i}/{total}] [SKIP] registry entry has no doc_name.") + skipped += 1 + continue + + if meta.get("type") == "long_pdf": + summary_path = wiki_dir / "summaries" / f"{name}.md" + doc_id = meta.get("doc_id") + if not doc_id: + click.echo( + f"[{i}/{total}] [SKIP] {name}: legacy long-doc entry without a " + "doc_id — re-add to refresh." + ) + skipped += 1 + continue + if not summary_path.exists(): + click.echo( + f"[{i}/{total}] [SKIP] {name}: missing summary at " + f"{summary_path.relative_to(kb_dir)}." + ) + skipped += 1 + continue + click.echo(f"[{i}/{total}] Recompiling long doc {name}...") + start = time.time() + try: + asyncio.run(compiler.compile_long_doc(name, summary_path, doc_id, kb_dir, model)) + except Exception as exc: + click.echo(f" [ERROR] Compilation failed: {exc}") + logging.getLogger(__name__).debug("Recompile traceback:", exc_info=True) + skipped += 1 + continue + click.echo(f" [OK] {name} ({time.time() - start:.1f}s)") + recompiled += 1 + else: + source_path = wiki_dir / "sources" / f"{name}.md" + if not source_path.exists(): + click.echo( + f"[{i}/{total}] [SKIP] {name}: missing source at " + f"{source_path.relative_to(kb_dir)}." + ) + skipped += 1 + continue + click.echo(f"[{i}/{total}] Recompiling short doc {name}...") + start = time.time() + try: + asyncio.run(compiler.compile_short_doc(name, source_path, kb_dir, model)) + except Exception as exc: + click.echo(f" [ERROR] Compilation failed: {exc}") + logging.getLogger(__name__).debug("Recompile traceback:", exc_info=True) + skipped += 1 + continue + click.echo(f" [OK] {name} ({time.time() - start:.1f}s)") + recompiled += 1 + + click.echo(f"\nDone: recompiled {recompiled}, skipped {skipped}.") + append_log(wiki_dir, "recompile", f"recompiled {recompiled}, skipped {skipped}") + + @cli.command() @click.option( "--resume", "-r", "resume", From d10055d29d0398181ee5493c7fdd0dc71db159aa Mon Sep 17 00:00:00 2001 From: mountain Date: Sun, 31 May 2026 10:26:45 +0800 Subject: [PATCH 27/29] test(cli): recompile dispatch/dry-run/skip/refresh-schema --- tests/test_recompile.py | 296 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 296 insertions(+) create mode 100644 tests/test_recompile.py diff --git a/tests/test_recompile.py b/tests/test_recompile.py new file mode 100644 index 00000000..4a603fda --- /dev/null +++ b/tests/test_recompile.py @@ -0,0 +1,296 @@ +"""Tests for the `openkb recompile` CLI command. + +`recompile` re-runs the current compile pipeline (compile_short_doc / +compile_long_doc) on already-indexed docs so pre-feature KBs gain the +entities/ layer and refresh to the current format. It does NOT re-run +PageIndex or re-convert raw files. + +Covers: +- short-doc dispatch (compile_short_doc called with the right args) +- long-doc dispatch (compile_long_doc called with doc_id; PageIndex not invoked) +- --all confirmation + --yes bypass +- --dry-run: no compile calls, no writes +- skip+warn paths (missing source, missing summary/doc_id) with others + still processed +- unknown / empty registry friendly error +- --refresh-schema backs up + overwrites only when AGENTS.md differs +""" + +from __future__ import annotations + +import json +from pathlib import Path +from unittest.mock import AsyncMock, patch + +from click.testing import CliRunner + +from openkb.cli import cli +from openkb.schema import AGENTS_MD + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _invoke(kb_dir, args, input_text=None): + return CliRunner().invoke( + cli, ["--kb-dir", str(kb_dir), *args], input=input_text, + ) + + +def _seed_short(kb_dir: Path) -> None: + """One short doc with a source file on disk.""" + (kb_dir / ".openkb" / "hashes.json").write_text(json.dumps({ + "h_s": {"name": "notes.md", "doc_name": "notes-h_s", "type": "md"}, + })) + (kb_dir / "wiki" / "sources" / "notes-h_s.md").write_text( + "# Notes\n\nbody\n", encoding="utf-8", + ) + (kb_dir / "wiki" / "log.md").write_text("# Log\n\n", encoding="utf-8") + + +def _seed_long(kb_dir: Path) -> None: + """One long (PageIndex) doc with a summary file + doc_id on disk.""" + (kb_dir / ".openkb" / "hashes.json").write_text(json.dumps({ + "h_l": { + "name": "paper.pdf", "doc_name": "paper-h_l", + "type": "long_pdf", "doc_id": "doc-abc123", + }, + })) + (kb_dir / "wiki" / "summaries" / "paper-h_l.md").write_text( + "---\nsources: [raw/paper.pdf]\nbrief: P\n---\n# Paper\n", + encoding="utf-8", + ) + (kb_dir / "wiki" / "log.md").write_text("# Log\n\n", encoding="utf-8") + + +# --------------------------------------------------------------------------- +# short-doc dispatch +# --------------------------------------------------------------------------- + + +def test_recompile_short_dispatches_compile_short_doc(kb_dir): + _seed_short(kb_dir) + with patch("openkb.agent.compiler.compile_short_doc", new_callable=AsyncMock) as short, \ + patch("openkb.agent.compiler.compile_long_doc", new_callable=AsyncMock) as long_: + result = _invoke(kb_dir, ["recompile", "notes.md"]) + + assert result.exit_code == 0, result.output + short.assert_called_once() + args = short.call_args.args + assert args[0] == "notes-h_s" # doc_name + assert args[1] == kb_dir / "wiki" / "sources" / "notes-h_s.md" # source_path + assert args[2] == kb_dir # kb_dir + long_.assert_not_called() + assert "recompiled 1" in result.output + + +# --------------------------------------------------------------------------- +# long-doc dispatch +# --------------------------------------------------------------------------- + + +def test_recompile_long_dispatches_compile_long_doc_with_doc_id(kb_dir): + _seed_long(kb_dir) + with patch("openkb.agent.compiler.compile_long_doc", new_callable=AsyncMock) as long_, \ + patch("openkb.agent.compiler.compile_short_doc", new_callable=AsyncMock) as short, \ + patch("openkb.indexer.index_long_document") as index: + result = _invoke(kb_dir, ["recompile", "paper.pdf"]) + + assert result.exit_code == 0, result.output + long_.assert_called_once() + args = long_.call_args.args + assert args[0] == "paper-h_l" # doc_name + assert args[1] == kb_dir / "wiki" / "summaries" / "paper-h_l.md" + assert args[2] == "doc-abc123" # doc_id + assert args[3] == kb_dir + short.assert_not_called() + # PageIndex must NOT be re-run + index.assert_not_called() + assert "recompiled 1" in result.output + + +# --------------------------------------------------------------------------- +# --all confirmation + --yes +# --------------------------------------------------------------------------- + + +def test_recompile_all_requires_confirmation(kb_dir): + _seed_short(kb_dir) + with patch("openkb.agent.compiler.compile_short_doc") as short: + result = _invoke(kb_dir, ["recompile", "--all"], input_text="n\n") + + assert result.exit_code == 0, result.output + assert "Aborted" in result.output + short.assert_not_called() + + +def test_recompile_all_yes_bypasses_confirmation(kb_dir): + _seed_short(kb_dir) + with patch("openkb.agent.compiler.compile_short_doc", new_callable=AsyncMock) as short: + result = _invoke(kb_dir, ["recompile", "--all", "--yes"]) + + assert result.exit_code == 0, result.output + short.assert_called_once() + assert "recompiled 1" in result.output + + +# --------------------------------------------------------------------------- +# --dry-run +# --------------------------------------------------------------------------- + + +def test_recompile_dry_run_no_calls_no_writes(kb_dir): + _seed_short(kb_dir) + log_before = (kb_dir / "wiki" / "log.md").read_text() + with patch("openkb.agent.compiler.compile_short_doc") as short, \ + patch("openkb.agent.compiler.compile_long_doc") as long_: + result = _invoke(kb_dir, ["recompile", "--all", "--dry-run"]) + + assert result.exit_code == 0, result.output + short.assert_not_called() + long_.assert_not_called() + assert "notes-h_s" in result.output + assert "short" in result.output + # No writes: log.md unchanged + assert (kb_dir / "wiki" / "log.md").read_text() == log_before + + +# --------------------------------------------------------------------------- +# skip + warn paths +# --------------------------------------------------------------------------- + + +def test_recompile_skips_short_missing_source(kb_dir): + """Short doc with no source on disk is warned + skipped; others run.""" + (kb_dir / ".openkb" / "hashes.json").write_text(json.dumps({ + "h_ok": {"name": "ok.md", "doc_name": "ok-h_ok", "type": "md"}, + "h_miss": {"name": "gone.md", "doc_name": "gone-h_miss", "type": "md"}, + })) + (kb_dir / "wiki" / "sources" / "ok-h_ok.md").write_text("# ok\n") + (kb_dir / "wiki" / "log.md").write_text("# Log\n\n", encoding="utf-8") + + with patch("openkb.agent.compiler.compile_short_doc", new_callable=AsyncMock) as short: + result = _invoke(kb_dir, ["recompile", "--all", "--yes"]) + + assert result.exit_code == 0, result.output + # only the doc with a present source compiled + assert short.call_count == 1 + assert short.call_args.args[0] == "ok-h_ok" + assert "recompiled 1" in result.output + assert "skipped 1" in result.output + + +def test_recompile_skips_long_missing_doc_id(kb_dir): + """Long doc lacking doc_id is warned + skipped; others run.""" + (kb_dir / ".openkb" / "hashes.json").write_text(json.dumps({ + "h_l": {"name": "legacy.pdf", "doc_name": "legacy-h_l", "type": "long_pdf"}, + })) + (kb_dir / "wiki" / "summaries" / "legacy-h_l.md").write_text("# legacy\n") + (kb_dir / "wiki" / "log.md").write_text("# Log\n\n", encoding="utf-8") + + with patch("openkb.agent.compiler.compile_long_doc") as long_: + result = _invoke(kb_dir, ["recompile", "--all", "--yes"]) + + assert result.exit_code == 0, result.output + long_.assert_not_called() + assert "skipped 1" in result.output + assert "recompiled 0" in result.output + + +def test_recompile_skips_long_missing_summary(kb_dir): + """Long doc with doc_id but no summary on disk is warned + skipped.""" + (kb_dir / ".openkb" / "hashes.json").write_text(json.dumps({ + "h_l": { + "name": "paper.pdf", "doc_name": "paper-h_l", + "type": "long_pdf", "doc_id": "doc-x", + }, + })) + (kb_dir / "wiki" / "log.md").write_text("# Log\n\n", encoding="utf-8") + + with patch("openkb.agent.compiler.compile_long_doc") as long_: + result = _invoke(kb_dir, ["recompile", "--all", "--yes"]) + + assert result.exit_code == 0, result.output + long_.assert_not_called() + assert "skipped 1" in result.output + + +# --------------------------------------------------------------------------- +# error paths +# --------------------------------------------------------------------------- + + +def test_recompile_requires_doc_or_all(kb_dir): + _seed_short(kb_dir) + result = _invoke(kb_dir, ["recompile"]) + assert result.exit_code != 0 or "Specify" in result.output or "--all" in result.output + + +def test_recompile_doc_and_all_conflict(kb_dir): + _seed_short(kb_dir) + result = _invoke(kb_dir, ["recompile", "notes.md", "--all"]) + assert "both" in result.output.lower() or "either" in result.output.lower() \ + or result.exit_code != 0 + + +def test_recompile_unknown_doc_friendly_error(kb_dir): + _seed_short(kb_dir) + with patch("openkb.agent.compiler.compile_short_doc") as short: + result = _invoke(kb_dir, ["recompile", "no-such-doc"]) + assert result.exit_code == 0, result.output + assert "no-such-doc" in result.output + short.assert_not_called() + + +def test_recompile_empty_registry_friendly_error(kb_dir): + (kb_dir / ".openkb" / "hashes.json").write_text(json.dumps({})) + with patch("openkb.agent.compiler.compile_short_doc") as short: + result = _invoke(kb_dir, ["recompile", "--all"], input_text="y\n") + assert result.exit_code == 0, result.output + short.assert_not_called() + assert "No documents" in result.output or "no documents" in result.output + + +# --------------------------------------------------------------------------- +# --refresh-schema +# --------------------------------------------------------------------------- + + +def test_recompile_refresh_schema_overwrites_when_differing(kb_dir): + _seed_short(kb_dir) + agents = kb_dir / "wiki" / "AGENTS.md" + agents.write_text("OLD CUSTOM SCHEMA\n", encoding="utf-8") + with patch("openkb.agent.compiler.compile_short_doc", new_callable=AsyncMock) as short: + result = _invoke(kb_dir, ["recompile", "notes.md", "--refresh-schema"]) + + assert result.exit_code == 0, result.output + bak = kb_dir / "wiki" / "AGENTS.md.bak" + assert bak.exists() + assert bak.read_text(encoding="utf-8") == "OLD CUSTOM SCHEMA\n" + assert agents.read_text(encoding="utf-8") == AGENTS_MD + + +def test_recompile_refresh_schema_noop_when_identical(kb_dir): + _seed_short(kb_dir) + agents = kb_dir / "wiki" / "AGENTS.md" + agents.write_text(AGENTS_MD, encoding="utf-8") + with patch("openkb.agent.compiler.compile_short_doc", new_callable=AsyncMock) as short: + result = _invoke(kb_dir, ["recompile", "notes.md", "--refresh-schema"]) + + assert result.exit_code == 0, result.output + assert not (kb_dir / "wiki" / "AGENTS.md.bak").exists() + + +def test_recompile_no_refresh_schema_by_default(kb_dir): + _seed_short(kb_dir) + agents = kb_dir / "wiki" / "AGENTS.md" + agents.write_text("OLD CUSTOM SCHEMA\n", encoding="utf-8") + with patch("openkb.agent.compiler.compile_short_doc", new_callable=AsyncMock) as short: + result = _invoke(kb_dir, ["recompile", "notes.md"]) + + assert result.exit_code == 0, result.output + # Untouched without the flag + assert agents.read_text(encoding="utf-8") == "OLD CUSTOM SCHEMA\n" + assert not (kb_dir / "wiki" / "AGENTS.md.bak").exists() From 1416b1fe779e53579679e5522ebe8c2200774a7a Mon Sep 17 00:00:00 2001 From: mountain Date: Sun, 31 May 2026 10:26:45 +0800 Subject: [PATCH 28/29] docs(readme): document openkb recompile --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 8e915456..27ad2e16 100644 --- a/README.md +++ b/README.md @@ -155,6 +155,7 @@ OpenKB commands fall into two layers: the **wiki foundation** (compile + manage | `openkb init` | Initialize a new knowledge base (interactive) | | openkb add <file_or_dir_or_URL> | Add documents and compile to wiki. URL ingest auto-detects PDF (saved as `.pdf` → PageIndex / markitdown) vs HTML (trafilatura main-content extract → `.md`) | | openkb remove <doc> | Remove a document and clean up its wiki pages, images, registry, and PageIndex state (use `--dry-run` to preview, `--keep-raw` / `--keep-empty-concepts` to retain artifacts) | +| openkb recompile [<doc>] [--all] | Re-run the current compile pipeline on already-indexed docs (e.g. to backfill the `entities/` layer) without re-indexing. Regenerates summaries and rewrites concept pages — manual edits are overwritten. Use `--dry-run` to preview, `--refresh-schema` to also update `wiki/AGENTS.md` | | `openkb watch` | Watch `raw/` and auto-compile new files | | `openkb lint` | Run structural + knowledge health checks | | `openkb list` | List indexed documents and concepts | From c39baf00668f2427a91eaef4747911e57e88fcbf Mon Sep 17 00:00:00 2001 From: mountain Date: Sun, 31 May 2026 10:31:35 +0800 Subject: [PATCH 29/29] fix(cli): recompile --refresh-schema no-ops when AGENTS.md absent; tighten guard tests Match the spec (and the helper's own docstring): _refresh_schema returns early when wiki/AGENTS.md is missing rather than materializing the default (get_agents_md already falls back to it at runtime). Tighten the doc/--all guard tests to assert the exact message + that no compile runs, and add the missing-AGENTS.md no-op test. --- openkb/cli.py | 13 ++++++++----- tests/test_recompile.py | 28 +++++++++++++++++++++++----- 2 files changed, 31 insertions(+), 10 deletions(-) diff --git a/openkb/cli.py b/openkb/cli.py index c3ce3057..0ad10602 100644 --- a/openkb/cli.py +++ b/openkb/cli.py @@ -1083,13 +1083,16 @@ def _refresh_schema(wiki_dir: Path) -> bool: file is missing or already identical. Returns True if it overwrote. """ agents_file = wiki_dir / "AGENTS.md" - current = agents_file.read_text(encoding="utf-8") if agents_file.exists() else "" + if not agents_file.exists(): + # No-op when missing: get_agents_md() already falls back to the + # bundled AGENTS_MD default at runtime, so there is nothing to refresh. + return False + current = agents_file.read_text(encoding="utf-8") if current == AGENTS_MD: return False - if agents_file.exists(): - backup = wiki_dir / "AGENTS.md.bak" - backup.write_text(current, encoding="utf-8") - click.echo(f" Backed up existing schema to {backup.relative_to(wiki_dir.parent)}") + backup = wiki_dir / "AGENTS.md.bak" + backup.write_text(current, encoding="utf-8") + click.echo(f" Backed up existing schema to {backup.relative_to(wiki_dir.parent)}") agents_file.write_text(AGENTS_MD, encoding="utf-8") click.echo(" Refreshed wiki/AGENTS.md to the current schema.") return True diff --git a/tests/test_recompile.py b/tests/test_recompile.py index 4a603fda..29d06137 100644 --- a/tests/test_recompile.py +++ b/tests/test_recompile.py @@ -224,15 +224,19 @@ def test_recompile_skips_long_missing_summary(kb_dir): def test_recompile_requires_doc_or_all(kb_dir): _seed_short(kb_dir) - result = _invoke(kb_dir, ["recompile"]) - assert result.exit_code != 0 or "Specify" in result.output or "--all" in result.output + with patch("openkb.agent.compiler.compile_short_doc", new_callable=AsyncMock) as short: + result = _invoke(kb_dir, ["recompile"]) + # Usage guard echoes a message and returns (exit 0); no compile runs. + assert "Specify a document name or pass --all" in result.output + short.assert_not_called() def test_recompile_doc_and_all_conflict(kb_dir): _seed_short(kb_dir) - result = _invoke(kb_dir, ["recompile", "notes.md", "--all"]) - assert "both" in result.output.lower() or "either" in result.output.lower() \ - or result.exit_code != 0 + with patch("openkb.agent.compiler.compile_short_doc", new_callable=AsyncMock) as short: + result = _invoke(kb_dir, ["recompile", "notes.md", "--all"]) + assert "not both" in result.output.lower() + short.assert_not_called() def test_recompile_unknown_doc_friendly_error(kb_dir): @@ -294,3 +298,17 @@ def test_recompile_no_refresh_schema_by_default(kb_dir): # Untouched without the flag assert agents.read_text(encoding="utf-8") == "OLD CUSTOM SCHEMA\n" assert not (kb_dir / "wiki" / "AGENTS.md.bak").exists() + + +def test_recompile_refresh_schema_noop_when_agents_missing(kb_dir): + """Spec: --refresh-schema is a no-op when AGENTS.md is absent (runtime + already falls back to the bundled default), so nothing is written.""" + _seed_short(kb_dir) + agents = kb_dir / "wiki" / "AGENTS.md" + agents.unlink(missing_ok=True) + with patch("openkb.agent.compiler.compile_short_doc", new_callable=AsyncMock) as short: + result = _invoke(kb_dir, ["recompile", "notes.md", "--refresh-schema"]) + + assert result.exit_code == 0, result.output + assert not agents.exists() # not materialized + assert not (kb_dir / "wiki" / "AGENTS.md.bak").exists()