Skip to content

Commit bfb4e3c

Browse files
fix(compiler): YAML-safe brief in concept frontmatter (closes #67) (#76)
* fix(compiler): YAML-safe brief in concept frontmatter (closes #67) LLM-authored brief values often contain colons, quotes, or hashes; the existing f-string interpolation in _write_concept produced invalid frontmatter (e.g. "brief: Why X: Y" → mapping value not allowed). OpenKB's own reader uses string slicing so it never noticed, but external YAML-aware tools (VS Code preview, Obsidian, doc generators) reject the page. - New _yaml_kv_line helper routes the value through yaml.safe_dump so PyYAML auto-quotes when needed. - Both write sites in _write_concept (re.sub update path + create path) use the helper. re.sub now takes a lambda replacement to bypass backref interpretation when brief contains literal \1 / \g<…>. - New find_invalid_frontmatter check in lint.py runs yaml.safe_load on every wiki page's frontmatter and is wired into the run_structural_lint report; future regressions surface immediately instead of silently writing bad YAML. Repro and root-cause analysis by @invu557 (issue #67). * fix(compiler): close 4 review findings on YAML-safe frontmatter - _yaml_kv_line: switch from yaml.safe_dump+split to json.dumps. The old form silently truncated briefs with embedded newlines (PyYAML promoted them to multi-line single-quoted scalars; taking the first line dropped the rest AND left an unterminated quote). json.dumps always produces single-line, correctly-escaped output that round- trips through any YAML loader. - _read_concept_briefs: parse frontmatter via yaml.safe_load instead of string-slicing the brief line. Without this, every brief that PyYAML auto-quoted (any colon-bearing value) leaked its surrounding quotes back into the LLM planning prompt and the index.md display. - New _yaml_list_line / _parse_yaml_list_value helpers harden the sources field the same way; _write_concept, _prepend_source_to_ frontmatter, and _remove_source_from_frontmatter all route through them. Without this, the PR's own new lint flagged compiler-emitted pages as invalid whenever a source filename contained a comma, bracket, or other YAML-significant character. - find_invalid_frontmatter now skips wiki/sources/ and wiki/reports/, matching the convention in find_broken_links / find_orphans — otherwise user-uploaded source markdown with malformed YAML produced non-actionable lint noise. Tests: 520 pass. Updated 9 assertion strings to match the new quoted output format; one assertion (TestAddRelatedLink) kept the old unquoted form because that test exercises an existing-file path where the brief line is preserved verbatim.
1 parent 7289cfe commit bfb4e3c

3 files changed

Lines changed: 115 additions & 30 deletions

File tree

openkb/agent/compiler.py

Lines changed: 56 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
from pathlib import Path
2828

2929
import litellm
30+
import yaml
3031

3132
from openkb.lint import list_existing_wiki_targets, strip_ghost_wikilinks
3233
from openkb.schema import get_agents_md
@@ -405,12 +406,14 @@ def _read_concept_briefs(wiki_dir: Path) -> str:
405406
if text.startswith("---"):
406407
end = text.find("---", 3)
407408
if end != -1:
408-
fm = text[:end + 3]
409+
fm_text = text[3:end].strip("\n")
409410
body = text[end + 3:]
410-
for line in fm.split("\n"):
411-
if line.startswith("brief:"):
412-
brief = line[len("brief:"):].strip()
413-
break
411+
try:
412+
fm = yaml.safe_load(fm_text)
413+
except yaml.YAMLError:
414+
fm = None
415+
if isinstance(fm, dict) and isinstance(fm.get("brief"), str):
416+
brief = fm["brief"].strip()
414417
if not brief:
415418
brief = body.strip().replace("\n", " ")[:150]
416419
if brief:
@@ -564,6 +567,39 @@ def _sanitize_concept_name(name: str) -> str:
564567
return sanitized or "unnamed-concept"
565568

566569

570+
def _yaml_kv_line(key: str, value: str) -> str:
571+
"""Render a single ``key: value`` line that round-trips through any YAML loader.
572+
573+
Uses ``json.dumps`` for the value — JSON strings are a strict subset of
574+
YAML, always single-line, always correctly escaped (newlines, quotes,
575+
control chars), and never auto-promoted to multi-line block scalars.
576+
"""
577+
return f"{key}: {json.dumps(value, ensure_ascii=False)}"
578+
579+
580+
def _yaml_list_line(key: str, items: list[str]) -> str:
581+
"""Render ``key: [a, b, c]`` as JSON-style YAML (always single-line, always valid)."""
582+
return f"{key}: {json.dumps(list(items), ensure_ascii=False)}"
583+
584+
585+
def _parse_yaml_list_value(line: str) -> list[str] | None:
586+
"""Parse the right-hand side of ``key: [...]`` into a list of strings.
587+
588+
Returns ``None`` when the value cannot be interpreted as a list — callers
589+
treat that as "leave the frontmatter alone".
590+
"""
591+
colon = line.find(":")
592+
if colon == -1:
593+
return None
594+
try:
595+
parsed = yaml.safe_load(line[colon + 1:])
596+
except yaml.YAMLError:
597+
return None
598+
if not isinstance(parsed, list):
599+
return None
600+
return [str(x) for x in parsed]
601+
602+
567603
def _write_concept(wiki_dir: Path, name: str, content: str, source_file: str, is_update: bool, brief: str = "") -> None:
568604
"""Write or update a concept page, managing the sources frontmatter."""
569605
concepts_dir = wiki_dir / "concepts"
@@ -598,20 +634,23 @@ def _write_concept(wiki_dir: Path, name: str, content: str, source_file: str, is
598634
if end != -1:
599635
fm = existing[:end + 3]
600636
body = existing[end + 3:]
637+
brief_line = _yaml_kv_line("brief", brief)
601638
if "brief:" in fm:
602-
fm = re.sub(r"brief:.*", f"brief: {brief}", fm)
639+
# Lambda to bypass re.sub backref interpretation in the
640+
# replacement string (brief may contain \1, \g<…>, etc.).
641+
fm = re.sub(r"brief:.*", lambda _m: brief_line, fm)
603642
else:
604-
fm = fm.replace("---\n", f"---\nbrief: {brief}\n", 1)
643+
fm = fm.replace("---\n", f"---\n{brief_line}\n", 1)
605644
existing = fm + body
606645
path.write_text(existing, encoding="utf-8")
607646
else:
608647
if content.startswith("---"):
609648
end = content.find("---", 3)
610649
if end != -1:
611650
content = content[end + 3:].lstrip("\n")
612-
fm_lines = [f"sources: [{source_file}]"]
651+
fm_lines = [_yaml_list_line("sources", [source_file])]
613652
if brief:
614-
fm_lines.append(f"brief: {brief}")
653+
fm_lines.append(_yaml_kv_line("brief", brief))
615654
frontmatter = "---\n" + "\n".join(fm_lines) + "\n---\n\n"
616655
path.write_text(frontmatter + content, encoding="utf-8")
617656

@@ -624,7 +663,7 @@ def _prepend_source_to_frontmatter(text: str, source_file: str) -> str:
624663
the frontmatter is malformed (no closing ``---``).
625664
"""
626665
if not text.startswith("---"):
627-
return f"---\nsources: [{source_file}]\n---\n\n" + text
666+
return f"---\n{_yaml_list_line('sources', [source_file])}\n---\n\n" + text
628667

629668
fm_end = text.find("---", 3)
630669
if fm_end == -1:
@@ -637,18 +676,16 @@ def _prepend_source_to_frontmatter(text: str, source_file: str) -> str:
637676
for i, line in enumerate(fm_lines):
638677
if not line.lstrip().startswith("sources:"):
639678
continue
640-
lb = line.find("[")
641-
rb = line.rfind("]")
642-
if lb == -1 or rb == -1 or rb < lb:
679+
items = _parse_yaml_list_value(line)
680+
if items is None:
643681
return text
644-
items = [s.strip() for s in line[lb + 1:rb].split(",") if s.strip()]
645682
if source_file in items:
646683
return text
647684
items.insert(0, source_file)
648-
fm_lines[i] = f"sources: [{', '.join(items)}]"
685+
fm_lines[i] = _yaml_list_line("sources", items)
649686
return "\n".join(fm_lines) + body
650687

651-
fm_lines.insert(1, f"sources: [{source_file}]")
688+
fm_lines.insert(1, _yaml_list_line("sources", [source_file]))
652689
return "\n".join(fm_lines) + body
653690

654691

@@ -676,15 +713,13 @@ def _remove_source_from_frontmatter(text: str, source_file: str) -> tuple[str, b
676713
for i, line in enumerate(fm_lines):
677714
if not line.lstrip().startswith("sources:"):
678715
continue
679-
lb = line.find("[")
680-
rb = line.rfind("]")
681-
if lb == -1 or rb == -1 or rb < lb:
716+
items = _parse_yaml_list_value(line)
717+
if items is None:
682718
return text, False
683-
items = [s.strip() for s in line[lb + 1:rb].split(",") if s.strip()]
684719
if source_file not in items:
685720
return text, False
686721
items.remove(source_file)
687-
fm_lines[i] = f"sources: [{', '.join(items)}]"
722+
fm_lines[i] = _yaml_list_line("sources", items)
688723
return "\n".join(fm_lines) + body, len(items) == 0
689724

690725
return text, False

openkb/lint.py

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,16 @@
55
- Orphaned pages — pages with no incoming or outgoing links
66
- Missing wiki entries — raw files without corresponding sources/summaries
77
- Index sync — index.md links vs actual files on disk
8+
- Invalid frontmatter — YAML that won't round-trip through safe_load
89
"""
910
from __future__ import annotations
1011

1112
import re
1213
import unicodedata
1314
from pathlib import Path
1415

16+
import yaml
17+
1518
# Matches [[wikilink]] or [[subdir/link]]
1619
_WIKILINK_RE = re.compile(r"\[\[([^\]]+)\]\]")
1720

@@ -402,6 +405,42 @@ def check_index_sync(wiki: Path) -> list[str]:
402405
return sorted(issues)
403406

404407

408+
def find_invalid_frontmatter(wiki: Path) -> list[str]:
409+
"""Return wiki pages whose YAML frontmatter fails ``yaml.safe_load``.
410+
411+
Catches the silent-write class of bug where an LLM-authored field
412+
(e.g. ``brief:``) ships unquoted and turns a colon-bearing value
413+
into invalid YAML that OpenKB itself reads with string slicing but
414+
external YAML-aware tools (VS Code, Obsidian, doc generators) reject.
415+
"""
416+
issues: list[str] = []
417+
if not wiki.exists():
418+
return issues
419+
for path in sorted(wiki.rglob("*.md")):
420+
if path.name in _EXCLUDED_FILES:
421+
continue
422+
# Skip reports/ and sources/ — auto-generated / user-uploaded
423+
# content, not wiki pages we manage. Matches the convention in
424+
# find_broken_links / find_orphans.
425+
rel_parts = path.relative_to(wiki).parts
426+
if rel_parts and rel_parts[0] in ("reports", "sources"):
427+
continue
428+
text = _read_md(path)
429+
if not text.startswith("---"):
430+
continue
431+
end = text.find("\n---", 3)
432+
if end == -1:
433+
continue
434+
fm = text[3:end].strip("\n")
435+
try:
436+
yaml.safe_load(fm)
437+
except yaml.YAMLError as exc:
438+
rel = path.relative_to(wiki)
439+
msg = str(exc).splitlines()[0]
440+
issues.append(f"{rel}: {msg}")
441+
return issues
442+
443+
405444
def run_structural_lint(kb_dir: Path) -> str:
406445
"""Run all structural lint checks and return a formatted Markdown report.
407446
@@ -418,6 +457,7 @@ def run_structural_lint(kb_dir: Path) -> str:
418457
orphans = find_orphans(wiki)
419458
missing = find_missing_entries(raw, wiki)
420459
sync_issues = check_index_sync(wiki)
460+
bad_frontmatter = find_invalid_frontmatter(wiki)
421461

422462
lines = ["## Structural Lint Report\n"]
423463

@@ -455,5 +495,14 @@ def run_structural_lint(kb_dir: Path) -> str:
455495
lines.append(f"- {issue}")
456496
else:
457497
lines.append("Index is in sync.")
498+
lines.append("")
499+
500+
# Invalid frontmatter
501+
lines.append(f"### Invalid Frontmatter ({len(bad_frontmatter)})")
502+
if bad_frontmatter:
503+
for issue in bad_frontmatter:
504+
lines.append(f"- {issue}")
505+
else:
506+
lines.append("All frontmatter parses as valid YAML.")
458507

459508
return "\n".join(lines)

tests/test_compiler.py

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -137,8 +137,8 @@ def test_new_concept_with_brief(self, tmp_path):
137137
path = wiki / "concepts" / "attention.md"
138138
assert path.exists()
139139
text = path.read_text()
140-
assert "sources: [paper.pdf]" in text
141-
assert "brief: Mechanism for selective focus" in text
140+
assert 'sources: ["paper.pdf"]' in text
141+
assert 'brief: "Mechanism for selective focus"' in text
142142
assert "# Attention" in text
143143

144144
def test_new_concept_without_brief(self, tmp_path):
@@ -148,7 +148,7 @@ def test_new_concept_without_brief(self, tmp_path):
148148
path = wiki / "concepts" / "attention.md"
149149
assert path.exists()
150150
text = path.read_text()
151-
assert "sources: [paper.pdf]" in text
151+
assert 'sources: ["paper.pdf"]' in text
152152
assert "brief:" not in text
153153

154154
def test_update_concept_updates_brief(self, tmp_path):
@@ -163,7 +163,7 @@ def test_update_concept_updates_brief(self, tmp_path):
163163
text = (concepts / "attention.md").read_text()
164164
assert "paper2.pdf" in text
165165
assert "paper1.pdf" in text
166-
assert "brief: Updated brief" in text
166+
assert 'brief: "Updated brief"' in text
167167
assert "Old brief" not in text
168168

169169
def test_update_concept_appends_source(self, tmp_path):
@@ -633,7 +633,8 @@ def test_frontmatter_without_sources_line_gets_one_inserted(self, tmp_path):
633633
)
634634
_add_related_link(wiki, "attention", "new-doc", "paper.pdf")
635635
text = (concepts / "attention.md").read_text()
636-
assert "sources: [paper.pdf]" in text
636+
assert 'sources: ["paper.pdf"]' in text
637+
# Brief was not touched (existing line preserved); only sources was inserted.
637638
assert "brief: Focus mechanism" in text
638639
assert "[[summaries/new-doc]]" in text
639640

@@ -732,7 +733,7 @@ async def test_full_pipeline(self, tmp_path):
732733
# Verify concept written
733734
concept_path = wiki / "concepts" / "transformer.md"
734735
assert concept_path.exists()
735-
assert "sources: [summaries/test-doc.md]" in concept_path.read_text()
736+
assert 'sources: ["summaries/test-doc.md"]' in concept_path.read_text()
736737

737738
# Verify index updated
738739
index_text = (wiki / "index.md").read_text()
@@ -1205,7 +1206,7 @@ async def ordered_acompletion(*args, **kwargs):
12051206
fa_path = wiki / "concepts" / "flash-attention.md"
12061207
assert fa_path.exists()
12071208
fa_text = fa_path.read_text()
1208-
assert "sources: [summaries/test-doc.md]" in fa_text
1209+
assert 'sources: ["summaries/test-doc.md"]' in fa_text
12091210
assert "Flash Attention" in fa_text
12101211

12111212
# Verify attention updated (is_update=True path in _write_concept)
@@ -1287,7 +1288,7 @@ async def test_fallback_list_format(self, tmp_path):
12871288
att_path = wiki / "concepts" / "attention.md"
12881289
assert att_path.exists()
12891290
att_text = att_path.read_text()
1290-
assert "sources: [summaries/test-doc.md]" in att_text
1291+
assert 'sources: ["summaries/test-doc.md"]' in att_text
12911292
assert "Attention" in att_text
12921293

12931294

@@ -1338,7 +1339,7 @@ async def test_short_doc_briefs_in_index_and_frontmatter(self, tmp_path):
13381339

13391340
# Concept frontmatter has brief
13401341
concept_text = (wiki / "concepts" / "transformer.md").read_text()
1341-
assert "brief: NN architecture using self-attention" in concept_text
1342+
assert 'brief: "NN architecture using self-attention"' in concept_text
13421343

13431344
# Index has briefs
13441345
index_text = (wiki / "index.md").read_text()

0 commit comments

Comments
 (0)