fix(skill): round-2 review fixes + drop community docs

KylinMountain · KylinMountain · commit bc916bc2cb25 · 2026-05-18T16:39:45.000+08:00
Code review round-2 flagged the eval pipeline reintroducing the
MaxTurnsExceeded/JSONDecodeError traceback leak that round-1 caught
for skill new. Apply the same shim inside skill_evaluator + 4 other
carryover items:

- Translate MaxTurnsExceeded and json.JSONDecodeError to RuntimeError
  inside generate_eval_set and grade_one. CLI catch (RuntimeError) now
  covers both.
- Wrap _setup_llm_key in skill_eval with the same try/except/exit
  pattern as skill_new / query / chat.
- Move openkb/skill_evaluator.py -&gt; openkb/agent/skill_evaluator.py.
  Modules that construct Agent live under openkb/agent/ per repo
  convention; top-level openkb/ keeps marketplace + generator (no
  agents SDK).
- Validator: reject '&lt;' / '&gt;' in description (Anthropic parser
  requirement); warn on unknown frontmatter keys (Anthropic spec
  allows a fixed set).
- Drop redundant in-function 'import asyncio' from skill_eval (already
  at module top).
- Drop unused EvalMiss import from tests.
- Validator module docstring updated to enumerate all checks.

Also delete community contribution scaffolding (CONTRIBUTING.md +
.github/PULL_REQUEST_TEMPLATE/skill_submission.md) - premature for the
project's current stage; will revisit when real contributors arrive.
diff --git a/.github/PULL_REQUEST_TEMPLATE/skill_submission.md b/.github/PULL_REQUEST_TEMPLATE/skill_submission.md
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
diff --git a/README.md b/README.md
@@ -274,8 +274,6 @@ openkb skill history karpathy-thinking
 openkb skill rollback karpathy-thinking --to 2
 ```
 
-See [CONTRIBUTING.md](CONTRIBUTING.md) for how to submit your compiled skill back to the community registry at [VectifyAI/OpenKB](https://github.com/VectifyAI/OpenKB).
-
 ### Configuration
 
 Settings are initialized by `openkb init`, and stored in `.openkb/config.yaml`:
diff --git a/openkb/agent/skill_evaluator.py b/openkb/agent/skill_evaluator.py
@@ -117,7 +117,14 @@ async def generate_eval_set(
         model=f"litellm/{model}",
         model_settings=ModelSettings(parallel_tool_calls=False),
     )
-    result = await Runner.run(agent, "Generate the eval set now.", max_turns=3)
+    from agents.exceptions import MaxTurnsExceeded
+    try:
+        result = await Runner.run(agent, "Generate the eval set now.", max_turns=3)
+    except MaxTurnsExceeded as exc:
+        raise RuntimeError(
+            "Eval set generation hit the max-turn cap. The model may be "
+            "looping; try a different model or a smaller --count."
+        ) from exc
     raw = (result.final_output or "").strip()
 
     # Strip optional code fence
@@ -126,7 +133,14 @@ async def generate_eval_set(
         if raw.startswith("json"):
             raw = raw[4:].lstrip()
 
-    data = json.loads(raw)
+    try:
+        data = json.loads(raw)
+    except json.JSONDecodeError as exc:
+        raise RuntimeError(
+            f"Eval set generator returned non-JSON output: {exc.msg}. "
+            f"Try a more capable model — small models often ignore "
+            f"'output only JSON' instructions. First 200 chars: {raw[:200]!r}"
+        ) from exc
     prompts: list[EvalPrompt] = []
     for q in data.get("should_trigger", []):
         prompts.append(EvalPrompt(question=q, expected="trigger"))
@@ -157,7 +171,14 @@ async def grade_one(
         model=f"litellm/{model}",
         model_settings=ModelSettings(parallel_tool_calls=False),
     )
-    result = await Runner.run(agent, f"Question: {question}", max_turns=2)
+    from agents.exceptions import MaxTurnsExceeded
+    try:
+        result = await Runner.run(agent, f"Question: {question}", max_turns=2)
+    except MaxTurnsExceeded as exc:
+        raise RuntimeError(
+            f"Trigger grader hit the max-turn cap on question: {question!r}. "
+            f"Try a more capable model."
+        ) from exc
     raw = (result.final_output or "").strip().upper()
     if "NO-TRIGGER" in raw or "NO TRIGGER" in raw:
         return "no-trigger"
diff --git a/openkb/cli.py b/openkb/cli.py
@@ -1744,8 +1744,7 @@ def skill_eval(ctx, name, save_flag, eval_set_path, count):
     the description should activate the skill for each prompt. Prints pass
     rate + miss list.
     """
-    import asyncio
-    from openkb.skill_evaluator import (
+    from openkb.agent.skill_evaluator import (
         run_eval, save_eval_set, load_eval_set, EvalPrompt,
     )
 
@@ -1759,7 +1758,11 @@ def skill_eval(ctx, name, save_flag, eval_set_path, count):
         click.echo(f"[ERROR] Skill '{name}' not found.", err=True)
         ctx.exit(1)
 
-    _setup_llm_key(kb_dir)
+    try:
+        _setup_llm_key(kb_dir)
+    except RuntimeError as exc:
+        click.echo(f"[ERROR] {exc}", err=True)
+        ctx.exit(1)
     config = load_config(kb_dir / ".openkb" / "config.yaml")
     model = config.get("model", DEFAULT_CONFIG["model"])
 
diff --git a/openkb/skill_validator.py b/openkb/skill_validator.py
@@ -4,12 +4,15 @@
 make a skill un-loadable or misleading to the agents that install it:
 
   * SKILL.md missing or unparseable
-  * frontmatter missing required fields
-  * name field doesn't match directory or violates the slug rule
-  * description too long (> 1024 chars per Anthropic spec)
-  * files too big (SKILL.md > 50 KB / references/*.md > 100 KB)
-  * `[[references/...]]` wikilinks pointing at files that don't exist
-  * (strict mode) scripts/*.py importing non-stdlib modules
+  * frontmatter present, parses as YAML, is a mapping
+  * required fields: name (matches dir + slug regex), description
+  * description length within bounds (warns < 20 chars, errors > 1024)
+  * description must not contain '<' or '>' (breaks activation parser)
+  * frontmatter keys limited to the Anthropic Skills allowed set
+    (warns on unknown keys; matches Anthropic's quick_validate.py)
+  * files within size limits (SKILL.md ≤ 50 KB / references/*.md ≤ 100 KB)
+  * `[[references/...]]` wikilinks resolve to actual files
+  * (strict mode) scripts/*.py imports only stdlib modules
 
 This is the deterministic counterpart to ``openkb skill eval`` — eval
 measures whether the description fires; validate ensures the structure
@@ -32,6 +35,9 @@
 REFERENCE_MAX_BYTES = 100 * 1024
 NAME_MAX_LEN = 64
 WIKILINK_RE = re.compile(r"\[\[references/([a-z0-9._/-]+)\]\]", re.IGNORECASE)
+ALLOWED_FRONTMATTER_KEYS = {
+    "name", "description", "license", "allowed-tools", "metadata", "compatibility",
+}
 
 
 @dataclass
@@ -98,6 +104,15 @@ def validate_skill(skill_dir: Path, *, strict: bool = False) -> ValidationResult
         result.errors.append("Frontmatter must be a YAML mapping.")
         return result
 
+    extras = set(meta.keys()) - ALLOWED_FRONTMATTER_KEYS
+    if extras:
+        # Treat as warning, not error — keeps strict mode user-controllable
+        result.warnings.append(
+            f"Frontmatter contains unknown keys: {sorted(extras)}. "
+            f"Anthropic Skills spec only allows: "
+            f"{sorted(ALLOWED_FRONTMATTER_KEYS)}."
+        )
+
     # name field
     name = meta.get("name")
     if not name:
@@ -133,6 +148,11 @@ def validate_skill(skill_dir: Path, *, strict: bool = False) -> ValidationResult
                 f"Frontmatter 'description:' is only {len(desc)} chars — "
                 f"too short to be a useful activation signal."
             )
+        if "<" in desc or ">" in desc:
+            result.errors.append(
+                "Frontmatter 'description:' must not contain '<' or '>' "
+                "characters — they break the activation parser in Claude Code."
+            )
 
     # references/ wikilink resolution
     wikilinks = WIKILINK_RE.findall(text)
diff --git a/tests/test_skill_cli.py b/tests/test_skill_cli.py
@@ -343,7 +343,7 @@ async def perfect_grader(description, question, *, model):
     runner = CliRunner()
     with patch("openkb.cli._find_kb_dir", return_value=kb), \
          patch("openkb.cli._setup_llm_key", return_value=None), \
-         patch("openkb.skill_evaluator.grade_one", side_effect=perfect_grader):
+         patch("openkb.agent.skill_evaluator.grade_one", side_effect=perfect_grader):
         result = runner.invoke(cli, [
             "skill", "eval", "demo", "--eval-set", str(eval_path),
         ])
@@ -374,7 +374,7 @@ async def biased_grader(description, question, *, model):
     runner = CliRunner()
     with patch("openkb.cli._find_kb_dir", return_value=kb), \
          patch("openkb.cli._setup_llm_key", return_value=None), \
-         patch("openkb.skill_evaluator.grade_one", side_effect=biased_grader):
+         patch("openkb.agent.skill_evaluator.grade_one", side_effect=biased_grader):
         result = runner.invoke(cli, [
             "skill", "eval", "demo", "--eval-set", str(eval_path),
         ])
diff --git a/tests/test_skill_evaluator.py b/tests/test_skill_evaluator.py
@@ -1,4 +1,4 @@
-"""Tests for openkb.skill_evaluator.
+"""Tests for openkb.agent.skill_evaluator.
 
 The Runner.run call is mocked everywhere — no real LLM tokens spent.
 What we DO verify:
@@ -17,8 +17,7 @@
 
 import pytest
 
-from openkb.skill_evaluator import (
-    EvalMiss,
+from openkb.agent.skill_evaluator import (
     EvalPrompt,
     EvalResult,
     _read_description,
@@ -80,7 +79,7 @@ async def test_generate_eval_set_parses_plain_json(tmp_path):
     async def fake_runner(*args, **kwargs):
         return SimpleNamespace(final_output=_fake_generator_payload(10))
 
-    with patch("openkb.skill_evaluator.Runner.run", new=AsyncMock(side_effect=fake_runner)):
+    with patch("openkb.agent.skill_evaluator.Runner.run", new=AsyncMock(side_effect=fake_runner)):
         prompts = await generate_eval_set(skill_dir, model="gpt-4o-mini", count=10)
 
     assert len(prompts) == 20
@@ -98,7 +97,7 @@ async def test_generate_eval_set_strips_code_fences(tmp_path):
     async def fake_runner(*args, **kwargs):
         return SimpleNamespace(final_output=fenced)
 
-    with patch("openkb.skill_evaluator.Runner.run", new=AsyncMock(side_effect=fake_runner)):
+    with patch("openkb.agent.skill_evaluator.Runner.run", new=AsyncMock(side_effect=fake_runner)):
         prompts = await generate_eval_set(skill_dir, model="gpt-4o-mini", count=3)
 
     assert len(prompts) == 6
@@ -112,7 +111,7 @@ async def test_grade_one_returns_trigger_for_trigger_response():
     async def fake_runner(*args, **kwargs):
         return SimpleNamespace(final_output="TRIGGER")
 
-    with patch("openkb.skill_evaluator.Runner.run", new=AsyncMock(side_effect=fake_runner)):
+    with patch("openkb.agent.skill_evaluator.Runner.run", new=AsyncMock(side_effect=fake_runner)):
         out = await grade_one("desc", "question?", model="gpt-4o-mini")
     assert out == "trigger"
 
@@ -122,7 +121,7 @@ async def test_grade_one_returns_no_trigger_for_negative_response():
     async def fake_runner(*args, **kwargs):
         return SimpleNamespace(final_output="NO-TRIGGER")
 
-    with patch("openkb.skill_evaluator.Runner.run", new=AsyncMock(side_effect=fake_runner)):
+    with patch("openkb.agent.skill_evaluator.Runner.run", new=AsyncMock(side_effect=fake_runner)):
         out = await grade_one("desc", "question?", model="gpt-4o-mini")
     assert out == "no-trigger"
 
@@ -132,7 +131,7 @@ async def test_grade_one_handles_mixed_case():
     async def fake_runner(*args, **kwargs):
         return SimpleNamespace(final_output="trigger")
 
-    with patch("openkb.skill_evaluator.Runner.run", new=AsyncMock(side_effect=fake_runner)):
+    with patch("openkb.agent.skill_evaluator.Runner.run", new=AsyncMock(side_effect=fake_runner)):
         out = await grade_one("desc", "question?", model="gpt-4o-mini")
     assert out == "trigger"
 
@@ -142,7 +141,7 @@ async def test_grade_one_handles_space_variant():
     async def fake_runner(*args, **kwargs):
         return SimpleNamespace(final_output="No Trigger")
 
-    with patch("openkb.skill_evaluator.Runner.run", new=AsyncMock(side_effect=fake_runner)):
+    with patch("openkb.agent.skill_evaluator.Runner.run", new=AsyncMock(side_effect=fake_runner)):
         out = await grade_one("desc", "question?", model="gpt-4o-mini")
     assert out == "no-trigger"
 
@@ -152,7 +151,7 @@ async def test_grade_one_defaults_to_no_trigger_on_ambiguous_output():
     async def fake_runner(*args, **kwargs):
         return SimpleNamespace(final_output="hmm not sure")
 
-    with patch("openkb.skill_evaluator.Runner.run", new=AsyncMock(side_effect=fake_runner)):
+    with patch("openkb.agent.skill_evaluator.Runner.run", new=AsyncMock(side_effect=fake_runner)):
         out = await grade_one("desc", "question?", model="gpt-4o-mini")
     assert out == "no-trigger"
 
@@ -179,7 +178,7 @@ async def fake_grade(description, question, *, model):
         match = next(p for p in eval_set if p.question == question)
         return match.expected
 
-    with patch("openkb.skill_evaluator.grade_one", side_effect=fake_grade):
+    with patch("openkb.agent.skill_evaluator.grade_one", side_effect=fake_grade):
         result = await run_eval(skill_dir, model="gpt-4o-mini", eval_set=eval_set)
 
     assert isinstance(result, EvalResult)
@@ -198,7 +197,7 @@ async def test_run_eval_reports_misses(tmp_path):
     async def fake_grade(description, question, *, model):
         return "trigger"
 
-    with patch("openkb.skill_evaluator.grade_one", side_effect=fake_grade):
+    with patch("openkb.agent.skill_evaluator.grade_one", side_effect=fake_grade):
         result = await run_eval(skill_dir, model="gpt-4o-mini", eval_set=eval_set)
 
     assert result.total == 6
@@ -230,3 +229,36 @@ def test_save_and_load_eval_set_round_trip(tmp_path):
     assert len(loaded) == 4
     assert [p.question for p in loaded if p.expected == "trigger"] == ["trig 0", "trig 1"]
     assert [p.question for p in loaded if p.expected == "no-trigger"] == ["no 0", "no 1"]
+
+
+# -------- RuntimeError translation for CLI catch -------------------------------
+
+
+@pytest.mark.asyncio
+async def test_generate_eval_set_translates_max_turns_to_runtime_error(tmp_path):
+    """MaxTurnsExceeded from Runner.run should become RuntimeError."""
+    from agents.exceptions import MaxTurnsExceeded
+
+    skill_dir = _make_skill(tmp_path)
+
+    async def fake_runner(*args, **kwargs):
+        raise MaxTurnsExceeded("ran out")
+
+    with patch("openkb.agent.skill_evaluator.Runner.run",
+               new=AsyncMock(side_effect=fake_runner)):
+        with pytest.raises(RuntimeError, match="max-turn cap"):
+            await generate_eval_set(skill_dir, model="gpt-4o-mini")
+
+
+@pytest.mark.asyncio
+async def test_generate_eval_set_translates_malformed_json_to_runtime_error(tmp_path):
+    """Non-JSON LLM output should produce a friendly RuntimeError."""
+    skill_dir = _make_skill(tmp_path)
+
+    async def fake_runner(*args, **kwargs):
+        return SimpleNamespace(final_output="this is not json at all")
+
+    with patch("openkb.agent.skill_evaluator.Runner.run",
+               new=AsyncMock(side_effect=fake_runner)):
+        with pytest.raises(RuntimeError, match="non-JSON output"):
+            await generate_eval_set(skill_dir, model="gpt-4o-mini")
diff --git a/tests/test_skill_validator.py b/tests/test_skill_validator.py