Skip to content

Commit bc916bc

Browse files
committed
fix(skill): round-2 review fixes + drop community docs
Code review round-2 flagged the eval pipeline reintroducing the MaxTurnsExceeded/JSONDecodeError traceback leak that round-1 caught for skill new. Apply the same shim inside skill_evaluator + 4 other carryover items: - Translate MaxTurnsExceeded and json.JSONDecodeError to RuntimeError inside generate_eval_set and grade_one. CLI catch (RuntimeError) now covers both. - Wrap _setup_llm_key in skill_eval with the same try/except/exit pattern as skill_new / query / chat. - Move openkb/skill_evaluator.py -> openkb/agent/skill_evaluator.py. Modules that construct Agent live under openkb/agent/ per repo convention; top-level openkb/ keeps marketplace + generator (no agents SDK). - Validator: reject '<' / '>' in description (Anthropic parser requirement); warn on unknown frontmatter keys (Anthropic spec allows a fixed set). - Drop redundant in-function 'import asyncio' from skill_eval (already at module top). - Drop unused EvalMiss import from tests. - Validator module docstring updated to enumerate all checks. Also delete community contribution scaffolding (CONTRIBUTING.md + .github/PULL_REQUEST_TEMPLATE/skill_submission.md) - premature for the project's current stage; will revisit when real contributors arrive.
1 parent a5fe567 commit bc916bc

9 files changed

Lines changed: 134 additions & 121 deletions

File tree

.github/PULL_REQUEST_TEMPLATE/skill_submission.md

Lines changed: 0 additions & 22 deletions
This file was deleted.

CONTRIBUTING.md

Lines changed: 0 additions & 71 deletions
This file was deleted.

README.md

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -274,8 +274,6 @@ openkb skill history karpathy-thinking
274274
openkb skill rollback karpathy-thinking --to 2
275275
```
276276

277-
See [CONTRIBUTING.md](CONTRIBUTING.md) for how to submit your compiled skill back to the community registry at [VectifyAI/OpenKB](https://github.com/VectifyAI/OpenKB).
278-
279277
### Configuration
280278

281279
Settings are initialized by `openkb init`, and stored in `.openkb/config.yaml`:
Lines changed: 24 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,14 @@ async def generate_eval_set(
117117
model=f"litellm/{model}",
118118
model_settings=ModelSettings(parallel_tool_calls=False),
119119
)
120-
result = await Runner.run(agent, "Generate the eval set now.", max_turns=3)
120+
from agents.exceptions import MaxTurnsExceeded
121+
try:
122+
result = await Runner.run(agent, "Generate the eval set now.", max_turns=3)
123+
except MaxTurnsExceeded as exc:
124+
raise RuntimeError(
125+
"Eval set generation hit the max-turn cap. The model may be "
126+
"looping; try a different model or a smaller --count."
127+
) from exc
121128
raw = (result.final_output or "").strip()
122129

123130
# Strip optional code fence
@@ -126,7 +133,14 @@ async def generate_eval_set(
126133
if raw.startswith("json"):
127134
raw = raw[4:].lstrip()
128135

129-
data = json.loads(raw)
136+
try:
137+
data = json.loads(raw)
138+
except json.JSONDecodeError as exc:
139+
raise RuntimeError(
140+
f"Eval set generator returned non-JSON output: {exc.msg}. "
141+
f"Try a more capable model — small models often ignore "
142+
f"'output only JSON' instructions. First 200 chars: {raw[:200]!r}"
143+
) from exc
130144
prompts: list[EvalPrompt] = []
131145
for q in data.get("should_trigger", []):
132146
prompts.append(EvalPrompt(question=q, expected="trigger"))
@@ -157,7 +171,14 @@ async def grade_one(
157171
model=f"litellm/{model}",
158172
model_settings=ModelSettings(parallel_tool_calls=False),
159173
)
160-
result = await Runner.run(agent, f"Question: {question}", max_turns=2)
174+
from agents.exceptions import MaxTurnsExceeded
175+
try:
176+
result = await Runner.run(agent, f"Question: {question}", max_turns=2)
177+
except MaxTurnsExceeded as exc:
178+
raise RuntimeError(
179+
f"Trigger grader hit the max-turn cap on question: {question!r}. "
180+
f"Try a more capable model."
181+
) from exc
161182
raw = (result.final_output or "").strip().upper()
162183
if "NO-TRIGGER" in raw or "NO TRIGGER" in raw:
163184
return "no-trigger"

openkb/cli.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1744,8 +1744,7 @@ def skill_eval(ctx, name, save_flag, eval_set_path, count):
17441744
the description should activate the skill for each prompt. Prints pass
17451745
rate + miss list.
17461746
"""
1747-
import asyncio
1748-
from openkb.skill_evaluator import (
1747+
from openkb.agent.skill_evaluator import (
17491748
run_eval, save_eval_set, load_eval_set, EvalPrompt,
17501749
)
17511750

@@ -1759,7 +1758,11 @@ def skill_eval(ctx, name, save_flag, eval_set_path, count):
17591758
click.echo(f"[ERROR] Skill '{name}' not found.", err=True)
17601759
ctx.exit(1)
17611760

1762-
_setup_llm_key(kb_dir)
1761+
try:
1762+
_setup_llm_key(kb_dir)
1763+
except RuntimeError as exc:
1764+
click.echo(f"[ERROR] {exc}", err=True)
1765+
ctx.exit(1)
17631766
config = load_config(kb_dir / ".openkb" / "config.yaml")
17641767
model = config.get("model", DEFAULT_CONFIG["model"])
17651768

openkb/skill_validator.py

Lines changed: 26 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,15 @@
44
make a skill un-loadable or misleading to the agents that install it:
55
66
* SKILL.md missing or unparseable
7-
* frontmatter missing required fields
8-
* name field doesn't match directory or violates the slug rule
9-
* description too long (> 1024 chars per Anthropic spec)
10-
* files too big (SKILL.md > 50 KB / references/*.md > 100 KB)
11-
* `[[references/...]]` wikilinks pointing at files that don't exist
12-
* (strict mode) scripts/*.py importing non-stdlib modules
7+
* frontmatter present, parses as YAML, is a mapping
8+
* required fields: name (matches dir + slug regex), description
9+
* description length within bounds (warns < 20 chars, errors > 1024)
10+
* description must not contain '<' or '>' (breaks activation parser)
11+
* frontmatter keys limited to the Anthropic Skills allowed set
12+
(warns on unknown keys; matches Anthropic's quick_validate.py)
13+
* files within size limits (SKILL.md ≤ 50 KB / references/*.md ≤ 100 KB)
14+
* `[[references/...]]` wikilinks resolve to actual files
15+
* (strict mode) scripts/*.py imports only stdlib modules
1316
1417
This is the deterministic counterpart to ``openkb skill eval`` — eval
1518
measures whether the description fires; validate ensures the structure
@@ -32,6 +35,9 @@
3235
REFERENCE_MAX_BYTES = 100 * 1024
3336
NAME_MAX_LEN = 64
3437
WIKILINK_RE = re.compile(r"\[\[references/([a-z0-9._/-]+)\]\]", re.IGNORECASE)
38+
ALLOWED_FRONTMATTER_KEYS = {
39+
"name", "description", "license", "allowed-tools", "metadata", "compatibility",
40+
}
3541

3642

3743
@dataclass
@@ -98,6 +104,15 @@ def validate_skill(skill_dir: Path, *, strict: bool = False) -> ValidationResult
98104
result.errors.append("Frontmatter must be a YAML mapping.")
99105
return result
100106

107+
extras = set(meta.keys()) - ALLOWED_FRONTMATTER_KEYS
108+
if extras:
109+
# Treat as warning, not error — keeps strict mode user-controllable
110+
result.warnings.append(
111+
f"Frontmatter contains unknown keys: {sorted(extras)}. "
112+
f"Anthropic Skills spec only allows: "
113+
f"{sorted(ALLOWED_FRONTMATTER_KEYS)}."
114+
)
115+
101116
# name field
102117
name = meta.get("name")
103118
if not name:
@@ -133,6 +148,11 @@ def validate_skill(skill_dir: Path, *, strict: bool = False) -> ValidationResult
133148
f"Frontmatter 'description:' is only {len(desc)} chars — "
134149
f"too short to be a useful activation signal."
135150
)
151+
if "<" in desc or ">" in desc:
152+
result.errors.append(
153+
"Frontmatter 'description:' must not contain '<' or '>' "
154+
"characters — they break the activation parser in Claude Code."
155+
)
136156

137157
# references/ wikilink resolution
138158
wikilinks = WIKILINK_RE.findall(text)

tests/test_skill_cli.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -343,7 +343,7 @@ async def perfect_grader(description, question, *, model):
343343
runner = CliRunner()
344344
with patch("openkb.cli._find_kb_dir", return_value=kb), \
345345
patch("openkb.cli._setup_llm_key", return_value=None), \
346-
patch("openkb.skill_evaluator.grade_one", side_effect=perfect_grader):
346+
patch("openkb.agent.skill_evaluator.grade_one", side_effect=perfect_grader):
347347
result = runner.invoke(cli, [
348348
"skill", "eval", "demo", "--eval-set", str(eval_path),
349349
])
@@ -374,7 +374,7 @@ async def biased_grader(description, question, *, model):
374374
runner = CliRunner()
375375
with patch("openkb.cli._find_kb_dir", return_value=kb), \
376376
patch("openkb.cli._setup_llm_key", return_value=None), \
377-
patch("openkb.skill_evaluator.grade_one", side_effect=biased_grader):
377+
patch("openkb.agent.skill_evaluator.grade_one", side_effect=biased_grader):
378378
result = runner.invoke(cli, [
379379
"skill", "eval", "demo", "--eval-set", str(eval_path),
380380
])

tests/test_skill_evaluator.py

Lines changed: 44 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
"""Tests for openkb.skill_evaluator.
1+
"""Tests for openkb.agent.skill_evaluator.
22
33
The Runner.run call is mocked everywhere — no real LLM tokens spent.
44
What we DO verify:
@@ -17,8 +17,7 @@
1717

1818
import pytest
1919

20-
from openkb.skill_evaluator import (
21-
EvalMiss,
20+
from openkb.agent.skill_evaluator import (
2221
EvalPrompt,
2322
EvalResult,
2423
_read_description,
@@ -80,7 +79,7 @@ async def test_generate_eval_set_parses_plain_json(tmp_path):
8079
async def fake_runner(*args, **kwargs):
8180
return SimpleNamespace(final_output=_fake_generator_payload(10))
8281

83-
with patch("openkb.skill_evaluator.Runner.run", new=AsyncMock(side_effect=fake_runner)):
82+
with patch("openkb.agent.skill_evaluator.Runner.run", new=AsyncMock(side_effect=fake_runner)):
8483
prompts = await generate_eval_set(skill_dir, model="gpt-4o-mini", count=10)
8584

8685
assert len(prompts) == 20
@@ -98,7 +97,7 @@ async def test_generate_eval_set_strips_code_fences(tmp_path):
9897
async def fake_runner(*args, **kwargs):
9998
return SimpleNamespace(final_output=fenced)
10099

101-
with patch("openkb.skill_evaluator.Runner.run", new=AsyncMock(side_effect=fake_runner)):
100+
with patch("openkb.agent.skill_evaluator.Runner.run", new=AsyncMock(side_effect=fake_runner)):
102101
prompts = await generate_eval_set(skill_dir, model="gpt-4o-mini", count=3)
103102

104103
assert len(prompts) == 6
@@ -112,7 +111,7 @@ async def test_grade_one_returns_trigger_for_trigger_response():
112111
async def fake_runner(*args, **kwargs):
113112
return SimpleNamespace(final_output="TRIGGER")
114113

115-
with patch("openkb.skill_evaluator.Runner.run", new=AsyncMock(side_effect=fake_runner)):
114+
with patch("openkb.agent.skill_evaluator.Runner.run", new=AsyncMock(side_effect=fake_runner)):
116115
out = await grade_one("desc", "question?", model="gpt-4o-mini")
117116
assert out == "trigger"
118117

@@ -122,7 +121,7 @@ async def test_grade_one_returns_no_trigger_for_negative_response():
122121
async def fake_runner(*args, **kwargs):
123122
return SimpleNamespace(final_output="NO-TRIGGER")
124123

125-
with patch("openkb.skill_evaluator.Runner.run", new=AsyncMock(side_effect=fake_runner)):
124+
with patch("openkb.agent.skill_evaluator.Runner.run", new=AsyncMock(side_effect=fake_runner)):
126125
out = await grade_one("desc", "question?", model="gpt-4o-mini")
127126
assert out == "no-trigger"
128127

@@ -132,7 +131,7 @@ async def test_grade_one_handles_mixed_case():
132131
async def fake_runner(*args, **kwargs):
133132
return SimpleNamespace(final_output="trigger")
134133

135-
with patch("openkb.skill_evaluator.Runner.run", new=AsyncMock(side_effect=fake_runner)):
134+
with patch("openkb.agent.skill_evaluator.Runner.run", new=AsyncMock(side_effect=fake_runner)):
136135
out = await grade_one("desc", "question?", model="gpt-4o-mini")
137136
assert out == "trigger"
138137

@@ -142,7 +141,7 @@ async def test_grade_one_handles_space_variant():
142141
async def fake_runner(*args, **kwargs):
143142
return SimpleNamespace(final_output="No Trigger")
144143

145-
with patch("openkb.skill_evaluator.Runner.run", new=AsyncMock(side_effect=fake_runner)):
144+
with patch("openkb.agent.skill_evaluator.Runner.run", new=AsyncMock(side_effect=fake_runner)):
146145
out = await grade_one("desc", "question?", model="gpt-4o-mini")
147146
assert out == "no-trigger"
148147

@@ -152,7 +151,7 @@ async def test_grade_one_defaults_to_no_trigger_on_ambiguous_output():
152151
async def fake_runner(*args, **kwargs):
153152
return SimpleNamespace(final_output="hmm not sure")
154153

155-
with patch("openkb.skill_evaluator.Runner.run", new=AsyncMock(side_effect=fake_runner)):
154+
with patch("openkb.agent.skill_evaluator.Runner.run", new=AsyncMock(side_effect=fake_runner)):
156155
out = await grade_one("desc", "question?", model="gpt-4o-mini")
157156
assert out == "no-trigger"
158157

@@ -179,7 +178,7 @@ async def fake_grade(description, question, *, model):
179178
match = next(p for p in eval_set if p.question == question)
180179
return match.expected
181180

182-
with patch("openkb.skill_evaluator.grade_one", side_effect=fake_grade):
181+
with patch("openkb.agent.skill_evaluator.grade_one", side_effect=fake_grade):
183182
result = await run_eval(skill_dir, model="gpt-4o-mini", eval_set=eval_set)
184183

185184
assert isinstance(result, EvalResult)
@@ -198,7 +197,7 @@ async def test_run_eval_reports_misses(tmp_path):
198197
async def fake_grade(description, question, *, model):
199198
return "trigger"
200199

201-
with patch("openkb.skill_evaluator.grade_one", side_effect=fake_grade):
200+
with patch("openkb.agent.skill_evaluator.grade_one", side_effect=fake_grade):
202201
result = await run_eval(skill_dir, model="gpt-4o-mini", eval_set=eval_set)
203202

204203
assert result.total == 6
@@ -230,3 +229,36 @@ def test_save_and_load_eval_set_round_trip(tmp_path):
230229
assert len(loaded) == 4
231230
assert [p.question for p in loaded if p.expected == "trigger"] == ["trig 0", "trig 1"]
232231
assert [p.question for p in loaded if p.expected == "no-trigger"] == ["no 0", "no 1"]
232+
233+
234+
# -------- RuntimeError translation for CLI catch -------------------------------
235+
236+
237+
@pytest.mark.asyncio
238+
async def test_generate_eval_set_translates_max_turns_to_runtime_error(tmp_path):
239+
"""MaxTurnsExceeded from Runner.run should become RuntimeError."""
240+
from agents.exceptions import MaxTurnsExceeded
241+
242+
skill_dir = _make_skill(tmp_path)
243+
244+
async def fake_runner(*args, **kwargs):
245+
raise MaxTurnsExceeded("ran out")
246+
247+
with patch("openkb.agent.skill_evaluator.Runner.run",
248+
new=AsyncMock(side_effect=fake_runner)):
249+
with pytest.raises(RuntimeError, match="max-turn cap"):
250+
await generate_eval_set(skill_dir, model="gpt-4o-mini")
251+
252+
253+
@pytest.mark.asyncio
254+
async def test_generate_eval_set_translates_malformed_json_to_runtime_error(tmp_path):
255+
"""Non-JSON LLM output should produce a friendly RuntimeError."""
256+
skill_dir = _make_skill(tmp_path)
257+
258+
async def fake_runner(*args, **kwargs):
259+
return SimpleNamespace(final_output="this is not json at all")
260+
261+
with patch("openkb.agent.skill_evaluator.Runner.run",
262+
new=AsyncMock(side_effect=fake_runner)):
263+
with pytest.raises(RuntimeError, match="non-JSON output"):
264+
await generate_eval_set(skill_dir, model="gpt-4o-mini")

0 commit comments

Comments
 (0)