agentv/evals/agentic-engineering/agent-plugin-review.eval.yaml at 707761b0e1d5464df6af851270d77d105853c7dc · EntityProcess/agentv · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
description: Evaluates that the agent-plugin-review skill is triggered and catches planted issues in a mock plugin

tags: [agent]

workspace:
  template: ./workspace-template
  hooks:
    before_all:
      command:
        - node
        - "{{workspace_path}}/scripts/setup.mjs"

tests:
  - id: detect-missing-eval
    criteria: Identifies that deploy-rollback skill has no corresponding eval file
    input: |
      Review the deploy-auto plugin in this repo for completeness.
      Check that every skill has a corresponding eval file.
    assertions:
      - type: skill-trigger
        skill: agent-plugin-review
      - type: contains
        value: deploy-rollback
      - type: rubrics
        criteria:
          - Flags that deploy-rollback skill has no corresponding eval file
          - Does not flag deploy-plan or deploy-execute as missing evals

  - id: detect-eval-naming
    criteria: Identifies eval files using bare .yaml instead of .eval.yaml
    input: |
      Review the eval files under evals/deploy-auto/ for naming convention issues.
    assertions:
      - type: contains
        value: .eval.yaml
      - type: rubrics
        criteria:
          - Flags deploy-plan.yaml as using wrong extension
          - Recommends renaming to .eval.yaml
          - Does not flag deploy-execute.eval.yaml

  - id: detect-missing-assertions
    criteria: Identifies eval tests without assertions that rely solely on expected_output prose
    input: |
      Review evals/deploy-auto/deploy-plan.yaml for eval quality issues.
      Check assertion coverage and expected_output format.
    assertions:
      - type: rubrics
        criteria:
          - Flags that no assertions are defined in deploy-plan.yaml
          - Notes that expected_output contains evaluation criteria prose rather than sample responses
          - Suggests adding deterministic assertions

  - id: detect-relative-file-paths
    criteria: Identifies eval file paths missing leading slash
    input: |
      Review evals/deploy-auto/deploy-plan.yaml for file path formatting issues.
    assertions:
      - type: rubrics
        criteria:
          - Flags that file paths are missing a leading slash
          - Shows the corrected path format with leading slash

  - id: detect-repeated-inputs
    criteria: Identifies eval files repeating the same file input in every test
    input: |
      Review evals/deploy-auto/deploy-plan.yaml for structural improvements.
      Look at how inputs are organized across test cases.
    assertions:
      - type: rubrics
        criteria:
          - Identifies the repeated SKILL.md file input across all 3 tests
          - Recommends using top-level input for the shared file reference

  - id: detect-missing-hard-gates
    criteria: Identifies that deploy-execute has no hard gate checking for deploy-plan.md
    input: |
      Review the deploy-auto plugin's workflow architecture.
      Check whether phases enforce prerequisites before proceeding.
    assertions:
      - type: rubrics
        criteria:
          - Flags that deploy-execute does not check for deploy-plan.md before starting
          - Recommends adding hard gates between phases
          - Suggests stopping with a clear message if prerequisites are missing

  - id: detect-factual-contradiction
    criteria: Identifies that deploy-execute says pytest but its eval says python -m unittest
    input: |
      Review evals/deploy-auto/deploy-execute.eval.yaml for factual accuracy.
      Cross-check expected outputs against what the skills actually document.
    assertions:
      - type: rubrics
        criteria:
          - Flags the contradiction between pytest (skill) and python -m unittest (eval)
          - Recommends updating the eval to match the skill

  - id: detect-nonexistent-command-reference
    criteria: Identifies that deploy-plan references /deploy-execute which is not a command
    input: |
      Review plugins/deploy-auto/skills/deploy-plan/SKILL.md for cross-reference issues.
      Check that referenced commands and skills actually exist.
    assertions:
      - type: rubrics
        criteria:
          - Flags that /deploy-execute is referenced but does not exist as a slash command
          - Notes the distinction between skills and slash commands
          - Suggests either creating the command or updating the handoff

  - id: detect-hardcoded-paths
    criteria: Identifies hardcoded local paths in deploy-execute skill
    input: |
      Review plugins/deploy-auto/skills/deploy-execute/SKILL.md for portability issues.
    assertions:
      - type: rubrics
        criteria:
          - Flags the hardcoded path C:\Users\admin\.kube\config
          - Recommends using environment variables or configurable defaults