Skip to content

Run Evals

Run Evals #36

Workflow file for this run

name: Run Evals
on:
workflow_dispatch:
inputs:
suite_filter:
description: "Comma-separated glob patterns for eval files to run"
required: false
default: "evals/**/*.eval.yaml,examples/**/*.eval.yaml,examples/**/*.EVAL.yaml,examples/**/EVAL.yaml"
target:
description: "Optional target override (leave empty to use each eval's own target)"
required: false
default: ""
threshold:
description: "Minimum score threshold (0-1)"
required: false
default: "0.8"
jobs:
evals:
name: Run AgentV Evals
runs-on: ubuntu-latest
permissions:
contents: read
checks: write
models: read
steps:
- uses: actions/checkout@v4
- uses: actions/setup-node@v4
with:
node-version: 22
- uses: ./.github/actions/setup-bun
- name: Build
run: bun run build
- name: Install GitHub Copilot CLI
run: npm install -g @github/copilot
- name: Install Pi CLI
run: npm install -g @mariozechner/pi-coding-agent || echo "pi-cli install failed (non-fatal)"
- name: Install uv (Python package manager)
run: curl -LsSf https://astral.sh/uv/install.sh | sh
- name: Configure credentials
run: |
cat > .env <<EOF
GH_MODELS_TOKEN=${{ secrets.COPILOT_PAT || secrets.GH_MODELS_TOKEN || secrets.GITHUB_TOKEN }}
GH_MODELS_MODEL=${{ vars.GH_MODELS_MODEL || 'gpt-5-mini' }}
COPILOT_MODEL=${{ vars.COPILOT_MODEL || 'gpt-5-mini' }}
AGENT_TARGET=${{ vars.AGENT_TARGET || 'copilot-cli' }}
GRADER_TARGET=${{ vars.GRADER_TARGET || 'openrouter' }}
GOOGLE_GENERATIVE_AI_API_KEY=${{ secrets.GOOGLE_GENERATIVE_AI_API_KEY }}
OPENROUTER_API_KEY=${{ secrets.OPENROUTER_API_KEY }}
OPENROUTER_MODEL=${{ vars.OPENROUTER_MODEL || 'openai/gpt-5.4-mini' }}
GEMINI_MODEL_NAME=${{ vars.GEMINI_MODEL_NAME || 'gemini-2.0-flash' }}
EOF
- name: Resolve inputs
id: filter
env:
DEFAULT_PATTERNS: "evals/**/*.eval.yaml,examples/**/*.eval.yaml,examples/**/*.EVAL.yaml,examples/**/EVAL.yaml"
# Exclude evals that need local scripts or multiple agent targets.
# Negation patterns (!glob) are supported by the CLI.
# multi-model-benchmark: needs multiple agents
# copilot-log-eval: needs copilot session files on disk
# batch-cli: batch output format mismatch (pre-existing)
# file-changes-graders: workspace cwd bug on retries (pre-existing)
EXCLUDE_PATTERNS: "!examples/showcase/multi-model-benchmark/**,!examples/features/copilot-log-eval/**,!examples/features/batch-cli/**,!examples/features/file-changes-graders/**,!examples/showcase/cross-repo-sync/**"
run: |
PATTERNS="${{ github.event.inputs.suite_filter || vars.EVAL_PATTERNS || env.DEFAULT_PATTERNS }}"
EXCLUDES="${{ vars.EVAL_EXCLUDE_PATTERNS || env.EXCLUDE_PATTERNS }}"
echo "patterns=${PATTERNS},${EXCLUDES}" >> "$GITHUB_OUTPUT"
echo "target=${{ github.event.inputs.target || vars.EVAL_TARGET || '' }}" >> "$GITHUB_OUTPUT"
echo "threshold=${{ github.event.inputs.threshold || '0.8' }}" >> "$GITHUB_OUTPUT"
- name: Run AgentV evals
id: run-evals
env:
COPILOT_GITHUB_TOKEN: ${{ secrets.COPILOT_PAT }}
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
mkdir -p .agentv/ci-results
# Split comma-separated patterns into positional args
IFS=',' read -ra PATTERNS <<< "${{ steps.filter.outputs.patterns }}"
# Build optional --target flag (empty = use each eval's own target)
TARGET_FLAG=()
if [ -n "${{ steps.filter.outputs.target }}" ]; then
TARGET_FLAG=(--target "${{ steps.filter.outputs.target }}")
fi
bun apps/cli/dist/cli.js eval run "${PATTERNS[@]}" \
"${TARGET_FLAG[@]}" \
--workers 3 \
--threshold ${{ steps.filter.outputs.threshold }} \
--output .agentv/ci-results/junit.xml \
--benchmark-json .agentv/ci-results/benchmark.json \
--artifacts .agentv/ci-results/artifacts \
2>&1 | tee .agentv/ci-results/eval-output.log
echo "exit_code=${PIPESTATUS[0]}" >> "$GITHUB_OUTPUT"
- name: Post eval summary
if: always()
run: bun run scripts/ci-summary.ts .agentv/ci-results >> "$GITHUB_STEP_SUMMARY"
- name: Publish JUnit test results
if: always()
continue-on-error: true
uses: dorny/test-reporter@v1
with:
name: AgentV Eval Results
path: .agentv/ci-results/junit.xml
reporter: java-junit
fail-on-error: false
- name: Upload eval artifacts
if: always()
uses: actions/upload-artifact@v4
with:
name: eval-results-${{ github.run_id }}
path: .agentv/ci-results/
retention-days: 30
- name: Fail if threshold not met
if: always()
run: |
if [ "${{ steps.run-evals.outputs.exit_code }}" != "0" ]; then
echo "::error::Eval score below threshold (${{ steps.filter.outputs.threshold }})"
exit 1
fi