Run Evals #36

Workflow file for this run

	name: Run Evals

	on:
	workflow_dispatch:
	inputs:
	suite_filter:
	description: "Comma-separated glob patterns for eval files to run"
	required: false
	default: "evals/*/.eval.yaml,examples/*/.eval.yaml,examples/*/.EVAL.yaml,examples/**/EVAL.yaml"
	target:
	description: "Optional target override (leave empty to use each eval's own target)"
	required: false
	default: ""
	threshold:
	description: "Minimum score threshold (0-1)"
	required: false
	default: "0.8"

	jobs:
	evals:
	name: Run AgentV Evals
	runs-on: ubuntu-latest
	permissions:
	contents: read
	checks: write
	models: read
	steps:
	- uses: actions/checkout@v4
	- uses: actions/setup-node@v4
	with:
	node-version: 22
	- uses: ./.github/actions/setup-bun

	- name: Build
	run: bun run build

	- name: Install GitHub Copilot CLI
	run: npm install -g @github/copilot

	- name: Install Pi CLI
	run: npm install -g @mariozechner/pi-coding-agent \|\| echo "pi-cli install failed (non-fatal)"

	- name: Install uv (Python package manager)
	run: curl -LsSf https://astral.sh/uv/install.sh \| sh

	- name: Configure credentials
	run: \|
	cat > .env <<EOF
	GH_MODELS_TOKEN=${{ secrets.COPILOT_PAT \|\| secrets.GH_MODELS_TOKEN \|\| secrets.GITHUB_TOKEN }}
	GH_MODELS_MODEL=${{ vars.GH_MODELS_MODEL \|\| 'gpt-5-mini' }}
	COPILOT_MODEL=${{ vars.COPILOT_MODEL \|\| 'gpt-5-mini' }}
	AGENT_TARGET=${{ vars.AGENT_TARGET \|\| 'copilot-cli' }}
	GRADER_TARGET=${{ vars.GRADER_TARGET \|\| 'openrouter' }}
	GOOGLE_GENERATIVE_AI_API_KEY=${{ secrets.GOOGLE_GENERATIVE_AI_API_KEY }}
	OPENROUTER_API_KEY=${{ secrets.OPENROUTER_API_KEY }}
	OPENROUTER_MODEL=${{ vars.OPENROUTER_MODEL \|\| 'openai/gpt-5.4-mini' }}
	GEMINI_MODEL_NAME=${{ vars.GEMINI_MODEL_NAME \|\| 'gemini-2.0-flash' }}
	EOF

	- name: Resolve inputs
	id: filter
	env:
	DEFAULT_PATTERNS: "evals/*/.eval.yaml,examples/*/.eval.yaml,examples/*/.EVAL.yaml,examples/**/EVAL.yaml"
	# Exclude evals that need local scripts or multiple agent targets.
	# Negation patterns (!glob) are supported by the CLI.
	# multi-model-benchmark: needs multiple agents
	# copilot-log-eval: needs copilot session files on disk
	# batch-cli: batch output format mismatch (pre-existing)
	# file-changes-graders: workspace cwd bug on retries (pre-existing)
	EXCLUDE_PATTERNS: "!examples/showcase/multi-model-benchmark/,!examples/features/copilot-log-eval/,!examples/features/batch-cli/,!examples/features/file-changes-graders/,!examples/showcase/cross-repo-sync/**"
	run: \|
	PATTERNS="${{ github.event.inputs.suite_filter \|\| vars.EVAL_PATTERNS \|\| env.DEFAULT_PATTERNS }}"
	EXCLUDES="${{ vars.EVAL_EXCLUDE_PATTERNS \|\| env.EXCLUDE_PATTERNS }}"
	echo "patterns=${PATTERNS},${EXCLUDES}" >> "$GITHUB_OUTPUT"
	echo "target=${{ github.event.inputs.target \|\| vars.EVAL_TARGET \|\| '' }}" >> "$GITHUB_OUTPUT"
	echo "threshold=${{ github.event.inputs.threshold \|\| '0.8' }}" >> "$GITHUB_OUTPUT"

	- name: Run AgentV evals
	id: run-evals
	env:
	COPILOT_GITHUB_TOKEN: ${{ secrets.COPILOT_PAT }}
	GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
	run: \|
	mkdir -p .agentv/ci-results

	# Split comma-separated patterns into positional args
	IFS=',' read -ra PATTERNS <<< "${{ steps.filter.outputs.patterns }}"

	# Build optional --target flag (empty = use each eval's own target)
	TARGET_FLAG=()
	if [ -n "${{ steps.filter.outputs.target }}" ]; then
	TARGET_FLAG=(--target "${{ steps.filter.outputs.target }}")
	fi

	bun apps/cli/dist/cli.js eval run "${PATTERNS[@]}" \
	"${TARGET_FLAG[@]}" \
	--workers 3 \
	--threshold ${{ steps.filter.outputs.threshold }} \
	--output .agentv/ci-results/junit.xml \
	--benchmark-json .agentv/ci-results/benchmark.json \
	--artifacts .agentv/ci-results/artifacts \
	2>&1 \| tee .agentv/ci-results/eval-output.log

	echo "exit_code=${PIPESTATUS[0]}" >> "$GITHUB_OUTPUT"

	- name: Post eval summary
	if: always()
	run: bun run scripts/ci-summary.ts .agentv/ci-results >> "$GITHUB_STEP_SUMMARY"

	- name: Publish JUnit test results
	if: always()
	continue-on-error: true
	uses: dorny/test-reporter@v1
	with:
	name: AgentV Eval Results
	path: .agentv/ci-results/junit.xml
	reporter: java-junit
	fail-on-error: false

	- name: Upload eval artifacts
	if: always()
	uses: actions/upload-artifact@v4
	with:
	name: eval-results-${{ github.run_id }}
	path: .agentv/ci-results/
	retention-days: 30

	- name: Fail if threshold not met
	if: always()
	run: \|
	if [ "${{ steps.run-evals.outputs.exit_code }}" != "0" ]; then
	echo "::error::Eval score below threshold (${{ steps.filter.outputs.threshold }})"
	exit 1
	fi

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Run Evals #36

Workflow file

Run Evals #36

Uh oh!

Workflow file for this run