Run Evals #36
Workflow file for this run
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Run Evals | |
| on: | |
| workflow_dispatch: | |
| inputs: | |
| suite_filter: | |
| description: "Comma-separated glob patterns for eval files to run" | |
| required: false | |
| default: "evals/**/*.eval.yaml,examples/**/*.eval.yaml,examples/**/*.EVAL.yaml,examples/**/EVAL.yaml" | |
| target: | |
| description: "Optional target override (leave empty to use each eval's own target)" | |
| required: false | |
| default: "" | |
| threshold: | |
| description: "Minimum score threshold (0-1)" | |
| required: false | |
| default: "0.8" | |
| jobs: | |
| evals: | |
| name: Run AgentV Evals | |
| runs-on: ubuntu-latest | |
| permissions: | |
| contents: read | |
| checks: write | |
| models: read | |
| steps: | |
| - uses: actions/checkout@v4 | |
| - uses: actions/setup-node@v4 | |
| with: | |
| node-version: 22 | |
| - uses: ./.github/actions/setup-bun | |
| - name: Build | |
| run: bun run build | |
| - name: Install GitHub Copilot CLI | |
| run: npm install -g @github/copilot | |
| - name: Install Pi CLI | |
| run: npm install -g @mariozechner/pi-coding-agent || echo "pi-cli install failed (non-fatal)" | |
| - name: Install uv (Python package manager) | |
| run: curl -LsSf https://astral.sh/uv/install.sh | sh | |
| - name: Configure credentials | |
| run: | | |
| cat > .env <<EOF | |
| GH_MODELS_TOKEN=${{ secrets.COPILOT_PAT || secrets.GH_MODELS_TOKEN || secrets.GITHUB_TOKEN }} | |
| GH_MODELS_MODEL=${{ vars.GH_MODELS_MODEL || 'gpt-5-mini' }} | |
| COPILOT_MODEL=${{ vars.COPILOT_MODEL || 'gpt-5-mini' }} | |
| AGENT_TARGET=${{ vars.AGENT_TARGET || 'copilot-cli' }} | |
| GRADER_TARGET=${{ vars.GRADER_TARGET || 'openrouter' }} | |
| GOOGLE_GENERATIVE_AI_API_KEY=${{ secrets.GOOGLE_GENERATIVE_AI_API_KEY }} | |
| OPENROUTER_API_KEY=${{ secrets.OPENROUTER_API_KEY }} | |
| OPENROUTER_MODEL=${{ vars.OPENROUTER_MODEL || 'openai/gpt-5.4-mini' }} | |
| GEMINI_MODEL_NAME=${{ vars.GEMINI_MODEL_NAME || 'gemini-2.0-flash' }} | |
| EOF | |
| - name: Resolve inputs | |
| id: filter | |
| env: | |
| DEFAULT_PATTERNS: "evals/**/*.eval.yaml,examples/**/*.eval.yaml,examples/**/*.EVAL.yaml,examples/**/EVAL.yaml" | |
| # Exclude evals that need local scripts or multiple agent targets. | |
| # Negation patterns (!glob) are supported by the CLI. | |
| # multi-model-benchmark: needs multiple agents | |
| # copilot-log-eval: needs copilot session files on disk | |
| # batch-cli: batch output format mismatch (pre-existing) | |
| # file-changes-graders: workspace cwd bug on retries (pre-existing) | |
| EXCLUDE_PATTERNS: "!examples/showcase/multi-model-benchmark/**,!examples/features/copilot-log-eval/**,!examples/features/batch-cli/**,!examples/features/file-changes-graders/**,!examples/showcase/cross-repo-sync/**" | |
| run: | | |
| PATTERNS="${{ github.event.inputs.suite_filter || vars.EVAL_PATTERNS || env.DEFAULT_PATTERNS }}" | |
| EXCLUDES="${{ vars.EVAL_EXCLUDE_PATTERNS || env.EXCLUDE_PATTERNS }}" | |
| echo "patterns=${PATTERNS},${EXCLUDES}" >> "$GITHUB_OUTPUT" | |
| echo "target=${{ github.event.inputs.target || vars.EVAL_TARGET || '' }}" >> "$GITHUB_OUTPUT" | |
| echo "threshold=${{ github.event.inputs.threshold || '0.8' }}" >> "$GITHUB_OUTPUT" | |
| - name: Run AgentV evals | |
| id: run-evals | |
| env: | |
| COPILOT_GITHUB_TOKEN: ${{ secrets.COPILOT_PAT }} | |
| GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} | |
| run: | | |
| mkdir -p .agentv/ci-results | |
| # Split comma-separated patterns into positional args | |
| IFS=',' read -ra PATTERNS <<< "${{ steps.filter.outputs.patterns }}" | |
| # Build optional --target flag (empty = use each eval's own target) | |
| TARGET_FLAG=() | |
| if [ -n "${{ steps.filter.outputs.target }}" ]; then | |
| TARGET_FLAG=(--target "${{ steps.filter.outputs.target }}") | |
| fi | |
| bun apps/cli/dist/cli.js eval run "${PATTERNS[@]}" \ | |
| "${TARGET_FLAG[@]}" \ | |
| --workers 3 \ | |
| --threshold ${{ steps.filter.outputs.threshold }} \ | |
| --output .agentv/ci-results/junit.xml \ | |
| --benchmark-json .agentv/ci-results/benchmark.json \ | |
| --artifacts .agentv/ci-results/artifacts \ | |
| 2>&1 | tee .agentv/ci-results/eval-output.log | |
| echo "exit_code=${PIPESTATUS[0]}" >> "$GITHUB_OUTPUT" | |
| - name: Post eval summary | |
| if: always() | |
| run: bun run scripts/ci-summary.ts .agentv/ci-results >> "$GITHUB_STEP_SUMMARY" | |
| - name: Publish JUnit test results | |
| if: always() | |
| continue-on-error: true | |
| uses: dorny/test-reporter@v1 | |
| with: | |
| name: AgentV Eval Results | |
| path: .agentv/ci-results/junit.xml | |
| reporter: java-junit | |
| fail-on-error: false | |
| - name: Upload eval artifacts | |
| if: always() | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: eval-results-${{ github.run_id }} | |
| path: .agentv/ci-results/ | |
| retention-days: 30 | |
| - name: Fail if threshold not met | |
| if: always() | |
| run: | | |
| if [ "${{ steps.run-evals.outputs.exit_code }}" != "0" ]; then | |
| echo "::error::Eval score below threshold (${{ steps.filter.outputs.threshold }})" | |
| exit 1 | |
| fi |