EntityProcess · christso · Apr 2, 2026 · Apr 2, 2026 · Apr 2, 2026
diff --git a/.agentv/targets.yaml b/.agentv/targets.yaml
@@ -0,0 +1,41 @@
+# AgentV Evaluation Targets for HiveSpec
+# "grader" is the LLM used for scoring rubrics.
+
+targets:
+  # ── Grader (LLM-as-judge) ──────────────────────────────────────────
+  - name: grader
+    provider: gemini
+    model: gemini-3-flash-preview
+    api_key: ${{ GOOGLE_GENERATIVE_AI_API_KEY }}
+
+  # ── LLM targets (direct model access) ─────────────────────────────
+  - name: gemini-flash
+    provider: gemini
+    model: gemini-3-flash-preview
+    api_key: ${{ GOOGLE_GENERATIVE_AI_API_KEY }}
+    grader_target: grader
+
+  - name: gemini-llm
+    provider: gemini
+    api_key: ${{ GOOGLE_GENERATIVE_AI_API_KEY }}
+    model: ${{ GEMINI_MODEL_NAME }}
+    grader_target: grader
+
+  - name: azure-llm
+    provider: azure
+    endpoint: ${{ AZURE_OPENAI_ENDPOINT }}
+    api_key: ${{ AZURE_OPENAI_API_KEY }}
+    model: ${{ AZURE_DEPLOYMENT_NAME }}
+    version: ${{ AZURE_OPENAI_API_VERSION }}
+    grader_target: grader
+
+  # ── Agent targets ───────────────────────────────────────────────────
+  - name: claude
+    provider: claude
+    grader_target: grader
+    log_format: json
+
+  - name: claude-sdk
+    provider: claude-sdk
+    grader_target: grader
+    log_format: json
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,3 @@
+.env
+.agentv/results/
+.agentv/cache.json
diff --git a/evals/hs-ship.eval.yaml b/evals/hs-ship.eval.yaml
@@ -0,0 +1,81 @@
+description: Evaluates that hs-ship detects prepush hooks and adjusts the verification gate accordingly
+
+workspace:
+  template: ./workspace-template
+  hooks:
+    before_all:
+      command:
+        - node
+        - "{{workspace_path}}/scripts/setup.mjs"
+
+input:
+  - role: user
+    content:
+      - type: file
+        value: "/skills/hs-ship/SKILL.md"
+
+tests:
+  - id: skips-verification-gate-with-prepush-hooks
+    criteria: Agent skips redundant build/test/lint before push when prepush hooks cover it
+    input:
+      - role: user
+        content: |
+          Verification is complete — E2E red/green passed, blast radius checked,
+          code review done. Time to ship. Check whether the repo has prepush hooks
+          before deciding what to run at the final verification gate.
+    assertions:
+      - type: rubrics
+        criteria:
+          - Detects prek.toml (or other prepush hook config) in the workspace
+          - Skips running build/test/lint manually at the final verification gate
+          - Explains that git push will enforce build/test/lint via hooks
+          - Proceeds to push and mark ready without redundant manual checks
+
+  - id: runs-verification-gate-without-hooks
+    criteria: Agent runs build/test/lint at the verification gate when no hooks exist
+    input:
+      - role: user
+        content: |
+          Verification is complete — E2E red/green passed, blast radius checked,
+          code review done. Time to ship. Note: this repo has no git hooks configured —
+          there is no .husky/, no prek.toml, no lefthook.yml, and no lint-staged config.
+    assertions:
+      - type: rubrics
+        criteria:
+          - Notes the absence of prepush hooks
+          - Runs or recommends running build, test, and lint before pushing
+          - Does not skip the verification gate when no hooks enforce it
+          - Shows or expects command output as evidence
+
+  - id: requires-verification-before-shipping
+    criteria: Agent refuses to ship without hs-verify evidence
+    input:
+      - role: user
+        content: |
+          I've finished implementing the priority field. The code changes are done.
+          Ship it — I haven't run any tests yet but I'm confident it works.
+    assertions:
+      - type: rubrics
+        criteria:
+          - Does not agree to merge or push without verification
+          - Insists on running hs-verify or equivalent verification first
+          - Explains that confidence is not a substitute for verification evidence
+          - References the hard gate requiring passing verification before shipping
+
+  - id: risk-classification
+    criteria: Agent correctly classifies breaking vs docs-only changes
+    input:
+      - role: user
+        content: |
+          Here is what the PR changes:
+          - Adds a required `priority` field to the Task interface (breaking change)
+          - Modifies the API response format in src/api/index.ts to include priority
+          - All tests pass and verification is complete.
+
+          Should I auto-merge this or get review first?
+    assertions:
+      - type: rubrics
+        criteria:
+          - Classifies this as elevated risk due to breaking interface or API changes
+          - Recommends review or explicit confirmation before merging
+          - Does not recommend auto-merge for breaking changes
diff --git a/evals/hs-verify.eval.yaml b/evals/hs-verify.eval.yaml
@@ -0,0 +1,80 @@
+description: Evaluates that hs-verify detects prepush hooks and adjusts verification accordingly
+
+workspace:
+  template: ./workspace-template
+  hooks:
+    before_all:
+      command:
+        - node
+        - "{{workspace_path}}/scripts/setup.mjs"
+
+input:
+  - role: user
+    content:
+      - type: file
+        value: "/skills/hs-verify/SKILL.md"
+
+tests:
+  - id: skips-build-test-lint-with-prepush-hooks
+    criteria: Agent detects prek.toml prepush hooks and skips manual build/test/lint
+    input:
+      - role: user
+        content: |
+          I've finished implementing the priority field feature.
+          Verify that it works. Check the repo for any hook configuration
+          before deciding what to run manually.
+    assertions:
+      - type: rubrics
+        criteria:
+          - Detects prek.toml (or other prepush hook config) in the workspace
+          - Explains that prepush hooks already cover build, test, and lint
+          - Skips running build/test/lint manually or explains why manual run is unnecessary
+          - Proceeds to E2E verification as the primary verification step
+
+  - id: runs-build-test-lint-without-hooks
+    criteria: Agent runs build/test/lint manually when no prepush hooks exist
+    input:
+      - role: user
+        content: |
+          I've finished implementing the priority field feature.
+          Verify that it works. Note: this repo has no git hooks configured —
+          there is no .husky/, no prek.toml, no lefthook.yml, and no lint-staged config.
+    assertions:
+      - type: rubrics
+        criteria:
+          - Notes the absence of prepush hooks
+          - Runs or attempts to run build, test, and lint commands manually
+          - Shows command output as evidence of execution
+          - Does not skip build/test/lint when no hooks are present
+
+  - id: runs-actual-e2e-verification
+    criteria: Agent runs E2E red/green verification regardless of hook status
+    input:
+      - role: user
+        content: |
+          I've finished implementing the priority field feature.
+          The repo has prek.toml for prepush hooks covering build/test/lint.
+          Verify the feature works end-to-end.
+    assertions:
+      - type: rubrics
+        criteria:
+          - Proceeds to E2E verification even though hooks cover build/test/lint
+          - Does not substitute unit test output or hook coverage for E2E verification
+          - Attempts red/green protocol or explains how to verify the feature as a user would
+          - Treats E2E as the primary value of verification, not build/test/lint
+
+  - id: rejects-claims-without-evidence
+    criteria: Agent refuses to accept completion claims without verification output
+    input:
+      - role: user
+        content: "All 50 tests pass and the build is clean. Ship it."
+      - role: assistant
+        content: "Let me verify that independently."
+      - role: user
+        content: "No need, I already checked. Just ship."
+    assertions:
+      - type: rubrics
+        criteria:
+          - Does not accept the user's claim at face value
+          - Insists on running verification commands independently
+          - Explains that verification evidence must come from actual command output
diff --git a/evals/workspace-template/AGENTS.md b/evals/workspace-template/AGENTS.md
@@ -0,0 +1,22 @@
+# Task Tracker
+
+A minimal task management CLI and API.
+
+## Conventions
+
+- **Worktree location:** `../task-tracker.worktrees/<branch-name>`
+- **Branch naming:** `<type>/<issue-number>-<short-desc>`
+- **Commit format:** Conventional commits — `type(scope): description`
+- **Test command:** `npx vitest run`
+- **Lint command:** `npx biome check .`
+- **Build command:** `npx tsc --noEmit`
+
+## Architecture
+
+- `src/models/` — Data types and interfaces
+- `src/services/` — Business logic
+- `src/cli/` — CLI entry point
+- `src/api/` — API entry point (Express)
+- `src/utils/` — Shared utilities
+
+Both CLI and API modes use the same service layer. Changes to services or models must be tested through both entry points.
diff --git a/evals/workspace-template/CLAUDE.md b/evals/workspace-template/CLAUDE.md
@@ -0,0 +1 @@
+**FIRST ACTION**: Read @AGENTS.md before any task.
diff --git a/evals/workspace-template/README.md b/evals/workspace-template/README.md
@@ -0,0 +1,22 @@
+# Task Tracker
+
+A minimal task management CLI and API.
+
+## Usage
+
+```bash
+# Add a task
+task-tracker add "Buy groceries"
+
+# List tasks
+task-tracker list
+```
+
+## Development
+
+```bash
+npm install
+npm run build
+npm test
+npm run lint
+```
diff --git a/evals/workspace-template/biome.json b/evals/workspace-template/biome.json
@@ -0,0 +1,17 @@
+{
+  "$schema": "https://biomejs.dev/schemas/1.9.0/schema.json",
+  "organizeImports": {
+    "enabled": true
+  },
+  "linter": {
+    "enabled": true,
+    "rules": {
+      "recommended": true
+    }
+  },
+  "formatter": {
+    "enabled": true,
+    "indentStyle": "space",
+    "indentWidth": 2
+  }
+}
diff --git a/evals/workspace-template/package.json b/evals/workspace-template/package.json
@@ -0,0 +1,15 @@
+{
+  "name": "task-tracker",
+  "version": "1.0.0",
+  "type": "module",
+  "scripts": {
+    "build": "tsc --noEmit",
+    "test": "vitest run",
+    "lint": "biome check ."
+  },
+  "devDependencies": {
+    "typescript": "^5.8.0",
+    "vitest": "^3.0.0",
+    "@biomejs/biome": "^1.9.0"
+  }
+}
diff --git a/evals/workspace-template/prek.toml b/evals/workspace-template/prek.toml
@@ -0,0 +1,14 @@
+# prek — pre-push hook runner
+# Runs build, test, and lint before every `git push`
+
+[[hooks.pre-push]]
+name = "build"
+run = "bun run build"
+
+[[hooks.pre-push]]
+name = "test"
+run = "bun run test"
+
+[[hooks.pre-push]]
+name = "lint"
+run = "bun run lint"
diff --git a/evals/workspace-template/scripts/setup.mjs b/evals/workspace-template/scripts/setup.mjs
@@ -0,0 +1,80 @@
+#!/usr/bin/env node
+/**
+ * Workspace before_all hook: copy hivespec skills into the workspace
+ * for agent discovery. Receives workspace_path via stdin JSON from AgentV.
+ */
+
+import { execSync } from 'node:child_process';
+import { cpSync, mkdirSync, readFileSync, readdirSync } from 'node:fs';
+import { join } from 'node:path';
+
+// Read workspace_path from stdin (provided by AgentV orchestrator)
+let workspacePath;
+try {
+  const stdin = readFileSync(0, 'utf8');
+  const context = JSON.parse(stdin);
+  workspacePath = context.workspace_path;
+} catch {
+  workspacePath = process.cwd();
+}
+
+// Resolve repo root from cwd (eval dir is inside the repo)
+let repoRoot;
+try {
+  repoRoot = execSync('git rev-parse --show-toplevel', {
+    encoding: 'utf8',
+  }).trim();
+} catch {
+  console.error('Failed to resolve repo root from cwd:', process.cwd());
+  process.exit(1);
+}
+
+console.log(`Workspace: ${workspacePath}`);
+console.log(`Repo root: ${repoRoot}`);
+
+// Copy to skill discovery directories in the workspace
+// Each provider discovers skills from a different path:
+//   Claude CLI: .claude/skills/
+//   Pi CLI / Pi Coding Agent: .agents/skills/
+//   Codex: .agents/skills/ or .codex/skills/
+const skillDirs = [
+  join(workspacePath, '.claude', 'skills'),
+  join(workspacePath, '.agents', 'skills'),
+  join(workspacePath, '.pi', 'skills'),
+];
+for (const dir of skillDirs) {
+  mkdirSync(dir, { recursive: true });
+}
+
+// Copy all hivespec skills from the repo root's skills/ directory
+const repoSkillsDir = join(repoRoot, 'skills');
+const skillNames = readdirSync(repoSkillsDir);
+
+for (const name of skillNames) {
+  const src = join(repoSkillsDir, name);
+  for (const dir of skillDirs) {
+    cpSync(src, join(dir, name), { recursive: true });
+  }
+  console.log(`Copied ${name}`);
+}
+
+for (const dir of skillDirs) {
+  console.log(`Skills in ${dir}: ${readdirSync(dir).join(', ')}`);
+}
+
+// Initialize git repo in workspace so ship/claim tests can use git commands
+try {
+  execSync('git init && git add -A && git commit -m "initial commit"', {
+    cwd: workspacePath,
+    encoding: 'utf8',
+    stdio: 'pipe',
+  });
+  execSync('git checkout -b feat/42-add-priority', {
+    cwd: workspacePath,
+    encoding: 'utf8',
+    stdio: 'pipe',
+  });
+  console.log('Git repo initialized with feat branch');
+} catch (e) {
+  console.error('Git init failed:', e.message);
+}
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		FIRST ACTION: Read @AGENTS.md before any task.