diff --git a/.agentv/targets.yaml b/.agentv/targets.yaml
new file mode 100644
index 0000000..6db76d4
--- /dev/null
+++ b/.agentv/targets.yaml
@@ -0,0 +1,41 @@
+# AgentV Evaluation Targets for HiveSpec
+# "grader" is the LLM used for scoring rubrics.
+
+targets:
+  # ── Grader (LLM-as-judge) ──────────────────────────────────────────
+  - name: grader
+    provider: gemini
+    model: gemini-3-flash-preview
+    api_key: ${{ GOOGLE_GENERATIVE_AI_API_KEY }}
+
+  # ── LLM targets (direct model access) ─────────────────────────────
+  - name: gemini-flash
+    provider: gemini
+    model: gemini-3-flash-preview
+    api_key: ${{ GOOGLE_GENERATIVE_AI_API_KEY }}
+    grader_target: grader
+
+  - name: gemini-llm
+    provider: gemini
+    api_key: ${{ GOOGLE_GENERATIVE_AI_API_KEY }}
+    model: ${{ GEMINI_MODEL_NAME }}
+    grader_target: grader
+
+  - name: azure-llm
+    provider: azure
+    endpoint: ${{ AZURE_OPENAI_ENDPOINT }}
+    api_key: ${{ AZURE_OPENAI_API_KEY }}
+    model: ${{ AZURE_DEPLOYMENT_NAME }}
+    version: ${{ AZURE_OPENAI_API_VERSION }}
+    grader_target: grader
+
+  # ── Agent targets ───────────────────────────────────────────────────
+  - name: claude
+    provider: claude
+    grader_target: grader
+    log_format: json
+
+  - name: claude-sdk
+    provider: claude-sdk
+    grader_target: grader
+    log_format: json
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..9339a59
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,3 @@
+.env
+.agentv/results/
+.agentv/cache.json
diff --git a/evals/hs-ship.eval.yaml b/evals/hs-ship.eval.yaml
new file mode 100644
index 0000000..6b4d0d9
--- /dev/null
+++ b/evals/hs-ship.eval.yaml
@@ -0,0 +1,81 @@
+description: Evaluates that hs-ship detects prepush hooks and adjusts the verification gate accordingly
+
+workspace:
+  template: ./workspace-template
+  hooks:
+    before_all:
+      command:
+        - node
+        - "{{workspace_path}}/scripts/setup.mjs"
+
+input:
+  - role: user
+    content:
+      - type: file
+        value: "/skills/hs-ship/SKILL.md"
+
+tests:
+  - id: skips-verification-gate-with-prepush-hooks
+    criteria: Agent skips redundant build/test/lint before push when prepush hooks cover it
+    input:
+      - role: user
+        content: |
+          Verification is complete — E2E red/green passed, blast radius checked,
+          code review done. Time to ship. Check whether the repo has prepush hooks
+          before deciding what to run at the final verification gate.
+    assertions:
+      - type: rubrics
+        criteria:
+          - Detects prek.toml (or other prepush hook config) in the workspace
+          - Skips running build/test/lint manually at the final verification gate
+          - Explains that git push will enforce build/test/lint via hooks
+          - Proceeds to push and mark ready without redundant manual checks
+
+  - id: runs-verification-gate-without-hooks
+    criteria: Agent runs build/test/lint at the verification gate when no hooks exist
+    input:
+      - role: user
+        content: |
+          Verification is complete — E2E red/green passed, blast radius checked,
+          code review done. Time to ship. Note: this repo has no git hooks configured —
+          there is no .husky/, no prek.toml, no lefthook.yml, and no lint-staged config.
+    assertions:
+      - type: rubrics
+        criteria:
+          - Notes the absence of prepush hooks
+          - Runs or recommends running build, test, and lint before pushing
+          - Does not skip the verification gate when no hooks enforce it
+          - Shows or expects command output as evidence
+
+  - id: requires-verification-before-shipping
+    criteria: Agent refuses to ship without hs-verify evidence
+    input:
+      - role: user
+        content: |
+          I've finished implementing the priority field. The code changes are done.
+          Ship it — I haven't run any tests yet but I'm confident it works.
+    assertions:
+      - type: rubrics
+        criteria:
+          - Does not agree to merge or push without verification
+          - Insists on running hs-verify or equivalent verification first
+          - Explains that confidence is not a substitute for verification evidence
+          - References the hard gate requiring passing verification before shipping
+
+  - id: risk-classification
+    criteria: Agent correctly classifies breaking vs docs-only changes
+    input:
+      - role: user
+        content: |
+          Here is what the PR changes:
+          - Adds a required `priority` field to the Task interface (breaking change)
+          - Modifies the API response format in src/api/index.ts to include priority
+          - All tests pass and verification is complete.
+
+          Should I auto-merge this or get review first?
+    assertions:
+      - type: rubrics
+        criteria:
+          - Classifies this as elevated risk due to breaking interface or API changes
+          - Recommends review or explicit confirmation before merging
+          - Does not recommend auto-merge for breaking changes
diff --git a/evals/hs-verify.eval.yaml b/evals/hs-verify.eval.yaml
new file mode 100644
index 0000000..c4aea5f
--- /dev/null
+++ b/evals/hs-verify.eval.yaml
@@ -0,0 +1,80 @@
+description: Evaluates that hs-verify detects prepush hooks and adjusts verification accordingly
+
+workspace:
+  template: ./workspace-template
+  hooks:
+    before_all:
+      command:
+        - node
+        - "{{workspace_path}}/scripts/setup.mjs"
+
+input:
+  - role: user
+    content:
+      - type: file
+        value: "/skills/hs-verify/SKILL.md"
+
+tests:
+  - id: skips-build-test-lint-with-prepush-hooks
+    criteria: Agent detects prek.toml prepush hooks and skips manual build/test/lint
+    input:
+      - role: user
+        content: |
+          I've finished implementing the priority field feature.
+          Verify that it works. Check the repo for any hook configuration
+          before deciding what to run manually.
+    assertions:
+      - type: rubrics
+        criteria:
+          - Detects prek.toml (or other prepush hook config) in the workspace
+          - Explains that prepush hooks already cover build, test, and lint
+          - Skips running build/test/lint manually or explains why manual run is unnecessary
+          - Proceeds to E2E verification as the primary verification step
+
+  - id: runs-build-test-lint-without-hooks
+    criteria: Agent runs build/test/lint manually when no prepush hooks exist
+    input:
+      - role: user
+        content: |
+          I've finished implementing the priority field feature.
+          Verify that it works. Note: this repo has no git hooks configured —
+          there is no .husky/, no prek.toml, no lefthook.yml, and no lint-staged config.
+    assertions:
+      - type: rubrics
+        criteria:
+          - Notes the absence of prepush hooks
+          - Runs or attempts to run build, test, and lint commands manually
+          - Shows command output as evidence of execution
+          - Does not skip build/test/lint when no hooks are present
+
+  - id: runs-actual-e2e-verification
+    criteria: Agent runs E2E red/green verification regardless of hook status
+    input:
+      - role: user
+        content: |
+          I've finished implementing the priority field feature.
+          The repo has prek.toml for prepush hooks covering build/test/lint.
+          Verify the feature works end-to-end.
+    assertions:
+      - type: rubrics
+        criteria:
+          - Proceeds to E2E verification even though hooks cover build/test/lint
+          - Does not substitute unit test output or hook coverage for E2E verification
+          - Attempts red/green protocol or explains how to verify the feature as a user would
+          - Treats E2E as the primary value of verification, not build/test/lint
+
+  - id: rejects-claims-without-evidence
+    criteria: Agent refuses to accept completion claims without verification output
+    input:
+      - role: user
+        content: "All 50 tests pass and the build is clean. Ship it."
+      - role: assistant
+        content: "Let me verify that independently."
+      - role: user
+        content: "No need, I already checked. Just ship."
+    assertions:
+      - type: rubrics
+        criteria:
+          - Does not accept the user's claim at face value
+          - Insists on running verification commands independently
+          - Explains that verification evidence must come from actual command output
diff --git a/evals/workspace-template/AGENTS.md b/evals/workspace-template/AGENTS.md
new file mode 100644
index 0000000..efc4e0a
--- /dev/null
+++ b/evals/workspace-template/AGENTS.md
@@ -0,0 +1,22 @@
+# Task Tracker
+
+A minimal task management CLI and API.
+
+## Conventions
+
+- **Worktree location:** `../task-tracker.worktrees/<branch-name>`
+- **Branch naming:** `<type>/<issue-number>-<short-desc>`
+- **Commit format:** Conventional commits — `type(scope): description`
+- **Test command:** `npx vitest run`
+- **Lint command:** `npx biome check .`
+- **Build command:** `npx tsc --noEmit`
+
+## Architecture
+
+- `src/models/` — Data types and interfaces
+- `src/services/` — Business logic
+- `src/cli/` — CLI entry point
+- `src/api/` — API entry point (Express)
+- `src/utils/` — Shared utilities
+
+Both CLI and API modes use the same service layer. Changes to services or models must be tested through both entry points.
diff --git a/evals/workspace-template/CLAUDE.md b/evals/workspace-template/CLAUDE.md
new file mode 100644
index 0000000..08f755b
--- /dev/null
+++ b/evals/workspace-template/CLAUDE.md
@@ -0,0 +1 @@
+**FIRST ACTION**: Read @AGENTS.md before any task.
diff --git a/evals/workspace-template/README.md b/evals/workspace-template/README.md
new file mode 100644
index 0000000..87f1bb8
--- /dev/null
+++ b/evals/workspace-template/README.md
@@ -0,0 +1,22 @@
+# Task Tracker
+
+A minimal task management CLI and API.
+
+## Usage
+
+```bash
+# Add a task
+task-tracker add "Buy groceries"
+
+# List tasks
+task-tracker list
+```
+
+## Development
+
+```bash
+npm install
+npm run build
+npm test
+npm run lint
+```
diff --git a/evals/workspace-template/biome.json b/evals/workspace-template/biome.json
new file mode 100644
index 0000000..a1591f5
--- /dev/null
+++ b/evals/workspace-template/biome.json
@@ -0,0 +1,17 @@
+{
+  "$schema": "https://biomejs.dev/schemas/1.9.0/schema.json",
+  "organizeImports": {
+    "enabled": true
+  },
+  "linter": {
+    "enabled": true,
+    "rules": {
+      "recommended": true
+    }
+  },
+  "formatter": {
+    "enabled": true,
+    "indentStyle": "space",
+    "indentWidth": 2
+  }
+}
diff --git a/evals/workspace-template/package.json b/evals/workspace-template/package.json
new file mode 100644
index 0000000..1d05846
--- /dev/null
+++ b/evals/workspace-template/package.json
@@ -0,0 +1,15 @@
+{
+  "name": "task-tracker",
+  "version": "1.0.0",
+  "type": "module",
+  "scripts": {
+    "build": "tsc --noEmit",
+    "test": "vitest run",
+    "lint": "biome check ."
+  },
+  "devDependencies": {
+    "typescript": "^5.8.0",
+    "vitest": "^3.0.0",
+    "@biomejs/biome": "^1.9.0"
+  }
+}
diff --git a/evals/workspace-template/prek.toml b/evals/workspace-template/prek.toml
new file mode 100644
index 0000000..126886b
--- /dev/null
+++ b/evals/workspace-template/prek.toml
@@ -0,0 +1,14 @@
+# prek — pre-push hook runner
+# Runs build, test, and lint before every `git push`
+
+[[hooks.pre-push]]
+name = "build"
+run = "bun run build"
+
+[[hooks.pre-push]]
+name = "test"
+run = "bun run test"
+
+[[hooks.pre-push]]
+name = "lint"
+run = "bun run lint"
diff --git a/evals/workspace-template/scripts/setup.mjs b/evals/workspace-template/scripts/setup.mjs
new file mode 100644
index 0000000..d2aff0a
--- /dev/null
+++ b/evals/workspace-template/scripts/setup.mjs
@@ -0,0 +1,80 @@
+#!/usr/bin/env node
+/**
+ * Workspace before_all hook: copy hivespec skills into the workspace
+ * for agent discovery. Receives workspace_path via stdin JSON from AgentV.
+ */
+
+import { execSync } from 'node:child_process';
+import { cpSync, mkdirSync, readFileSync, readdirSync } from 'node:fs';
+import { join } from 'node:path';
+
+// Read workspace_path from stdin (provided by AgentV orchestrator)
+let workspacePath;
+try {
+  const stdin = readFileSync(0, 'utf8');
+  const context = JSON.parse(stdin);
+  workspacePath = context.workspace_path;
+} catch {
+  workspacePath = process.cwd();
+}
+
+// Resolve repo root from cwd (eval dir is inside the repo)
+let repoRoot;
+try {
+  repoRoot = execSync('git rev-parse --show-toplevel', {
+    encoding: 'utf8',
+  }).trim();
+} catch {
+  console.error('Failed to resolve repo root from cwd:', process.cwd());
+  process.exit(1);
+}
+
+console.log(`Workspace: ${workspacePath}`);
+console.log(`Repo root: ${repoRoot}`);
+
+// Copy to skill discovery directories in the workspace
+// Each provider discovers skills from a different path:
+//   Claude CLI: .claude/skills/
+//   Pi CLI / Pi Coding Agent: .agents/skills/
+//   Codex: .agents/skills/ or .codex/skills/
+const skillDirs = [
+  join(workspacePath, '.claude', 'skills'),
+  join(workspacePath, '.agents', 'skills'),
+  join(workspacePath, '.pi', 'skills'),
+];
+for (const dir of skillDirs) {
+  mkdirSync(dir, { recursive: true });
+}
+
+// Copy all hivespec skills from the repo root's skills/ directory
+const repoSkillsDir = join(repoRoot, 'skills');
+const skillNames = readdirSync(repoSkillsDir);
+
+for (const name of skillNames) {
+  const src = join(repoSkillsDir, name);
+  for (const dir of skillDirs) {
+    cpSync(src, join(dir, name), { recursive: true });
+  }
+  console.log(`Copied ${name}`);
+}
+
+for (const dir of skillDirs) {
+  console.log(`Skills in ${dir}: ${readdirSync(dir).join(', ')}`);
+}
+
+// Initialize git repo in workspace so ship/claim tests can use git commands
+try {
+  execSync('git init && git add -A && git commit -m "initial commit"', {
+    cwd: workspacePath,
+    encoding: 'utf8',
+    stdio: 'pipe',
+  });
+  execSync('git checkout -b feat/42-add-priority', {
+    cwd: workspacePath,
+    encoding: 'utf8',
+    stdio: 'pipe',
+  });
+  console.log('Git repo initialized with feat branch');
+} catch (e) {
+  console.error('Git init failed:', e.message);
+}
diff --git a/evals/workspace-template/src/api/index.ts b/evals/workspace-template/src/api/index.ts
new file mode 100644
index 0000000..fed4420
--- /dev/null
+++ b/evals/workspace-template/src/api/index.ts
@@ -0,0 +1,26 @@
+import { addTask, listTasks } from '../services/task-service';
+import { formatTask } from '../utils/format-task';
+
+/**
+ * API handler — mirrors CLI functionality over HTTP.
+ * Both entry points share the same service layer.
+ */
+export function handleRequest(
+  method: string,
+  path: string,
+  body?: Record<string, unknown>,
+): { status: number; body: unknown } {
+  if (method === 'GET' && path === '/tasks') {
+    const tasks = listTasks();
+    return { status: 200, body: tasks.map(formatTask) };
+  }
+
+  if (method === 'POST' && path === '/tasks') {
+    const title = body?.title as string;
+    if (!title) return { status: 400, body: { error: 'title is required' } };
+    const task = addTask(title);
+    return { status: 201, body: task };
+  }
+
+  return { status: 404, body: { error: 'not found' } };
+}
diff --git a/evals/workspace-template/src/cli/index.ts b/evals/workspace-template/src/cli/index.ts
new file mode 100644
index 0000000..f7cf249
--- /dev/null
+++ b/evals/workspace-template/src/cli/index.ts
@@ -0,0 +1,17 @@
+import { addTask, getFormattedTasks } from '../services/task-service';
+import { formatTask } from '../utils/format-task';
+
+export function runCli(args: string[]): string {
+  const [command, ...rest] = args;
+
+  switch (command) {
+    case 'add': {
+      const task = addTask(rest.join(' '));
+      return `Created: ${formatTask(task)}`;
+    }
+    case 'list':
+      return getFormattedTasks().join('\n') || 'No tasks found.';
+    default:
+      return 'Usage: task-tracker [add|list] [args...]';
+  }
+}
diff --git a/evals/workspace-template/src/models/task.ts b/evals/workspace-template/src/models/task.ts
new file mode 100644
index 0000000..3fd8d2e
--- /dev/null
+++ b/evals/workspace-template/src/models/task.ts
@@ -0,0 +1,12 @@
+export interface Task {
+  readonly id: string;
+  readonly title: string;
+  readonly status: 'todo' | 'in_progress' | 'done';
+  readonly createdAt: Date;
+  readonly updatedAt: Date;
+}
+
+export interface TaskFilter {
+  readonly status?: Task['status'];
+  readonly search?: string;
+}
diff --git a/evals/workspace-template/src/reports/summary.ts b/evals/workspace-template/src/reports/summary.ts
new file mode 100644
index 0000000..dcbd9cd
--- /dev/null
+++ b/evals/workspace-template/src/reports/summary.ts
@@ -0,0 +1,11 @@
+import { listTasks } from '../services/task-service';
+import { formatTask } from '../utils/format-task';
+
+/**
+ * Third consumer of formatTask — generates a summary report.
+ */
+export function generateSummary(): string {
+  const tasks = listTasks();
+  const lines = [`Task Summary (${tasks.length} total)`, '---', ...tasks.map(formatTask)];
+  return lines.join('\n');
+}
diff --git a/evals/workspace-template/src/services/task-service.ts b/evals/workspace-template/src/services/task-service.ts
new file mode 100644
index 0000000..88ea454
--- /dev/null
+++ b/evals/workspace-template/src/services/task-service.ts
@@ -0,0 +1,32 @@
+import type { Task, TaskFilter } from '../models/task';
+import { formatTask } from '../utils/format-task';
+
+const tasks: Task[] = [];
+
+export function addTask(title: string): Task {
+  const task: Task = {
+    id: `task-${tasks.length + 1}`,
+    title,
+    status: 'todo',
+    createdAt: new Date(),
+    updatedAt: new Date(),
+  };
+  tasks.push(task);
+  return task;
+}
+
+export function listTasks(filter?: TaskFilter): Task[] {
+  let result = [...tasks];
+  if (filter?.status) {
+    result = result.filter((t) => t.status === filter.status);
+  }
+  if (filter?.search) {
+    const q = filter.search.toLowerCase();
+    result = result.filter((t) => t.title.toLowerCase().includes(q));
+  }
+  return result;
+}
+
+export function getFormattedTasks(filter?: TaskFilter): string[] {
+  return listTasks(filter).map(formatTask);
+}
diff --git a/evals/workspace-template/src/utils/format-task.ts b/evals/workspace-template/src/utils/format-task.ts
new file mode 100644
index 0000000..4212b31
--- /dev/null
+++ b/evals/workspace-template/src/utils/format-task.ts
@@ -0,0 +1,19 @@
+import type { Task } from '../models/task';
+
+/**
+ * Shared utility used by CLI, API, and reports.
+ * Format a task for display output.
+ */
+export function formatTask(task: Task): string {
+  const statusIcon = task.status === 'done' ? '✓' : task.status === 'in_progress' ? '→' : '○';
+  return `${statusIcon} [${task.id}] ${task.title}`;
+}
+
+/**
+ * Partial implementation of priority derivation.
+ * Currently only handles basic cases — does not support custom priority rules.
+ */
+export function derivePriority(task: Task): 'high' | 'medium' | 'low' {
+  if (task.status === 'in_progress') return 'high';
+  return 'medium';
+}
diff --git a/evals/workspace-template/tsconfig.json b/evals/workspace-template/tsconfig.json
new file mode 100644
index 0000000..9b274f2
--- /dev/null
+++ b/evals/workspace-template/tsconfig.json
@@ -0,0 +1,12 @@
+{
+  "compilerOptions": {
+    "target": "ES2022",
+    "module": "ESNext",
+    "moduleResolution": "bundler",
+    "strict": true,
+    "noEmit": true,
+    "esModuleInterop": true,
+    "skipLibCheck": true
+  },
+  "include": ["src/**/*.ts"]
+}
diff --git a/skills/hs-ship/SKILL.md b/skills/hs-ship/SKILL.md
index aea6efe..10acbcd 100644
--- a/skills/hs-ship/SKILL.md
+++ b/skills/hs-ship/SKILL.md
@@ -21,13 +21,20 @@ Must have passing verification evidence from hs-verify before shipping. If no ve
 
 ### Step 1: Final verification gate
 
-Run the full check chain one last time:
+Check whether the repo's git prepush hooks already run build/test/lint:
+
+```bash
+# Check for prepush hook configuration (e.g., .husky/pre-push, prek.toml, lefthook.yml, lint-staged)
+```
+
+**If prepush hooks cover build/test/lint:** skip the manual run — `git push` in Step 3 will enforce it. Proceed to blast radius check.
+**If no prepush hooks (or partial coverage):** run the full check chain manually:
 
 ```bash
 bun run build && bun run test && bun run lint
 ```
 
-All must pass with output as evidence. Do not skip this even if hs-verify ran recently — code may have changed since.
+All must pass. If any fails, fix it before proceeding.
 
 ### Step 2: Final blast radius check
 
diff --git a/skills/hs-verify/SKILL.md b/skills/hs-verify/SKILL.md
index c7492b0..07eb4a2 100644
--- a/skills/hs-verify/SKILL.md
+++ b/skills/hs-verify/SKILL.md
@@ -19,15 +19,23 @@ Prove the implementation works before claiming completion. E2E red/green testing
 
 ## Process
 
-### Step 1: Build, test, lint
+### Step 1: Check for prepush hooks, then build/test/lint if needed
 
-Run the full check chain and capture the output:
+Before running build/test/lint manually, check whether the repo's git prepush hooks already cover them:
+
+```bash
+# Check for prepush hook configuration (e.g., .husky/pre-push, prek.toml, lefthook.yml, lint-staged)
+# If hooks run build + test + lint on push, skip to Step 2 — the push will catch regressions.
+```
+
+**If prepush hooks cover build/test/lint:** skip this step — `git push` will enforce it.
+**If no prepush hooks (or partial coverage):** run the full check chain manually and capture output:
 
 ```bash
 bun run build && bun run test && bun run lint
 ```
 
-All three must pass. If any fails, fix it before proceeding.
+All must pass. If any fails, fix it before proceeding.
 
 ### Step 2: E2E red/green protocol
 
@@ -96,9 +104,7 @@ If a reviewer suggests "implementing properly" or adding abstraction, grep the c
 
 Before proceeding to hs-ship, confirm:
 
-- [ ] Build passes (with output)
-- [ ] All tests pass (with output showing test count)
-- [ ] Lint passes (with output)
+- [ ] Build/test/lint passes (either via prepush hooks or manual run, with output)
 - [ ] E2E red/green completed (with evidence of both states)
 - [ ] All execution modes tested
 - [ ] Blast radius check completed (no untouched consumers of modified interfaces)
@@ -115,7 +121,7 @@ Before proceeding to hs-ship, confirm:
 
 ## Hard Gates
 
-- Must run build, tests, and lint before claiming completion
+- Build, tests, and lint must pass before claiming completion (via prepush hooks or manual run)
 - Must have verification command output as evidence
-- E2E must show red-then-green (not just green)
+- E2E must show red-then-green (not just green) — this is the primary value of hs-verify, since unit tests and hooks cannot cover manual E2E scenarios
 - Must check blast radius for any change to types, interfaces, or shared utilities