From 5a53322a76bc2499b2245ed5e098f9e84d6b7030 Mon Sep 17 00:00:00 2001
From: Caleb Leak <caleb.leak@gmail.com>
Date: Sat, 2 May 2026 16:19:45 -0700
Subject: [PATCH 1/4] feat(snapshot): dedupe rebuild work, lock per key, and
 prune the store
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Eliminates three sources of waste in the shared snapshot index store
under `<git-common-dir>/tempyr/snapshots/<key>/`:

1. `tempyr index rebuild` short-circuits to a free seed when the shared
   snapshot for the current graph state already exists, instead of
   rebuilding from scratch and producing identical bytes. Adds `--force`
   for corruption recovery.

2. New `SnapshotBuildLock` (per snapshot key, exclusive file lock under
   `snapshots/.locks/`) serializes concurrent rebuilds. A second worktree
   racing on the same never-seen graph state waits for the first to
   publish, then short-circuits through path 1.

3. New `tempyr snapshot {prune,list}` commands implement Nix-style
   hybrid retention: pinned (live worktree cursors cross-checked against
   `git worktree list`) ∪ recent buffer (`--keep-recent`, default 20) ∪
   soft size cap (`--max-size`, default 500 MB). Two-phase rename-then-
   remove avoids open-file races on Windows. Auto-runs at the tail of
   `index rebuild` and `index update`.

`tempyr doctor` now reports snapshot-store dir count + total bytes with
a `consider \`tempyr snapshot prune\`` hint above the configurable
thresholds. `is_snapshot_key` and `short_path_hash` are exposed from
`tempyr-core::project` so health and prune use identical filtering.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 Cargo.lock                                    |  33 +-
 crates/tempyr-cli/Cargo.toml                  |   1 +
 crates/tempyr-cli/src/commands/doctor.rs      |  75 ++
 crates/tempyr-cli/src/commands/index_cmd.rs   | 395 ++++++++-
 crates/tempyr-cli/src/commands/init.rs        |   2 +-
 crates/tempyr-cli/src/commands/mod.rs         |   1 +
 .../tempyr-cli/src/commands/snapshot_cmd.rs   | 792 ++++++++++++++++++
 crates/tempyr-cli/src/config.rs               |  20 +-
 crates/tempyr-cli/src/main.rs                 |  71 +-
 crates/tempyr-core/src/project.rs             | 135 ++-
 crates/tempyr-index/Cargo.toml                |   1 +
 crates/tempyr-index/src/health.rs             |  57 ++
 12 files changed, 1566 insertions(+), 17 deletions(-)
 create mode 100644 crates/tempyr-cli/src/commands/snapshot_cmd.rs
diff --git a/Cargo.lock b/Cargo.lock
index dae51aa..2d15015 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -906,6 +906,17 @@ dependencies = [
  "simd-adler32",
 ]
 
+[[package]]
+name = "filetime"
+version = "0.2.27"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f98844151eee8917efc50bd9e8318cb963ae8b297431495d3f758616ea5c57db"
+dependencies = [
+ "cfg-if",
+ "libc",
+ "libredox",
+]
+
 [[package]]
 name = "find-msvc-tools"
 version = "0.1.9"
@@ -1657,7 +1668,10 @@ version = "0.1.15"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7ddbf48fd451246b1f8c2610bd3b4ac0cc6e149d89832867093ab69a17194f08"
 dependencies = [
+ "bitflags",
  "libc",
+ "plain",
+ "redox_syscall 0.7.4",
 ]
 
 [[package]]
@@ -2096,7 +2110,7 @@ checksum = "2621685985a2ebf1c516881c026032ac7deafcda1a2c9b7850dc81e3dfcb64c1"
 dependencies = [
  "cfg-if",
  "libc",
- "redox_syscall",
+ "redox_syscall 0.5.18",
  "smallvec",
  "windows-link",
 ]
@@ -2152,6 +2166,12 @@ version = "0.3.32"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c"
 
+[[package]]
+name = "plain"
+version = "0.2.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b4596b6d070b27117e987119b4dac604f3c58cfb0b191112e24771b2faeac1a6"
+
 [[package]]
 name = "png"
 version = "0.18.1"
@@ -2454,6 +2474,15 @@ dependencies = [
  "bitflags",
 ]
 
+[[package]]
+name = "redox_syscall"
+version = "0.7.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f450ad9c3b1da563fb6948a8e0fb0fb9269711c9c73d9ea1de5058c79c8d643a"
+dependencies = [
+ "bitflags",
+]
+
 [[package]]
 name = "redox_users"
 version = "0.5.2"
@@ -3104,6 +3133,7 @@ dependencies = [
  "chrono",
  "clap",
  "crossterm",
+ "filetime",
  "libc",
  "predicates",
  "ratatui",
@@ -3161,6 +3191,7 @@ dependencies = [
  "thiserror",
  "tokio",
  "toml",
+ "walkdir",
 ]
 
 [[package]]
diff --git a/crates/tempyr-cli/Cargo.toml b/crates/tempyr-cli/Cargo.toml
index d991bb4..5f6f06f 100644
--- a/crates/tempyr-cli/Cargo.toml
+++ b/crates/tempyr-cli/Cargo.toml
@@ -34,6 +34,7 @@ tempfile = { workspace = true }
 assert_cmd = "2"
 predicates = "3"
 rmcp = { workspace = true }
+filetime = "0.2"
 
 [target.'cfg(unix)'.dev-dependencies]
 libc = { workspace = true }
diff --git a/crates/tempyr-cli/src/commands/doctor.rs b/crates/tempyr-cli/src/commands/doctor.rs
index a8e574b..fbb19f6 100644
--- a/crates/tempyr-cli/src/commands/doctor.rs
+++ b/crates/tempyr-cli/src/commands/doctor.rs
@@ -5,6 +5,7 @@
 
 use std::io::{self, Write};
 
+use crate::commands::snapshot_cmd::{SNAPSHOT_STORE_HINT_BYTES, SNAPSHOT_STORE_HINT_DIRS};
 use crate::config::ProjectContext;
 use tempyr_index::health::{self, HealthInputs, HealthReport};
 use tempyr_journal::path as jpath;
@@ -197,6 +198,21 @@ pub(crate) fn render_text(out: &mut impl Write, report: &HealthReport) -> io::Re
     if let Some(count) = report.index.embedding_count_for_index {
         writeln!(out, "  embedded nodes: {count}")?;
     }
+    if let (Some(count), Some(bytes)) = (
+        report.index.snapshot_store_count,
+        report.index.snapshot_store_bytes,
+    ) {
+        let hint = if count > SNAPSHOT_STORE_HINT_DIRS || bytes > SNAPSHOT_STORE_HINT_BYTES {
+            "consider `tempyr snapshot prune`"
+        } else {
+            "ok"
+        };
+        writeln!(
+            out,
+            "  snapshot store: {count} dirs, {} ({hint})",
+            human_bytes(bytes),
+        )?;
+    }
 
     if let Some(journal) = &report.journal {
         writeln!(out, "\nJournal")?;
@@ -255,6 +271,21 @@ fn missing_marker(exists: bool) -> &'static str {
     if exists { "" } else { " (MISSING)" }
 }
 
+fn human_bytes(bytes: u64) -> String {
+    const KB: u64 = 1024;
+    const MB: u64 = KB * 1024;
+    const GB: u64 = MB * 1024;
+    if bytes >= GB {
+        format!("{:.2} GB", bytes as f64 / GB as f64)
+    } else if bytes >= MB {
+        format!("{:.1} MB", bytes as f64 / MB as f64)
+    } else if bytes >= KB {
+        format!("{:.1} KB", bytes as f64 / KB as f64)
+    } else {
+        format!("{bytes} B")
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
@@ -318,6 +349,8 @@ mod tests {
                 snapshot_key_error: None,
                 fts_entries: None,
                 embedding_count_for_index: None,
+                snapshot_store_count: None,
+                snapshot_store_bytes: None,
             },
             journal: None,
             warnings: vec!["something is amiss".to_string()],
@@ -397,6 +430,48 @@ mod tests {
         assert!(!out.contains("\nJournal\n"));
     }
 
+    #[test]
+    fn human_bytes_formats_known_magnitudes() {
+        assert_eq!(human_bytes(0), "0 B");
+        assert_eq!(human_bytes(512), "512 B");
+        assert_eq!(human_bytes(2 * 1024), "2.0 KB");
+        assert_eq!(human_bytes(3 * 1024 * 1024), "3.0 MB");
+        assert_eq!(human_bytes(4 * 1024 * 1024 * 1024), "4.00 GB");
+    }
+
+    #[test]
+    fn render_text_includes_snapshot_store_when_populated() {
+        let mut report = fixture_report();
+        report.index.snapshot_store_count = Some(884);
+        report.index.snapshot_store_bytes = Some(1_649_267_441); // ~1.54 GB
+        let out = render_to_string(&report);
+
+        assert!(out.contains("snapshot store: 884 dirs"));
+        assert!(out.contains("GB"));
+        assert!(
+            out.contains("consider `tempyr snapshot prune`"),
+            "high-usage stores should hint at prune; got:\n{out}"
+        );
+    }
+
+    #[test]
+    fn render_text_marks_small_snapshot_store_ok() {
+        let mut report = fixture_report();
+        report.index.snapshot_store_count = Some(5);
+        report.index.snapshot_store_bytes = Some(8 * 1024 * 1024);
+        let out = render_to_string(&report);
+        assert!(out.contains("snapshot store: 5 dirs"));
+        assert!(out.contains("ok"));
+        assert!(!out.contains("consider `tempyr snapshot prune`"));
+    }
+
+    #[test]
+    fn render_text_omits_snapshot_store_when_unprobed() {
+        let report = fixture_report(); // snapshot_store_count is None
+        let out = render_to_string(&report);
+        assert!(!out.contains("snapshot store:"));
+    }
+
     #[test]
     fn render_text_includes_journal_section_when_present() {
         use tempyr_journal::JournalHealthReport;
diff --git a/crates/tempyr-cli/src/commands/index_cmd.rs b/crates/tempyr-cli/src/commands/index_cmd.rs
index 9bbc283..f04b5a2 100644
--- a/crates/tempyr-cli/src/commands/index_cmd.rs
+++ b/crates/tempyr-cli/src/commands/index_cmd.rs
@@ -1,15 +1,139 @@
 use std::path::Path;
+use std::thread;
+use std::time::{Duration, Instant};
 
+use crate::commands::snapshot_cmd::{self, PruneOptions};
 use crate::config::ProjectContext;
 use tempyr_core::graph::Graph;
+use tempyr_core::project::SnapshotBuildLock;
 use tempyr_index::embeddings::{self, EmbeddingStore};
 use tempyr_index::indexer::Index;
 
-pub fn run_rebuild(ctx: &ProjectContext, json: bool, skip_embeddings: bool) -> anyhow::Result<()> {
+/// How long to wait for a competing builder to finish before giving up.
+/// Used both for the "wait for the other process to publish the snapshot"
+/// poll and the "block on the build lock" retry. Builds are typically a
+/// few seconds; the budget is generous to cover cold caches.
+const BUILD_LOCK_WAIT: Duration = Duration::from_secs(60);
+
+/// Polling interval while waiting for another process to publish a snapshot
+/// or release the build lock.
+const BUILD_LOCK_POLL: Duration = Duration::from_millis(100);
+
+/// Outcome of negotiating with concurrent rebuilds for the current snapshot.
+#[cfg_attr(test, derive(Debug))]
+enum RebuildSlot {
+    /// Caller holds the build lock and must rebuild the snapshot from scratch.
+    /// Held until the caller drops the value at end of scope.
+    Build(#[allow(dead_code)] SnapshotBuildLock),
+    /// The shared snapshot exists; caller should seed-and-report rather than
+    /// rebuild. The bool records whether another process built it during this
+    /// invocation (true) or whether it was already there at entry (false).
+    UseExisting { built_by_other: bool },
+}
+
+pub fn run_rebuild(
+    ctx: &ProjectContext,
+    json: bool,
+    skip_embeddings: bool,
+    force: bool,
+) -> anyhow::Result<()> {
     let graph = Graph::load_from_directory(&ctx.graph_dir, ctx.schema.clone())?;
-    let (snapshot_key, index_path) = ctx.ensure_active_index_seeded()?;
+    let (snapshot_key, _) = ctx.ensure_active_index_seeded()?;
+    let shared = ctx.shared_snapshot_index_path()?;
+
+    let slot = negotiate_rebuild_slot(ctx, &shared, force, BUILD_LOCK_WAIT)?;
+    if let RebuildSlot::UseExisting { built_by_other } = slot {
+        return seed_and_report(
+            ctx,
+            &graph,
+            &snapshot_key,
+            json,
+            skip_embeddings,
+            built_by_other,
+        );
+    }
+
+    rebuild_from_scratch(ctx, &graph, &snapshot_key, json, skip_embeddings)?;
+
+    // Best-effort snapshot store maintenance. Failures here do not break the
+    // rebuild — pruning is a hygiene step.
+    let _ = snapshot_cmd::run_prune(
+        ctx,
+        &PruneOptions::default(),
+        false,
+        snapshot_cmd::PruneOutput::Silent,
+    );
+
+    Ok(())
+}
+
+/// Decide whether this caller should rebuild the snapshot or reuse an
+/// existing one, while serializing against any concurrent builder.
+///
+/// Returns:
+/// - [`RebuildSlot::UseExisting`] when `force = false` and either (a) the
+///   shared snapshot is already on disk at entry, or (b) it appears while
+///   we wait for another process to publish.
+/// - [`RebuildSlot::Build`] holding the per-key build lock, when this
+///   caller must rebuild. Drop the value to release.
+///
+/// `wait_budget` bounds **both** the wait-for-publish poll and the
+/// block-acquire retry, so the worst-case total wait is roughly
+/// `2 * wait_budget`. Tests can pass a small budget to exercise the
+/// blocking path without sleeping for the production default.
+fn negotiate_rebuild_slot(
+    ctx: &ProjectContext,
+    shared: &Path,
+    force: bool,
+    wait_budget: Duration,
+) -> anyhow::Result<RebuildSlot> {
+    if !force && shared.exists() {
+        return Ok(RebuildSlot::UseExisting {
+            built_by_other: false,
+        });
+    }
+
+    if let Some(lock) = ctx.try_acquire_snapshot_build_lock()? {
+        if !force && shared.exists() {
+            // Another process won the race and published while we were
+            // taking the lock. Release and use what they built.
+            drop(lock);
+            return Ok(RebuildSlot::UseExisting {
+                built_by_other: true,
+            });
+        }
+        return Ok(RebuildSlot::Build(lock));
+    }
+
+    // Lock is held by another rebuild. Wait for them to publish.
+    wait_for_snapshot(shared, wait_budget);
+    if !force && shared.exists() {
+        return Ok(RebuildSlot::UseExisting {
+            built_by_other: true,
+        });
+    }
+
+    // Other builder didn't publish in time. Block on the lock so the work
+    // is still serialized — better to wait than to double-build.
+    let lock = acquire_build_lock_blocking(ctx, wait_budget)?;
+    if !force && shared.exists() {
+        drop(lock);
+        return Ok(RebuildSlot::UseExisting {
+            built_by_other: true,
+        });
+    }
+    Ok(RebuildSlot::Build(lock))
+}
+
+fn rebuild_from_scratch(
+    ctx: &ProjectContext,
+    graph: &Graph,
+    snapshot_key: &str,
+    json: bool,
+    skip_embeddings: bool,
+) -> anyhow::Result<()> {
+    let (_key, index_path) = ctx.ensure_active_index_seeded()?;
 
-    // Remove existing index
     if index_path.exists() {
         std::fs::remove_file(&index_path)?;
     }
@@ -18,12 +142,11 @@ pub fn run_rebuild(ctx: &ProjectContext, json: bool, skip_embeddings: bool) -> a
     }
 
     let index = Index::create(&index_path)?;
-    let stats = index.rebuild(&graph)?;
+    let stats = index.rebuild(graph)?;
 
-    // Try to generate embeddings
-    let embed_result = maybe_embed(&graph, ctx, skip_embeddings);
-    ctx.write_active_snapshot_key(&snapshot_key)?;
-    ctx.publish_active_snapshot(&snapshot_key)?;
+    let embed_result = maybe_embed(graph, ctx, skip_embeddings);
+    ctx.write_active_snapshot_key(snapshot_key)?;
+    ctx.publish_active_snapshot(snapshot_key)?;
 
     if json {
         let mut result = serde_json::json!({
@@ -31,6 +154,7 @@ pub fn run_rebuild(ctx: &ProjectContext, json: bool, skip_embeddings: bool) -> a
             "edge_count": stats.edge_count,
             "fts_entries": stats.fts_entries,
             "nodes_by_type": stats.nodes_by_type,
+            "source": "rebuilt",
         });
         if let Ok(ref es) = embed_result {
             result["embeddings"] = serde_json::json!({
@@ -54,12 +178,96 @@ pub fn run_rebuild(ctx: &ProjectContext, json: bool, skip_embeddings: bool) -> a
     Ok(())
 }
 
+fn wait_for_snapshot(path: &Path, max: Duration) {
+    let deadline = Instant::now() + max;
+    while Instant::now() < deadline {
+        if path.exists() {
+            return;
+        }
+        thread::sleep(BUILD_LOCK_POLL);
+    }
+}
+
+fn acquire_build_lock_blocking(
+    ctx: &ProjectContext,
+    wait_budget: Duration,
+) -> anyhow::Result<SnapshotBuildLock> {
+    let deadline = Instant::now() + wait_budget;
+    loop {
+        if let Some(lock) = ctx.try_acquire_snapshot_build_lock()? {
+            return Ok(lock);
+        }
+        if Instant::now() >= deadline {
+            anyhow::bail!(
+                "Could not acquire snapshot build lock within {wait_budget:?}; another rebuild is still running."
+            );
+        }
+        thread::sleep(BUILD_LOCK_POLL);
+    }
+}
+
+fn seed_and_report(
+    ctx: &ProjectContext,
+    graph: &Graph,
+    snapshot_key: &str,
+    json: bool,
+    skip_embeddings: bool,
+    waited_for_concurrent_builder: bool,
+) -> anyhow::Result<()> {
+    let index_path = ctx.queryable_index_path()?;
+    let index = Index::open(&index_path)?;
+    let stats = index.stats()?;
+    let embed_result = maybe_embed(graph, ctx, skip_embeddings);
+    ctx.write_active_snapshot_key(snapshot_key)?;
+
+    let source = if waited_for_concurrent_builder {
+        "snapshot_built_by_other_process"
+    } else {
+        "existing_snapshot"
+    };
+
+    if json {
+        let mut result = serde_json::json!({
+            "node_count": stats.node_count,
+            "edge_count": stats.edge_count,
+            "fts_entries": stats.fts_entries,
+            "nodes_by_type": stats.nodes_by_type,
+            "source": source,
+            "snapshot_key": snapshot_key,
+        });
+        if let Ok(ref es) = embed_result {
+            result["embeddings"] = serde_json::json!({
+                "embedded": es.embedded,
+                "cached": es.skipped,
+                "dimensions": es.dimensions,
+            });
+        }
+        println!("{}", serde_json::to_string_pretty(&result)?);
+    } else {
+        let prefix = if waited_for_concurrent_builder {
+            "Index reused after concurrent build"
+        } else {
+            "Index reused from existing snapshot"
+        };
+        println!(
+            "{prefix}: {} nodes, {} edges, {} FTS entries (snapshot {snapshot_key})",
+            stats.node_count, stats.edge_count, stats.fts_entries
+        );
+        for (node_type, count) in &stats.nodes_by_type {
+            println!("  {node_type}: {count}");
+        }
+        render_embedding_message(&embed_result);
+    }
+
+    Ok(())
+}
+
 pub fn run_update(ctx: &ProjectContext, json: bool, skip_embeddings: bool) -> anyhow::Result<()> {
     let graph = Graph::load_from_directory(&ctx.graph_dir, ctx.schema.clone())?;
     let (snapshot_key, index_path) = ctx.ensure_active_index_seeded()?;
 
     if !index_path.exists() {
-        return run_rebuild(ctx, json, skip_embeddings);
+        return run_rebuild(ctx, json, skip_embeddings, false);
     }
 
     let index = Index::open(&index_path)?;
@@ -87,6 +295,14 @@ pub fn run_update(ctx: &ProjectContext, json: bool, skip_embeddings: bool) -> an
         render_embedding_message(&embed_result);
     }
 
+    // Tier 3: best-effort snapshot store maintenance.
+    let _ = snapshot_cmd::run_prune(
+        ctx,
+        &PruneOptions::default(),
+        false,
+        snapshot_cmd::PruneOutput::Silent,
+    );
+
     Ok(())
 }
 
@@ -228,3 +444,164 @@ fn render_stats(
 
     Ok(())
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::fs;
+    use tempyr_core::project as project_mod;
+
+    fn make_ctx(tmp: &Path) -> ProjectContext {
+        let tempyr_dir = tmp.join(".tempyr");
+        fs::create_dir_all(&tempyr_dir).unwrap();
+        fs::create_dir_all(tmp.join("graph")).unwrap();
+        fs::write(tempyr_dir.join("schema.toml"), "name = 'x'\n").unwrap();
+        let cache = project_mod::cache_layout(tmp, &tempyr_dir);
+        ProjectContext {
+            root: tmp.to_path_buf(),
+            graph_dir: tmp.join("graph"),
+            tempyr_dir,
+            cache,
+            schema: tempyr_core::schema::Schema {
+                meta: tempyr_core::schema::SchemaMeta {
+                    version: "1".to_string(),
+                    description: String::new(),
+                },
+                node_types: Default::default(),
+                edge_types: Default::default(),
+            },
+        }
+    }
+
+    fn touch_shared_snapshot(ctx: &ProjectContext) {
+        let shared = ctx.shared_snapshot_index_path().unwrap();
+        fs::create_dir_all(shared.parent().unwrap()).unwrap();
+        fs::write(&shared, b"pretend index").unwrap();
+    }
+
+    #[test]
+    fn slot_uses_existing_when_snapshot_present_and_not_forced() {
+        let tmp = tempfile::tempdir().unwrap();
+        let ctx = make_ctx(tmp.path());
+        touch_shared_snapshot(&ctx);
+        let shared = ctx.shared_snapshot_index_path().unwrap();
+
+        let slot = negotiate_rebuild_slot(&ctx, &shared, false, Duration::from_millis(50)).unwrap();
+
+        assert!(matches!(
+            slot,
+            RebuildSlot::UseExisting {
+                built_by_other: false
+            }
+        ));
+    }
+
+    #[test]
+    fn slot_takes_lock_to_build_when_snapshot_missing() {
+        let tmp = tempfile::tempdir().unwrap();
+        let ctx = make_ctx(tmp.path());
+        let shared = ctx.shared_snapshot_index_path().unwrap();
+
+        let slot = negotiate_rebuild_slot(&ctx, &shared, false, Duration::from_millis(50)).unwrap();
+
+        assert!(matches!(slot, RebuildSlot::Build(_)));
+    }
+
+    #[test]
+    fn slot_forces_build_even_when_snapshot_present() {
+        let tmp = tempfile::tempdir().unwrap();
+        let ctx = make_ctx(tmp.path());
+        touch_shared_snapshot(&ctx);
+        let shared = ctx.shared_snapshot_index_path().unwrap();
+
+        // force=true must bypass the short-circuit and take the build lock.
+        let slot = negotiate_rebuild_slot(&ctx, &shared, true, Duration::from_millis(50)).unwrap();
+        assert!(matches!(slot, RebuildSlot::Build(_)));
+    }
+
+    #[test]
+    fn slot_blocking_acquire_succeeds_after_other_releases() {
+        // Simulate: another process holds the build lock, never publishes a
+        // snapshot, then releases. The waiter should escalate from "wait for
+        // snapshot" → "block-acquire" → finally take the lock and rebuild.
+        let tmp = tempfile::tempdir().unwrap();
+        let ctx = make_ctx(tmp.path());
+        let shared = ctx.shared_snapshot_index_path().unwrap();
+
+        let other_lock = ctx.try_acquire_snapshot_build_lock().unwrap().unwrap();
+
+        let cache = ctx.cache.clone();
+        let snapshot_key = {
+            let layout =
+                project_mod::IndexLayout::resolve(&ctx.root, &ctx.graph_dir, &ctx.tempyr_dir)
+                    .unwrap();
+            layout.snapshot_key().unwrap()
+        };
+
+        let releaser = std::thread::spawn(move || {
+            // Hold long enough that the waiter exhausts the snapshot poll
+            // and enters the blocking-acquire loop, then release.
+            std::thread::sleep(Duration::from_millis(60));
+            drop(other_lock);
+            // Keep `cache` and `snapshot_key` alive for the duration so the
+            // lock-file path stays stable.
+            let _ = (&cache, &snapshot_key);
+        });
+
+        // Total budget = 2 * wait_budget = 200ms. Release happens at ~60ms,
+        // so the blocking-acquire phase should succeed well within that.
+        let slot =
+            negotiate_rebuild_slot(&ctx, &shared, false, Duration::from_millis(100)).unwrap();
+        assert!(matches!(slot, RebuildSlot::Build(_)));
+        releaser.join().unwrap();
+    }
+
+    #[test]
+    fn slot_uses_snapshot_when_other_publishes_during_wait() {
+        // Other process holds the lock, publishes the snapshot during the
+        // wait, then releases. The waiter should short-circuit to UseExisting
+        // without re-doing the work.
+        let tmp = tempfile::tempdir().unwrap();
+        let ctx = make_ctx(tmp.path());
+        let shared = ctx.shared_snapshot_index_path().unwrap();
+        let shared_for_thread = shared.clone();
+
+        let other_lock = ctx.try_acquire_snapshot_build_lock().unwrap().unwrap();
+
+        let publisher = std::thread::spawn(move || {
+            std::thread::sleep(Duration::from_millis(40));
+            fs::create_dir_all(shared_for_thread.parent().unwrap()).unwrap();
+            fs::write(&shared_for_thread, b"published").unwrap();
+            drop(other_lock);
+        });
+
+        let slot =
+            negotiate_rebuild_slot(&ctx, &shared, false, Duration::from_millis(500)).unwrap();
+        assert!(matches!(
+            slot,
+            RebuildSlot::UseExisting {
+                built_by_other: true
+            }
+        ));
+        publisher.join().unwrap();
+    }
+
+    #[test]
+    fn slot_blocking_acquire_times_out_when_lock_never_releases() {
+        let tmp = tempfile::tempdir().unwrap();
+        let ctx = make_ctx(tmp.path());
+        let shared = ctx.shared_snapshot_index_path().unwrap();
+
+        let _held = ctx.try_acquire_snapshot_build_lock().unwrap().unwrap();
+
+        // With no publisher and no release, total time bounded by 2*budget.
+        // Use a small budget so the test is fast.
+        let err =
+            negotiate_rebuild_slot(&ctx, &shared, false, Duration::from_millis(60)).unwrap_err();
+        assert!(
+            err.to_string()
+                .contains("Could not acquire snapshot build lock"),
+            "unexpected error: {err}"
+        );
+    }
+}
diff --git a/crates/tempyr-cli/src/commands/init.rs b/crates/tempyr-cli/src/commands/init.rs
index 9d5942a..509804c 100644
--- a/crates/tempyr-cli/src/commands/init.rs
+++ b/crates/tempyr-cli/src/commands/init.rs
@@ -271,7 +271,7 @@ fn initialize_project(root: &Path, selections: &OnboardingSelections) -> anyhow:
         let ctx = ProjectContext::find(Some(root.join("graph").as_path()))?;
         println!();
         println!("Running initial index rebuild...");
-        if let Err(err) = index_cmd::run_rebuild(&ctx, false, false) {
+        if let Err(err) = index_cmd::run_rebuild(&ctx, false, false, false) {
             eprintln!("Warning: initial index rebuild failed: {err}");
         }
     }
diff --git a/crates/tempyr-cli/src/commands/mod.rs b/crates/tempyr-cli/src/commands/mod.rs
index dac8dd4..0cf56d7 100644
--- a/crates/tempyr-cli/src/commands/mod.rs
+++ b/crates/tempyr-cli/src/commands/mod.rs
@@ -23,6 +23,7 @@ pub(crate) mod process_utils;
 pub mod rename;
 pub mod render_cmd;
 pub mod search;
+pub mod snapshot_cmd;
 pub mod status_cmd;
 pub mod traverse;
 pub mod update;
diff --git a/crates/tempyr-cli/src/commands/snapshot_cmd.rs b/crates/tempyr-cli/src/commands/snapshot_cmd.rs
new file mode 100644
index 0000000..6820d28
--- /dev/null
+++ b/crates/tempyr-cli/src/commands/snapshot_cmd.rs
@@ -0,0 +1,792 @@
+//! Snapshot store management.
+//!
+//! The shared snapshot store at `<git-common-dir>/tempyr/snapshots/<key>/`
+//! accumulates one ~1.8 MB SQLite index per unique graph state ever indexed
+//! by any worktree. With no upper bound, it can balloon to gigabytes.
+//!
+//! `tempyr snapshot prune` enforces a Nix-style hybrid retention policy:
+//!
+//! 1. **Pinned set** — every snapshot key cited by a live worktree's
+//!    `<shared_root>/worktrees/<wt>/snapshot-key.txt` cursor is permanent.
+//!    Stale worktree dirs (no matching `.git/worktrees/<wt>/` private admin
+//!    dir, when `git worktree list` is available) are excluded from the pin
+//!    set so their old cursors don't keep dead snapshots alive forever.
+//! 2. **Recent buffer** — after pinning, keep up to `--keep-recent` of the
+//!    most-recently-modified snapshots, even if they are not pinned. This
+//!    cushions branch-switching workflows so the immediate previous state is
+//!    not evicted when a new snapshot lands.
+//! 3. **Size cap** — among everything that is neither pinned nor in the
+//!    recent buffer, evict in least-recently-modified order until the total
+//!    snapshot-store size is under `--max-size`.
+//!
+//! Deletion is two-phase to dodge open-file races on Windows: each victim is
+//! first renamed to a sibling `<shared_root>/snapshots/.gc-<key>-<pid>-<ts>/`
+//! (invisible to fresh `current_index_path` lookups), then `remove_dir_all`
+//! is attempted. If removal fails (most often because a long-running query
+//! still holds the file open), the renamed dir is left for the next pass.
+
+use std::collections::HashSet;
+use std::fs;
+use std::io;
+use std::path::{Path, PathBuf};
+use std::time::{SystemTime, UNIX_EPOCH};
+
+use anyhow::Context;
+use serde::Serialize;
+use tempyr_core::project::{self, CacheLayout};
+
+use crate::config::ProjectContext;
+
+/// Prefix marking a snapshot dir that has been atomically renamed for
+/// deletion by [`run_prune`] but not yet `remove_dir_all`-ed. New readers
+/// look up snapshots by exact key, so a renamed dir is invisible to them.
+const GC_PREFIX: &str = ".gc-";
+
+/// Default size cap for `tempyr snapshot prune --max-size`. At ~1.8 MB per
+/// snapshot this fits ~280 snapshots, comfortably more than the typical
+/// working set across worktrees on one repo. Also surfaced by `tempyr
+/// doctor` (see [`SNAPSHOT_STORE_HINT_BYTES`]).
+pub const DEFAULT_MAX_SIZE_BYTES: u64 = 500 * 1024 * 1024;
+
+/// Default size of the "recent buffer" — the number of newest snapshots to
+/// keep beyond the pinned set, so branch-switching doesn't immediately
+/// evict the previous state.
+pub const DEFAULT_KEEP_RECENT: usize = 20;
+
+/// `tempyr doctor` switches the snapshot-store summary from "ok" to
+/// "consider `tempyr snapshot prune`" when either threshold is exceeded.
+/// Set well below [`DEFAULT_MAX_SIZE_BYTES`] so users see the hint before
+/// they hit the cap.
+pub const SNAPSHOT_STORE_HINT_DIRS: usize = 200;
+pub const SNAPSHOT_STORE_HINT_BYTES: u64 = 256 * 1024 * 1024;
+
+#[derive(Debug, Clone)]
+pub struct PruneOptions {
+    pub keep_recent: usize,
+    pub max_size_bytes: u64,
+}
+
+impl Default for PruneOptions {
+    fn default() -> Self {
+        Self {
+            keep_recent: DEFAULT_KEEP_RECENT,
+            max_size_bytes: DEFAULT_MAX_SIZE_BYTES,
+        }
+    }
+}
+
+/// Outcome of one [`run_prune`] invocation.
+///
+/// All three "kept" counts sum to the number of snapshots remaining on
+/// disk after the prune. Each snapshot is counted in exactly one bucket:
+///
+/// - `kept_pinned`: cited by a live worktree's `snapshot-key.txt` cursor.
+/// - `kept_buffer`: not pinned, but among the most-recent `keep_recent` by mtime.
+/// - `kept_under_cap`: neither pinned nor in the buffer, but kept because
+///   evicting them wasn't necessary to fit under `max_size_bytes`.
+///
+/// `total_bytes_after_estimate` is the sum of sizes for all three kept
+/// buckets — it's the on-disk size we expect after eviction completes.
+#[derive(Debug, Serialize)]
+pub struct PruneReport {
+    pub kept_pinned: usize,
+    pub kept_buffer: usize,
+    pub kept_under_cap: usize,
+    pub evicted: Vec<EvictedEntry>,
+    /// Snapshots whose `rename` to a `.gc-*` stub or `remove_dir_all`
+    /// failed (most often a long-running reader still holds the SQLite
+    /// file open on Windows). The stub is left in place; the next prune
+    /// pass sweeps it up.
+    pub failures: Vec<EvictionFailure>,
+    pub total_bytes_before: u64,
+    pub total_bytes_after_estimate: u64,
+}
+
+#[derive(Debug, Serialize)]
+pub struct EvictedEntry {
+    pub snapshot_key: String,
+    pub bytes: u64,
+}
+
+#[derive(Debug, Serialize)]
+pub struct EvictionFailure {
+    pub snapshot_key: String,
+    pub message: String,
+}
+
+#[derive(Debug)]
+struct SnapshotEntry {
+    snapshot_key: String,
+    path: PathBuf,
+    bytes: u64,
+    modified_secs: u64,
+}
+
+/// Parse `--max-size` strings like `200`, `200K`, `500M`, `2G`.
+pub fn parse_size(input: &str) -> anyhow::Result<u64> {
+    let trimmed = input.trim();
+    if trimmed.is_empty() {
+        anyhow::bail!("size value is empty");
+    }
+    let bytes = trimmed.as_bytes();
+    let last = bytes[bytes.len() - 1];
+    let (digits, multiplier): (&str, u64) = match last {
+        b'k' | b'K' => (&trimmed[..trimmed.len() - 1], 1024),
+        b'm' | b'M' => (&trimmed[..trimmed.len() - 1], 1024 * 1024),
+        b'g' | b'G' => (&trimmed[..trimmed.len() - 1], 1024 * 1024 * 1024),
+        b'0'..=b'9' => (trimmed, 1),
+        _ => anyhow::bail!("unrecognized size suffix in {input:?}; use plain bytes or K/M/G"),
+    };
+    let n: u64 = digits
+        .parse()
+        .with_context(|| format!("could not parse size value {input:?}"))?;
+    Ok(n.saturating_mul(multiplier))
+}
+
+/// Output format for `tempyr snapshot prune`. Auto-prune callers
+/// (rebuild/update tails) use [`PruneOutput::Silent`]; the CLI passes
+/// `Human` or `Json` based on the global `--json` flag.
+#[derive(Debug, Clone, Copy)]
+pub enum PruneOutput {
+    Silent,
+    Human,
+    Json,
+}
+
+pub fn run_prune(
+    ctx: &ProjectContext,
+    opts: &PruneOptions,
+    dry_run: bool,
+    output: PruneOutput,
+) -> anyhow::Result<PruneReport> {
+    let cache = ctx.cache_layout();
+    let snapshots_root = cache.snapshots_root();
+
+    let report = if snapshots_root.exists() {
+        plan_and_evict(&snapshots_root, cache, opts, dry_run)?
+    } else {
+        PruneReport {
+            kept_pinned: 0,
+            kept_buffer: 0,
+            kept_under_cap: 0,
+            evicted: Vec::new(),
+            failures: Vec::new(),
+            total_bytes_before: 0,
+            total_bytes_after_estimate: 0,
+        }
+    };
+
+    render_prune_output(&report, dry_run, output)?;
+    Ok(report)
+}
+
+fn plan_and_evict(
+    snapshots_root: &Path,
+    cache: &CacheLayout,
+    opts: &PruneOptions,
+    dry_run: bool,
+) -> anyhow::Result<PruneReport> {
+    let mut entries = enumerate_snapshots(snapshots_root)?;
+    entries.sort_by_key(|e| std::cmp::Reverse(e.modified_secs));
+
+    let pinned = collect_pin_set(cache);
+
+    // Pass 1: keep all pinned, plus the most-recent `keep_recent` of the rest.
+    let mut kept: HashSet<String> = HashSet::new();
+    let mut kept_pinned = 0;
+    let mut kept_buffer = 0;
+    for entry in &entries {
+        if pinned.contains(&entry.snapshot_key) && kept.insert(entry.snapshot_key.clone()) {
+            kept_pinned += 1;
+        }
+    }
+    for entry in &entries {
+        if kept_buffer >= opts.keep_recent {
+            break;
+        }
+        if kept.insert(entry.snapshot_key.clone()) {
+            kept_buffer += 1;
+        }
+    }
+
+    // Pass 2: among unkept, walk newest-first and keep each whose size fits
+    // under the remaining cap headroom. Older unkept entries that can't fit
+    // are evicted. This is a soft cap, not a hard LRU: a single huge
+    // snapshot doesn't kick out smaller older ones that do fit.
+    let total_bytes_before: u64 = entries.iter().map(|e| e.bytes).sum();
+    let kept_bytes: u64 = entries
+        .iter()
+        .filter(|e| kept.contains(&e.snapshot_key))
+        .map(|e| e.bytes)
+        .sum();
+    let mut running = kept_bytes;
+    let mut kept_under_cap = 0;
+    let mut evict: Vec<&SnapshotEntry> = Vec::new();
+    let mut unkept: Vec<&SnapshotEntry> = entries
+        .iter()
+        .filter(|e| !kept.contains(&e.snapshot_key))
+        .collect();
+    unkept.sort_by_key(|e| std::cmp::Reverse(e.modified_secs));
+    for entry in &unkept {
+        if running + entry.bytes <= opts.max_size_bytes {
+            running += entry.bytes;
+            kept_under_cap += 1;
+        } else {
+            evict.push(*entry);
+        }
+    }
+
+    let mut report = PruneReport {
+        kept_pinned,
+        kept_buffer,
+        kept_under_cap,
+        evicted: Vec::new(),
+        failures: Vec::new(),
+        total_bytes_before,
+        total_bytes_after_estimate: running,
+    };
+
+    for entry in &evict {
+        if dry_run {
+            report.evicted.push(EvictedEntry {
+                snapshot_key: entry.snapshot_key.clone(),
+                bytes: entry.bytes,
+            });
+            continue;
+        }
+        match two_phase_remove(&entry.path) {
+            Ok(()) => report.evicted.push(EvictedEntry {
+                snapshot_key: entry.snapshot_key.clone(),
+                bytes: entry.bytes,
+            }),
+            Err(err) => report.failures.push(EvictionFailure {
+                snapshot_key: entry.snapshot_key.clone(),
+                message: err.to_string(),
+            }),
+        }
+    }
+
+    // Sweep `.gc-*` stubs that prior runs left behind (e.g. EBUSY on Windows).
+    if !dry_run {
+        let _ = sweep_orphaned_gc_dirs(snapshots_root);
+    }
+
+    Ok(report)
+}
+
+fn render_prune_output(
+    report: &PruneReport,
+    dry_run: bool,
+    output: PruneOutput,
+) -> anyhow::Result<()> {
+    match output {
+        PruneOutput::Silent => Ok(()),
+        PruneOutput::Json => {
+            println!("{}", serde_json::to_string_pretty(report)?);
+            Ok(())
+        }
+        PruneOutput::Human => {
+            let verb = if dry_run { "would evict" } else { "evicted" };
+            let total_kept = report.kept_pinned + report.kept_buffer + report.kept_under_cap;
+            println!(
+                "Snapshot prune{}: {verb} {} snapshots, kept {total_kept} ({} pinned, {} buffer, {} under cap)",
+                if dry_run { " (dry run)" } else { "" },
+                report.evicted.len(),
+                report.kept_pinned,
+                report.kept_buffer,
+                report.kept_under_cap,
+            );
+            if !report.failures.is_empty() {
+                println!("  {} eviction(s) failed:", report.failures.len());
+                for failure in &report.failures {
+                    println!("    {}: {}", failure.snapshot_key, failure.message);
+                }
+            }
+            Ok(())
+        }
+    }
+}
+
+pub fn run_list(ctx: &ProjectContext, json: bool) -> anyhow::Result<()> {
+    let cache = ctx.cache_layout();
+    let snapshots_root = cache.snapshots_root();
+    let entries = if snapshots_root.exists() {
+        enumerate_snapshots(&snapshots_root)?
+    } else {
+        Vec::new()
+    };
+    let pinned = collect_pin_set(cache);
+
+    if json {
+        #[derive(Serialize)]
+        struct Row<'a> {
+            snapshot_key: &'a str,
+            bytes: u64,
+            modified_secs: u64,
+            pinned: bool,
+        }
+        let rows: Vec<Row> = entries
+            .iter()
+            .map(|e| Row {
+                snapshot_key: &e.snapshot_key,
+                bytes: e.bytes,
+                modified_secs: e.modified_secs,
+                pinned: pinned.contains(&e.snapshot_key),
+            })
+            .collect();
+        println!("{}", serde_json::to_string_pretty(&rows)?);
+    } else {
+        println!("Snapshots in {}:", snapshots_root.display());
+        for entry in &entries {
+            let marker = if pinned.contains(&entry.snapshot_key) {
+                "PINNED"
+            } else {
+                "      "
+            };
+            println!(
+                "  {marker}  {}  {:>10} bytes  mtime={}",
+                entry.snapshot_key, entry.bytes, entry.modified_secs
+            );
+        }
+        println!(
+            "Total: {} snapshots, {} pinned",
+            entries.len(),
+            entries
+                .iter()
+                .filter(|e| pinned.contains(&e.snapshot_key))
+                .count()
+        );
+    }
+    Ok(())
+}
+
+fn enumerate_snapshots(snapshots_root: &Path) -> io::Result<Vec<SnapshotEntry>> {
+    let mut out = Vec::new();
+    for entry in fs::read_dir(snapshots_root)? {
+        let entry = entry?;
+        let file_type = entry.file_type()?;
+        if !file_type.is_dir() {
+            continue;
+        }
+        let name = match entry.file_name().into_string() {
+            Ok(name) => name,
+            Err(_) => continue,
+        };
+        // Skip the `.locks` coordination dir and any partial-prune `.gc-*`
+        // stubs. `is_snapshot_key` accepts only the 16-hex-char names.
+        if !project::is_snapshot_key(&name) {
+            continue;
+        }
+        let path = entry.path();
+        let bytes = directory_size(&path).unwrap_or(0);
+        let modified_secs = directory_modified_secs(&path).unwrap_or(0);
+        out.push(SnapshotEntry {
+            snapshot_key: name,
+            path,
+            bytes,
+            modified_secs,
+        });
+    }
+    Ok(out)
+}
+
+/// Build the set of snapshot keys that no eviction may touch.
+///
+/// Each per-worktree `<shared_root>/worktrees/<wt-id>/snapshot-key.txt`
+/// is a GC root pinning whatever snapshot key it contains. When `git
+/// worktree list` works against the cache's owning repo we additionally
+/// drop cursors whose `wt-id` no longer corresponds to a live worktree —
+/// those are dangling pointers from worktrees the user deleted. When
+/// git is unavailable we trust every cursor on disk: better to retain
+/// too many snapshots than to silently evict one a worktree is still
+/// pointing at.
+fn collect_pin_set(cache: &CacheLayout) -> HashSet<String> {
+    let mut pinned = HashSet::new();
+    let worktrees_root = cache.worktrees_root();
+    let live_wt_ids = live_worktree_ids(cache);
+    let Ok(read) = fs::read_dir(&worktrees_root) else {
+        return pinned;
+    };
+    for entry in read.flatten() {
+        let Ok(file_type) = entry.file_type() else {
+            continue;
+        };
+        if !file_type.is_dir() {
+            continue;
+        }
+        let cursor_path = entry.path().join("snapshot-key.txt");
+        let Ok(raw) = fs::read_to_string(&cursor_path) else {
+            continue;
+        };
+        let key = raw.trim().to_string();
+        if !project::is_snapshot_key(&key) {
+            continue;
+        }
+
+        // When we have a live worktree list, drop cursors whose worktree-id
+        // is not present — those are dangling. Without git we keep all of
+        // them.
+        if let Some(ref live) = live_wt_ids {
+            let wt_id = entry.file_name().to_string_lossy().to_string();
+            if !live.contains(&wt_id) {
+                continue;
+            }
+        }
+        pinned.insert(key);
+    }
+    pinned
+}
+
+/// Best-effort enumeration of live git worktree-ids for the repo that
+/// owns this snapshot store. Each id is [`project::short_path_hash`] of
+/// that worktree's private `.git` admin dir — the same hash
+/// [`tempyr_core::project::IndexLayout`] uses to derive each worktree's
+/// cache subdir.
+///
+/// We pass `--git-dir=<owning common dir>` rather than relying on the
+/// process cwd, so a `cargo test` running from the Tempyr repo doesn't
+/// accidentally treat the *test's* synthetic store as belonging to the
+/// Tempyr worktree. Returns `None` when the cache isn't backed by a git
+/// repo (non-git tempyr projects keep their cache under `.tempyr/cache/`)
+/// or when git is unavailable; callers treat `None` as "trust all
+/// on-disk cursors" so we never over-evict.
+fn live_worktree_ids(cache: &CacheLayout) -> Option<HashSet<String>> {
+    // For git-backed projects, `cache.shared_root` is `<common-dir>/tempyr`.
+    // Bail out if the parent doesn't look like a git common dir (no `HEAD`
+    // file, no `commondir` file pointing elsewhere).
+    let common_dir = cache.shared_root.parent()?;
+    if !common_dir.join("HEAD").is_file() && !common_dir.join("commondir").is_file() {
+        return None;
+    }
+    let output = std::process::Command::new("git")
+        .arg(format!("--git-dir={}", common_dir.display()))
+        .args(["worktree", "list", "--porcelain"])
+        .output()
+        .ok()?;
+    if !output.status.success() {
+        return None;
+    }
+    let mut ids = HashSet::new();
+    let stdout = String::from_utf8_lossy(&output.stdout);
+    for line in stdout.lines() {
+        let Some(path) = line.strip_prefix("worktree ") else {
+            continue;
+        };
+        if let Some(dirs) = project::resolve_git_dirs(&PathBuf::from(path)) {
+            ids.insert(project::short_path_hash(&dirs.git_dir));
+        }
+    }
+    Some(ids)
+}
+
+fn directory_size(path: &Path) -> io::Result<u64> {
+    let mut total: u64 = 0;
+    for entry in walkdir::WalkDir::new(path) {
+        let Ok(entry) = entry else { continue };
+        let metadata = match entry.metadata() {
+            Ok(m) => m,
+            Err(_) => continue,
+        };
+        if metadata.is_file() {
+            total = total.saturating_add(metadata.len());
+        }
+    }
+    Ok(total)
+}
+
+fn directory_modified_secs(path: &Path) -> io::Result<u64> {
+    let metadata = fs::metadata(path)?;
+    Ok(metadata
+        .modified()
+        .ok()
+        .and_then(|m| m.duration_since(UNIX_EPOCH).ok())
+        .map(|d| d.as_secs())
+        .unwrap_or(0))
+}
+
+fn two_phase_remove(snapshot_dir: &Path) -> io::Result<()> {
+    let parent = snapshot_dir
+        .parent()
+        .ok_or_else(|| io::Error::other("snapshot dir has no parent"))?;
+    let name = snapshot_dir
+        .file_name()
+        .ok_or_else(|| io::Error::other("snapshot dir has no name"))?
+        .to_string_lossy()
+        .to_string();
+    let nonce = SystemTime::now()
+        .duration_since(UNIX_EPOCH)
+        .unwrap_or_default()
+        .as_nanos();
+    let staged = parent.join(format!("{GC_PREFIX}{name}-{}-{nonce}", std::process::id()));
+
+    fs::rename(snapshot_dir, &staged)?;
+    // Try to remove the renamed dir. On Windows this can fail with EBUSY
+    // if a long-running reader still has the index file open; in that
+    // case the staged dir lingers and the next prune sweeps it up.
+    fs::remove_dir_all(&staged)
+}
+
+fn sweep_orphaned_gc_dirs(snapshots_root: &Path) -> io::Result<()> {
+    for entry in fs::read_dir(snapshots_root)? {
+        let Ok(entry) = entry else { continue };
+        let name = entry.file_name();
+        let name = name.to_string_lossy();
+        if !name.starts_with(GC_PREFIX) {
+            continue;
+        }
+        let _ = fs::remove_dir_all(entry.path());
+    }
+    Ok(())
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn parse_size_accepts_plain_bytes() {
+        assert_eq!(parse_size("1024").unwrap(), 1024);
+    }
+
+    #[test]
+    fn parse_size_accepts_suffixes() {
+        assert_eq!(parse_size("1K").unwrap(), 1024);
+        assert_eq!(parse_size("2k").unwrap(), 2048);
+        assert_eq!(parse_size("3M").unwrap(), 3 * 1024 * 1024);
+        assert_eq!(parse_size("4g").unwrap(), 4 * 1024 * 1024 * 1024);
+    }
+
+    #[test]
+    fn parse_size_rejects_unknown_suffix() {
+        assert!(parse_size("1T").is_err());
+        assert!(parse_size("1.5M").is_err());
+        assert!(parse_size("").is_err());
+    }
+
+    use std::fs;
+
+    /// Build a synthetic snapshot store under `tmp` with the given keys, each
+    /// containing an `index.db` of `bytes_per_snapshot` bytes. Modified-time
+    /// is back-dated so the first key is oldest and the last key is newest.
+    fn populate_snapshot_store(
+        tmp: &Path,
+        keys: &[&str],
+        bytes_per_snapshot: usize,
+    ) -> CacheLayout {
+        let cache = CacheLayout {
+            shared_root: tmp.to_path_buf(),
+            worktree_root: tmp.join("worktrees").join("default"),
+        };
+        let snapshots_root = cache.snapshots_root();
+        fs::create_dir_all(&snapshots_root).unwrap();
+        let payload = vec![0u8; bytes_per_snapshot];
+        for (i, key) in keys.iter().enumerate() {
+            let dir = snapshots_root.join(key);
+            fs::create_dir_all(&dir).unwrap();
+            let path = dir.join("index.db");
+            fs::write(&path, &payload).unwrap();
+            // Set mtime via filetime so test ordering is deterministic.
+            let mtime = filetime_for_offset_secs((keys.len() - i) as i64 * 60);
+            filetime::set_file_mtime(&path, mtime).unwrap();
+            filetime::set_file_mtime(&dir, mtime).unwrap();
+        }
+        cache
+    }
+
+    fn write_worktree_cursor(cache: &CacheLayout, wt_id: &str, snapshot_key: &str) {
+        let dir = cache.worktrees_root().join(wt_id);
+        fs::create_dir_all(&dir).unwrap();
+        fs::write(dir.join("snapshot-key.txt"), snapshot_key).unwrap();
+    }
+
+    fn filetime_for_offset_secs(secs_ago: i64) -> filetime::FileTime {
+        let now = std::time::SystemTime::now();
+        let target = if secs_ago >= 0 {
+            now - std::time::Duration::from_secs(secs_ago as u64)
+        } else {
+            now + std::time::Duration::from_secs((-secs_ago) as u64)
+        };
+        filetime::FileTime::from_system_time(target)
+    }
+
+    fn make_ctx_for_cache(cache: &CacheLayout) -> ProjectContext {
+        let root = cache.shared_root.clone();
+        let tempyr_dir = root.join(".tempyr");
+        std::fs::create_dir_all(&tempyr_dir).unwrap();
+        ProjectContext {
+            root: root.clone(),
+            graph_dir: root.join("graph"),
+            tempyr_dir,
+            cache: cache.clone(),
+            schema: tempyr_core::schema::Schema {
+                meta: tempyr_core::schema::SchemaMeta {
+                    version: "1".to_string(),
+                    description: String::new(),
+                },
+                node_types: Default::default(),
+                edge_types: Default::default(),
+            },
+        }
+    }
+
+    fn keys_on_disk(cache: &CacheLayout) -> Vec<String> {
+        let mut keys: Vec<String> = fs::read_dir(cache.snapshots_root())
+            .unwrap()
+            .filter_map(|e| e.ok())
+            .filter(|e| e.file_type().map(|t| t.is_dir()).unwrap_or(false))
+            .filter_map(|e| e.file_name().into_string().ok())
+            .filter(|n| n.len() == 16 && n.bytes().all(|b| b.is_ascii_hexdigit()))
+            .collect();
+        keys.sort();
+        keys
+    }
+
+    #[test]
+    fn prune_keeps_pinned_snapshots() {
+        let tmp = tempfile::tempdir().unwrap();
+        let keys = [
+            "0000000000000001",
+            "0000000000000002",
+            "0000000000000003",
+            "0000000000000004",
+        ];
+        let cache = populate_snapshot_store(tmp.path(), &keys, 8);
+        write_worktree_cursor(&cache, "wt-a", "0000000000000001");
+        let ctx = make_ctx_for_cache(&cache);
+
+        let opts = PruneOptions {
+            keep_recent: 0,
+            max_size_bytes: 1, // force eviction of everything not pinned/buffered
+        };
+        let report = run_prune(&ctx, &opts, false, PruneOutput::Silent).unwrap();
+
+        // The pinned key must survive even though it's the oldest.
+        let remaining = keys_on_disk(&cache);
+        assert!(remaining.contains(&"0000000000000001".to_string()));
+        assert_eq!(report.kept_pinned, 1);
+        // Everything else evicted.
+        assert!(
+            report
+                .evicted
+                .iter()
+                .all(|e| e.snapshot_key != "0000000000000001")
+        );
+        assert_eq!(remaining.len(), 1);
+    }
+
+    #[test]
+    fn prune_keeps_recent_buffer_above_pin_set() {
+        let tmp = tempfile::tempdir().unwrap();
+        let keys = [
+            "1111111111111111", // oldest
+            "2222222222222222",
+            "3333333333333333",
+            "4444444444444444",
+            "5555555555555555", // newest
+        ];
+        let cache = populate_snapshot_store(tmp.path(), &keys, 8);
+        // No worktree cursors → no pins.
+        let ctx = make_ctx_for_cache(&cache);
+
+        let opts = PruneOptions {
+            keep_recent: 2,
+            max_size_bytes: 1, // cap forces only buffer to survive
+        };
+        let report = run_prune(&ctx, &opts, false, PruneOutput::Silent).unwrap();
+
+        assert_eq!(report.kept_pinned, 0);
+        assert_eq!(report.kept_buffer, 2);
+        // Only the 2 newest survive.
+        let remaining = keys_on_disk(&cache);
+        assert_eq!(
+            remaining,
+            vec![
+                "4444444444444444".to_string(),
+                "5555555555555555".to_string()
+            ]
+        );
+    }
+
+    #[test]
+    fn prune_evicts_lru_under_size_cap() {
+        let tmp = tempfile::tempdir().unwrap();
+        let keys = [
+            "aaaaaaaaaaaaaaaa",
+            "bbbbbbbbbbbbbbbb",
+            "cccccccccccccccc",
+            "dddddddddddddddd",
+        ];
+        // 100 bytes per snapshot → 400 total.
+        let cache = populate_snapshot_store(tmp.path(), &keys, 100);
+        let ctx = make_ctx_for_cache(&cache);
+
+        let opts = PruneOptions {
+            keep_recent: 0,
+            max_size_bytes: 250, // fits 2 snapshots, evict 2
+        };
+        let report = run_prune(&ctx, &opts, false, PruneOutput::Silent).unwrap();
+
+        // The 2 newest survive (cccc + dddd).
+        let remaining = keys_on_disk(&cache);
+        assert_eq!(
+            remaining,
+            vec![
+                "cccccccccccccccc".to_string(),
+                "dddddddddddddddd".to_string()
+            ]
+        );
+        assert_eq!(report.evicted.len(), 2);
+        assert!(report.total_bytes_after_estimate <= opts.max_size_bytes);
+    }
+
+    #[test]
+    fn prune_dry_run_does_not_delete() {
+        let tmp = tempfile::tempdir().unwrap();
+        let keys = ["1234567890abcdef", "abcdef1234567890"];
+        let cache = populate_snapshot_store(tmp.path(), &keys, 50);
+        let ctx = make_ctx_for_cache(&cache);
+
+        let opts = PruneOptions {
+            keep_recent: 0,
+            max_size_bytes: 1,
+        };
+        let report = run_prune(&ctx, &opts, true, PruneOutput::Silent).unwrap();
+
+        assert_eq!(report.evicted.len(), 2);
+        assert_eq!(keys_on_disk(&cache).len(), 2, "dry run must not delete");
+    }
+
+    #[test]
+    fn prune_skips_locks_dir_and_gc_stubs() {
+        let tmp = tempfile::tempdir().unwrap();
+        let keys = ["0123456789abcdef"];
+        let cache = populate_snapshot_store(tmp.path(), &keys, 8);
+        // Create a .locks dir and a .gc-* stub — these must be ignored.
+        std::fs::create_dir_all(cache.snapshot_locks_dir()).unwrap();
+        std::fs::create_dir_all(cache.snapshots_root().join(".gc-stale-12345")).unwrap();
+        std::fs::write(
+            cache
+                .snapshots_root()
+                .join(".gc-stale-12345")
+                .join("index.db"),
+            b"orphan",
+        )
+        .unwrap();
+
+        let ctx = make_ctx_for_cache(&cache);
+        let opts = PruneOptions {
+            keep_recent: 100,
+            max_size_bytes: u64::MAX,
+        };
+        let report = run_prune(&ctx, &opts, false, PruneOutput::Silent).unwrap();
+
+        // Real snapshot survived.
+        assert!(keys_on_disk(&cache).contains(&"0123456789abcdef".to_string()));
+        // Orphan .gc-* dirs got swept.
+        assert!(!cache.snapshots_root().join(".gc-stale-12345").exists());
+        // .locks dir unaffected.
+        assert!(cache.snapshot_locks_dir().exists());
+        // Nothing real evicted.
+        assert!(report.evicted.is_empty());
+    }
+}
diff --git a/crates/tempyr-cli/src/config.rs b/crates/tempyr-cli/src/config.rs
index 6244692..0e18a41 100644
--- a/crates/tempyr-cli/src/config.rs
+++ b/crates/tempyr-cli/src/config.rs
@@ -2,7 +2,7 @@ use std::path::{Path, PathBuf};
 
 use tempyr_core::graph::Graph;
 use tempyr_core::project;
-use tempyr_core::project::{CacheLayout, IndexLayout};
+use tempyr_core::project::{CacheLayout, IndexLayout, SnapshotBuildLock};
 use tempyr_core::schema::Schema;
 use tempyr_index::embeddings::{
     self, EmbeddingConfig, ResolvedEmbeddingConfig, resolve_embedding_config,
@@ -101,6 +101,24 @@ impl ProjectContext {
         Ok(layout.publish_active_snapshot()?)
     }
 
+    /// Path to the shared content-addressed snapshot index for the current
+    /// graph state. The file may or may not exist yet.
+    pub fn shared_snapshot_index_path(&self) -> anyhow::Result<PathBuf> {
+        let layout = self.index_layout()?;
+        Ok(layout.shared_snapshot_index_path()?)
+    }
+
+    /// Try to acquire the per-snapshot build lock. Returns `Ok(None)` if
+    /// another process is already building this snapshot.
+    pub fn try_acquire_snapshot_build_lock(&self) -> anyhow::Result<Option<SnapshotBuildLock>> {
+        let layout = self.index_layout()?;
+        Ok(layout.try_acquire_snapshot_build_lock()?)
+    }
+
+    pub fn cache_layout(&self) -> &CacheLayout {
+        &self.cache
+    }
+
     /// Refresh the index so query commands keep working after graph mutations.
     pub fn refresh_index_for_current_snapshot(&self) -> anyhow::Result<()> {
         let layout = self.index_layout()?;
diff --git a/crates/tempyr-cli/src/main.rs b/crates/tempyr-cli/src/main.rs
index e36d83b..8818f50 100644
--- a/crates/tempyr-cli/src/main.rs
+++ b/crates/tempyr-cli/src/main.rs
@@ -248,6 +248,13 @@ pub enum Commands {
         action: IndexAction,
     },
 
+    /// Snapshot store management (the content-addressed shared index store
+    /// under `<git-common-dir>/tempyr/snapshots/`).
+    Snapshot {
+        #[command(subcommand)]
+        action: SnapshotAction,
+    },
+
     /// Linear integration
     Linear {
         #[command(subcommand)]
@@ -313,6 +320,33 @@ pub enum InterviewAction {
     List,
 }
 
+#[derive(Subcommand)]
+pub enum SnapshotAction {
+    /// Prune historical snapshots from the shared store. Pinned snapshots
+    /// (cited by any live worktree's `snapshot-key.txt`) are never evicted.
+    /// A configurable buffer of the most-recent snapshots is also kept;
+    /// beyond pinned + buffer, snapshots are evicted in LRU order until the
+    /// total snapshot-store size is under the cap.
+    Prune {
+        /// Keep at least this many of the most-recently-modified snapshots
+        /// even if they are not pinned.
+        #[arg(long, default_value_t = 20)]
+        keep_recent: usize,
+        /// Soft cap on the total bytes used by `<shared_root>/snapshots/`.
+        /// Accepts plain bytes or a suffix like `500M`, `2G`. The pruner
+        /// evicts unpinned, non-buffered snapshots in LRU order until the
+        /// total falls below this value. Pinned snapshots are never counted
+        /// against the cap.
+        #[arg(long, default_value = "500M")]
+        max_size: String,
+        /// Show what would be deleted without removing anything.
+        #[arg(long)]
+        dry_run: bool,
+    },
+    /// List snapshots in the shared store with their pinned/buffer status.
+    List,
+}
+
 #[derive(Subcommand)]
 pub enum IndexAction {
     /// Full index rebuild from source files
@@ -320,6 +354,14 @@ pub enum IndexAction {
         /// Refresh structural search data only; do not call embedding providers
         #[arg(long)]
         skip_embeddings: bool,
+        /// Rebuild from scratch even when the shared snapshot already exists.
+        /// Use this to recover from a corrupted snapshot index. Without this
+        /// flag, `rebuild` short-circuits to a free seed when the snapshot
+        /// for the current graph state has already been built (by this
+        /// worktree or another), since rebuilding would produce identical
+        /// output.
+        #[arg(long)]
+        force: bool,
     },
     /// Incremental update (changed files only)
     Update {
@@ -775,15 +817,38 @@ fn run(cli: Cli) -> anyhow::Result<()> {
         Commands::Index { action } => {
             let ctx = config::ProjectContext::find(cli.graph_dir.as_deref())?;
             match action {
-                IndexAction::Rebuild { skip_embeddings } => {
-                    commands::index_cmd::run_rebuild(&ctx, cli.json, skip_embeddings)
-                }
+                IndexAction::Rebuild {
+                    skip_embeddings,
+                    force,
+                } => commands::index_cmd::run_rebuild(&ctx, cli.json, skip_embeddings, force),
                 IndexAction::Update { skip_embeddings } => {
                     commands::index_cmd::run_update(&ctx, cli.json, skip_embeddings)
                 }
                 IndexAction::Stats => commands::index_cmd::run_stats(&ctx, cli.json),
             }
         }
+        Commands::Snapshot { action } => {
+            let ctx = config::ProjectContext::find(cli.graph_dir.as_deref())?;
+            match action {
+                SnapshotAction::Prune {
+                    keep_recent,
+                    max_size,
+                    dry_run,
+                } => {
+                    let opts = commands::snapshot_cmd::PruneOptions {
+                        keep_recent,
+                        max_size_bytes: commands::snapshot_cmd::parse_size(&max_size)?,
+                    };
+                    let output = if cli.json {
+                        commands::snapshot_cmd::PruneOutput::Json
+                    } else {
+                        commands::snapshot_cmd::PruneOutput::Human
+                    };
+                    commands::snapshot_cmd::run_prune(&ctx, &opts, dry_run, output).map(|_| ())
+                }
+                SnapshotAction::List => commands::snapshot_cmd::run_list(&ctx, cli.json),
+            }
+        }
         Commands::Linear { action } => {
             let ctx = config::ProjectContext::find(cli.graph_dir.as_deref())?;
             match action {
diff --git a/crates/tempyr-core/src/project.rs b/crates/tempyr-core/src/project.rs
index 202fa83..ef2a131 100644
--- a/crates/tempyr-core/src/project.rs
+++ b/crates/tempyr-core/src/project.rs
@@ -10,7 +10,7 @@
 //! from a different working directory than the active repo.
 
 use std::cell::RefCell;
-use std::fs;
+use std::fs::{self, File, OpenOptions};
 use std::io;
 use std::path::{Path, PathBuf};
 use std::time::{SystemTime, UNIX_EPOCH};
@@ -84,6 +84,27 @@ impl CacheLayout {
             .join("index.db")
     }
 
+    pub fn snapshots_root(&self) -> PathBuf {
+        self.shared_root.join("snapshots")
+    }
+
+    /// Directory holding per-snapshot build coordination locks.
+    /// Lock files live here so the snapshot dir itself does not need to exist
+    /// before a builder takes the lock.
+    pub fn snapshot_locks_dir(&self) -> PathBuf {
+        self.shared_root.join("snapshots").join(".locks")
+    }
+
+    pub fn snapshot_build_lock_path(&self, snapshot_key: &str) -> PathBuf {
+        self.snapshot_locks_dir()
+            .join(format!("{snapshot_key}.build.lock"))
+    }
+
+    /// Root containing every per-worktree cursor directory.
+    pub fn worktrees_root(&self) -> PathBuf {
+        self.shared_root.join("worktrees")
+    }
+
     pub fn embeddings_dir(&self) -> PathBuf {
         self.shared_root.join("embeddings")
     }
@@ -98,6 +119,42 @@ pub struct IndexLayout {
     pub legacy_index_path: PathBuf,
 }
 
+/// Held exclusive lock for building one specific snapshot key. Two worktrees
+/// that race to build the same never-seen graph state will see one acquire
+/// and the other receive `None` — the loser should wait briefly then re-check
+/// whether the shared snapshot now exists rather than redoing the work.
+///
+/// Drop releases the OS lock; the lockfile is left behind under
+/// `<shared_root>/snapshots/.locks/` for reuse by the next builder.
+#[derive(Debug)]
+pub struct SnapshotBuildLock {
+    _file: File,
+}
+
+impl SnapshotBuildLock {
+    /// Attempt a non-blocking exclusive acquire of the per-key build lock.
+    pub fn try_acquire(cache: &CacheLayout, snapshot_key: &str) -> io::Result<Option<Self>> {
+        let locks_dir = cache.snapshot_locks_dir();
+        fs::create_dir_all(&locks_dir)?;
+        let path = cache.snapshot_build_lock_path(snapshot_key);
+
+        // `read(true)` is required for `File::try_lock` on Windows even though
+        // we never read — same constraint observed in PublisherLock.
+        let file = OpenOptions::new()
+            .read(true)
+            .write(true)
+            .create(true)
+            .truncate(false)
+            .open(&path)?;
+
+        match file.try_lock() {
+            Ok(()) => Ok(Some(SnapshotBuildLock { _file: file })),
+            Err(std::fs::TryLockError::WouldBlock) => Ok(None),
+            Err(std::fs::TryLockError::Error(e)) => Err(e),
+        }
+    }
+}
+
 struct StagedActiveIndex {
     active_path: PathBuf,
     staged_path: PathBuf,
@@ -207,6 +264,13 @@ impl IndexLayout {
         Ok(self.cache.snapshot_index_path(&self.snapshot_key()?))
     }
 
+    /// Try to acquire the build lock for the current snapshot key.
+    /// Caller decides whether to wait, retry, or short-circuit if `None`.
+    pub fn try_acquire_snapshot_build_lock(&self) -> io::Result<Option<SnapshotBuildLock>> {
+        let snapshot_key = self.snapshot_key()?;
+        SnapshotBuildLock::try_acquire(&self.cache, &snapshot_key)
+    }
+
     pub fn current_index_path(&self) -> io::Result<Option<PathBuf>> {
         let snapshot_key = self.snapshot_key()?;
         let shared = self.cache.snapshot_index_path(&snapshot_key);
@@ -711,7 +775,11 @@ fn read_commondir(git_dir: &Path) -> Option<PathBuf> {
     fs::canonicalize(resolved).ok()
 }
 
-fn short_path_hash(path: &Path) -> String {
+/// Stable per-worktree id derived from the path of the worktree's private
+/// `.git` admin dir. Used to namespace per-worktree state under
+/// `<shared_root>/worktrees/<id>/` and exposed so snapshot pruning can
+/// recompute the same id from a `git worktree list` result.
+pub fn short_path_hash(path: &Path) -> String {
     let canonical = fs::canonicalize(path).unwrap_or_else(|_| path.to_path_buf());
     let mut hasher = Hasher::new();
     hasher.update(canonical.to_string_lossy().as_bytes());
@@ -719,6 +787,14 @@ fn short_path_hash(path: &Path) -> String {
     hex[..12].to_string()
 }
 
+/// Snapshot keys are the first 16 hex chars of a BLAKE3 digest, see
+/// [`graph_snapshot_key`]. This predicate filters out the `.locks`
+/// coordination dir and any partial-prune `.gc-*` stubs from a `read_dir`
+/// over `<shared_root>/snapshots/`.
+pub fn is_snapshot_key(name: &str) -> bool {
+    name.len() == 16 && name.bytes().all(|b| b.is_ascii_hexdigit())
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
@@ -1222,6 +1298,61 @@ mod tests {
         );
     }
 
+    #[test]
+    fn is_snapshot_key_accepts_only_canonical_form() {
+        assert!(is_snapshot_key("12971a01c28ece80"));
+        assert!(is_snapshot_key("0000000000000000"));
+        assert!(!is_snapshot_key(""));
+        assert!(!is_snapshot_key("12971a01c28ece8"));
+        assert!(!is_snapshot_key("12971a01c28ece800"));
+        assert!(!is_snapshot_key(".locks"));
+        assert!(!is_snapshot_key(".gc-12971a01c28ece80"));
+        assert!(!is_snapshot_key("ZZ971a01c28ece80"));
+    }
+
+    #[test]
+    fn snapshot_build_lock_is_exclusive_per_key() {
+        let tmp = tempfile::tempdir().unwrap();
+        let cache = CacheLayout {
+            shared_root: tmp.path().to_path_buf(),
+            worktree_root: tmp.path().join("worktrees").join("default"),
+        };
+        let key = "deadbeefcafef00d";
+
+        let first = SnapshotBuildLock::try_acquire(&cache, key).unwrap();
+        assert!(first.is_some());
+
+        let second = SnapshotBuildLock::try_acquire(&cache, key).unwrap();
+        assert!(
+            second.is_none(),
+            "second acquire while first still held should return None"
+        );
+
+        // Distinct key — no contention.
+        let other = SnapshotBuildLock::try_acquire(&cache, "abc1234567890def").unwrap();
+        assert!(other.is_some());
+
+        drop(first);
+        let after_release = SnapshotBuildLock::try_acquire(&cache, key).unwrap();
+        assert!(after_release.is_some());
+    }
+
+    #[test]
+    fn snapshot_build_lock_creates_locks_dir() {
+        let tmp = tempfile::tempdir().unwrap();
+        let cache = CacheLayout {
+            shared_root: tmp.path().to_path_buf(),
+            worktree_root: tmp.path().join("worktrees").join("default"),
+        };
+
+        let _lock = SnapshotBuildLock::try_acquire(&cache, "cafefacecafefade")
+            .unwrap()
+            .unwrap();
+
+        assert!(cache.snapshot_locks_dir().is_dir());
+        assert!(cache.snapshot_build_lock_path("cafefacecafefade").is_file());
+    }
+
     #[test]
     fn publish_active_snapshot_does_not_overwrite_existing_shared_snapshot() {
         let tmp = tempfile::tempdir().unwrap();
diff --git a/crates/tempyr-index/Cargo.toml b/crates/tempyr-index/Cargo.toml
index 549de0d..3556d51 100644
--- a/crates/tempyr-index/Cargo.toml
+++ b/crates/tempyr-index/Cargo.toml
@@ -17,6 +17,7 @@ reqwest = { workspace = true }
 async-trait = { workspace = true }
 blake3 = { workspace = true }
 toml = { workspace = true }
+walkdir = { workspace = true }
 
 [features]
 default = ["local-embeddings"]
diff --git a/crates/tempyr-index/src/health.rs b/crates/tempyr-index/src/health.rs
index 03dfc7b..e9d829d 100644
--- a/crates/tempyr-index/src/health.rs
+++ b/crates/tempyr-index/src/health.rs
@@ -118,6 +118,13 @@ pub struct IndexSection {
     pub snapshot_key_error: Option<String>,
     pub fts_entries: Option<usize>,
     pub embedding_count_for_index: Option<usize>,
+    /// Number of historical snapshot directories under
+    /// `<shared_root>/snapshots/`. Excludes the `.locks` dir and any
+    /// `.gc-*` eviction stubs left by a partial prune.
+    pub snapshot_store_count: Option<usize>,
+    /// Total bytes used by historical snapshot directories. A high number
+    /// here is the signal to run `tempyr snapshot prune`.
+    pub snapshot_store_bytes: Option<u64>,
 }
 
 const LOCAL_EMBEDDINGS_COMPILED_IN: bool = cfg!(feature = "local-embeddings");
@@ -360,6 +367,8 @@ fn build_index_section(inputs: &HealthInputs<'_>, store_path: Option<&Path>) ->
     let layout = match IndexLayout::resolve(inputs.root, inputs.graph_dir, inputs.tempyr_dir) {
         Ok(layout) => layout,
         Err(err) => {
+            let (snapshot_store_count, snapshot_store_bytes) =
+                probe_snapshot_store(&inputs.cache.snapshots_root());
             return IndexSection {
                 active_index_path: inputs.cache.active_index_path(),
                 active_index_exists: inputs.cache.active_index_path().exists(),
@@ -372,6 +381,8 @@ fn build_index_section(inputs: &HealthInputs<'_>, store_path: Option<&Path>) ->
                 snapshot_key_error: Some(err.to_string()),
                 fts_entries: None,
                 embedding_count_for_index: None,
+                snapshot_store_count,
+                snapshot_store_bytes,
             };
         }
     };
@@ -398,6 +409,9 @@ fn build_index_section(inputs: &HealthInputs<'_>, store_path: Option<&Path>) ->
         None => (None, None),
     };
 
+    let (snapshot_store_count, snapshot_store_bytes) =
+        probe_snapshot_store(&inputs.cache.snapshots_root());
+
     IndexSection {
         active_index_path,
         active_index_exists,
@@ -410,7 +424,50 @@ fn build_index_section(inputs: &HealthInputs<'_>, store_path: Option<&Path>) ->
         snapshot_key_error,
         fts_entries,
         embedding_count_for_index,
+        snapshot_store_count,
+        snapshot_store_bytes,
+    }
+}
+
+/// Walk `<shared_root>/snapshots/` and return `(snapshot_dir_count, total_bytes)`.
+/// Uses [`project::is_snapshot_key`] so the count agrees with what
+/// `tempyr snapshot prune` would consider — the `.locks` coordination dir
+/// and any partial-prune `.gc-*` stubs are excluded. Returns `(None, None)`
+/// if the dir does not exist or cannot be read (snapshot store unused on
+/// this project yet).
+fn probe_snapshot_store(snapshots_root: &Path) -> (Option<usize>, Option<u64>) {
+    if !snapshots_root.is_dir() {
+        return (None, None);
+    }
+    let read = match std::fs::read_dir(snapshots_root) {
+        Ok(read) => read,
+        Err(_) => return (None, None),
+    };
+    let mut count = 0usize;
+    let mut bytes = 0u64;
+    for entry in read.flatten() {
+        let Ok(file_type) = entry.file_type() else {
+            continue;
+        };
+        if !file_type.is_dir() {
+            continue;
+        }
+        let name = entry.file_name();
+        let name = name.to_string_lossy();
+        if !project::is_snapshot_key(&name) {
+            continue;
+        }
+        count += 1;
+        for sub in walkdir::WalkDir::new(entry.path()) {
+            let Ok(sub) = sub else { continue };
+            if let Ok(meta) = sub.metadata()
+                && meta.is_file()
+            {
+                bytes = bytes.saturating_add(meta.len());
+            }
+        }
     }
+    (Some(count), Some(bytes))
 }
 
 fn probe_index(index_path: &Path, store_path: Option<&Path>) -> (Option<usize>, Option<usize>) {

From 1c89c1717d7c40d321a2983603350d42d86a7619 Mon Sep 17 00:00:00 2001
From: Caleb Leak <caleb.leak@gmail.com>
Date: Sat, 2 May 2026 17:46:40 -0700
Subject: [PATCH 2/4] review: address PR feedback on snapshot store changes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Three fixes from review on PR #45, plus a regression test pinning the
if-let drop semantics that one piece of feedback misidentified as a bug.

- `seed_and_report` now re-seeds the local active.db from the shared
  snapshot before opening it. Without this, a `built_by_other = true`
  outcome (concurrent builder published during our wait) would update
  the worktree cursor to the new key while leaving a stale local index
  on disk. A subsequent query that fell through to the active path
  (e.g. after the shared snapshot was pruned) would then read stale
  bytes against a cursor that claimed they were current. Narrow window
  but real.

- `project::is_snapshot_key` now rejects uppercase A–F. Snapshot keys
  are produced by BLAKE3 `to_hex()` which always emits lowercase, so
  any uppercase-named directory under `snapshots/` was created by some
  foreign process. Treating it as a snapshot would let `tempyr snapshot
  prune` evict it. Added a paired test that the predicate accepts
  whatever `graph_snapshot_key()` produces, so the two can't drift.

- `health::probe_snapshot_store` now returns `(None, None)` on any IO
  failure during the walk instead of silently undercounting via
  `read.flatten()` and per-entry `continue`s. An undercount that hides
  a multi-GB snapshot store from the doctor's prune hint is worse than
  a clear "unavailable" signal. Concurrent prune (rename-then-remove)
  can briefly trigger this; a re-run after prune completes succeeds.

- New `build_lock_survives_if_let_match_against_use_existing_arm` test
  empirically pins the Rust drop semantics in `run_rebuild`: the
  `if let RebuildSlot::UseExisting { .. } = slot { return ... }`
  pattern leaves `slot` (and any `Build(SnapshotBuildLock)` it carries)
  alive for the rest of the function, so the lock is held through
  `rebuild_from_scratch`. A future refactor that moves out of `slot`
  in the matched arm would fail this test.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .agents/skills/tempyr-interview/SKILL.md    | 197 ++++++++++++++++++++
 crates/tempyr-cli/src/commands/index_cmd.rs |  40 ++++
 crates/tempyr-core/src/project.rs           |  38 +++-
 crates/tempyr-index/src/health.rs           |  34 ++--
 4 files changed, 294 insertions(+), 15 deletions(-)
 create mode 100644 .agents/skills/tempyr-interview/SKILL.md

diff --git a/.agents/skills/tempyr-interview/SKILL.md b/.agents/skills/tempyr-interview/SKILL.md
new file mode 100644
index 0000000..df9d5d2
--- /dev/null
+++ b/.agents/skills/tempyr-interview/SKILL.md
@@ -0,0 +1,197 @@
+---
+name: tempyr-interview
+description: >
+  Guides the user through creating graph nodes via structured interview.
+  Activate when the user wants to: add a feature, create an epic, plan a
+  project, capture requirements, do a brain dump, or create a PRD/TDD.
+  Keywords: interview, new feature, brain dump, PRD, TDD, requirements.
+allowed-tools:
+  - mcp__tempyr__interview_start
+  - mcp__tempyr__interview_answer
+  - mcp__tempyr__interview_show
+  - mcp__tempyr__interview_commit
+  - mcp__tempyr__interview_adjust
+  - mcp__tempyr__interview_resume
+  - mcp__tempyr__interview_add_node
+  - mcp__tempyr__interview_add_edge
+  - mcp__tempyr__graph_search
+  - mcp__tempyr__graph_list
+  - mcp__tempyr__graph_context
+  - mcp__tempyr__graph_traverse
+  - mcp__tempyr__graph_get_node
+  - mcp__tempyr__graph_update_node
+---
+
+# Tempyr Interview Skill
+
+You are conducting a structured interview to create knowledge graph nodes.
+The MCP server handles state, gap detection, and phase transitions. Your job
+is the CONVERSATION — phrasing questions naturally, extracting structured
+entities from answers, and presenting proposals clearly.
+
+## CRITICAL: You are the interviewer, NOT the interviewee
+
+**NEVER answer your own questions.** You ask questions, then STOP and WAIT
+for the user to respond. Every call to `interview_answer` MUST contain text
+that the user actually typed — never your own fabricated answers, inferences,
+or "obvious" gap-fills. If you think you know the answer from context, you
+still ask — the user may disagree, clarify, or have context you lack.
+
+Concretely:
+- After calling `interview_start`, present the gaps/questions and **stop**.
+- After each user reply, call `interview_answer` with **their words**, then
+  present the next questions and **stop**.
+- Do NOT batch-answer gaps. Do NOT pre-fill answers from existing graph
+  context. Do NOT call `interview_answer` multiple times in a row without
+  user input between each call.
+- The only valid argument to `interview_answer` is a quote or close
+  paraphrase of what the user just said.
+
+## Core workflow
+
+### Starting an interview
+
+When the user describes something they want to build/plan/capture:
+1. Call `interview_start` with their input as `brain_dump`
+2. The server returns: tentative root node, existing graph context, gaps
+3. Present what the server found in the existing graph FIRST
+4. Show the tentative root node it created from the brain dump
+5. Ask the first 2-3 questions from `next_questions`
+
+### Processing answers — the extraction loop
+
+When the user answers a question (or gives additional context — meaning they
+typed something and you received it as a user message):
+
+1. **Record** the answer: call `interview_answer` with the user's actual response
+2. **Extract** entities from the answer text. For each entity you identify:
+   - If the node **already exists in the graph** (not tentative), call
+     `graph_update_node` with the node_id and the fields to change (body,
+     status, owner, tags). Only provided fields are overwritten.
+   - If the node is **new**, call `interview_add_node` with the `session_id`,
+     a human-readable `slug` (e.g. `session-replay`, `p99-latency`), and
+     `node_type`. The system generates a 6-char suffix automatically and
+     returns the full ID. The node is stored as tentative (not written to
+     disk) until `interview_commit`.
+   - Call `interview_add_edge` using the `session_id` and the full ID
+     returned by `interview_add_node`. You can reference tentative nodes
+     (including the root node), existing graph nodes, or use 6-char suffixes.
+   - Do NOT include type prefixes in slugs — use `session-replay` not
+     `feat-session-replay`. The `node_type` field handles typing.
+3. **Show** what was created/linked in compact format (see below)
+4. **Ask** the next 2-3 questions from the server's gap list
+
+Alternatively, spawn the `tempyr-extractor` subagent for complex answers
+(wall-of-text brain dumps, multi-entity responses). Pass it:
+- The question that was asked
+- The user's answer
+- Current tentative nodes (from `interview_show`)
+- Existing graph context
+
+Then apply its JSON output by calling `interview_add_node` (new) or
+`graph_update_node` (existing graph nodes) and `interview_add_edge` for
+each entity.
+
+### How to present tentative nodes
+
+Use this compact format — NOT full YAML:
+
+```
+Here's what I've added:
++ constraint: P99 replay load < 2s (from your latency requirement)
++ decision: separate ingestion pipeline (status: proposed)
+  -> linked to: comp-event-pipeline (existing), constraint-p99-latency (new)
+```
+
+### How to phrase questions
+
+The MCP server returns structured gap descriptions with context for
+natural phrasing. Use `suggested_angle` as your approach hint.
+
+Server returns gap objects like:
+```json
+{
+  "gap_type": "MissingSuccessMetric",
+  "priority": "Required",
+  "context": "'feat-replay' has no measured_by relationship to any metric.",
+  "existing_related": ["metric-mttr-reduction"],
+  "question_type": "Closed",
+  "suggested_angle": "Ask what success looks like -- quantitative if possible."
+}
+```
+
+When `existing_related` is populated, reference those nodes:
+"The observability epic tracks MTTR reduction — does that cover session
+replay too, or does this feature need its own success metric?"
+
+When `existing_related` is empty, ask open-ended:
+"How will we know this feature is successful? What would you measure?"
+
+### Question rules
+
+- NEVER ask more than 3 questions per turn
+- NEVER ask questions the graph already answers — check `existing_related`
+- For `Closed` question_type: phrase as confirmation ("Is X the right...?")
+- For `Open` question_type: ask one focused question
+- For `ForcedChoice` question_type: present the candidates from `existing_related`
+- For `Implication` question_type: frame as "have you thought about X?"
+  with a specific number or consequence
+
+### Phase transitions
+
+The server manages phases internally. When the response includes
+`phase_changed: true`, acknowledge the shift naturally:
+
+"Good — I have a clear picture of who this is for and what success looks
+like. Let me ask about the technical side now."
+
+Do NOT announce phase names ("entering Technical phase"). The user
+experiences a conversation, not a state machine.
+
+Phases flow: Discovery -> Product -> Technical -> Decomposition -> Review.
+Each phase focuses on different gap types. The server handles this
+automatically — just follow the `next_questions` it returns.
+
+### Handling tangents and corrections
+
+If the user:
+- **Goes off-topic**: still call `interview_answer` — extract any relevant
+  entities and the gap analysis will catch what's still missing
+- **Wants to correct something**: call `interview_adjust` with the node_id
+  and the changes (body, status, or new_id for renaming)
+- **Wants to skip ahead**: call `interview_answer` with "user wants to
+  skip to technical/decomposition/review"
+- **Dumps a wall of text**: spawn the `tempyr-extractor` subagent — it
+  handles multi-entity extraction better in isolated context
+
+### Review phase
+
+When the server returns `"phase": "Review"`:
+1. Call `interview_show` to get the full tentative state
+2. Present a structured summary organized by node type:
+   - Features, with their linked personas, metrics, constraints
+   - Decisions and their rationale
+   - Tasks and dependencies
+   - Risks and open questions
+3. Show progress: "X nodes, Y edges proposed"
+4. Ask: "Anything to add, change, or remove before I commit?"
+5. On approval, call `interview_commit`
+6. Report the files created and any validation warnings
+7. Mention: `tempyr render prd <id>` or `tempyr render tdd <id>`
+
+### Resuming an interrupted interview
+
+If the user mentions a previous interview or wants to continue:
+1. Call `interview_resume` with the session_id
+2. The server returns the full current state
+3. Summarize where they left off: phase, nodes created, gaps remaining
+4. Continue asking questions from `next_questions`
+
+If the user doesn't know the session_id, they can run
+`tempyr interview list` in the terminal to see active sessions.
+
+### When NOT to interview
+
+If the user just wants to quickly add a single note or insight:
+- Use `graph_add_node` directly, skip the interview
+- The interview is for features, epics, and multi-node creation
diff --git a/crates/tempyr-cli/src/commands/index_cmd.rs b/crates/tempyr-cli/src/commands/index_cmd.rs
index f04b5a2..0281fc9 100644
--- a/crates/tempyr-cli/src/commands/index_cmd.rs
+++ b/crates/tempyr-cli/src/commands/index_cmd.rs
@@ -214,6 +214,15 @@ fn seed_and_report(
     skip_embeddings: bool,
     waited_for_concurrent_builder: bool,
 ) -> anyhow::Result<()> {
+    // If a concurrent builder published the snapshot during our wait, the
+    // initial `ensure_active_index_seeded` (called at the top of `run_rebuild`)
+    // ran before the snapshot existed and so left the worktree's local index
+    // unrefreshed. Re-seed now so the local copy and cursor stay in sync with
+    // the shared snapshot — otherwise a later query that falls through to the
+    // active path (e.g. after the shared snapshot is pruned) would read stale
+    // bytes against a cursor that claims they're current.
+    let _ = ctx.ensure_active_index_seeded()?;
+
     let index_path = ctx.queryable_index_path()?;
     let index = Index::open(&index_path)?;
     let stats = index.stats()?;
@@ -586,6 +595,37 @@ mod tests {
         publisher.join().unwrap();
     }
 
+    /// Regression guard for the if-let drop semantics in `run_rebuild`:
+    /// `if let RebuildSlot::UseExisting { .. } = slot { return ... }` must
+    /// leave `slot` (and the `Build(SnapshotBuildLock)` it might carry)
+    /// alive for the rest of the function, so that the lock is held through
+    /// `rebuild_from_scratch`. If a future refactor changes this — e.g.
+    /// moves out of `slot` in the matched arm — the test would fail by
+    /// observing the lock as available to a second acquirer mid-rebuild.
+    #[test]
+    fn build_lock_survives_if_let_match_against_use_existing_arm() {
+        let tmp = tempfile::tempdir().unwrap();
+        let ctx = make_ctx(tmp.path());
+        let shared = ctx.shared_snapshot_index_path().unwrap();
+
+        // Snapshot doesn't exist → negotiate returns Build(lock).
+        let slot = negotiate_rebuild_slot(&ctx, &shared, false, Duration::from_millis(50)).unwrap();
+        assert!(matches!(slot, RebuildSlot::Build(_)));
+
+        // Mirror the exact pattern used in run_rebuild.
+        if let RebuildSlot::UseExisting { built_by_other: _ } = slot {
+            unreachable!("Build was constructed but UseExisting matched");
+        }
+
+        // After the if-let, `slot` (and its lock) must still be held —
+        // confirm by trying to acquire from a fresh ProjectContext.
+        let probe = ctx.try_acquire_snapshot_build_lock().unwrap();
+        assert!(
+            probe.is_none(),
+            "lock was released too early; if-let against UseExisting arm dropped Build(lock)"
+        );
+    }
+
     #[test]
     fn slot_blocking_acquire_times_out_when_lock_never_releases() {
         let tmp = tempfile::tempdir().unwrap();
diff --git a/crates/tempyr-core/src/project.rs b/crates/tempyr-core/src/project.rs
index ef2a131..a714f74 100644
--- a/crates/tempyr-core/src/project.rs
+++ b/crates/tempyr-core/src/project.rs
@@ -787,12 +787,21 @@ pub fn short_path_hash(path: &Path) -> String {
     hex[..12].to_string()
 }
 
-/// Snapshot keys are the first 16 hex chars of a BLAKE3 digest, see
-/// [`graph_snapshot_key`]. This predicate filters out the `.locks`
+/// Snapshot keys are the first 16 lowercase hex chars of a BLAKE3 digest,
+/// see [`graph_snapshot_key`]. This predicate filters out the `.locks`
 /// coordination dir and any partial-prune `.gc-*` stubs from a `read_dir`
 /// over `<shared_root>/snapshots/`.
+///
+/// Lowercase-only matters: BLAKE3's `to_hex()` always emits lowercase, so
+/// any uppercase-named directory under `snapshots/` was created by some
+/// foreign process, not by Tempyr. Treating it as a snapshot would let
+/// `tempyr snapshot prune` evict it — refusing to recognize it is the
+/// safer default.
 pub fn is_snapshot_key(name: &str) -> bool {
-    name.len() == 16 && name.bytes().all(|b| b.is_ascii_hexdigit())
+    name.len() == 16
+        && name
+            .bytes()
+            .all(|b| b.is_ascii_digit() || (b'a'..=b'f').contains(&b))
 }
 
 #[cfg(test)]
@@ -1302,12 +1311,35 @@ mod tests {
     fn is_snapshot_key_accepts_only_canonical_form() {
         assert!(is_snapshot_key("12971a01c28ece80"));
         assert!(is_snapshot_key("0000000000000000"));
+        assert!(is_snapshot_key("abcdef0123456789"));
         assert!(!is_snapshot_key(""));
         assert!(!is_snapshot_key("12971a01c28ece8"));
         assert!(!is_snapshot_key("12971a01c28ece800"));
         assert!(!is_snapshot_key(".locks"));
         assert!(!is_snapshot_key(".gc-12971a01c28ece80"));
         assert!(!is_snapshot_key("ZZ971a01c28ece80"));
+        // Tightened: uppercase A-F is rejected (BLAKE3 to_hex emits lowercase).
+        assert!(!is_snapshot_key("ABCDEF0123456789"));
+        assert!(!is_snapshot_key("12971A01C28ECE80"));
+    }
+
+    #[test]
+    fn is_snapshot_key_matches_graph_snapshot_key_output() {
+        // Whatever real snapshot-key the producer emits must satisfy the
+        // predicate, otherwise prune/health would silently drop it.
+        let tmp = tempfile::tempdir().unwrap();
+        let root = tmp.path();
+        let graph_dir = root.join("graph");
+        let tempyr_dir = root.join(".tempyr");
+        fs::create_dir_all(&graph_dir).unwrap();
+        fs::create_dir_all(&tempyr_dir).unwrap();
+        fs::write(tempyr_dir.join("schema.toml"), "name = 'x'\n").unwrap();
+
+        let key = graph_snapshot_key(&graph_dir, &tempyr_dir).unwrap();
+        assert!(
+            is_snapshot_key(&key),
+            "graph_snapshot_key() output {key:?} must satisfy is_snapshot_key()"
+        );
     }
 
     #[test]
diff --git a/crates/tempyr-index/src/health.rs b/crates/tempyr-index/src/health.rs
index ed3d3db..bc6eaa8 100644
--- a/crates/tempyr-index/src/health.rs
+++ b/crates/tempyr-index/src/health.rs
@@ -432,22 +432,29 @@ fn build_index_section(inputs: &HealthInputs<'_>, store_path: Option<&Path>) ->
 /// Walk `<shared_root>/snapshots/` and return `(snapshot_dir_count, total_bytes)`.
 /// Uses [`project::is_snapshot_key`] so the count agrees with what
 /// `tempyr snapshot prune` would consider — the `.locks` coordination dir
-/// and any partial-prune `.gc-*` stubs are excluded. Returns `(None, None)`
-/// if the dir does not exist or cannot be read (snapshot store unused on
-/// this project yet).
+/// and any partial-prune `.gc-*` stubs are excluded.
+///
+/// Returns `(None, None)` for any IO failure rather than reporting
+/// partial totals: an undercount that hides a multi-GB store from the
+/// `tempyr doctor` prune hint is worse than a clear "unavailable" signal.
+/// `tempyr snapshot prune` running concurrently can briefly trigger this
+/// (rename-then-remove races with our walk); a re-run of doctor after
+/// the prune completes will succeed.
 fn probe_snapshot_store(snapshots_root: &Path) -> (Option<usize>, Option<u64>) {
     if !snapshots_root.is_dir() {
         return (None, None);
     }
-    let read = match std::fs::read_dir(snapshots_root) {
-        Ok(read) => read,
-        Err(_) => return (None, None),
+    let Ok(read) = std::fs::read_dir(snapshots_root) else {
+        return (None, None);
     };
     let mut count = 0usize;
     let mut bytes = 0u64;
-    for entry in read.flatten() {
+    for entry in read {
+        let Ok(entry) = entry else {
+            return (None, None);
+        };
         let Ok(file_type) = entry.file_type() else {
-            continue;
+            return (None, None);
         };
         if !file_type.is_dir() {
             continue;
@@ -459,10 +466,13 @@ fn probe_snapshot_store(snapshots_root: &Path) -> (Option<usize>, Option<u64>) {
         }
         count += 1;
         for sub in walkdir::WalkDir::new(entry.path()) {
-            let Ok(sub) = sub else { continue };
-            if let Ok(meta) = sub.metadata()
-                && meta.is_file()
-            {
+            let Ok(sub) = sub else {
+                return (None, None);
+            };
+            let Ok(meta) = sub.metadata() else {
+                return (None, None);
+            };
+            if meta.is_file() {
                 bytes = bytes.saturating_add(meta.len());
             }
         }

From b7406e8378ea254c44007a334e9a0c8ad7d5fa60 Mon Sep 17 00:00:00 2001
From: Caleb Leak <caleb.leak@gmail.com>
Date: Sat, 2 May 2026 17:46:58 -0700
Subject: [PATCH 3/4] review: drop accidentally staged .agents/ scratch file
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The .agents/ dir was already untracked before this branch — it's local
agent state, not part of the snapshot store work. Got swept in by a
`git add -A` in the prior commit; removing here.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .agents/skills/tempyr-interview/SKILL.md | 197 -----------------------
 1 file changed, 197 deletions(-)
 delete mode 100644 .agents/skills/tempyr-interview/SKILL.md

diff --git a/.agents/skills/tempyr-interview/SKILL.md b/.agents/skills/tempyr-interview/SKILL.md
deleted file mode 100644
index df9d5d2..0000000
--- a/.agents/skills/tempyr-interview/SKILL.md
+++ /dev/null
@@ -1,197 +0,0 @@
----
-name: tempyr-interview
-description: >
-  Guides the user through creating graph nodes via structured interview.
-  Activate when the user wants to: add a feature, create an epic, plan a
-  project, capture requirements, do a brain dump, or create a PRD/TDD.
-  Keywords: interview, new feature, brain dump, PRD, TDD, requirements.
-allowed-tools:
-  - mcp__tempyr__interview_start
-  - mcp__tempyr__interview_answer
-  - mcp__tempyr__interview_show
-  - mcp__tempyr__interview_commit
-  - mcp__tempyr__interview_adjust
-  - mcp__tempyr__interview_resume
-  - mcp__tempyr__interview_add_node
-  - mcp__tempyr__interview_add_edge
-  - mcp__tempyr__graph_search
-  - mcp__tempyr__graph_list
-  - mcp__tempyr__graph_context
-  - mcp__tempyr__graph_traverse
-  - mcp__tempyr__graph_get_node
-  - mcp__tempyr__graph_update_node
----
-
-# Tempyr Interview Skill
-
-You are conducting a structured interview to create knowledge graph nodes.
-The MCP server handles state, gap detection, and phase transitions. Your job
-is the CONVERSATION — phrasing questions naturally, extracting structured
-entities from answers, and presenting proposals clearly.
-
-## CRITICAL: You are the interviewer, NOT the interviewee
-
-**NEVER answer your own questions.** You ask questions, then STOP and WAIT
-for the user to respond. Every call to `interview_answer` MUST contain text
-that the user actually typed — never your own fabricated answers, inferences,
-or "obvious" gap-fills. If you think you know the answer from context, you
-still ask — the user may disagree, clarify, or have context you lack.
-
-Concretely:
-- After calling `interview_start`, present the gaps/questions and **stop**.
-- After each user reply, call `interview_answer` with **their words**, then
-  present the next questions and **stop**.
-- Do NOT batch-answer gaps. Do NOT pre-fill answers from existing graph
-  context. Do NOT call `interview_answer` multiple times in a row without
-  user input between each call.
-- The only valid argument to `interview_answer` is a quote or close
-  paraphrase of what the user just said.
-
-## Core workflow
-
-### Starting an interview
-
-When the user describes something they want to build/plan/capture:
-1. Call `interview_start` with their input as `brain_dump`
-2. The server returns: tentative root node, existing graph context, gaps
-3. Present what the server found in the existing graph FIRST
-4. Show the tentative root node it created from the brain dump
-5. Ask the first 2-3 questions from `next_questions`
-
-### Processing answers — the extraction loop
-
-When the user answers a question (or gives additional context — meaning they
-typed something and you received it as a user message):
-
-1. **Record** the answer: call `interview_answer` with the user's actual response
-2. **Extract** entities from the answer text. For each entity you identify:
-   - If the node **already exists in the graph** (not tentative), call
-     `graph_update_node` with the node_id and the fields to change (body,
-     status, owner, tags). Only provided fields are overwritten.
-   - If the node is **new**, call `interview_add_node` with the `session_id`,
-     a human-readable `slug` (e.g. `session-replay`, `p99-latency`), and
-     `node_type`. The system generates a 6-char suffix automatically and
-     returns the full ID. The node is stored as tentative (not written to
-     disk) until `interview_commit`.
-   - Call `interview_add_edge` using the `session_id` and the full ID
-     returned by `interview_add_node`. You can reference tentative nodes
-     (including the root node), existing graph nodes, or use 6-char suffixes.
-   - Do NOT include type prefixes in slugs — use `session-replay` not
-     `feat-session-replay`. The `node_type` field handles typing.
-3. **Show** what was created/linked in compact format (see below)
-4. **Ask** the next 2-3 questions from the server's gap list
-
-Alternatively, spawn the `tempyr-extractor` subagent for complex answers
-(wall-of-text brain dumps, multi-entity responses). Pass it:
-- The question that was asked
-- The user's answer
-- Current tentative nodes (from `interview_show`)
-- Existing graph context
-
-Then apply its JSON output by calling `interview_add_node` (new) or
-`graph_update_node` (existing graph nodes) and `interview_add_edge` for
-each entity.
-
-### How to present tentative nodes
-
-Use this compact format — NOT full YAML:
-
-```
-Here's what I've added:
-+ constraint: P99 replay load < 2s (from your latency requirement)
-+ decision: separate ingestion pipeline (status: proposed)
-  -> linked to: comp-event-pipeline (existing), constraint-p99-latency (new)
-```
-
-### How to phrase questions
-
-The MCP server returns structured gap descriptions with context for
-natural phrasing. Use `suggested_angle` as your approach hint.
-
-Server returns gap objects like:
-```json
-{
-  "gap_type": "MissingSuccessMetric",
-  "priority": "Required",
-  "context": "'feat-replay' has no measured_by relationship to any metric.",
-  "existing_related": ["metric-mttr-reduction"],
-  "question_type": "Closed",
-  "suggested_angle": "Ask what success looks like -- quantitative if possible."
-}
-```
-
-When `existing_related` is populated, reference those nodes:
-"The observability epic tracks MTTR reduction — does that cover session
-replay too, or does this feature need its own success metric?"
-
-When `existing_related` is empty, ask open-ended:
-"How will we know this feature is successful? What would you measure?"
-
-### Question rules
-
-- NEVER ask more than 3 questions per turn
-- NEVER ask questions the graph already answers — check `existing_related`
-- For `Closed` question_type: phrase as confirmation ("Is X the right...?")
-- For `Open` question_type: ask one focused question
-- For `ForcedChoice` question_type: present the candidates from `existing_related`
-- For `Implication` question_type: frame as "have you thought about X?"
-  with a specific number or consequence
-
-### Phase transitions
-
-The server manages phases internally. When the response includes
-`phase_changed: true`, acknowledge the shift naturally:
-
-"Good — I have a clear picture of who this is for and what success looks
-like. Let me ask about the technical side now."
-
-Do NOT announce phase names ("entering Technical phase"). The user
-experiences a conversation, not a state machine.
-
-Phases flow: Discovery -> Product -> Technical -> Decomposition -> Review.
-Each phase focuses on different gap types. The server handles this
-automatically — just follow the `next_questions` it returns.
-
-### Handling tangents and corrections
-
-If the user:
-- **Goes off-topic**: still call `interview_answer` — extract any relevant
-  entities and the gap analysis will catch what's still missing
-- **Wants to correct something**: call `interview_adjust` with the node_id
-  and the changes (body, status, or new_id for renaming)
-- **Wants to skip ahead**: call `interview_answer` with "user wants to
-  skip to technical/decomposition/review"
-- **Dumps a wall of text**: spawn the `tempyr-extractor` subagent — it
-  handles multi-entity extraction better in isolated context
-
-### Review phase
-
-When the server returns `"phase": "Review"`:
-1. Call `interview_show` to get the full tentative state
-2. Present a structured summary organized by node type:
-   - Features, with their linked personas, metrics, constraints
-   - Decisions and their rationale
-   - Tasks and dependencies
-   - Risks and open questions
-3. Show progress: "X nodes, Y edges proposed"
-4. Ask: "Anything to add, change, or remove before I commit?"
-5. On approval, call `interview_commit`
-6. Report the files created and any validation warnings
-7. Mention: `tempyr render prd <id>` or `tempyr render tdd <id>`
-
-### Resuming an interrupted interview
-
-If the user mentions a previous interview or wants to continue:
-1. Call `interview_resume` with the session_id
-2. The server returns the full current state
-3. Summarize where they left off: phase, nodes created, gaps remaining
-4. Continue asking questions from `next_questions`
-
-If the user doesn't know the session_id, they can run
-`tempyr interview list` in the terminal to see active sessions.
-
-### When NOT to interview
-
-If the user just wants to quickly add a single note or insight:
-- Use `graph_add_node` directly, skip the interview
-- The interview is for features, epics, and multi-node creation

From fe40e73d7163a89f2b9f52838c68e170328c62f2 Mon Sep 17 00:00:00 2001
From: Caleb Leak <caleb.leak@gmail.com>
Date: Sat, 2 May 2026 21:13:16 -0700
Subject: [PATCH 4/4] review: harden snapshot_key validation and clarify
 empty-store probe

Two more fixes from review on PR #45.

- `SnapshotBuildLock::try_acquire` now validates `snapshot_key` against
  `is_snapshot_key()` before doing IO. The path helpers (`snapshot_locks_dir`,
  `snapshot_build_lock_path`) are pure path construction and would happily
  produce a path outside the locks directory if given input like
  `"../../etc/passwd"`. All current internal callers source the key from
  `IndexLayout::snapshot_key()` (always BLAKE3 hex), but the function is
  `pub` so external code could pass arbitrary input. Validation at the IO
  boundary closes that surface without splaying `Result` returns across
  every path helper. Test: `snapshot_build_lock_rejects_path_traversal_keys`.

- `health::probe_snapshot_store` now distinguishes "no snapshot store yet"
  from "probe failed". A missing snapshots directory returns
  `(Some(0), Some(0))` so `tempyr doctor` shows
  `snapshot store: 0 dirs, 0 B (ok)` on a fresh project instead of silently
  suppressing the line. Real anomalies (path exists but is not a directory,
  IO errors mid-walk) still return `(None, None)` so doctor's render can
  signal "unavailable". Tests: missing-dir, non-directory, and a fresh test
  that the canonical-snapshot filter excludes `.locks` and `.gc-*`.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 crates/tempyr-core/src/project.rs | 49 +++++++++++++++++++++++++
 crates/tempyr-index/src/health.rs | 61 +++++++++++++++++++++++++++++++
 2 files changed, 110 insertions(+)

diff --git a/crates/tempyr-core/src/project.rs b/crates/tempyr-core/src/project.rs
index a714f74..92ef96c 100644
--- a/crates/tempyr-core/src/project.rs
+++ b/crates/tempyr-core/src/project.rs
@@ -133,7 +133,23 @@ pub struct SnapshotBuildLock {
 
 impl SnapshotBuildLock {
     /// Attempt a non-blocking exclusive acquire of the per-key build lock.
+    ///
+    /// Rejects any `snapshot_key` that does not satisfy [`is_snapshot_key`]
+    /// — the path helpers it composes (`snapshot_locks_dir` +
+    /// `snapshot_build_lock_path`) are pure path construction and would
+    /// happily produce a path outside the locks directory if given an
+    /// input like `"../../etc/passwd"`. The validation here, at the IO
+    /// boundary, ensures no caller (internal or external) can use this
+    /// API to open or lock a file outside the snapshots store.
     pub fn try_acquire(cache: &CacheLayout, snapshot_key: &str) -> io::Result<Option<Self>> {
+        if !is_snapshot_key(snapshot_key) {
+            return Err(io::Error::new(
+                io::ErrorKind::InvalidInput,
+                format!(
+                    "invalid snapshot key {snapshot_key:?}: expected 16 lowercase hex characters"
+                ),
+            ));
+        }
         let locks_dir = cache.snapshot_locks_dir();
         fs::create_dir_all(&locks_dir)?;
         let path = cache.snapshot_build_lock_path(snapshot_key);
@@ -1369,6 +1385,39 @@ mod tests {
         assert!(after_release.is_some());
     }
 
+    #[test]
+    fn snapshot_build_lock_rejects_path_traversal_keys() {
+        let tmp = tempfile::tempdir().unwrap();
+        let cache = CacheLayout {
+            shared_root: tmp.path().to_path_buf(),
+            worktree_root: tmp.path().join("worktrees").join("default"),
+        };
+
+        for bad in [
+            "..",
+            "../escape",
+            "../../etc/passwd",
+            "ABCDEF1234567890", // wrong case
+            "deadbeef",         // wrong length
+            "",
+            "deadbeefcafef00g", // 'g' is not hex
+        ] {
+            let err = SnapshotBuildLock::try_acquire(&cache, bad).unwrap_err();
+            assert_eq!(
+                err.kind(),
+                io::ErrorKind::InvalidInput,
+                "expected InvalidInput for {bad:?}, got {err:?}"
+            );
+        }
+
+        // Sanity: nothing got created outside the cache root.
+        let escaped = tmp.path().parent().unwrap().join("etc");
+        assert!(
+            !escaped.exists(),
+            "rejection must not produce any side effects outside the cache root"
+        );
+    }
+
     #[test]
     fn snapshot_build_lock_creates_locks_dir() {
         let tmp = tempfile::tempdir().unwrap();
diff --git a/crates/tempyr-index/src/health.rs b/crates/tempyr-index/src/health.rs
index bc6eaa8..3624478 100644
--- a/crates/tempyr-index/src/health.rs
+++ b/crates/tempyr-index/src/health.rs
@@ -434,6 +434,12 @@ fn build_index_section(inputs: &HealthInputs<'_>, store_path: Option<&Path>) ->
 /// `tempyr snapshot prune` would consider — the `.locks` coordination dir
 /// and any partial-prune `.gc-*` stubs are excluded.
 ///
+/// A missing snapshots directory returns `(Some(0), Some(0))` — this is
+/// a definitively-empty store, not a probe failure (a fresh project
+/// before any rebuild, or a non-git tempyr project before any indexing).
+/// Distinguishing this from probe failure lets `tempyr doctor` show
+/// `snapshot store: 0 dirs, 0 B (ok)` instead of suppressing the line.
+///
 /// Returns `(None, None)` for any IO failure rather than reporting
 /// partial totals: an undercount that hides a multi-GB store from the
 /// `tempyr doctor` prune hint is worse than a clear "unavailable" signal.
@@ -441,7 +447,13 @@ fn build_index_section(inputs: &HealthInputs<'_>, store_path: Option<&Path>) ->
 /// (rename-then-remove races with our walk); a re-run of doctor after
 /// the prune completes will succeed.
 fn probe_snapshot_store(snapshots_root: &Path) -> (Option<usize>, Option<u64>) {
+    if !snapshots_root.exists() {
+        // Fresh project — no rebuild has populated the store yet.
+        return (Some(0), Some(0));
+    }
     if !snapshots_root.is_dir() {
+        // Path exists but isn't a directory — that's a real anomaly,
+        // not "empty". Surface as unavailable.
         return (None, None);
     }
     let Ok(read) = std::fs::read_dir(snapshots_root) else {
@@ -656,4 +668,53 @@ allowed_edges = []
             .unwrap();
         assert!(config_entry.exists);
     }
+
+    #[test]
+    fn probe_snapshot_store_treats_missing_dir_as_empty() {
+        let tmp = tempfile::tempdir().unwrap();
+        let snapshots_root = tmp.path().join("never-created");
+
+        let (count, bytes) = probe_snapshot_store(&snapshots_root);
+
+        assert_eq!(
+            count,
+            Some(0),
+            "missing dir should be empty store, not failure"
+        );
+        assert_eq!(bytes, Some(0));
+    }
+
+    #[test]
+    fn probe_snapshot_store_treats_non_directory_as_unavailable() {
+        let tmp = tempfile::tempdir().unwrap();
+        let snapshots_root = tmp.path().join("not-a-dir");
+        fs::write(&snapshots_root, b"oops").unwrap();
+
+        let (count, bytes) = probe_snapshot_store(&snapshots_root);
+
+        assert_eq!(count, None, "non-directory at the path is a real anomaly");
+        assert_eq!(bytes, None);
+    }
+
+    #[test]
+    fn probe_snapshot_store_counts_canonical_snapshots_only() {
+        let tmp = tempfile::tempdir().unwrap();
+        let snapshots_root = tmp.path().join("snapshots");
+        fs::create_dir_all(&snapshots_root).unwrap();
+
+        // Two real snapshots
+        for key in ["0123456789abcdef", "fedcba9876543210"] {
+            let dir = snapshots_root.join(key);
+            fs::create_dir_all(&dir).unwrap();
+            fs::write(dir.join("index.db"), vec![0u8; 100]).unwrap();
+        }
+        // Plus a `.locks` dir and a `.gc-*` stub that must be ignored.
+        fs::create_dir_all(snapshots_root.join(".locks")).unwrap();
+        fs::create_dir_all(snapshots_root.join(".gc-stale")).unwrap();
+        fs::write(snapshots_root.join(".gc-stale").join("index.db"), b"x").unwrap();
+
+        let (count, bytes) = probe_snapshot_store(&snapshots_root);
+        assert_eq!(count, Some(2));
+        assert_eq!(bytes, Some(200));
+    }
 }