Skip to content

Commit d843720

Browse files
hyperpolymathclaude
andcommitted
Phase 1.1: Persistent vector store via redb (first of 6 modalities)
Add RedbVectorStore — durable storage of embeddings with ephemeral in-memory index for fast similarity search. On startup, all embeddings are loaded from redb and the index is rebuilt. Pattern: TypedStore<RedbBackend> namespace "vec" handles serialisation. In-memory HashMap + brute-force search for queries (same as existing). Writes go to redb first (durable), then update in-memory index. Two tests prove persistence: - test_persistent_vector_roundtrip: create, insert, drop, reopen, verify - test_persistent_vector_delete: delete persists across restarts This is the template for the remaining 5 modalities (tensor, semantic, temporal, provenance, spatial). Feature-gated behind redb-backend. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent c3c3dca commit d843720

4 files changed

Lines changed: 302 additions & 2 deletions

File tree

verisimdb/Cargo.lock

Lines changed: 3 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

verisimdb/rust-core/verisim-vector/Cargo.toml

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,18 @@ thiserror.workspace = true
1717
tracing.workspace = true
1818
async-trait.workspace = true
1919
tokio.workspace = true
20-
# bincode removed — not used in crate
21-
# bincode.workspace = true
20+
serde_json.workspace = true
21+
22+
# Optional: persistent storage via redb
23+
verisim-storage = { path = "../verisim-storage", optional = true }
24+
25+
[features]
26+
default = []
27+
redb-backend = ["verisim-storage/redb-backend"]
2228

2329
[dev-dependencies]
2430
proptest.workspace = true
2531
criterion.workspace = true
32+
tempfile = "3"
33+
tokio = { workspace = true, features = ["macros", "rt-multi-thread"] }
2634

verisimdb/rust-core/verisim-vector/src/lib.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,12 @@
66
77
#![forbid(unsafe_code)]
88
mod hnsw;
9+
#[cfg(feature = "redb-backend")]
10+
pub mod persistent;
911

1012
pub use hnsw::{HnswConfig, HnswVectorStore};
13+
#[cfg(feature = "redb-backend")]
14+
pub use persistent::RedbVectorStore;
1115

1216
use async_trait::async_trait;
1317
use ndarray::{Array1, ArrayView1};
Lines changed: 285 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,285 @@
1+
// SPDX-License-Identifier: PMPL-1.0-or-later
2+
// Copyright (c) 2026 Jonathan D.A. Jewell (hyperpolymath) <j.d.a.jewell@open.ac.uk>
3+
//
4+
// Persistent vector store backed by redb via verisim-storage.
5+
//
6+
// Durable storage of embeddings with an ephemeral in-memory index for fast
7+
// similarity search. On startup, all embeddings are loaded from redb and the
8+
// index is rebuilt. Writes go to both redb (durable) and the in-memory index
9+
// (fast search).
10+
//
11+
// Design:
12+
// - TypedStore<RedbBackend> with namespace "vec" handles serialisation + persistence
13+
// - In-memory HashMap + brute-force search for queries (same as BruteForceVectorStore)
14+
// - Startup: load all embeddings from redb, populate in-memory index
15+
// - Upsert: write to redb first (durable), then update in-memory index
16+
// - Delete: remove from redb first, then remove from in-memory index
17+
// - Search: in-memory only (fast, no disk I/O)
18+
19+
use std::collections::HashMap;
20+
use std::path::Path;
21+
use std::sync::{Arc, RwLock};
22+
23+
use async_trait::async_trait;
24+
use tracing::{debug, info};
25+
use verisim_storage::redb_backend::RedbBackend;
26+
use verisim_storage::typed::TypedStore;
27+
28+
use crate::{DistanceMetric, Embedding, SearchResult, VectorError, VectorStore};
29+
30+
/// Persistent vector store: redb for durability, in-memory index for search.
31+
pub struct RedbVectorStore {
32+
/// Dimensionality of stored vectors.
33+
dimension: usize,
34+
/// Distance metric for similarity computation.
35+
metric: DistanceMetric,
36+
/// Durable storage: TypedStore<RedbBackend> with namespace "vec".
37+
store: TypedStore<RedbBackend>,
38+
/// Ephemeral in-memory index for fast similarity search.
39+
/// Rebuilt from redb on startup.
40+
index: Arc<RwLock<HashMap<String, Embedding>>>,
41+
}
42+
43+
impl RedbVectorStore {
44+
/// Open or create a persistent vector store at the given path.
45+
///
46+
/// On first open, creates an empty redb database.
47+
/// On subsequent opens, loads all embeddings from redb and rebuilds the
48+
/// in-memory index. Returns the number of embeddings loaded.
49+
pub async fn open(
50+
path: impl AsRef<Path>,
51+
dimension: usize,
52+
metric: DistanceMetric,
53+
) -> Result<Self, VectorError> {
54+
let backend = RedbBackend::open(path.as_ref()).map_err(|e| {
55+
VectorError::IndexError(format!("Failed to open redb: {}", e))
56+
})?;
57+
let store = TypedStore::new(backend, "vec");
58+
59+
let mut index = HashMap::new();
60+
61+
// Load all existing embeddings from redb into memory
62+
let entries: Vec<(String, Embedding)> = store
63+
.scan_prefix("", 1_000_000)
64+
.await
65+
.map_err(|e| VectorError::IndexError(format!("Failed to scan redb: {}", e)))?;
66+
67+
for (id, embedding) in &entries {
68+
// Validate dimensionality
69+
if embedding.dim() != dimension {
70+
debug!(
71+
id = %id,
72+
expected = dimension,
73+
actual = embedding.dim(),
74+
"Skipping embedding with wrong dimensionality"
75+
);
76+
continue;
77+
}
78+
index.insert(id.clone(), embedding.clone());
79+
}
80+
81+
info!(
82+
count = index.len(),
83+
dimension = dimension,
84+
path = %path.as_ref().display(),
85+
"Loaded vector store from redb"
86+
);
87+
88+
Ok(Self {
89+
dimension,
90+
metric,
91+
store,
92+
index: Arc::new(RwLock::new(index)),
93+
})
94+
}
95+
96+
/// Normalise a vector for cosine similarity.
97+
fn normalize(v: &[f32]) -> Vec<f32> {
98+
let norm: f32 = v.iter().map(|x| x * x).sum::<f32>().sqrt();
99+
if norm > 0.0 {
100+
v.iter().map(|x| x / norm).collect()
101+
} else {
102+
v.to_vec()
103+
}
104+
}
105+
106+
/// Compute similarity between two vectors.
107+
fn similarity(&self, a: &[f32], b: &[f32]) -> f32 {
108+
match self.metric {
109+
DistanceMetric::Cosine => {
110+
let a_norm = Self::normalize(a);
111+
let b_norm = Self::normalize(b);
112+
a_norm.iter().zip(b_norm.iter()).map(|(x, y)| x * y).sum()
113+
}
114+
DistanceMetric::DotProduct => {
115+
a.iter().zip(b.iter()).map(|(x, y)| x * y).sum()
116+
}
117+
DistanceMetric::Euclidean => {
118+
let dist_sq: f32 = a
119+
.iter()
120+
.zip(b.iter())
121+
.map(|(x, y)| (x - y).powi(2))
122+
.sum();
123+
1.0 / (1.0 + dist_sq.sqrt())
124+
}
125+
}
126+
}
127+
}
128+
129+
#[async_trait]
130+
impl VectorStore for RedbVectorStore {
131+
async fn upsert(&self, embedding: &Embedding) -> Result<(), VectorError> {
132+
if embedding.dim() != self.dimension {
133+
return Err(VectorError::DimensionMismatch {
134+
expected: self.dimension,
135+
actual: embedding.dim(),
136+
});
137+
}
138+
139+
// Write to redb first (durable)
140+
self.store
141+
.put(&embedding.id, embedding)
142+
.await
143+
.map_err(|e| VectorError::IndexError(format!("redb put: {}", e)))?;
144+
145+
// Then update in-memory index
146+
let mut idx = self.index.write().map_err(|_| VectorError::LockPoisoned)?;
147+
idx.insert(embedding.id.clone(), embedding.clone());
148+
149+
Ok(())
150+
}
151+
152+
async fn search(&self, query: &[f32], k: usize) -> Result<Vec<SearchResult>, VectorError> {
153+
if query.len() != self.dimension {
154+
return Err(VectorError::DimensionMismatch {
155+
expected: self.dimension,
156+
actual: query.len(),
157+
});
158+
}
159+
160+
let idx = self.index.read().map_err(|_| VectorError::LockPoisoned)?;
161+
162+
let mut results: Vec<SearchResult> = idx
163+
.values()
164+
.map(|emb| SearchResult {
165+
id: emb.id.clone(),
166+
score: self.similarity(query, &emb.vector),
167+
})
168+
.collect();
169+
170+
results.sort_by(|a, b| b.score.partial_cmp(&a.score).unwrap_or(std::cmp::Ordering::Equal));
171+
results.truncate(k);
172+
173+
Ok(results)
174+
}
175+
176+
async fn get(&self, id: &str) -> Result<Option<Embedding>, VectorError> {
177+
// Read from in-memory index (fast path)
178+
let idx = self.index.read().map_err(|_| VectorError::LockPoisoned)?;
179+
Ok(idx.get(id).cloned())
180+
}
181+
182+
async fn delete(&self, id: &str) -> Result<(), VectorError> {
183+
// Delete from redb first (durable)
184+
self.store
185+
.delete(id)
186+
.await
187+
.map_err(|e| VectorError::IndexError(format!("redb delete: {}", e)))?;
188+
189+
// Then remove from in-memory index
190+
let mut idx = self.index.write().map_err(|_| VectorError::LockPoisoned)?;
191+
idx.remove(id);
192+
193+
Ok(())
194+
}
195+
196+
fn dimension(&self) -> usize {
197+
self.dimension
198+
}
199+
}
200+
201+
#[cfg(test)]
202+
mod tests {
203+
use super::*;
204+
205+
#[tokio::test]
206+
async fn test_persistent_vector_roundtrip() {
207+
let dir = tempfile::tempdir().unwrap();
208+
let path = dir.path().join("vector.redb");
209+
210+
// Create store and insert embeddings
211+
{
212+
let store = RedbVectorStore::open(&path, 3, DistanceMetric::Cosine)
213+
.await
214+
.unwrap();
215+
216+
store
217+
.upsert(&Embedding::new("a", vec![1.0, 0.0, 0.0]))
218+
.await
219+
.unwrap();
220+
store
221+
.upsert(&Embedding::new("b", vec![0.0, 1.0, 0.0]))
222+
.await
223+
.unwrap();
224+
store
225+
.upsert(&Embedding::new("c", vec![0.9, 0.1, 0.0]))
226+
.await
227+
.unwrap();
228+
229+
// Verify search works
230+
let results = store.search(&[1.0, 0.0, 0.0], 2).await.unwrap();
231+
assert_eq!(results.len(), 2);
232+
assert_eq!(results[0].id, "a"); // Most similar to [1,0,0]
233+
}
234+
235+
// Reopen store — data should survive
236+
{
237+
let store = RedbVectorStore::open(&path, 3, DistanceMetric::Cosine)
238+
.await
239+
.unwrap();
240+
241+
// Verify data persisted
242+
let a = store.get("a").await.unwrap();
243+
assert!(a.is_some());
244+
assert_eq!(a.unwrap().vector, vec![1.0, 0.0, 0.0]);
245+
246+
let b = store.get("b").await.unwrap();
247+
assert!(b.is_some());
248+
249+
// Verify search still works after reload
250+
let results = store.search(&[1.0, 0.0, 0.0], 2).await.unwrap();
251+
assert_eq!(results.len(), 2);
252+
assert_eq!(results[0].id, "a");
253+
}
254+
}
255+
256+
#[tokio::test]
257+
async fn test_persistent_vector_delete() {
258+
let dir = tempfile::tempdir().unwrap();
259+
let path = dir.path().join("vector-del.redb");
260+
261+
{
262+
let store = RedbVectorStore::open(&path, 3, DistanceMetric::Cosine)
263+
.await
264+
.unwrap();
265+
266+
store
267+
.upsert(&Embedding::new("x", vec![1.0, 0.0, 0.0]))
268+
.await
269+
.unwrap();
270+
store.delete("x").await.unwrap();
271+
272+
let result: Option<Embedding> = store.get("x").await.unwrap();
273+
assert!(result.is_none());
274+
}
275+
276+
// Reopen — deletion should persist
277+
{
278+
let store = RedbVectorStore::open(&path, 3, DistanceMetric::Cosine)
279+
.await
280+
.unwrap();
281+
let result: Option<Embedding> = store.get("x").await.unwrap();
282+
assert!(result.is_none());
283+
}
284+
}
285+
}

0 commit comments

Comments
 (0)