From 77cc936ed8241b34f3baf75a0842ddc863d367ff Mon Sep 17 00:00:00 2001
From: Suryansh Gupta <suryansh.gupta2000@gmail.com>
Date: Thu, 7 May 2026 01:18:55 +0530
Subject: [PATCH 1/8] Add benchmark crate for multi-vector

---
 Cargo.lock                                    |   17 +
 Cargo.toml                                    |    1 +
 diskann-benchmark-multi-vector/Cargo.toml     |   30 +
 diskann-benchmark-multi-vector/README.md      |  136 ++
 .../examples/multi-vector.json                |   70 +
 .../examples/test.json                        |   47 +
 .../examples/tolerance.json                   |   16 +
 diskann-benchmark-multi-vector/src/bin.rs     |   96 +
 diskann-benchmark-multi-vector/src/lib.rs     |  992 ++++++++
 results.json                                  | 2150 +++++++++++++++++
 10 files changed, 3555 insertions(+)
 create mode 100644 diskann-benchmark-multi-vector/Cargo.toml
 create mode 100644 diskann-benchmark-multi-vector/README.md
 create mode 100644 diskann-benchmark-multi-vector/examples/multi-vector.json
 create mode 100644 diskann-benchmark-multi-vector/examples/test.json
 create mode 100644 diskann-benchmark-multi-vector/examples/tolerance.json
 create mode 100644 diskann-benchmark-multi-vector/src/bin.rs
 create mode 100644 diskann-benchmark-multi-vector/src/lib.rs
 create mode 100644 results.json

diff --git a/Cargo.lock b/Cargo.lock
index beac316c4..fc0a7cc87 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -697,6 +697,23 @@ dependencies = [
  "tokio",
 ]
 
+[[package]]
+name = "diskann-benchmark-multi-vector"
+version = "0.50.1"
+dependencies = [
+ "anyhow",
+ "diskann-benchmark-runner",
+ "diskann-quantization",
+ "diskann-utils",
+ "diskann-vector",
+ "half",
+ "rand 0.9.4",
+ "serde",
+ "serde_json",
+ "tempfile",
+ "thiserror 2.0.17",
+]
+
 [[package]]
 name = "diskann-benchmark-runner"
 version = "0.50.1"
diff --git a/Cargo.toml b/Cargo.toml
index 6f31a1ae2..13fcbdd9c 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -21,6 +21,7 @@ members = [
     "diskann-benchmark-runner",
     "diskann-benchmark-core",
     "diskann-benchmark-simd",
+    "diskann-benchmark-multi-vector",
     "diskann-benchmark",
     "diskann-tools",
     "vectorset",
diff --git a/diskann-benchmark-multi-vector/Cargo.toml b/diskann-benchmark-multi-vector/Cargo.toml
new file mode 100644
index 000000000..f8eb937e1
--- /dev/null
+++ b/diskann-benchmark-multi-vector/Cargo.toml
@@ -0,0 +1,30 @@
+[package]
+name = "diskann-benchmark-multi-vector"
+version.workspace = true
+description.workspace = true
+authors.workspace = true
+documentation.workspace = true
+license.workspace = true
+edition.workspace = true
+
+[[bin]]
+name = "benchmark-multi-vector"
+path = "src/bin.rs"
+
+[dependencies]
+anyhow.workspace = true
+diskann-utils = { workspace = true, default-features = false }
+half = { workspace = true, features = ["rand_distr"] }
+diskann-benchmark-runner = { workspace = true }
+diskann-quantization = { workspace = true }
+diskann-vector = { workspace = true }
+rand.workspace = true
+serde = { workspace = true, features = ["derive"] }
+serde_json.workspace = true
+thiserror.workspace = true
+
+[lints]
+workspace = true
+
+[dev-dependencies]
+tempfile.workspace = true
diff --git a/diskann-benchmark-multi-vector/README.md b/diskann-benchmark-multi-vector/README.md
new file mode 100644
index 000000000..014a393a1
--- /dev/null
+++ b/diskann-benchmark-multi-vector/README.md
@@ -0,0 +1,136 @@
+# diskann-benchmark-multi-vector
+
+Benchmarks and regression detection for the **multi-vector distance
+operations** exposed by `diskann-quantization` — `Chamfer` and `MaxSim` —
+across `f32` and `f16` element types.
+
+## Layout
+
+- `src/lib.rs` — benchmark library: input/tolerance schemas, kernel
+  dispatch, regression checker.
+- `src/bin.rs` — `benchmark-multi-vector` CLI entry point.
+- `examples/multi-vector.json` — full benchmark matrix covering both
+  operations across the registered kernels and a representative range of
+  shapes.
+- `examples/test.json` — minimal smoke configuration consumed by the
+  integration tests.
+- `examples/tolerance.json` — default regression thresholds.
+
+## Registered kernels
+
+The crate registers four kernels — one per `(element_type, implementation)`
+pair:
+
+| Tag                              | Element | Implementation       |
+| -------------------------------- | ------- | -------------------- |
+| `multi-vector-op-f32-optimized`  | `f32`   | `QueryComputer`      |
+| `multi-vector-op-f16-optimized`  | `f16`   | `QueryComputer`      |
+| `multi-vector-op-f32-reference`  | `f32`   | `Chamfer` / `MaxSim` |
+| `multi-vector-op-f16-reference`  | `f16`   | `Chamfer` / `MaxSim` |
+
+The **optimized** path constructs a `QueryComputer` once per shape (which
+internally selects the best available SIMD kernel for the host) and calls
+`chamfer` / `max_sim` inside the timed loop. The **reference** path drives
+the `Chamfer` / `MaxSim` fallback used by the `multi_vector` unit tests —
+useful both as a numerical ground truth and as a baseline to measure SIMD
+speedups against.
+
+## Time normalization
+
+Per-measurement latency is normalized to **nanoseconds per inner-product
+call**, abbreviated `ns/IP`:
+
+```
+ns/IP = min_latency_µs * 1000 / (Q * D * loops_per_measurement)
+```
+
+Two important properties:
+
+- **Independent of `Q`, `D`, and `loops_per_measurement`.** Reshaping the
+  benchmark or scaling the loop budget leaves the metric unchanged, so
+  cache-residency effects and SIMD utilization show up directly.
+- **Approximately linear in `Dim`.** Each inner-product call is itself an
+  O(`Dim`) operation, so `ns/IP` grows with `Dim` — that is why the table
+  headers read `ns/IP @ Dim`. Compare across rows with the same `Dim`; to
+  compare across different `Dim`s, divide further by `Dim` to recover ns
+  per scalar multiply.
+
+This is the right metric for the two things this crate cares about:
+detecting per-shape regressions (the `Dim` factor cancels) and comparing
+optimized vs. reference at a fixed shape.
+
+## Usage
+
+All examples below assume you are inside the crate directory and use a
+small shell function for brevity:
+
+```bash
+bench() { cargo run --release -p diskann-benchmark-multi-vector --bin benchmark-multi-vector -- "$@"; }
+```
+
+### Run benchmarks
+
+`run` executes every job in the input file and writes per-measurement
+latencies plus percentiles to the output file:
+
+```bash
+bench run --input-file examples/multi-vector.json --output-file before.json
+```
+
+### Regression check workflow
+
+The check workflow is **two-phase**: validate the tolerance file once, then
+compare two recorded result files.
+
+**Phase 1 — preflight.** No benchmarks are executed. The verifier confirms
+that every entry in `tolerance.json` matches at least one job in the input
+file, and that every job is matched by exactly one entry. Run it whenever
+you edit `tolerance.json`:
+
+```bash
+bench check verify \
+  --tolerances examples/tolerance.json \
+  --input-file examples/multi-vector.json
+```
+
+**Phase 2 — comparison.** Record results before and after a code change,
+then compare. The command exits non-zero if any run regresses past its
+tolerance:
+
+```bash
+# On the baseline commit
+bench run --input-file examples/multi-vector.json --output-file before.json
+
+# On the change commit
+bench run --input-file examples/multi-vector.json --output-file after.json
+
+# Compare
+bench check run \
+  --tolerances examples/tolerance.json \
+  --input-file examples/multi-vector.json \
+  --before before.json --after after.json \
+  --output-file checks.json
+```
+
+A run **fails** when its post-change `ns/IP` minimum exceeds the
+baseline minimum by more than `min_time_regression` (default `0.05` =
+5%). Improvements (negative change) always pass.
+
+### How tolerances are matched to jobs
+
+Each entry in `tolerance.json` has the shape `{ input, tolerance }`. The
+`input` block acts as a **partial template** against the jobs in the input
+file: any field present must match; missing fields are wildcards.
+
+The shipped `tolerance.json` uses an empty `"content": {}`, which matches
+every `multi-vector-op` job — so a single 5% threshold applies to all four
+kernels. To apply different thresholds per implementation, add more
+specific entries, e.g.:
+
+```json
+{ "input":     { "type": "multi-vector-op", "content": { "implementation": "reference" } },
+  "tolerance": { "type": "multi-vector-tolerance", "content": { "min_time_regression": 0.10 } } }
+```
+
+`check verify` will reject the file if entries overlap or leave any job
+unmatched.
diff --git a/diskann-benchmark-multi-vector/examples/multi-vector.json b/diskann-benchmark-multi-vector/examples/multi-vector.json
new file mode 100644
index 000000000..2626e5047
--- /dev/null
+++ b/diskann-benchmark-multi-vector/examples/multi-vector.json
@@ -0,0 +1,70 @@
+{
+  "search_directories": [],
+  "jobs": [
+    {
+      "type": "multi-vector-op",
+      "content": {
+        "element_type": "float32",
+        "implementation": "optimized",
+        "runs": [
+          { "operation": "chamfer", "num_query_vectors": 8,  "num_doc_vectors": 32,   "dim": 128, "loops_per_measurement": 500, "num_measurements": 50 },
+          { "operation": "chamfer", "num_query_vectors": 16, "num_doc_vectors": 64,   "dim": 256, "loops_per_measurement": 100, "num_measurements": 50 },
+          { "operation": "chamfer", "num_query_vectors": 32, "num_doc_vectors": 128,  "dim": 384, "loops_per_measurement": 20,  "num_measurements": 50 },
+          { "operation": "chamfer", "num_query_vectors": 32, "num_doc_vectors": 16,   "dim": 256, "loops_per_measurement": 200, "num_measurements": 50 },
+          { "operation": "chamfer", "num_query_vectors": 64, "num_doc_vectors": 32,   "dim": 264, "loops_per_measurement": 50,  "num_measurements": 50 },
+          { "operation": "chamfer", "num_query_vectors": 32, "num_doc_vectors": 1250, "dim": 128, "loops_per_measurement": 10,  "num_measurements": 50 },
+          { "operation": "chamfer", "num_query_vectors": 64, "num_doc_vectors": 1250, "dim": 512, "loops_per_measurement": 2,   "num_measurements": 20 },
+          { "operation": "chamfer", "num_query_vectors": 64, "num_doc_vectors": 32,   "dim": 128, "loops_per_measurement": 200, "num_measurements": 50 },
+          { "operation": "chamfer", "num_query_vectors": 32, "num_doc_vectors": 32,   "dim": 512, "loops_per_measurement": 50,  "num_measurements": 50 },
+
+          { "operation": "max_sim", "num_query_vectors": 8,  "num_doc_vectors": 32,   "dim": 128, "loops_per_measurement": 500, "num_measurements": 50 },
+          { "operation": "max_sim", "num_query_vectors": 16, "num_doc_vectors": 64,   "dim": 256, "loops_per_measurement": 100, "num_measurements": 50 },
+          { "operation": "max_sim", "num_query_vectors": 32, "num_doc_vectors": 128,  "dim": 384, "loops_per_measurement": 20,  "num_measurements": 50 },
+          { "operation": "max_sim", "num_query_vectors": 32, "num_doc_vectors": 16,   "dim": 256, "loops_per_measurement": 200, "num_measurements": 50 },
+          { "operation": "max_sim", "num_query_vectors": 64, "num_doc_vectors": 32,   "dim": 264, "loops_per_measurement": 50,  "num_measurements": 50 },
+          { "operation": "max_sim", "num_query_vectors": 32, "num_doc_vectors": 1250, "dim": 128, "loops_per_measurement": 10,  "num_measurements": 50 },
+          { "operation": "max_sim", "num_query_vectors": 64, "num_doc_vectors": 1250, "dim": 512, "loops_per_measurement": 2,   "num_measurements": 20 },
+          { "operation": "max_sim", "num_query_vectors": 64, "num_doc_vectors": 32,   "dim": 128, "loops_per_measurement": 200, "num_measurements": 50 },
+          { "operation": "max_sim", "num_query_vectors": 32, "num_doc_vectors": 32,   "dim": 512, "loops_per_measurement": 50,  "num_measurements": 50 }
+        ]
+      }
+    },
+    {
+      "type": "multi-vector-op",
+      "content": {
+        "element_type": "float16",
+        "implementation": "optimized",
+        "runs": [
+          { "operation": "chamfer", "num_query_vectors": 16, "num_doc_vectors": 64,   "dim": 256, "loops_per_measurement": 100, "num_measurements": 50 },
+          { "operation": "chamfer", "num_query_vectors": 32, "num_doc_vectors": 1250, "dim": 128, "loops_per_measurement": 10,  "num_measurements": 50 },
+          { "operation": "max_sim", "num_query_vectors": 16, "num_doc_vectors": 64,   "dim": 256, "loops_per_measurement": 100, "num_measurements": 50 },
+          { "operation": "max_sim", "num_query_vectors": 32, "num_doc_vectors": 1250, "dim": 128, "loops_per_measurement": 10,  "num_measurements": 50 }
+        ]
+      }
+    },
+    {
+      "type": "multi-vector-op",
+      "content": {
+        "element_type": "float32",
+        "implementation": "reference",
+        "runs": [
+          { "operation": "chamfer", "num_query_vectors": 8,  "num_doc_vectors": 32,  "dim": 128, "loops_per_measurement": 50, "num_measurements": 50 },
+          { "operation": "chamfer", "num_query_vectors": 32, "num_doc_vectors": 128, "dim": 384, "loops_per_measurement": 2,  "num_measurements": 50 },
+          { "operation": "max_sim", "num_query_vectors": 8,  "num_doc_vectors": 32,  "dim": 128, "loops_per_measurement": 50, "num_measurements": 50 },
+          { "operation": "max_sim", "num_query_vectors": 32, "num_doc_vectors": 128, "dim": 384, "loops_per_measurement": 2,  "num_measurements": 50 }
+        ]
+      }
+    },
+    {
+      "type": "multi-vector-op",
+      "content": {
+        "element_type": "float16",
+        "implementation": "reference",
+        "runs": [
+          { "operation": "chamfer", "num_query_vectors": 8,  "num_doc_vectors": 32,  "dim": 128, "loops_per_measurement": 50, "num_measurements": 50 },
+          { "operation": "max_sim", "num_query_vectors": 32, "num_doc_vectors": 128, "dim": 384, "loops_per_measurement": 2,  "num_measurements": 50 }
+        ]
+      }
+    }
+  ]
+}
diff --git a/diskann-benchmark-multi-vector/examples/test.json b/diskann-benchmark-multi-vector/examples/test.json
new file mode 100644
index 000000000..28e9b9d64
--- /dev/null
+++ b/diskann-benchmark-multi-vector/examples/test.json
@@ -0,0 +1,47 @@
+{
+  "search_directories": [],
+  "jobs": [
+    {
+      "type": "multi-vector-op",
+      "content": {
+        "element_type": "float32",
+        "implementation": "optimized",
+        "runs": [
+          { "operation": "chamfer", "num_query_vectors": 8,  "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 2, "num_measurements": 1 },
+          { "operation": "max_sim", "num_query_vectors": 32, "num_doc_vectors": 16, "dim": 256, "loops_per_measurement": 2, "num_measurements": 1 }
+        ]
+      }
+    },
+    {
+      "type": "multi-vector-op",
+      "content": {
+        "element_type": "float16",
+        "implementation": "optimized",
+        "runs": [
+          { "operation": "chamfer", "num_query_vectors": 8,  "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 2, "num_measurements": 1 }
+        ]
+      }
+    },
+    {
+      "type": "multi-vector-op",
+      "content": {
+        "element_type": "float32",
+        "implementation": "reference",
+        "runs": [
+          { "operation": "chamfer", "num_query_vectors": 8,  "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 2, "num_measurements": 1 },
+          { "operation": "max_sim", "num_query_vectors": 64, "num_doc_vectors": 32, "dim": 264, "loops_per_measurement": 2, "num_measurements": 1 }
+        ]
+      }
+    },
+    {
+      "type": "multi-vector-op",
+      "content": {
+        "element_type": "float16",
+        "implementation": "reference",
+        "runs": [
+          { "operation": "max_sim", "num_query_vectors": 8,  "num_doc_vectors": 32, "dim": 128, "loops_per_measurement": 2, "num_measurements": 1 }
+        ]
+      }
+    }
+  ]
+}
diff --git a/diskann-benchmark-multi-vector/examples/tolerance.json b/diskann-benchmark-multi-vector/examples/tolerance.json
new file mode 100644
index 000000000..8d5997199
--- /dev/null
+++ b/diskann-benchmark-multi-vector/examples/tolerance.json
@@ -0,0 +1,16 @@
+{
+  "checks": [
+    {
+      "input": {
+        "type": "multi-vector-op",
+        "content": {}
+      },
+      "tolerance": {
+        "type": "multi-vector-tolerance",
+        "content": {
+          "min_time_regression": 0.05
+        }
+      }
+    }
+  ]
+}
diff --git a/diskann-benchmark-multi-vector/src/bin.rs b/diskann-benchmark-multi-vector/src/bin.rs
new file mode 100644
index 000000000..d595533e7
--- /dev/null
+++ b/diskann-benchmark-multi-vector/src/bin.rs
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) Microsoft Corporation.
+ * Licensed under the MIT license.
+ */
+
+use diskann_benchmark_multi_vector::{register, MultiVectorOp};
+use diskann_benchmark_runner::{output, registry, App, Output};
+
+pub fn main() -> anyhow::Result<()> {
+    // Create the pocket bench application.
+    let app = App::parse();
+    main_inner(&app, &mut output::default())
+}
+
+fn main_inner(app: &App, output: &mut dyn Output) -> anyhow::Result<()> {
+    // Register inputs and benchmarks.
+    let mut inputs = registry::Inputs::new();
+    inputs.register::<MultiVectorOp>()?;
+
+    let mut benchmarks = registry::Benchmarks::new();
+    register(&mut benchmarks);
+
+    // Here we go!
+    app.run(&inputs, &benchmarks, output)
+}
+
+///////////
+// Tests //
+///////////
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    use std::path::{Path, PathBuf};
+
+    use diskann_benchmark_runner::app::{Check, Commands};
+
+    fn run_integration_test(input_file: &Path, output_file: &Path) {
+        let commands = Commands::Run {
+            input_file: input_file.to_str().unwrap().into(),
+            output_file: output_file.to_str().unwrap().into(),
+            dry_run: false,
+            allow_debug: true,
+        };
+
+        let app = App::from_commands(commands);
+
+        let mut output = output::Memory::new();
+        main_inner(&app, &mut output).unwrap();
+        println!(
+            "output = {}",
+            String::from_utf8(output.into_inner()).unwrap()
+        );
+
+        assert!(output_file.exists());
+    }
+
+    fn run_check_test(input_file: &Path, tolerances: &Path) -> String {
+        let commands = Commands::Check(Check::Verify {
+            tolerances: tolerances.to_str().unwrap().into(),
+            input_file: input_file.to_str().unwrap().into(),
+        });
+
+        let app = App::from_commands(commands);
+
+        let mut output = output::Memory::new();
+        main_inner(&app, &mut output).unwrap();
+        String::from_utf8(output.into_inner()).unwrap()
+    }
+
+    #[test]
+    fn integration_test() {
+        let input_path = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
+            .join("examples")
+            .join("test.json");
+
+        let tempdir = tempfile::tempdir().unwrap();
+        let output_path = tempdir.path().join("output.json");
+
+        run_integration_test(&input_path, &output_path);
+    }
+
+    #[test]
+    fn check_verify() {
+        let input_path = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
+            .join("examples")
+            .join("test.json");
+        let tolerance_path = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
+            .join("examples")
+            .join("tolerance.json");
+
+        let stdout = run_check_test(&input_path, &tolerance_path);
+        println!("stdout = {}", stdout);
+    }
+}
diff --git a/diskann-benchmark-multi-vector/src/lib.rs b/diskann-benchmark-multi-vector/src/lib.rs
new file mode 100644
index 000000000..7cadf4f29
--- /dev/null
+++ b/diskann-benchmark-multi-vector/src/lib.rs
@@ -0,0 +1,992 @@
+/*
+ * Copyright (c) Microsoft Corporation.
+ * Licensed under the MIT license.
+ */
+
+//! Multi-vector distance benchmarks with regression detection.
+
+use std::{io::Write, num::NonZeroUsize};
+
+use diskann_quantization::multi_vector::{Chamfer, MatRef, MaxSim, QueryComputer, Standard};
+use diskann_vector::distance::InnerProduct;
+use diskann_vector::{DistanceFunctionMut, PureDistanceFunction};
+use half::f16;
+use rand::{
+    distr::{Distribution, StandardUniform},
+    rngs::StdRng,
+    SeedableRng,
+};
+use serde::{Deserialize, Serialize};
+use thiserror::Error;
+
+use diskann_benchmark_runner::{
+    benchmark::{PassFail, Regression},
+    dispatcher::{Description, DispatchRule, FailureScore, MatchScore},
+    utils::{
+        datatype::{self, DataType},
+        num::{relative_change, NonNegativeFinite},
+        percentiles, MicroSeconds,
+    },
+    Any, Benchmark, CheckDeserialization, Checker, Input,
+};
+
+////////////////
+// Public API //
+////////////////
+
+/// Register all multi-vector benchmarks with the runner's dispatcher.
+pub fn register(dispatcher: &mut diskann_benchmark_runner::registry::Benchmarks) {
+    register_benchmarks_impl(dispatcher)
+}
+
+///////////
+// Utils //
+///////////
+
+#[derive(Debug, Clone, Copy)]
+struct DisplayWrapper<'a, T: ?Sized>(&'a T);
+
+impl<T: ?Sized> std::ops::Deref for DisplayWrapper<'_, T> {
+    type Target = T;
+    fn deref(&self) -> &T {
+        self.0
+    }
+}
+
+////////////
+// Inputs //
+////////////
+
+/// The two distance operations exposed by [`QueryComputer`].
+#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)]
+#[serde(rename_all = "snake_case")]
+pub enum Operation {
+    Chamfer,
+    MaxSim,
+}
+
+impl std::fmt::Display for Operation {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        let st = match self {
+            Self::Chamfer => "chamfer",
+            Self::MaxSim => "max_sim",
+        };
+        write!(f, "{}", st)
+    }
+}
+
+/// Which implementation tier to benchmark.
+#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)]
+#[serde(rename_all = "kebab-case")]
+enum Implementation {
+    Optimized,
+    Reference,
+}
+
+impl std::fmt::Display for Implementation {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        let st = match self {
+            Self::Optimized => "optimized",
+            Self::Reference => "reference",
+        };
+        write!(f, "{}", st)
+    }
+}
+
+/// One benchmark configuration: a single (operation, shape) measurement.
+#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
+struct Run {
+    operation: Operation,
+    num_query_vectors: NonZeroUsize,
+    num_doc_vectors: NonZeroUsize,
+    dim: NonZeroUsize,
+    loops_per_measurement: NonZeroUsize,
+    num_measurements: NonZeroUsize,
+}
+
+/// A complete multi-vector benchmark job.
+#[derive(Debug, Serialize, Deserialize)]
+pub struct MultiVectorOp {
+    element_type: DataType,
+    implementation: Implementation,
+    runs: Vec<Run>,
+}
+
+impl CheckDeserialization for MultiVectorOp {
+    fn check_deserialization(&mut self, _checker: &mut Checker) -> Result<(), anyhow::Error> {
+        Ok(())
+    }
+}
+
+macro_rules! write_field {
+    ($f:ident, $field:tt, $($expr:tt)*) => {
+        writeln!($f, "{:>18}: {}", $field, $($expr)*)
+    }
+}
+
+impl MultiVectorOp {
+    fn summarize_fields(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write_field!(f, "element type", self.element_type)?;
+        write_field!(f, "implementation", self.implementation)?;
+        write_field!(f, "number of runs", self.runs.len())?;
+        Ok(())
+    }
+}
+
+impl std::fmt::Display for MultiVectorOp {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        writeln!(f, "Multi-Vector Operation\n")?;
+        write_field!(f, "tag", Self::tag())?;
+        self.summarize_fields(f)
+    }
+}
+
+impl Input for MultiVectorOp {
+    fn tag() -> &'static str {
+        "multi-vector-op"
+    }
+
+    fn try_deserialize(
+        serialized: &serde_json::Value,
+        checker: &mut Checker,
+    ) -> anyhow::Result<Any> {
+        checker.any(Self::deserialize(serialized)?)
+    }
+
+    fn example() -> anyhow::Result<serde_json::Value> {
+        const NUM_QUERY_VECTORS: NonZeroUsize = NonZeroUsize::new(32).unwrap();
+        const NUM_DOC_VECTORS: NonZeroUsize = NonZeroUsize::new(64).unwrap();
+        const DIM: NonZeroUsize = NonZeroUsize::new(128).unwrap();
+        const LOOPS_PER_MEASUREMENT: NonZeroUsize = NonZeroUsize::new(200).unwrap();
+        const NUM_MEASUREMENTS: NonZeroUsize = NonZeroUsize::new(100).unwrap();
+
+        let runs = vec![
+            Run {
+                operation: Operation::Chamfer,
+                num_query_vectors: NUM_QUERY_VECTORS,
+                num_doc_vectors: NUM_DOC_VECTORS,
+                dim: DIM,
+                loops_per_measurement: LOOPS_PER_MEASUREMENT,
+                num_measurements: NUM_MEASUREMENTS,
+            },
+            Run {
+                operation: Operation::MaxSim,
+                num_query_vectors: NUM_QUERY_VECTORS,
+                num_doc_vectors: NUM_DOC_VECTORS,
+                dim: DIM,
+                loops_per_measurement: LOOPS_PER_MEASUREMENT,
+                num_measurements: NUM_MEASUREMENTS,
+            },
+        ];
+
+        Ok(serde_json::to_value(&Self {
+            element_type: DataType::Float32,
+            implementation: Implementation::Optimized,
+            runs,
+        })?)
+    }
+}
+
+//////////////////////
+// Regression Check //
+//////////////////////
+
+/// Tolerance thresholds for multi-vector benchmark regression detection.
+///
+/// Each field specifies the maximum allowed relative increase in the corresponding metric.
+/// For example, a value of `0.05` means a 5% increase is tolerated.
+#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
+struct MultiVectorTolerance {
+    min_time_regression: NonNegativeFinite,
+}
+
+impl CheckDeserialization for MultiVectorTolerance {
+    fn check_deserialization(&mut self, _checker: &mut Checker) -> Result<(), anyhow::Error> {
+        Ok(())
+    }
+}
+
+impl Input for MultiVectorTolerance {
+    fn tag() -> &'static str {
+        "multi-vector-tolerance"
+    }
+
+    fn try_deserialize(
+        serialized: &serde_json::Value,
+        checker: &mut Checker,
+    ) -> anyhow::Result<Any> {
+        checker.any(Self::deserialize(serialized)?)
+    }
+
+    fn example() -> anyhow::Result<serde_json::Value> {
+        const EXAMPLE: NonNegativeFinite = match NonNegativeFinite::new(0.05) {
+            Ok(v) => v,
+            Err(_) => panic!("use a non-negative finite please"),
+        };
+
+        Ok(serde_json::to_value(MultiVectorTolerance {
+            min_time_regression: EXAMPLE,
+        })?)
+    }
+}
+
+/// Per-run comparison result showing before/after percentile differences.
+#[derive(Debug, Serialize)]
+struct Comparison {
+    run: Run,
+    tolerance: MultiVectorTolerance,
+    before_min: f64,
+    after_min: f64,
+}
+
+/// Aggregated result of the regression check across all runs.
+#[derive(Debug, Serialize)]
+struct CheckResult {
+    checks: Vec<Comparison>,
+}
+
+impl std::fmt::Display for CheckResult {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        let header = [
+            "Operation",
+            "Q",
+            "D",
+            "Dim",
+            "Min Before (ns/IP @ Dim)",
+            "Min After (ns/IP @ Dim)",
+            "Change (%)",
+            "Remark",
+        ];
+
+        let mut table = diskann_benchmark_runner::utils::fmt::Table::new(header, self.checks.len());
+
+        for (i, c) in self.checks.iter().enumerate() {
+            let mut row = table.row(i);
+            let change = relative_change(c.before_min, c.after_min);
+
+            row.insert(c.run.operation, 0);
+            row.insert(c.run.num_query_vectors, 1);
+            row.insert(c.run.num_doc_vectors, 2);
+            row.insert(c.run.dim, 3);
+            row.insert(format!("{:.3}", c.before_min), 4);
+            row.insert(format!("{:.3}", c.after_min), 5);
+            match change {
+                Ok(change) => {
+                    row.insert(format!("{:.3} %", change * 100.0), 6);
+                    if change > c.tolerance.min_time_regression.get() {
+                        row.insert("FAIL", 7);
+                    }
+                }
+                Err(err) => {
+                    row.insert("invalid", 6);
+                    row.insert(err, 7);
+                }
+            }
+        }
+
+        table.fmt(f)
+    }
+}
+
+////////////////////////////
+// Benchmark Registration //
+////////////////////////////
+
+fn register_benchmarks_impl(dispatcher: &mut diskann_benchmark_runner::registry::Benchmarks) {
+    // Optimized (architecture-dispatched QueryComputer).
+    dispatcher.register_regression(
+        "multi-vector-op-f32-optimized",
+        Kernel::<Optimized, f32>::new(),
+    );
+    dispatcher.register_regression(
+        "multi-vector-op-f16-optimized",
+        Kernel::<Optimized, f16>::new(),
+    );
+
+    // Reference (Chamfer / MaxSim fallback path).
+    dispatcher.register_regression(
+        "multi-vector-op-f32-reference",
+        Kernel::<Reference, f32>::new(),
+    );
+    dispatcher.register_regression(
+        "multi-vector-op-f16-reference",
+        Kernel::<Reference, f16>::new(),
+    );
+}
+
+//////////////
+// Dispatch //
+//////////////
+
+/// Dispatch marker for the [`QueryComputer`] implementation.
+#[derive(Debug)]
+struct Optimized;
+
+/// Dispatch marker for the [`Chamfer`] / [`MaxSim`] fallback.
+#[derive(Debug)]
+struct Reference;
+
+/// A multi-vector benchmark.
+struct Kernel<I, T> {
+    _type: std::marker::PhantomData<(I, T)>,
+}
+
+impl<I, T> Kernel<I, T> {
+    fn new() -> Self {
+        Self {
+            _type: std::marker::PhantomData,
+        }
+    }
+}
+
+#[derive(Debug, Error)]
+#[error("implementation {0} is not registered for this benchmark")]
+pub(crate) struct ImplementationMismatch(Implementation);
+
+impl DispatchRule<Implementation> for Optimized {
+    type Error = ImplementationMismatch;
+
+    fn try_match(from: &Implementation) -> Result<MatchScore, FailureScore> {
+        if *from == Implementation::Optimized {
+            Ok(MatchScore(0))
+        } else {
+            Err(FailureScore(1))
+        }
+    }
+
+    fn convert(from: Implementation) -> Result<Self, Self::Error> {
+        if from == Implementation::Optimized {
+            Ok(Optimized)
+        } else {
+            Err(ImplementationMismatch(from))
+        }
+    }
+
+    fn description(
+        f: &mut std::fmt::Formatter<'_>,
+        from: Option<&Implementation>,
+    ) -> std::fmt::Result {
+        match from {
+            None => write!(f, "QueryComputer (architecture-dispatched)"),
+            Some(impl_) => {
+                if Self::try_match(impl_).is_ok() {
+                    write!(f, "matched {}", impl_)
+                } else {
+                    write!(f, "expected {}, got {}", Implementation::Optimized, impl_)
+                }
+            }
+        }
+    }
+}
+
+impl DispatchRule<Implementation> for Reference {
+    type Error = ImplementationMismatch;
+
+    fn try_match(from: &Implementation) -> Result<MatchScore, FailureScore> {
+        if *from == Implementation::Reference {
+            Ok(MatchScore(0))
+        } else {
+            Err(FailureScore(1))
+        }
+    }
+
+    fn convert(from: Implementation) -> Result<Self, Self::Error> {
+        if from == Implementation::Reference {
+            Ok(Reference)
+        } else {
+            Err(ImplementationMismatch(from))
+        }
+    }
+
+    fn description(
+        f: &mut std::fmt::Formatter<'_>,
+        from: Option<&Implementation>,
+    ) -> std::fmt::Result {
+        match from {
+            None => write!(f, "Chamfer / MaxSim fallback"),
+            Some(impl_) => {
+                if Self::try_match(impl_).is_ok() {
+                    write!(f, "matched {}", impl_)
+                } else {
+                    write!(f, "expected {}, got {}", Implementation::Reference, impl_)
+                }
+            }
+        }
+    }
+}
+
+impl<I, T> Benchmark for Kernel<I, T>
+where
+    datatype::Type<T>: DispatchRule<datatype::DataType>,
+    I: DispatchRule<Implementation, Error = ImplementationMismatch> + 'static,
+    Kernel<I, T>: RunBenchmark<I>,
+    T: 'static,
+{
+    type Input = MultiVectorOp;
+    type Output = Vec<RunResult>;
+
+    fn try_match(&self, from: &MultiVectorOp) -> Result<MatchScore, FailureScore> {
+        let mut failscore: Option<u32> = None;
+        if datatype::Type::<T>::try_match(&from.element_type).is_err() {
+            *failscore.get_or_insert(0) += 10;
+        }
+        if let Err(FailureScore(score)) = I::try_match(&from.implementation) {
+            *failscore.get_or_insert(0) += 2 + score;
+        }
+
+        match failscore {
+            None => Ok(MatchScore(0)),
+            Some(score) => Err(FailureScore(score)),
+        }
+    }
+
+    fn run(
+        &self,
+        input: &MultiVectorOp,
+        _: diskann_benchmark_runner::Checkpoint<'_>,
+        mut output: &mut dyn diskann_benchmark_runner::Output,
+    ) -> anyhow::Result<Self::Output> {
+        let _ = I::convert(input.implementation)?;
+        writeln!(output, "{}", input)?;
+        let results = self.run_benchmark(input)?;
+        writeln!(output, "\n\n{}", DisplayWrapper(&*results))?;
+        Ok(results)
+    }
+
+    fn description(
+        &self,
+        f: &mut std::fmt::Formatter<'_>,
+        input: Option<&MultiVectorOp>,
+    ) -> std::fmt::Result {
+        match input {
+            None => {
+                writeln!(
+                    f,
+                    "- Element Type: {}",
+                    Description::<datatype::DataType, datatype::Type<T>>::new()
+                )?;
+                writeln!(
+                    f,
+                    "- Implementation: {}",
+                    Description::<Implementation, I>::new()
+                )?;
+            }
+            Some(input) => {
+                if let Err(err) = datatype::Type::<T>::try_match_verbose(&input.element_type) {
+                    writeln!(f, "\n    - Mismatched element type: {}", err)?;
+                }
+                if let Err(err) = I::try_match_verbose(&input.implementation) {
+                    writeln!(f, "\n    - Mismatched implementation: {}", err)?;
+                }
+            }
+        }
+        Ok(())
+    }
+}
+
+impl<I, T> Regression for Kernel<I, T>
+where
+    datatype::Type<T>: DispatchRule<datatype::DataType>,
+    I: DispatchRule<Implementation, Error = ImplementationMismatch> + 'static,
+    Kernel<I, T>: RunBenchmark<I>,
+    T: 'static,
+{
+    type Tolerances = MultiVectorTolerance;
+    type Pass = CheckResult;
+    type Fail = CheckResult;
+
+    fn check(
+        &self,
+        tolerance: &MultiVectorTolerance,
+        _input: &MultiVectorOp,
+        before: &Vec<RunResult>,
+        after: &Vec<RunResult>,
+    ) -> anyhow::Result<PassFail<CheckResult, CheckResult>> {
+        anyhow::ensure!(
+            before.len() == after.len(),
+            "before has {} runs but after has {}",
+            before.len(),
+            after.len(),
+        );
+
+        let mut passed = true;
+        let checks: Vec<Comparison> = std::iter::zip(before.iter(), after.iter())
+            .enumerate()
+            .map(|(i, (b, a))| {
+                anyhow::ensure!(b.run == a.run, "run {i} mismatched");
+
+                let computations_per_latency = b.computations_per_latency() as f64;
+
+                let before_min = b.percentiles.minimum.as_f64() * 1000.0 / computations_per_latency;
+                let after_min = a.percentiles.minimum.as_f64() * 1000.0 / computations_per_latency;
+
+                let comparison = Comparison {
+                    run: b.run.clone(),
+                    tolerance: *tolerance,
+                    before_min,
+                    after_min,
+                };
+
+                match relative_change(before_min, after_min) {
+                    Ok(change) => {
+                        if change > tolerance.min_time_regression.get() {
+                            passed = false;
+                        }
+                    }
+                    Err(_) => passed = false,
+                };
+
+                Ok(comparison)
+            })
+            .collect::<anyhow::Result<Vec<Comparison>>>()?;
+
+        let check = CheckResult { checks };
+
+        if passed {
+            Ok(PassFail::Pass(check))
+        } else {
+            Ok(PassFail::Fail(check))
+        }
+    }
+}
+
+///////////////
+// Benchmark //
+///////////////
+
+trait RunBenchmark<I> {
+    fn run_benchmark(&self, input: &MultiVectorOp) -> Result<Vec<RunResult>, anyhow::Error>;
+}
+
+#[derive(Debug, Serialize, Deserialize)]
+struct RunResult {
+    /// The configuration for this run.
+    run: Run,
+    /// Per-measurement latencies (over `loops_per_measurement` calls).
+    latencies: Vec<MicroSeconds>,
+    /// Latency percentiles.
+    percentiles: percentiles::Percentiles<MicroSeconds>,
+}
+
+impl RunResult {
+    fn computations_per_latency(&self) -> usize {
+        self.run.num_query_vectors.get()
+            * self.run.num_doc_vectors.get()
+            * self.run.loops_per_measurement.get()
+    }
+}
+
+impl std::fmt::Display for DisplayWrapper<'_, [RunResult]> {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        if self.is_empty() {
+            return Ok(());
+        }
+
+        // ns/IP is normalized as `min_latency_us * 1000 / (Q * D * loops)` and is
+        // approximately linear in `dim`. Compare across rows with the same `Dim`;
+        // divide further by `Dim` to recover ns per scalar multiply.
+        writeln!(
+            f,
+            "ns/IP = time per (query, doc) inner-product call (~ linear in Dim)"
+        )?;
+
+        let header = [
+            "Operation",
+            "Q",
+            "D",
+            "Dim",
+            "Min Time (ns/IP @ Dim)",
+            "Mean Time (ns/IP @ Dim)",
+            "Loops",
+            "Measurements",
+        ];
+
+        let mut table = diskann_benchmark_runner::utils::fmt::Table::new(header, self.len());
+
+        self.iter().enumerate().for_each(|(row, r)| {
+            let mut row = table.row(row);
+
+            let min_latency = r
+                .latencies
+                .iter()
+                .min()
+                .copied()
+                .unwrap_or(MicroSeconds::new(u64::MAX));
+            let mean_latency = r.percentiles.mean;
+
+            let computations_per_latency = r.computations_per_latency() as f64;
+
+            // Convert time from micro-seconds to nano-seconds per inner-product call
+            // (one (query, doc) pair, ~ linear in dim).
+            let min_time = min_latency.as_f64() / computations_per_latency * 1000.0;
+            let mean_time = mean_latency / computations_per_latency * 1000.0;
+
+            row.insert(r.run.operation, 0);
+            row.insert(r.run.num_query_vectors, 1);
+            row.insert(r.run.num_doc_vectors, 2);
+            row.insert(r.run.dim, 3);
+            row.insert(format!("{:.3}", min_time), 4);
+            row.insert(format!("{:.3}", mean_time), 5);
+            row.insert(r.run.loops_per_measurement, 6);
+            row.insert(r.run.num_measurements, 7);
+        });
+
+        table.fmt(f)
+    }
+}
+
+fn run_loops<F>(run: &Run, mut body: F) -> RunResult
+where
+    F: FnMut(),
+{
+    let mut latencies = Vec::with_capacity(run.num_measurements.get());
+
+    for _ in 0..run.num_measurements.get() {
+        let start = std::time::Instant::now();
+        for _ in 0..run.loops_per_measurement.get() {
+            body();
+        }
+        latencies.push(start.elapsed().into());
+    }
+
+    let percentiles = percentiles::compute_percentiles(&mut latencies).unwrap();
+    RunResult {
+        run: run.clone(),
+        latencies,
+        percentiles,
+    }
+}
+
+///////////////////
+// Data fixtures //
+///////////////////
+
+const RNG_SEED: u64 = 0x12345;
+
+struct Data<T> {
+    query_data: Box<[T]>,
+    doc_data: Box<[T]>,
+}
+
+impl<T: Copy> Data<T>
+where
+    StandardUniform: Distribution<T>,
+{
+    fn new(run: &Run) -> Self {
+        let mut rng = StdRng::seed_from_u64(RNG_SEED);
+        let query_data: Box<[T]> = (0..run.num_query_vectors.get() * run.dim.get())
+            .map(|_| StandardUniform.sample(&mut rng))
+            .collect();
+        let doc_data: Box<[T]> = (0..run.num_doc_vectors.get() * run.dim.get())
+            .map(|_| StandardUniform.sample(&mut rng))
+            .collect();
+
+        Self {
+            query_data,
+            doc_data,
+        }
+    }
+
+    fn query(&self, run: &Run) -> MatRef<'_, Standard<T>> {
+        MatRef::new(
+            Standard::new(run.num_query_vectors.get(), run.dim.get()).unwrap(),
+            &self.query_data,
+        )
+        .unwrap()
+    }
+
+    fn doc(&self, run: &Run) -> MatRef<'_, Standard<T>> {
+        MatRef::new(
+            Standard::new(run.num_doc_vectors.get(), run.dim.get()).unwrap(),
+            &self.doc_data,
+        )
+        .unwrap()
+    }
+}
+
+/////////////////////
+// Implementations //
+/////////////////////
+
+fn run_optimized<T>(input: &MultiVectorOp) -> anyhow::Result<Vec<RunResult>>
+where
+    T: Copy,
+    StandardUniform: Distribution<T>,
+    QueryComputer<T>: NewFromMatRef<T>,
+{
+    let mut results = Vec::with_capacity(input.runs.len());
+    for run in input.runs.iter() {
+        let data = Data::<T>::new(run);
+        let computer = <QueryComputer<T> as NewFromMatRef<T>>::new_from(data.query(run));
+        let doc = data.doc(run);
+
+        let result = match run.operation {
+            Operation::Chamfer => run_loops(run, || {
+                let v = computer.chamfer(doc);
+                std::hint::black_box(v);
+            }),
+            Operation::MaxSim => {
+                let mut scores = vec![0.0f32; run.num_query_vectors.get()];
+                run_loops(run, || {
+                    computer.max_sim(doc, &mut scores);
+                    std::hint::black_box(&mut scores);
+                })
+            }
+        };
+        results.push(result);
+    }
+    Ok(results)
+}
+
+/// Drive the [`Chamfer`] / [`MaxSim`] fallback path.
+fn run_reference<T>(input: &MultiVectorOp) -> anyhow::Result<Vec<RunResult>>
+where
+    T: Copy,
+    StandardUniform: Distribution<T>,
+    InnerProduct: for<'a, 'b> PureDistanceFunction<&'a [T], &'b [T], f32>,
+{
+    let mut results = Vec::with_capacity(input.runs.len());
+    for run in input.runs.iter() {
+        let data = Data::<T>::new(run);
+        let query = data.query(run);
+        let doc = data.doc(run);
+
+        let result = match run.operation {
+            Operation::Chamfer => run_loops(run, || {
+                let v = Chamfer::evaluate(query.into(), doc);
+                std::hint::black_box(v);
+            }),
+            Operation::MaxSim => {
+                let mut scores = vec![0.0f32; run.num_query_vectors.get()];
+                run_loops(run, || {
+                    let mut max_sim = MaxSim::new(&mut scores).unwrap();
+                    let _ = max_sim.evaluate(query.into(), doc);
+                    std::hint::black_box(&mut scores);
+                })
+            }
+        };
+        results.push(result);
+    }
+    Ok(results)
+}
+
+impl RunBenchmark<Optimized> for Kernel<Optimized, f32> {
+    fn run_benchmark(&self, input: &MultiVectorOp) -> Result<Vec<RunResult>, anyhow::Error> {
+        run_optimized::<f32>(input)
+    }
+}
+
+impl RunBenchmark<Optimized> for Kernel<Optimized, f16> {
+    fn run_benchmark(&self, input: &MultiVectorOp) -> Result<Vec<RunResult>, anyhow::Error> {
+        run_optimized::<f16>(input)
+    }
+}
+
+impl RunBenchmark<Reference> for Kernel<Reference, f32> {
+    fn run_benchmark(&self, input: &MultiVectorOp) -> Result<Vec<RunResult>, anyhow::Error> {
+        run_reference::<f32>(input)
+    }
+}
+
+impl RunBenchmark<Reference> for Kernel<Reference, f16> {
+    fn run_benchmark(&self, input: &MultiVectorOp) -> Result<Vec<RunResult>, anyhow::Error> {
+        run_reference::<f16>(input)
+    }
+}
+
+/// Element-type-erasing constructor for [`QueryComputer`].
+trait NewFromMatRef<T: Copy> {
+    fn new_from(query: MatRef<'_, Standard<T>>) -> QueryComputer<T>;
+}
+
+impl NewFromMatRef<f32> for QueryComputer<f32> {
+    fn new_from(query: MatRef<'_, Standard<f32>>) -> QueryComputer<f32> {
+        QueryComputer::<f32>::new(query)
+    }
+}
+
+impl NewFromMatRef<f16> for QueryComputer<f16> {
+    fn new_from(query: MatRef<'_, Standard<f16>>) -> QueryComputer<f16> {
+        QueryComputer::<f16>::new(query)
+    }
+}
+
+///////////
+// Tests //
+///////////
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    use diskann_benchmark_runner::{
+        benchmark::{PassFail, Regression},
+        utils::percentiles::compute_percentiles,
+    };
+
+    fn tiny_run(operation: Operation) -> Run {
+        Run {
+            operation,
+            num_query_vectors: NonZeroUsize::new(2).unwrap(),
+            num_doc_vectors: NonZeroUsize::new(2).unwrap(),
+            dim: NonZeroUsize::new(4).unwrap(),
+            loops_per_measurement: NonZeroUsize::new(1).unwrap(),
+            num_measurements: NonZeroUsize::new(1).unwrap(),
+        }
+    }
+
+    fn tiny_op() -> MultiVectorOp {
+        MultiVectorOp {
+            element_type: DataType::Float32,
+            implementation: Implementation::Optimized,
+            runs: vec![tiny_run(Operation::Chamfer)],
+        }
+    }
+
+    fn tiny_result(operation: Operation, minimum: u64) -> RunResult {
+        let run = tiny_run(operation);
+        let minimum = MicroSeconds::new(minimum);
+        let mut latencies = vec![minimum];
+        let percentiles = compute_percentiles(&mut latencies).unwrap();
+        RunResult {
+            run,
+            latencies,
+            percentiles,
+        }
+    }
+
+    fn tolerance(limit: f64) -> MultiVectorTolerance {
+        MultiVectorTolerance {
+            min_time_regression: NonNegativeFinite::new(limit).unwrap(),
+        }
+    }
+
+    #[test]
+    fn check_rejects_mismatched_runs() {
+        let kernel = Kernel::<Optimized, f32>::new();
+
+        let err = kernel
+            .check(
+                &tolerance(0.0),
+                &tiny_op(),
+                &vec![tiny_result(Operation::Chamfer, 100)],
+                &vec![tiny_result(Operation::MaxSim, 100)],
+            )
+            .unwrap_err();
+
+        assert_eq!(err.to_string(), "run 0 mismatched");
+    }
+
+    #[test]
+    fn check_allows_negative_relative_change() {
+        let kernel = Kernel::<Optimized, f32>::new();
+
+        let result = kernel
+            .check(
+                &tolerance(0.0),
+                &tiny_op(),
+                &vec![tiny_result(Operation::Chamfer, 100)],
+                &vec![tiny_result(Operation::Chamfer, 95)],
+            )
+            .unwrap();
+
+        assert!(matches!(result, PassFail::Pass(_)));
+    }
+
+    #[test]
+    fn check_passes_on_tolerance_boundary() {
+        let kernel = Kernel::<Optimized, f32>::new();
+
+        let result = kernel
+            .check(
+                &tolerance(0.05),
+                &tiny_op(),
+                &vec![tiny_result(Operation::Chamfer, 100)],
+                &vec![tiny_result(Operation::Chamfer, 105)],
+            )
+            .unwrap();
+
+        assert!(matches!(result, PassFail::Pass(_)));
+    }
+
+    #[test]
+    fn check_fails_above_tolerance_boundary() {
+        let kernel = Kernel::<Optimized, f32>::new();
+
+        let result = kernel
+            .check(
+                &tolerance(0.05),
+                &tiny_op(),
+                &vec![tiny_result(Operation::Chamfer, 100)],
+                &vec![tiny_result(Operation::Chamfer, 106)],
+            )
+            .unwrap();
+
+        assert!(matches!(result, PassFail::Fail(_)));
+    }
+
+    #[test]
+    fn check_result_display_includes_failure_details() {
+        let check = CheckResult {
+            checks: vec![Comparison {
+                run: tiny_run(Operation::Chamfer),
+                tolerance: tolerance(0.05),
+                before_min: 100.0,
+                after_min: 106.0,
+            }],
+        };
+
+        let rendered = check.to_string();
+        assert!(rendered.contains("Operation"), "rendered = {rendered}");
+        assert!(rendered.contains("chamfer"), "rendered = {rendered}");
+        assert!(rendered.contains("100.000"), "rendered = {rendered}");
+        assert!(rendered.contains("106.000"), "rendered = {rendered}");
+        assert!(rendered.contains("6.000 %"), "rendered = {rendered}");
+        assert!(rendered.contains("FAIL"), "rendered = {rendered}");
+    }
+
+    /// A "before" value of 0 means the measurement was too fast to obtain a
+    /// reliable signal, so we *could* be letting a regression through. We
+    /// require at least a non-zero value.
+    #[test]
+    fn zero_values_rejected() {
+        let kernel = Kernel::<Optimized, f32>::new();
+
+        let result = kernel
+            .check(
+                &tolerance(0.05),
+                &tiny_op(),
+                &vec![tiny_result(Operation::Chamfer, 0)],
+                &vec![tiny_result(Operation::Chamfer, 0)],
+            )
+            .unwrap();
+
+        assert!(matches!(result, PassFail::Fail(_)));
+    }
+
+    /// Sanity-check that the optimized kernel and the reference path produce
+    /// numerically equivalent Chamfer scores on a small fixture.
+    #[test]
+    fn optimized_chamfer_matches_reference_f32() {
+        let run = Run {
+            operation: Operation::Chamfer,
+            num_query_vectors: NonZeroUsize::new(5).unwrap(),
+            num_doc_vectors: NonZeroUsize::new(7).unwrap(),
+            dim: NonZeroUsize::new(16).unwrap(),
+            loops_per_measurement: NonZeroUsize::new(1).unwrap(),
+            num_measurements: NonZeroUsize::new(1).unwrap(),
+        };
+
+        let data = Data::<f32>::new(&run);
+        let query = data.query(&run);
+        let doc = data.doc(&run);
+
+        let optimized = QueryComputer::<f32>::new(query).chamfer(doc);
+        let reference = Chamfer::evaluate(query.into(), doc);
+
+        assert!(
+            (optimized - reference).abs() < 1e-4,
+            "optimized={optimized}, reference={reference}",
+        );
+    }
+}
diff --git a/results.json b/results.json
new file mode 100644
index 000000000..f061f6750
--- /dev/null
+++ b/results.json
@@ -0,0 +1,2150 @@
+[
+  {
+    "input": {
+      "content": {
+        "element_type": "float32",
+        "implementation": "optimized",
+        "runs": [
+          {
+            "dim": 128,
+            "loops_per_measurement": 500,
+            "num_doc_vectors": 32,
+            "num_measurements": 50,
+            "num_query_vectors": 8,
+            "operation": "chamfer"
+          },
+          {
+            "dim": 256,
+            "loops_per_measurement": 100,
+            "num_doc_vectors": 64,
+            "num_measurements": 50,
+            "num_query_vectors": 16,
+            "operation": "chamfer"
+          },
+          {
+            "dim": 384,
+            "loops_per_measurement": 20,
+            "num_doc_vectors": 128,
+            "num_measurements": 50,
+            "num_query_vectors": 32,
+            "operation": "chamfer"
+          },
+          {
+            "dim": 256,
+            "loops_per_measurement": 200,
+            "num_doc_vectors": 16,
+            "num_measurements": 50,
+            "num_query_vectors": 32,
+            "operation": "chamfer"
+          },
+          {
+            "dim": 264,
+            "loops_per_measurement": 50,
+            "num_doc_vectors": 32,
+            "num_measurements": 50,
+            "num_query_vectors": 64,
+            "operation": "chamfer"
+          },
+          {
+            "dim": 128,
+            "loops_per_measurement": 10,
+            "num_doc_vectors": 1250,
+            "num_measurements": 50,
+            "num_query_vectors": 32,
+            "operation": "chamfer"
+          },
+          {
+            "dim": 512,
+            "loops_per_measurement": 2,
+            "num_doc_vectors": 1250,
+            "num_measurements": 20,
+            "num_query_vectors": 64,
+            "operation": "chamfer"
+          },
+          {
+            "dim": 128,
+            "loops_per_measurement": 200,
+            "num_doc_vectors": 32,
+            "num_measurements": 50,
+            "num_query_vectors": 64,
+            "operation": "chamfer"
+          },
+          {
+            "dim": 512,
+            "loops_per_measurement": 50,
+            "num_doc_vectors": 32,
+            "num_measurements": 50,
+            "num_query_vectors": 32,
+            "operation": "chamfer"
+          },
+          {
+            "dim": 128,
+            "loops_per_measurement": 500,
+            "num_doc_vectors": 32,
+            "num_measurements": 50,
+            "num_query_vectors": 8,
+            "operation": "max_sim"
+          },
+          {
+            "dim": 256,
+            "loops_per_measurement": 100,
+            "num_doc_vectors": 64,
+            "num_measurements": 50,
+            "num_query_vectors": 16,
+            "operation": "max_sim"
+          },
+          {
+            "dim": 384,
+            "loops_per_measurement": 20,
+            "num_doc_vectors": 128,
+            "num_measurements": 50,
+            "num_query_vectors": 32,
+            "operation": "max_sim"
+          },
+          {
+            "dim": 256,
+            "loops_per_measurement": 200,
+            "num_doc_vectors": 16,
+            "num_measurements": 50,
+            "num_query_vectors": 32,
+            "operation": "max_sim"
+          },
+          {
+            "dim": 264,
+            "loops_per_measurement": 50,
+            "num_doc_vectors": 32,
+            "num_measurements": 50,
+            "num_query_vectors": 64,
+            "operation": "max_sim"
+          },
+          {
+            "dim": 128,
+            "loops_per_measurement": 10,
+            "num_doc_vectors": 1250,
+            "num_measurements": 50,
+            "num_query_vectors": 32,
+            "operation": "max_sim"
+          },
+          {
+            "dim": 512,
+            "loops_per_measurement": 2,
+            "num_doc_vectors": 1250,
+            "num_measurements": 20,
+            "num_query_vectors": 64,
+            "operation": "max_sim"
+          },
+          {
+            "dim": 128,
+            "loops_per_measurement": 200,
+            "num_doc_vectors": 32,
+            "num_measurements": 50,
+            "num_query_vectors": 64,
+            "operation": "max_sim"
+          },
+          {
+            "dim": 512,
+            "loops_per_measurement": 50,
+            "num_doc_vectors": 32,
+            "num_measurements": 50,
+            "num_query_vectors": 32,
+            "operation": "max_sim"
+          }
+        ]
+      },
+      "type": "multi-vector-op"
+    },
+    "results": [
+      {
+        "latencies": [
+          777,
+          777,
+          778,
+          780,
+          780,
+          781,
+          804,
+          838,
+          838,
+          838,
+          838,
+          839,
+          839,
+          839,
+          840,
+          842,
+          845,
+          850,
+          899,
+          926,
+          927,
+          931,
+          932,
+          937,
+          939,
+          956,
+          978,
+          1034,
+          1035,
+          1036,
+          1053,
+          1064,
+          1065,
+          1147,
+          1164,
+          1165,
+          1165,
+          1166,
+          1173,
+          1221,
+          1323,
+          1333,
+          1350,
+          1352,
+          1353,
+          1353,
+          1357,
+          1393,
+          1529,
+          1537
+        ],
+        "percentiles": {
+          "mean": 1030.32,
+          "median": 947.5,
+          "minimum": 777,
+          "p90": 1353,
+          "p99": 1537
+        },
+        "run": {
+          "dim": 128,
+          "loops_per_measurement": 500,
+          "num_doc_vectors": 32,
+          "num_measurements": 50,
+          "num_query_vectors": 8,
+          "operation": "chamfer"
+        }
+      },
+      {
+        "latencies": [
+          1029,
+          1029,
+          1030,
+          1030,
+          1030,
+          1030,
+          1030,
+          1031,
+          1032,
+          1034,
+          1035,
+          1038,
+          1050,
+          1058,
+          1070,
+          1112,
+          1112,
+          1112,
+          1112,
+          1112,
+          1112,
+          1112,
+          1113,
+          1117,
+          1119,
+          1120,
+          1123,
+          1145,
+          1146,
+          1146,
+          1146,
+          1148,
+          1152,
+          1167,
+          1192,
+          1192,
+          1192,
+          1192,
+          1193,
+          1207,
+          1235,
+          1251,
+          1254,
+          1256,
+          1257,
+          1261,
+          1293,
+          1330,
+          1330,
+          1344
+        ],
+        "percentiles": {
+          "mean": 1139.22,
+          "median": 1119.5,
+          "minimum": 1029,
+          "p90": 1261,
+          "p99": 1344
+        },
+        "run": {
+          "dim": 256,
+          "loops_per_measurement": 100,
+          "num_doc_vectors": 64,
+          "num_measurements": 50,
+          "num_query_vectors": 16,
+          "operation": "chamfer"
+        }
+      },
+      {
+        "latencies": [
+          1210,
+          1210,
+          1210,
+          1210,
+          1210,
+          1210,
+          1210,
+          1210,
+          1210,
+          1211,
+          1212,
+          1212,
+          1212,
+          1212,
+          1213,
+          1213,
+          1213,
+          1213,
+          1213,
+          1214,
+          1217,
+          1217,
+          1220,
+          1223,
+          1225,
+          1226,
+          1227,
+          1229,
+          1231,
+          1235,
+          1235,
+          1239,
+          1239,
+          1240,
+          1244,
+          1249,
+          1252,
+          1259,
+          1264,
+          1270,
+          1281,
+          1294,
+          1299,
+          1306,
+          1312,
+          1315,
+          1332,
+          1341,
+          1383,
+          1484
+        ],
+        "percentiles": {
+          "mean": 1246.32,
+          "median": 1225.5,
+          "minimum": 1210,
+          "p90": 1315,
+          "p99": 1484
+        },
+        "run": {
+          "dim": 384,
+          "loops_per_measurement": 20,
+          "num_doc_vectors": 128,
+          "num_measurements": 50,
+          "num_query_vectors": 32,
+          "operation": "chamfer"
+        }
+      },
+      {
+        "latencies": [
+          958,
+          958,
+          958,
+          958,
+          958,
+          960,
+          960,
+          960,
+          961,
+          961,
+          961,
+          961,
+          961,
+          961,
+          961,
+          961,
+          961,
+          961,
+          961,
+          961,
+          961,
+          961,
+          961,
+          961,
+          961,
+          961,
+          961,
+          962,
+          962,
+          963,
+          964,
+          964,
+          965,
+          965,
+          965,
+          966,
+          966,
+          973,
+          974,
+          974,
+          981,
+          981,
+          983,
+          985,
+          987,
+          987,
+          987,
+          990,
+          999,
+          999
+        ],
+        "percentiles": {
+          "mean": 967.42,
+          "median": 961.0,
+          "minimum": 958,
+          "p90": 987,
+          "p99": 999
+        },
+        "run": {
+          "dim": 256,
+          "loops_per_measurement": 200,
+          "num_doc_vectors": 16,
+          "num_measurements": 50,
+          "num_query_vectors": 32,
+          "operation": "chamfer"
+        }
+      },
+      {
+        "latencies": [
+          1017,
+          1017,
+          1017,
+          1017,
+          1017,
+          1017,
+          1017,
+          1017,
+          1017,
+          1017,
+          1017,
+          1017,
+          1018,
+          1018,
+          1018,
+          1018,
+          1018,
+          1018,
+          1018,
+          1018,
+          1018,
+          1019,
+          1019,
+          1019,
+          1019,
+          1020,
+          1020,
+          1020,
+          1020,
+          1020,
+          1020,
+          1021,
+          1022,
+          1023,
+          1023,
+          1026,
+          1029,
+          1031,
+          1032,
+          1033,
+          1034,
+          1035,
+          1036,
+          1037,
+          1041,
+          1044,
+          1044,
+          1045,
+          1046,
+          1065
+        ],
+        "percentiles": {
+          "mean": 1024.58,
+          "median": 1019.5,
+          "minimum": 1017,
+          "p90": 1044,
+          "p99": 1065
+        },
+        "run": {
+          "dim": 264,
+          "loops_per_measurement": 50,
+          "num_doc_vectors": 32,
+          "num_measurements": 50,
+          "num_query_vectors": 64,
+          "operation": "chamfer"
+        }
+      },
+      {
+        "latencies": [
+          1854,
+          1855,
+          1855,
+          1855,
+          1855,
+          1855,
+          1856,
+          1856,
+          1856,
+          1857,
+          1857,
+          1857,
+          1857,
+          1857,
+          1857,
+          1858,
+          1858,
+          1858,
+          1858,
+          1858,
+          1858,
+          1858,
+          1859,
+          1860,
+          1861,
+          1861,
+          1863,
+          1866,
+          1869,
+          1870,
+          1871,
+          1871,
+          1871,
+          1872,
+          1874,
+          1875,
+          1881,
+          1883,
+          1885,
+          1885,
+          1890,
+          1892,
+          1892,
+          1892,
+          1892,
+          1899,
+          1906,
+          1909,
+          1909,
+          1916
+        ],
+        "percentiles": {
+          "mean": 1870.38,
+          "median": 1861.0,
+          "minimum": 1854,
+          "p90": 1899,
+          "p99": 1916
+        },
+        "run": {
+          "dim": 128,
+          "loops_per_measurement": 10,
+          "num_doc_vectors": 1250,
+          "num_measurements": 50,
+          "num_query_vectors": 32,
+          "operation": "chamfer"
+        }
+      },
+      {
+        "latencies": [
+          3180,
+          3180,
+          3180,
+          3180,
+          3180,
+          3181,
+          3181,
+          3181,
+          3181,
+          3183,
+          3185,
+          3187,
+          3205,
+          3206,
+          3207,
+          3208,
+          3211,
+          3218,
+          3220,
+          3268
+        ],
+        "percentiles": {
+          "mean": 3196.1,
+          "median": 3184.0,
+          "minimum": 3180,
+          "p90": 3220,
+          "p99": 3268
+        },
+        "run": {
+          "dim": 512,
+          "loops_per_measurement": 2,
+          "num_doc_vectors": 1250,
+          "num_measurements": 20,
+          "num_query_vectors": 64,
+          "operation": "chamfer"
+        }
+      },
+      {
+        "latencies": [
+          1784,
+          1784,
+          1784,
+          1784,
+          1784,
+          1784,
+          1785,
+          1785,
+          1790,
+          1791,
+          1791,
+          1792,
+          1792,
+          1792,
+          1792,
+          1792,
+          1792,
+          1792,
+          1795,
+          1795,
+          1796,
+          1796,
+          1796,
+          1796,
+          1798,
+          1800,
+          1803,
+          1805,
+          1814,
+          1815,
+          1817,
+          1818,
+          1821,
+          1826,
+          1840,
+          1845,
+          1856,
+          1858,
+          1878,
+          1879,
+          1879,
+          1884,
+          1888,
+          1890,
+          1893,
+          1905,
+          1907,
+          1912,
+          1918,
+          1950
+        ],
+        "percentiles": {
+          "mean": 1825.26,
+          "median": 1799.0,
+          "minimum": 1784,
+          "p90": 1905,
+          "p99": 1950
+        },
+        "run": {
+          "dim": 128,
+          "loops_per_measurement": 200,
+          "num_doc_vectors": 32,
+          "num_measurements": 50,
+          "num_query_vectors": 64,
+          "operation": "chamfer"
+        }
+      },
+      {
+        "latencies": [
+          1017,
+          1017,
+          1017,
+          1017,
+          1017,
+          1017,
+          1017,
+          1017,
+          1017,
+          1017,
+          1017,
+          1017,
+          1017,
+          1017,
+          1017,
+          1017,
+          1017,
+          1017,
+          1017,
+          1017,
+          1017,
+          1017,
+          1017,
+          1017,
+          1017,
+          1018,
+          1018,
+          1018,
+          1018,
+          1018,
+          1019,
+          1020,
+          1020,
+          1021,
+          1021,
+          1022,
+          1022,
+          1022,
+          1023,
+          1027,
+          1030,
+          1030,
+          1035,
+          1043,
+          1043,
+          1044,
+          1045,
+          1049,
+          1049,
+          1060
+        ],
+        "percentiles": {
+          "mean": 1023.2,
+          "median": 1017.5,
+          "minimum": 1017,
+          "p90": 1044,
+          "p99": 1060
+        },
+        "run": {
+          "dim": 512,
+          "loops_per_measurement": 50,
+          "num_doc_vectors": 32,
+          "num_measurements": 50,
+          "num_query_vectors": 32,
+          "operation": "chamfer"
+        }
+      },
+      {
+        "latencies": [
+          567,
+          567,
+          567,
+          567,
+          567,
+          567,
+          567,
+          567,
+          567,
+          567,
+          567,
+          567,
+          567,
+          567,
+          567,
+          569,
+          569,
+          569,
+          569,
+          569,
+          569,
+          570,
+          570,
+          570,
+          570,
+          570,
+          570,
+          570,
+          570,
+          570,
+          571,
+          571,
+          571,
+          571,
+          571,
+          571,
+          571,
+          571,
+          571,
+          571,
+          571,
+          571,
+          571,
+          571,
+          574,
+          578,
+          578,
+          594,
+          595,
+          598
+        ],
+        "percentiles": {
+          "mean": 571.2,
+          "median": 570.0,
+          "minimum": 567,
+          "p90": 578,
+          "p99": 598
+        },
+        "run": {
+          "dim": 128,
+          "loops_per_measurement": 500,
+          "num_doc_vectors": 32,
+          "num_measurements": 50,
+          "num_query_vectors": 8,
+          "operation": "max_sim"
+        }
+      },
+      {
+        "latencies": [
+          988,
+          988,
+          988,
+          988,
+          988,
+          988,
+          988,
+          988,
+          988,
+          989,
+          989,
+          989,
+          989,
+          989,
+          989,
+          989,
+          989,
+          989,
+          991,
+          991,
+          991,
+          991,
+          991,
+          991,
+          991,
+          991,
+          991,
+          991,
+          991,
+          991,
+          991,
+          991,
+          992,
+          992,
+          992,
+          992,
+          992,
+          992,
+          992,
+          992,
+          992,
+          996,
+          996,
+          1004,
+          1009,
+          1013,
+          1018,
+          1020,
+          1047,
+          1057
+        ],
+        "percentiles": {
+          "mean": 995.1,
+          "median": 991.0,
+          "minimum": 988,
+          "p90": 1013,
+          "p99": 1057
+        },
+        "run": {
+          "dim": 256,
+          "loops_per_measurement": 100,
+          "num_doc_vectors": 64,
+          "num_measurements": 50,
+          "num_query_vectors": 16,
+          "operation": "max_sim"
+        }
+      },
+      {
+        "latencies": [
+          1210,
+          1210,
+          1210,
+          1210,
+          1210,
+          1210,
+          1210,
+          1210,
+          1211,
+          1211,
+          1211,
+          1212,
+          1213,
+          1213,
+          1213,
+          1213,
+          1213,
+          1213,
+          1213,
+          1213,
+          1213,
+          1213,
+          1214,
+          1214,
+          1214,
+          1214,
+          1214,
+          1214,
+          1214,
+          1214,
+          1214,
+          1214,
+          1216,
+          1217,
+          1217,
+          1217,
+          1218,
+          1220,
+          1222,
+          1223,
+          1224,
+          1224,
+          1225,
+          1227,
+          1238,
+          1239,
+          1239,
+          1241,
+          1242,
+          1243
+        ],
+        "percentiles": {
+          "mean": 1217.74,
+          "median": 1214.0,
+          "minimum": 1210,
+          "p90": 1239,
+          "p99": 1243
+        },
+        "run": {
+          "dim": 384,
+          "loops_per_measurement": 20,
+          "num_doc_vectors": 128,
+          "num_measurements": 50,
+          "num_query_vectors": 32,
+          "operation": "max_sim"
+        }
+      },
+      {
+        "latencies": [
+          953,
+          953,
+          953,
+          953,
+          953,
+          953,
+          954,
+          954,
+          956,
+          957,
+          957,
+          957,
+          957,
+          957,
+          957,
+          957,
+          957,
+          957,
+          957,
+          957,
+          957,
+          957,
+          957,
+          957,
+          957,
+          957,
+          957,
+          957,
+          958,
+          958,
+          958,
+          958,
+          958,
+          958,
+          960,
+          961,
+          961,
+          961,
+          961,
+          961,
+          961,
+          961,
+          962,
+          963,
+          971,
+          976,
+          978,
+          984,
+          984,
+          987
+        ],
+        "percentiles": {
+          "mean": 960.1,
+          "median": 957.0,
+          "minimum": 953,
+          "p90": 976,
+          "p99": 987
+        },
+        "run": {
+          "dim": 256,
+          "loops_per_measurement": 200,
+          "num_doc_vectors": 16,
+          "num_measurements": 50,
+          "num_query_vectors": 32,
+          "operation": "max_sim"
+        }
+      },
+      {
+        "latencies": [
+          1016,
+          1016,
+          1016,
+          1017,
+          1017,
+          1017,
+          1017,
+          1017,
+          1017,
+          1017,
+          1017,
+          1017,
+          1017,
+          1017,
+          1017,
+          1017,
+          1017,
+          1017,
+          1017,
+          1017,
+          1017,
+          1018,
+          1018,
+          1018,
+          1018,
+          1018,
+          1018,
+          1018,
+          1018,
+          1019,
+          1019,
+          1019,
+          1019,
+          1019,
+          1019,
+          1019,
+          1021,
+          1021,
+          1023,
+          1023,
+          1025,
+          1032,
+          1044,
+          1045,
+          1045,
+          1045,
+          1047,
+          1052,
+          1058,
+          1061
+        ],
+        "percentiles": {
+          "mean": 1023.46,
+          "median": 1018.0,
+          "minimum": 1016,
+          "p90": 1045,
+          "p99": 1061
+        },
+        "run": {
+          "dim": 264,
+          "loops_per_measurement": 50,
+          "num_doc_vectors": 32,
+          "num_measurements": 50,
+          "num_query_vectors": 64,
+          "operation": "max_sim"
+        }
+      },
+      {
+        "latencies": [
+          1858,
+          1858,
+          1860,
+          1860,
+          1860,
+          1860,
+          1860,
+          1860,
+          1860,
+          1860,
+          1860,
+          1861,
+          1861,
+          1861,
+          1861,
+          1861,
+          1861,
+          1861,
+          1862,
+          1863,
+          1863,
+          1864,
+          1865,
+          1867,
+          1868,
+          1872,
+          1873,
+          1876,
+          1878,
+          1881,
+          1882,
+          1883,
+          1888,
+          1889,
+          1889,
+          1890,
+          1890,
+          1890,
+          1891,
+          1892,
+          1905,
+          1906,
+          1908,
+          1934,
+          1962,
+          1967,
+          1974,
+          1988,
+          2004,
+          2014
+        ],
+        "percentiles": {
+          "mean": 1887.22,
+          "median": 1870.0,
+          "minimum": 1858,
+          "p90": 1967,
+          "p99": 2014
+        },
+        "run": {
+          "dim": 128,
+          "loops_per_measurement": 10,
+          "num_doc_vectors": 1250,
+          "num_measurements": 50,
+          "num_query_vectors": 32,
+          "operation": "max_sim"
+        }
+      },
+      {
+        "latencies": [
+          3177,
+          3177,
+          3177,
+          3179,
+          3192,
+          3201,
+          3212,
+          3222,
+          3251,
+          3251,
+          3255,
+          3256,
+          3256,
+          3321,
+          3381,
+          3399,
+          3400,
+          3419,
+          3422,
+          3445
+        ],
+        "percentiles": {
+          "mean": 3279.65,
+          "median": 3253.0,
+          "minimum": 3177,
+          "p90": 3422,
+          "p99": 3445
+        },
+        "run": {
+          "dim": 512,
+          "loops_per_measurement": 2,
+          "num_doc_vectors": 1250,
+          "num_measurements": 20,
+          "num_query_vectors": 64,
+          "operation": "max_sim"
+        }
+      },
+      {
+        "latencies": [
+          1783,
+          1784,
+          1787,
+          1791,
+          1791,
+          1791,
+          1813,
+          1838,
+          1853,
+          1868,
+          1871,
+          1882,
+          1882,
+          1884,
+          1890,
+          1899,
+          1899,
+          1899,
+          1900,
+          1901,
+          1905,
+          1906,
+          1908,
+          1909,
+          1911,
+          1911,
+          1911,
+          1911,
+          1914,
+          1915,
+          1915,
+          1916,
+          1916,
+          1917,
+          1919,
+          1922,
+          1922,
+          1923,
+          1923,
+          1925,
+          1927,
+          1927,
+          1928,
+          1929,
+          1929,
+          1933,
+          1937,
+          1938,
+          1940,
+          1983
+        ],
+        "percentiles": {
+          "mean": 1893.52,
+          "median": 1911.0,
+          "minimum": 1783,
+          "p90": 1933,
+          "p99": 1983
+        },
+        "run": {
+          "dim": 128,
+          "loops_per_measurement": 200,
+          "num_doc_vectors": 32,
+          "num_measurements": 50,
+          "num_query_vectors": 64,
+          "operation": "max_sim"
+        }
+      },
+      {
+        "latencies": [
+          1017,
+          1017,
+          1017,
+          1017,
+          1017,
+          1017,
+          1017,
+          1017,
+          1017,
+          1017,
+          1017,
+          1017,
+          1017,
+          1017,
+          1017,
+          1020,
+          1023,
+          1023,
+          1025,
+          1028,
+          1033,
+          1033,
+          1034,
+          1037,
+          1038,
+          1040,
+          1043,
+          1044,
+          1052,
+          1052,
+          1057,
+          1060,
+          1063,
+          1078,
+          1088,
+          1088,
+          1088,
+          1088,
+          1088,
+          1088,
+          1088,
+          1088,
+          1090,
+          1090,
+          1090,
+          1092,
+          1093,
+          1093,
+          1094,
+          1094
+        ],
+        "percentiles": {
+          "mean": 1049.56,
+          "median": 1039.0,
+          "minimum": 1017,
+          "p90": 1092,
+          "p99": 1094
+        },
+        "run": {
+          "dim": 512,
+          "loops_per_measurement": 50,
+          "num_doc_vectors": 32,
+          "num_measurements": 50,
+          "num_query_vectors": 32,
+          "operation": "max_sim"
+        }
+      }
+    ]
+  },
+  {
+    "input": {
+      "content": {
+        "element_type": "float16",
+        "implementation": "optimized",
+        "runs": [
+          {
+            "dim": 256,
+            "loops_per_measurement": 100,
+            "num_doc_vectors": 64,
+            "num_measurements": 50,
+            "num_query_vectors": 16,
+            "operation": "chamfer"
+          },
+          {
+            "dim": 128,
+            "loops_per_measurement": 10,
+            "num_doc_vectors": 1250,
+            "num_measurements": 50,
+            "num_query_vectors": 32,
+            "operation": "chamfer"
+          },
+          {
+            "dim": 256,
+            "loops_per_measurement": 100,
+            "num_doc_vectors": 64,
+            "num_measurements": 50,
+            "num_query_vectors": 16,
+            "operation": "max_sim"
+          },
+          {
+            "dim": 128,
+            "loops_per_measurement": 10,
+            "num_doc_vectors": 1250,
+            "num_measurements": 50,
+            "num_query_vectors": 32,
+            "operation": "max_sim"
+          }
+        ]
+      },
+      "type": "multi-vector-op"
+    },
+    "results": [
+      {
+        "latencies": [
+          1734,
+          1734,
+          1736,
+          1736,
+          1737,
+          1737,
+          1737,
+          1738,
+          1738,
+          1738,
+          1738,
+          1739,
+          1740,
+          1740,
+          1741,
+          1744,
+          1744,
+          1751,
+          1751,
+          1753,
+          1754,
+          1754,
+          1756,
+          1759,
+          1761,
+          1764,
+          1767,
+          1767,
+          1767,
+          1768,
+          1768,
+          1769,
+          1769,
+          1773,
+          1774,
+          1775,
+          1779,
+          1787,
+          1794,
+          1808,
+          1822,
+          1825,
+          1829,
+          1829,
+          1844,
+          1846,
+          1852,
+          1859,
+          1903,
+          2194
+        ],
+        "percentiles": {
+          "mean": 1780.44,
+          "median": 1762.5,
+          "minimum": 1734,
+          "p90": 1846,
+          "p99": 2194
+        },
+        "run": {
+          "dim": 256,
+          "loops_per_measurement": 100,
+          "num_doc_vectors": 64,
+          "num_measurements": 50,
+          "num_query_vectors": 16,
+          "operation": "chamfer"
+        }
+      },
+      {
+        "latencies": [
+          2130,
+          2130,
+          2130,
+          2131,
+          2133,
+          2133,
+          2140,
+          2142,
+          2149,
+          2151,
+          2158,
+          2160,
+          2163,
+          2164,
+          2166,
+          2167,
+          2167,
+          2168,
+          2171,
+          2173,
+          2174,
+          2176,
+          2177,
+          2178,
+          2178,
+          2181,
+          2184,
+          2189,
+          2195,
+          2195,
+          2197,
+          2198,
+          2198,
+          2201,
+          2203,
+          2207,
+          2215,
+          2217,
+          2220,
+          2229,
+          2240,
+          2242,
+          2243,
+          2249,
+          2250,
+          2291,
+          2305,
+          2438,
+          2613,
+          2643
+        ],
+        "percentiles": {
+          "mean": 2209.04,
+          "median": 2179.5,
+          "minimum": 2130,
+          "p90": 2291,
+          "p99": 2643
+        },
+        "run": {
+          "dim": 128,
+          "loops_per_measurement": 10,
+          "num_doc_vectors": 1250,
+          "num_measurements": 50,
+          "num_query_vectors": 32,
+          "operation": "chamfer"
+        }
+      },
+      {
+        "latencies": [
+          1731,
+          1733,
+          1737,
+          1737,
+          1737,
+          1741,
+          1741,
+          1745,
+          1745,
+          1750,
+          1750,
+          1750,
+          1750,
+          1751,
+          1754,
+          1754,
+          1755,
+          1758,
+          1758,
+          1759,
+          1761,
+          1761,
+          1766,
+          1768,
+          1770,
+          1771,
+          1771,
+          1772,
+          1773,
+          1773,
+          1775,
+          1776,
+          1776,
+          1778,
+          1785,
+          1788,
+          1789,
+          1791,
+          1795,
+          1800,
+          1804,
+          1808,
+          1814,
+          1822,
+          1832,
+          1833,
+          1834,
+          1864,
+          1867,
+          1869
+        ],
+        "percentiles": {
+          "mean": 1776.44,
+          "median": 1770.5,
+          "minimum": 1731,
+          "p90": 1833,
+          "p99": 1869
+        },
+        "run": {
+          "dim": 256,
+          "loops_per_measurement": 100,
+          "num_doc_vectors": 64,
+          "num_measurements": 50,
+          "num_query_vectors": 16,
+          "operation": "max_sim"
+        }
+      },
+      {
+        "latencies": [
+          2127,
+          2127,
+          2129,
+          2130,
+          2132,
+          2141,
+          2142,
+          2142,
+          2147,
+          2148,
+          2149,
+          2150,
+          2154,
+          2154,
+          2159,
+          2162,
+          2166,
+          2168,
+          2170,
+          2173,
+          2177,
+          2180,
+          2180,
+          2181,
+          2181,
+          2182,
+          2183,
+          2187,
+          2196,
+          2196,
+          2199,
+          2200,
+          2204,
+          2211,
+          2213,
+          2216,
+          2224,
+          2255,
+          2256,
+          2271,
+          2354,
+          2488,
+          2493,
+          2495,
+          2498,
+          2505,
+          2525,
+          2653,
+          2657,
+          3515
+        ],
+        "percentiles": {
+          "mean": 2264.9,
+          "median": 2181.5,
+          "minimum": 2127,
+          "p90": 2505,
+          "p99": 3515
+        },
+        "run": {
+          "dim": 128,
+          "loops_per_measurement": 10,
+          "num_doc_vectors": 1250,
+          "num_measurements": 50,
+          "num_query_vectors": 32,
+          "operation": "max_sim"
+        }
+      }
+    ]
+  },
+  {
+    "input": {
+      "content": {
+        "element_type": "float32",
+        "implementation": "reference",
+        "runs": [
+          {
+            "dim": 128,
+            "loops_per_measurement": 50,
+            "num_doc_vectors": 32,
+            "num_measurements": 50,
+            "num_query_vectors": 8,
+            "operation": "chamfer"
+          },
+          {
+            "dim": 384,
+            "loops_per_measurement": 2,
+            "num_doc_vectors": 128,
+            "num_measurements": 50,
+            "num_query_vectors": 32,
+            "operation": "chamfer"
+          },
+          {
+            "dim": 128,
+            "loops_per_measurement": 50,
+            "num_doc_vectors": 32,
+            "num_measurements": 50,
+            "num_query_vectors": 8,
+            "operation": "max_sim"
+          },
+          {
+            "dim": 384,
+            "loops_per_measurement": 2,
+            "num_doc_vectors": 128,
+            "num_measurements": 50,
+            "num_query_vectors": 32,
+            "operation": "max_sim"
+          }
+        ]
+      },
+      "type": "multi-vector-op"
+    },
+    "results": [
+      {
+        "latencies": [
+          64,
+          64,
+          64,
+          66,
+          66,
+          66,
+          66,
+          66,
+          66,
+          66,
+          66,
+          66,
+          66,
+          66,
+          66,
+          66,
+          66,
+          66,
+          66,
+          66,
+          66,
+          66,
+          66,
+          66,
+          66,
+          66,
+          66,
+          66,
+          66,
+          66,
+          66,
+          66,
+          66,
+          66,
+          66,
+          66,
+          67,
+          67,
+          67,
+          67,
+          67,
+          67,
+          67,
+          67,
+          67,
+          68,
+          68,
+          69,
+          71,
+          127
+        ],
+        "percentiles": {
+          "mean": 67.52,
+          "median": 66.0,
+          "minimum": 64,
+          "p90": 68,
+          "p99": 127
+        },
+        "run": {
+          "dim": 128,
+          "loops_per_measurement": 50,
+          "num_doc_vectors": 32,
+          "num_measurements": 50,
+          "num_query_vectors": 8,
+          "operation": "chamfer"
+        }
+      },
+      {
+        "latencies": [
+          130,
+          130,
+          130,
+          130,
+          130,
+          130,
+          130,
+          130,
+          130,
+          130,
+          131,
+          131,
+          131,
+          131,
+          132,
+          132,
+          133,
+          133,
+          135,
+          136,
+          136,
+          137,
+          138,
+          138,
+          138,
+          138,
+          138,
+          138,
+          138,
+          138,
+          138,
+          138,
+          138,
+          138,
+          138,
+          138,
+          138,
+          138,
+          139,
+          139,
+          139,
+          139,
+          139,
+          140,
+          140,
+          140,
+          141,
+          143,
+          147,
+          161
+        ],
+        "percentiles": {
+          "mean": 136.26,
+          "median": 138.0,
+          "minimum": 130,
+          "p90": 140,
+          "p99": 161
+        },
+        "run": {
+          "dim": 384,
+          "loops_per_measurement": 2,
+          "num_doc_vectors": 128,
+          "num_measurements": 50,
+          "num_query_vectors": 32,
+          "operation": "chamfer"
+        }
+      },
+      {
+        "latencies": [
+          62,
+          62,
+          62,
+          62,
+          62,
+          63,
+          63,
+          63,
+          63,
+          63,
+          63,
+          63,
+          63,
+          63,
+          63,
+          64,
+          64,
+          65,
+          66,
+          66,
+          66,
+          66,
+          66,
+          66,
+          66,
+          66,
+          66,
+          66,
+          66,
+          66,
+          66,
+          66,
+          66,
+          66,
+          66,
+          66,
+          67,
+          67,
+          67,
+          67,
+          67,
+          67,
+          67,
+          68,
+          68,
+          69,
+          71,
+          72,
+          78,
+          106
+        ],
+        "percentiles": {
+          "mean": 66.44,
+          "median": 66.0,
+          "minimum": 62,
+          "p90": 69,
+          "p99": 106
+        },
+        "run": {
+          "dim": 128,
+          "loops_per_measurement": 50,
+          "num_doc_vectors": 32,
+          "num_measurements": 50,
+          "num_query_vectors": 8,
+          "operation": "max_sim"
+        }
+      },
+      {
+        "latencies": [
+          130,
+          131,
+          131,
+          131,
+          131,
+          131,
+          131,
+          131,
+          131,
+          131,
+          131,
+          131,
+          131,
+          131,
+          131,
+          131,
+          131,
+          131,
+          131,
+          131,
+          131,
+          131,
+          131,
+          132,
+          132,
+          132,
+          132,
+          132,
+          133,
+          133,
+          134,
+          134,
+          135,
+          135,
+          135,
+          136,
+          136,
+          137,
+          139,
+          139,
+          140,
+          142,
+          142,
+          143,
+          144,
+          145,
+          145,
+          147,
+          155,
+          158
+        ],
+        "percentiles": {
+          "mean": 135.18,
+          "median": 132.0,
+          "minimum": 130,
+          "p90": 145,
+          "p99": 158
+        },
+        "run": {
+          "dim": 384,
+          "loops_per_measurement": 2,
+          "num_doc_vectors": 128,
+          "num_measurements": 50,
+          "num_query_vectors": 32,
+          "operation": "max_sim"
+        }
+      }
+    ]
+  },
+  {
+    "input": {
+      "content": {
+        "element_type": "float16",
+        "implementation": "reference",
+        "runs": [
+          {
+            "dim": 128,
+            "loops_per_measurement": 50,
+            "num_doc_vectors": 32,
+            "num_measurements": 50,
+            "num_query_vectors": 8,
+            "operation": "chamfer"
+          },
+          {
+            "dim": 384,
+            "loops_per_measurement": 2,
+            "num_doc_vectors": 128,
+            "num_measurements": 50,
+            "num_query_vectors": 32,
+            "operation": "max_sim"
+          }
+        ]
+      },
+      "type": "multi-vector-op"
+    },
+    "results": [
+      {
+        "latencies": [
+          73,
+          73,
+          73,
+          73,
+          73,
+          73,
+          73,
+          73,
+          73,
+          73,
+          73,
+          73,
+          73,
+          73,
+          73,
+          73,
+          73,
+          74,
+          74,
+          74,
+          74,
+          74,
+          74,
+          75,
+          75,
+          76,
+          76,
+          76,
+          76,
+          76,
+          76,
+          77,
+          77,
+          77,
+          77,
+          77,
+          77,
+          77,
+          77,
+          77,
+          78,
+          78,
+          78,
+          79,
+          80,
+          80,
+          80,
+          84,
+          87,
+          92
+        ],
+        "percentiles": {
+          "mean": 76.0,
+          "median": 75.5,
+          "minimum": 73,
+          "p90": 80,
+          "p99": 92
+        },
+        "run": {
+          "dim": 128,
+          "loops_per_measurement": 50,
+          "num_doc_vectors": 32,
+          "num_measurements": 50,
+          "num_query_vectors": 8,
+          "operation": "chamfer"
+        }
+      },
+      {
+        "latencies": [
+          135,
+          135,
+          135,
+          135,
+          135,
+          135,
+          135,
+          135,
+          136,
+          136,
+          137,
+          138,
+          140,
+          141,
+          141,
+          141,
+          141,
+          141,
+          142,
+          142,
+          142,
+          142,
+          142,
+          142,
+          142,
+          142,
+          142,
+          142,
+          142,
+          142,
+          142,
+          142,
+          142,
+          142,
+          143,
+          143,
+          143,
+          144,
+          144,
+          145,
+          145,
+          145,
+          147,
+          150,
+          151,
+          151,
+          153,
+          154,
+          158,
+          158
+        ],
+        "percentiles": {
+          "mean": 142.36,
+          "median": 142.0,
+          "minimum": 135,
+          "p90": 151,
+          "p99": 158
+        },
+        "run": {
+          "dim": 384,
+          "loops_per_measurement": 2,
+          "num_doc_vectors": 128,
+          "num_measurements": 50,
+          "num_query_vectors": 32,
+          "operation": "max_sim"
+        }
+      }
+    ]
+  }
+]
\ No newline at end of file

From 54a21ec8f274006c433fcddf111cd2580aa184e1 Mon Sep 17 00:00:00 2001
From: Suryansh Gupta <suryangupta@microsoft.com>
Date: Thu, 7 May 2026 02:38:23 +0530
Subject: [PATCH 2/8] Move some repetetive code to macros and add more
 benchmark cases

---
 .../examples/multi-vector.json                |  65 ++++-
 diskann-benchmark-multi-vector/src/lib.rs     | 228 +++++++-----------
 2 files changed, 141 insertions(+), 152 deletions(-)

diff --git a/diskann-benchmark-multi-vector/examples/multi-vector.json b/diskann-benchmark-multi-vector/examples/multi-vector.json
index 2626e5047..553a6a9d8 100644
--- a/diskann-benchmark-multi-vector/examples/multi-vector.json
+++ b/diskann-benchmark-multi-vector/examples/multi-vector.json
@@ -13,7 +13,7 @@
           { "operation": "chamfer", "num_query_vectors": 32, "num_doc_vectors": 16,   "dim": 256, "loops_per_measurement": 200, "num_measurements": 50 },
           { "operation": "chamfer", "num_query_vectors": 64, "num_doc_vectors": 32,   "dim": 264, "loops_per_measurement": 50,  "num_measurements": 50 },
           { "operation": "chamfer", "num_query_vectors": 32, "num_doc_vectors": 1250, "dim": 128, "loops_per_measurement": 10,  "num_measurements": 50 },
-          { "operation": "chamfer", "num_query_vectors": 64, "num_doc_vectors": 1250, "dim": 512, "loops_per_measurement": 2,   "num_measurements": 20 },
+          { "operation": "chamfer", "num_query_vectors": 64, "num_doc_vectors": 1250, "dim": 512, "loops_per_measurement": 2,   "num_measurements": 50 },
           { "operation": "chamfer", "num_query_vectors": 64, "num_doc_vectors": 32,   "dim": 128, "loops_per_measurement": 200, "num_measurements": 50 },
           { "operation": "chamfer", "num_query_vectors": 32, "num_doc_vectors": 32,   "dim": 512, "loops_per_measurement": 50,  "num_measurements": 50 },
 
@@ -23,7 +23,7 @@
           { "operation": "max_sim", "num_query_vectors": 32, "num_doc_vectors": 16,   "dim": 256, "loops_per_measurement": 200, "num_measurements": 50 },
           { "operation": "max_sim", "num_query_vectors": 64, "num_doc_vectors": 32,   "dim": 264, "loops_per_measurement": 50,  "num_measurements": 50 },
           { "operation": "max_sim", "num_query_vectors": 32, "num_doc_vectors": 1250, "dim": 128, "loops_per_measurement": 10,  "num_measurements": 50 },
-          { "operation": "max_sim", "num_query_vectors": 64, "num_doc_vectors": 1250, "dim": 512, "loops_per_measurement": 2,   "num_measurements": 20 },
+          { "operation": "max_sim", "num_query_vectors": 64, "num_doc_vectors": 1250, "dim": 512, "loops_per_measurement": 2,   "num_measurements": 50 },
           { "operation": "max_sim", "num_query_vectors": 64, "num_doc_vectors": 32,   "dim": 128, "loops_per_measurement": 200, "num_measurements": 50 },
           { "operation": "max_sim", "num_query_vectors": 32, "num_doc_vectors": 32,   "dim": 512, "loops_per_measurement": 50,  "num_measurements": 50 }
         ]
@@ -35,10 +35,25 @@
         "element_type": "float16",
         "implementation": "optimized",
         "runs": [
+          { "operation": "chamfer", "num_query_vectors": 8,  "num_doc_vectors": 32,   "dim": 128, "loops_per_measurement": 500, "num_measurements": 50 },
           { "operation": "chamfer", "num_query_vectors": 16, "num_doc_vectors": 64,   "dim": 256, "loops_per_measurement": 100, "num_measurements": 50 },
+          { "operation": "chamfer", "num_query_vectors": 32, "num_doc_vectors": 128,  "dim": 384, "loops_per_measurement": 20,  "num_measurements": 50 },
+          { "operation": "chamfer", "num_query_vectors": 32, "num_doc_vectors": 16,   "dim": 256, "loops_per_measurement": 200, "num_measurements": 50 },
+          { "operation": "chamfer", "num_query_vectors": 64, "num_doc_vectors": 32,   "dim": 264, "loops_per_measurement": 50,  "num_measurements": 50 },
           { "operation": "chamfer", "num_query_vectors": 32, "num_doc_vectors": 1250, "dim": 128, "loops_per_measurement": 10,  "num_measurements": 50 },
+          { "operation": "chamfer", "num_query_vectors": 64, "num_doc_vectors": 1250, "dim": 512, "loops_per_measurement": 2,   "num_measurements": 50 },
+          { "operation": "chamfer", "num_query_vectors": 64, "num_doc_vectors": 32,   "dim": 128, "loops_per_measurement": 200, "num_measurements": 50 },
+          { "operation": "chamfer", "num_query_vectors": 32, "num_doc_vectors": 32,   "dim": 512, "loops_per_measurement": 50,  "num_measurements": 50 },
+
+          { "operation": "max_sim", "num_query_vectors": 8,  "num_doc_vectors": 32,   "dim": 128, "loops_per_measurement": 500, "num_measurements": 50 },
           { "operation": "max_sim", "num_query_vectors": 16, "num_doc_vectors": 64,   "dim": 256, "loops_per_measurement": 100, "num_measurements": 50 },
-          { "operation": "max_sim", "num_query_vectors": 32, "num_doc_vectors": 1250, "dim": 128, "loops_per_measurement": 10,  "num_measurements": 50 }
+          { "operation": "max_sim", "num_query_vectors": 32, "num_doc_vectors": 128,  "dim": 384, "loops_per_measurement": 20,  "num_measurements": 50 },
+          { "operation": "max_sim", "num_query_vectors": 32, "num_doc_vectors": 16,   "dim": 256, "loops_per_measurement": 200, "num_measurements": 50 },
+          { "operation": "max_sim", "num_query_vectors": 64, "num_doc_vectors": 32,   "dim": 264, "loops_per_measurement": 50,  "num_measurements": 50 },
+          { "operation": "max_sim", "num_query_vectors": 32, "num_doc_vectors": 1250, "dim": 128, "loops_per_measurement": 10,  "num_measurements": 50 },
+          { "operation": "max_sim", "num_query_vectors": 64, "num_doc_vectors": 1250, "dim": 512, "loops_per_measurement": 2,   "num_measurements": 50 },
+          { "operation": "max_sim", "num_query_vectors": 64, "num_doc_vectors": 32,   "dim": 128, "loops_per_measurement": 200, "num_measurements": 50 },
+          { "operation": "max_sim", "num_query_vectors": 32, "num_doc_vectors": 32,   "dim": 512, "loops_per_measurement": 50,  "num_measurements": 50 }
         ]
       }
     },
@@ -48,10 +63,25 @@
         "element_type": "float32",
         "implementation": "reference",
         "runs": [
-          { "operation": "chamfer", "num_query_vectors": 8,  "num_doc_vectors": 32,  "dim": 128, "loops_per_measurement": 50, "num_measurements": 50 },
-          { "operation": "chamfer", "num_query_vectors": 32, "num_doc_vectors": 128, "dim": 384, "loops_per_measurement": 2,  "num_measurements": 50 },
-          { "operation": "max_sim", "num_query_vectors": 8,  "num_doc_vectors": 32,  "dim": 128, "loops_per_measurement": 50, "num_measurements": 50 },
-          { "operation": "max_sim", "num_query_vectors": 32, "num_doc_vectors": 128, "dim": 384, "loops_per_measurement": 2,  "num_measurements": 50 }
+          { "operation": "chamfer", "num_query_vectors": 8,  "num_doc_vectors": 32,   "dim": 128, "loops_per_measurement": 500, "num_measurements": 50 },
+          { "operation": "chamfer", "num_query_vectors": 16, "num_doc_vectors": 64,   "dim": 256, "loops_per_measurement": 100, "num_measurements": 50 },
+          { "operation": "chamfer", "num_query_vectors": 32, "num_doc_vectors": 128,  "dim": 384, "loops_per_measurement": 20,  "num_measurements": 50 },
+          { "operation": "chamfer", "num_query_vectors": 32, "num_doc_vectors": 16,   "dim": 256, "loops_per_measurement": 200, "num_measurements": 50 },
+          { "operation": "chamfer", "num_query_vectors": 64, "num_doc_vectors": 32,   "dim": 264, "loops_per_measurement": 50,  "num_measurements": 50 },
+          { "operation": "chamfer", "num_query_vectors": 32, "num_doc_vectors": 1250, "dim": 128, "loops_per_measurement": 10,  "num_measurements": 50 },
+          { "operation": "chamfer", "num_query_vectors": 64, "num_doc_vectors": 1250, "dim": 512, "loops_per_measurement": 2,   "num_measurements": 50 },
+          { "operation": "chamfer", "num_query_vectors": 64, "num_doc_vectors": 32,   "dim": 128, "loops_per_measurement": 200, "num_measurements": 50 },
+          { "operation": "chamfer", "num_query_vectors": 32, "num_doc_vectors": 32,   "dim": 512, "loops_per_measurement": 50,  "num_measurements": 50 },
+
+          { "operation": "max_sim", "num_query_vectors": 8,  "num_doc_vectors": 32,   "dim": 128, "loops_per_measurement": 500, "num_measurements": 50 },
+          { "operation": "max_sim", "num_query_vectors": 16, "num_doc_vectors": 64,   "dim": 256, "loops_per_measurement": 100, "num_measurements": 50 },
+          { "operation": "max_sim", "num_query_vectors": 32, "num_doc_vectors": 128,  "dim": 384, "loops_per_measurement": 20,  "num_measurements": 50 },
+          { "operation": "max_sim", "num_query_vectors": 32, "num_doc_vectors": 16,   "dim": 256, "loops_per_measurement": 200, "num_measurements": 50 },
+          { "operation": "max_sim", "num_query_vectors": 64, "num_doc_vectors": 32,   "dim": 264, "loops_per_measurement": 50,  "num_measurements": 50 },
+          { "operation": "max_sim", "num_query_vectors": 32, "num_doc_vectors": 1250, "dim": 128, "loops_per_measurement": 10,  "num_measurements": 50 },
+          { "operation": "max_sim", "num_query_vectors": 64, "num_doc_vectors": 1250, "dim": 512, "loops_per_measurement": 2,   "num_measurements": 50 },
+          { "operation": "max_sim", "num_query_vectors": 64, "num_doc_vectors": 32,   "dim": 128, "loops_per_measurement": 200, "num_measurements": 50 },
+          { "operation": "max_sim", "num_query_vectors": 32, "num_doc_vectors": 32,   "dim": 512, "loops_per_measurement": 50,  "num_measurements": 50 }
         ]
       }
     },
@@ -61,8 +91,25 @@
         "element_type": "float16",
         "implementation": "reference",
         "runs": [
-          { "operation": "chamfer", "num_query_vectors": 8,  "num_doc_vectors": 32,  "dim": 128, "loops_per_measurement": 50, "num_measurements": 50 },
-          { "operation": "max_sim", "num_query_vectors": 32, "num_doc_vectors": 128, "dim": 384, "loops_per_measurement": 2,  "num_measurements": 50 }
+          { "operation": "chamfer", "num_query_vectors": 8,  "num_doc_vectors": 32,   "dim": 128, "loops_per_measurement": 500, "num_measurements": 50 },
+          { "operation": "chamfer", "num_query_vectors": 16, "num_doc_vectors": 64,   "dim": 256, "loops_per_measurement": 100, "num_measurements": 50 },
+          { "operation": "chamfer", "num_query_vectors": 32, "num_doc_vectors": 128,  "dim": 384, "loops_per_measurement": 20,  "num_measurements": 50 },
+          { "operation": "chamfer", "num_query_vectors": 32, "num_doc_vectors": 16,   "dim": 256, "loops_per_measurement": 200, "num_measurements": 50 },
+          { "operation": "chamfer", "num_query_vectors": 64, "num_doc_vectors": 32,   "dim": 264, "loops_per_measurement": 50,  "num_measurements": 50 },
+          { "operation": "chamfer", "num_query_vectors": 32, "num_doc_vectors": 1250, "dim": 128, "loops_per_measurement": 10,  "num_measurements": 50 },
+          { "operation": "chamfer", "num_query_vectors": 64, "num_doc_vectors": 1250, "dim": 512, "loops_per_measurement": 2,   "num_measurements": 50 },
+          { "operation": "chamfer", "num_query_vectors": 64, "num_doc_vectors": 32,   "dim": 128, "loops_per_measurement": 200, "num_measurements": 50 },
+          { "operation": "chamfer", "num_query_vectors": 32, "num_doc_vectors": 32,   "dim": 512, "loops_per_measurement": 50,  "num_measurements": 50 },
+
+          { "operation": "max_sim", "num_query_vectors": 8,  "num_doc_vectors": 32,   "dim": 128, "loops_per_measurement": 500, "num_measurements": 50 },
+          { "operation": "max_sim", "num_query_vectors": 16, "num_doc_vectors": 64,   "dim": 256, "loops_per_measurement": 100, "num_measurements": 50 },
+          { "operation": "max_sim", "num_query_vectors": 32, "num_doc_vectors": 128,  "dim": 384, "loops_per_measurement": 20,  "num_measurements": 50 },
+          { "operation": "max_sim", "num_query_vectors": 32, "num_doc_vectors": 16,   "dim": 256, "loops_per_measurement": 200, "num_measurements": 50 },
+          { "operation": "max_sim", "num_query_vectors": 64, "num_doc_vectors": 32,   "dim": 264, "loops_per_measurement": 50,  "num_measurements": 50 },
+          { "operation": "max_sim", "num_query_vectors": 32, "num_doc_vectors": 1250, "dim": 128, "loops_per_measurement": 10,  "num_measurements": 50 },
+          { "operation": "max_sim", "num_query_vectors": 64, "num_doc_vectors": 1250, "dim": 512, "loops_per_measurement": 2,   "num_measurements": 50 },
+          { "operation": "max_sim", "num_query_vectors": 64, "num_doc_vectors": 32,   "dim": 128, "loops_per_measurement": 200, "num_measurements": 50 },
+          { "operation": "max_sim", "num_query_vectors": 32, "num_doc_vectors": 32,   "dim": 512, "loops_per_measurement": 50,  "num_measurements": 50 }
         ]
       }
     }
diff --git a/diskann-benchmark-multi-vector/src/lib.rs b/diskann-benchmark-multi-vector/src/lib.rs
index 7cadf4f29..ea6a09715 100644
--- a/diskann-benchmark-multi-vector/src/lib.rs
+++ b/diskann-benchmark-multi-vector/src/lib.rs
@@ -293,25 +293,19 @@ impl std::fmt::Display for CheckResult {
 ////////////////////////////
 
 fn register_benchmarks_impl(dispatcher: &mut diskann_benchmark_runner::registry::Benchmarks) {
+    macro_rules! register {
+        ($impl:ident, $t:ty, $tag:literal) => {
+            dispatcher.register_regression($tag, Kernel::<$impl, $t>::new());
+        };
+    }
+
     // Optimized (architecture-dispatched QueryComputer).
-    dispatcher.register_regression(
-        "multi-vector-op-f32-optimized",
-        Kernel::<Optimized, f32>::new(),
-    );
-    dispatcher.register_regression(
-        "multi-vector-op-f16-optimized",
-        Kernel::<Optimized, f16>::new(),
-    );
+    register!(Optimized, f32, "multi-vector-op-f32-optimized");
+    register!(Optimized, f16, "multi-vector-op-f16-optimized");
 
     // Reference (Chamfer / MaxSim fallback path).
-    dispatcher.register_regression(
-        "multi-vector-op-f32-reference",
-        Kernel::<Reference, f32>::new(),
-    );
-    dispatcher.register_regression(
-        "multi-vector-op-f16-reference",
-        Kernel::<Reference, f16>::new(),
-    );
+    register!(Reference, f32, "multi-vector-op-f32-reference");
+    register!(Reference, f16, "multi-vector-op-f16-reference");
 }
 
 //////////////
@@ -340,81 +334,52 @@ impl<I, T> Kernel<I, T> {
 }
 
 #[derive(Debug, Error)]
-#[error("implementation {0} is not registered for this benchmark")]
+#[error("this kernel handles a different implementation than {0}")]
 pub(crate) struct ImplementationMismatch(Implementation);
 
-impl DispatchRule<Implementation> for Optimized {
-    type Error = ImplementationMismatch;
-
-    fn try_match(from: &Implementation) -> Result<MatchScore, FailureScore> {
-        if *from == Implementation::Optimized {
-            Ok(MatchScore(0))
-        } else {
-            Err(FailureScore(1))
-        }
-    }
-
-    fn convert(from: Implementation) -> Result<Self, Self::Error> {
-        if from == Implementation::Optimized {
-            Ok(Optimized)
-        } else {
-            Err(ImplementationMismatch(from))
-        }
-    }
+macro_rules! impl_dispatch_rule {
+    ($marker:ident, $variant:ident, $description:literal) => {
+        impl DispatchRule<Implementation> for $marker {
+            type Error = ImplementationMismatch;
 
-    fn description(
-        f: &mut std::fmt::Formatter<'_>,
-        from: Option<&Implementation>,
-    ) -> std::fmt::Result {
-        match from {
-            None => write!(f, "QueryComputer (architecture-dispatched)"),
-            Some(impl_) => {
-                if Self::try_match(impl_).is_ok() {
-                    write!(f, "matched {}", impl_)
+            fn try_match(from: &Implementation) -> Result<MatchScore, FailureScore> {
+                if *from == Implementation::$variant {
+                    Ok(MatchScore(0))
                 } else {
-                    write!(f, "expected {}, got {}", Implementation::Optimized, impl_)
+                    Err(FailureScore(1))
                 }
             }
-        }
-    }
-}
-
-impl DispatchRule<Implementation> for Reference {
-    type Error = ImplementationMismatch;
 
-    fn try_match(from: &Implementation) -> Result<MatchScore, FailureScore> {
-        if *from == Implementation::Reference {
-            Ok(MatchScore(0))
-        } else {
-            Err(FailureScore(1))
-        }
-    }
-
-    fn convert(from: Implementation) -> Result<Self, Self::Error> {
-        if from == Implementation::Reference {
-            Ok(Reference)
-        } else {
-            Err(ImplementationMismatch(from))
-        }
-    }
-
-    fn description(
-        f: &mut std::fmt::Formatter<'_>,
-        from: Option<&Implementation>,
-    ) -> std::fmt::Result {
-        match from {
-            None => write!(f, "Chamfer / MaxSim fallback"),
-            Some(impl_) => {
-                if Self::try_match(impl_).is_ok() {
-                    write!(f, "matched {}", impl_)
+            fn convert(from: Implementation) -> Result<Self, Self::Error> {
+                if from == Implementation::$variant {
+                    Ok($marker)
                 } else {
-                    write!(f, "expected {}, got {}", Implementation::Reference, impl_)
+                    Err(ImplementationMismatch(from))
+                }
+            }
+
+            fn description(
+                f: &mut std::fmt::Formatter<'_>,
+                from: Option<&Implementation>,
+            ) -> std::fmt::Result {
+                match from {
+                    None => write!(f, $description),
+                    Some(impl_) => {
+                        if Self::try_match(impl_).is_ok() {
+                            write!(f, "matched {}", impl_)
+                        } else {
+                            write!(f, "expected {}, got {}", Implementation::$variant, impl_)
+                        }
+                    }
                 }
             }
         }
-    }
+    };
 }
 
+impl_dispatch_rule!(Optimized, Optimized, "QueryComputer (architecture-dispatched)");
+impl_dispatch_rule!(Reference, Reference, "Chamfer / MaxSim fallback");
+
 impl<I, T> Benchmark for Kernel<I, T>
 where
     datatype::Type<T>: DispatchRule<datatype::DataType>,
@@ -446,7 +411,9 @@ where
         _: diskann_benchmark_runner::Checkpoint<'_>,
         mut output: &mut dyn diskann_benchmark_runner::Output,
     ) -> anyhow::Result<Self::Output> {
-        let _ = I::convert(input.implementation)?;
+        // The dispatcher only invokes `run` after `try_match` has already accepted
+        // the input, so a failure here would indicate a dispatcher bug.
+        I::convert(input.implementation).expect("try_match accepted the input");
         writeln!(output, "{}", input)?;
         let results = self.run_benchmark(input)?;
         writeln!(output, "\n\n{}", DisplayWrapper(&*results))?;
@@ -717,6 +684,9 @@ where
     let mut results = Vec::with_capacity(input.runs.len());
     for run in input.runs.iter() {
         let data = Data::<T>::new(run);
+        // `QueryComputer` performs query-side precomputation that is intentionally
+        // amortized across many `chamfer` / `max_sim` calls; construct it once per
+        // shape, outside the timed loop.
         let computer = <QueryComputer<T> as NewFromMatRef<T>>::new_from(data.query(run));
         let doc = data.doc(run);
 
@@ -748,20 +718,23 @@ where
     let mut results = Vec::with_capacity(input.runs.len());
     for run in input.runs.iter() {
         let data = Data::<T>::new(run);
-        let query = data.query(run);
         let doc = data.doc(run);
+        // Hoist out of the timed loop to mirror the optimized path's
+        // per-shape precomputation.
+        let query: diskann_quantization::multi_vector::distance::QueryMatRef<'_, _> =
+            data.query(run).into();
 
         let result = match run.operation {
             Operation::Chamfer => run_loops(run, || {
-                let v = Chamfer::evaluate(query.into(), doc);
+                let v = Chamfer::evaluate(query, doc);
                 std::hint::black_box(v);
             }),
             Operation::MaxSim => {
                 let mut scores = vec![0.0f32; run.num_query_vectors.get()];
+                let mut max_sim = MaxSim::new(&mut scores).unwrap();
                 run_loops(run, || {
-                    let mut max_sim = MaxSim::new(&mut scores).unwrap();
-                    let _ = max_sim.evaluate(query.into(), doc);
-                    std::hint::black_box(&mut scores);
+                    let _ = max_sim.evaluate(query, doc);
+                    std::hint::black_box(max_sim.scores_mut());
                 })
             }
         };
@@ -770,47 +743,42 @@ where
     Ok(results)
 }
 
-impl RunBenchmark<Optimized> for Kernel<Optimized, f32> {
-    fn run_benchmark(&self, input: &MultiVectorOp) -> Result<Vec<RunResult>, anyhow::Error> {
-        run_optimized::<f32>(input)
-    }
-}
-
-impl RunBenchmark<Optimized> for Kernel<Optimized, f16> {
-    fn run_benchmark(&self, input: &MultiVectorOp) -> Result<Vec<RunResult>, anyhow::Error> {
-        run_optimized::<f16>(input)
-    }
-}
-
-impl RunBenchmark<Reference> for Kernel<Reference, f32> {
-    fn run_benchmark(&self, input: &MultiVectorOp) -> Result<Vec<RunResult>, anyhow::Error> {
-        run_reference::<f32>(input)
-    }
-}
-
-impl RunBenchmark<Reference> for Kernel<Reference, f16> {
-    fn run_benchmark(&self, input: &MultiVectorOp) -> Result<Vec<RunResult>, anyhow::Error> {
-        run_reference::<f16>(input)
-    }
-}
-
 /// Element-type-erasing constructor for [`QueryComputer`].
 trait NewFromMatRef<T: Copy> {
     fn new_from(query: MatRef<'_, Standard<T>>) -> QueryComputer<T>;
 }
 
-impl NewFromMatRef<f32> for QueryComputer<f32> {
-    fn new_from(query: MatRef<'_, Standard<f32>>) -> QueryComputer<f32> {
-        QueryComputer::<f32>::new(query)
-    }
-}
+macro_rules! impl_kernel_for {
+    ($t:ty) => {
+        impl NewFromMatRef<$t> for QueryComputer<$t> {
+            fn new_from(query: MatRef<'_, Standard<$t>>) -> QueryComputer<$t> {
+                QueryComputer::<$t>::new(query)
+            }
+        }
 
-impl NewFromMatRef<f16> for QueryComputer<f16> {
-    fn new_from(query: MatRef<'_, Standard<f16>>) -> QueryComputer<f16> {
-        QueryComputer::<f16>::new(query)
-    }
+        impl RunBenchmark<Optimized> for Kernel<Optimized, $t> {
+            fn run_benchmark(
+                &self,
+                input: &MultiVectorOp,
+            ) -> Result<Vec<RunResult>, anyhow::Error> {
+                run_optimized::<$t>(input)
+            }
+        }
+
+        impl RunBenchmark<Reference> for Kernel<Reference, $t> {
+            fn run_benchmark(
+                &self,
+                input: &MultiVectorOp,
+            ) -> Result<Vec<RunResult>, anyhow::Error> {
+                run_reference::<$t>(input)
+            }
+        }
+    };
 }
 
+impl_kernel_for!(f32);
+impl_kernel_for!(f16);
+
 ///////////
 // Tests //
 ///////////
@@ -963,30 +931,4 @@ mod tests {
 
         assert!(matches!(result, PassFail::Fail(_)));
     }
-
-    /// Sanity-check that the optimized kernel and the reference path produce
-    /// numerically equivalent Chamfer scores on a small fixture.
-    #[test]
-    fn optimized_chamfer_matches_reference_f32() {
-        let run = Run {
-            operation: Operation::Chamfer,
-            num_query_vectors: NonZeroUsize::new(5).unwrap(),
-            num_doc_vectors: NonZeroUsize::new(7).unwrap(),
-            dim: NonZeroUsize::new(16).unwrap(),
-            loops_per_measurement: NonZeroUsize::new(1).unwrap(),
-            num_measurements: NonZeroUsize::new(1).unwrap(),
-        };
-
-        let data = Data::<f32>::new(&run);
-        let query = data.query(&run);
-        let doc = data.doc(&run);
-
-        let optimized = QueryComputer::<f32>::new(query).chamfer(doc);
-        let reference = Chamfer::evaluate(query.into(), doc);
-
-        assert!(
-            (optimized - reference).abs() < 1e-4,
-            "optimized={optimized}, reference={reference}",
-        );
-    }
 }

From f3a5d9fb33cc2dbb0864c88f8bd90bbb65e26dca Mon Sep 17 00:00:00 2001
From: Suryansh Gupta <suryangupta@microsoft.com>
Date: Thu, 7 May 2026 02:38:43 +0530
Subject: [PATCH 3/8] Move some repetetive code to macros and add more
 benchmark cases

---
 diskann-benchmark-multi-vector/src/lib.rs | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/diskann-benchmark-multi-vector/src/lib.rs b/diskann-benchmark-multi-vector/src/lib.rs
index ea6a09715..df08d93dd 100644
--- a/diskann-benchmark-multi-vector/src/lib.rs
+++ b/diskann-benchmark-multi-vector/src/lib.rs
@@ -377,7 +377,11 @@ macro_rules! impl_dispatch_rule {
     };
 }
 
-impl_dispatch_rule!(Optimized, Optimized, "QueryComputer (architecture-dispatched)");
+impl_dispatch_rule!(
+    Optimized,
+    Optimized,
+    "QueryComputer (architecture-dispatched)"
+);
 impl_dispatch_rule!(Reference, Reference, "Chamfer / MaxSim fallback");
 
 impl<I, T> Benchmark for Kernel<I, T>

From 8efdbcd5e79bf48068dfc8e1b4d6a6cdfadd35f0 Mon Sep 17 00:00:00 2001
From: Suryansh Gupta <suryangupta@microsoft.com>
Date: Thu, 7 May 2026 02:39:33 +0530
Subject: [PATCH 4/8] Move some repetetive code to macros and add more
 benchmark cases

---
 results.json | 2150 --------------------------------------------------
 1 file changed, 2150 deletions(-)
 delete mode 100644 results.json

diff --git a/results.json b/results.json
deleted file mode 100644
index f061f6750..000000000
--- a/results.json
+++ /dev/null
@@ -1,2150 +0,0 @@
-[
-  {
-    "input": {
-      "content": {
-        "element_type": "float32",
-        "implementation": "optimized",
-        "runs": [
-          {
-            "dim": 128,
-            "loops_per_measurement": 500,
-            "num_doc_vectors": 32,
-            "num_measurements": 50,
-            "num_query_vectors": 8,
-            "operation": "chamfer"
-          },
-          {
-            "dim": 256,
-            "loops_per_measurement": 100,
-            "num_doc_vectors": 64,
-            "num_measurements": 50,
-            "num_query_vectors": 16,
-            "operation": "chamfer"
-          },
-          {
-            "dim": 384,
-            "loops_per_measurement": 20,
-            "num_doc_vectors": 128,
-            "num_measurements": 50,
-            "num_query_vectors": 32,
-            "operation": "chamfer"
-          },
-          {
-            "dim": 256,
-            "loops_per_measurement": 200,
-            "num_doc_vectors": 16,
-            "num_measurements": 50,
-            "num_query_vectors": 32,
-            "operation": "chamfer"
-          },
-          {
-            "dim": 264,
-            "loops_per_measurement": 50,
-            "num_doc_vectors": 32,
-            "num_measurements": 50,
-            "num_query_vectors": 64,
-            "operation": "chamfer"
-          },
-          {
-            "dim": 128,
-            "loops_per_measurement": 10,
-            "num_doc_vectors": 1250,
-            "num_measurements": 50,
-            "num_query_vectors": 32,
-            "operation": "chamfer"
-          },
-          {
-            "dim": 512,
-            "loops_per_measurement": 2,
-            "num_doc_vectors": 1250,
-            "num_measurements": 20,
-            "num_query_vectors": 64,
-            "operation": "chamfer"
-          },
-          {
-            "dim": 128,
-            "loops_per_measurement": 200,
-            "num_doc_vectors": 32,
-            "num_measurements": 50,
-            "num_query_vectors": 64,
-            "operation": "chamfer"
-          },
-          {
-            "dim": 512,
-            "loops_per_measurement": 50,
-            "num_doc_vectors": 32,
-            "num_measurements": 50,
-            "num_query_vectors": 32,
-            "operation": "chamfer"
-          },
-          {
-            "dim": 128,
-            "loops_per_measurement": 500,
-            "num_doc_vectors": 32,
-            "num_measurements": 50,
-            "num_query_vectors": 8,
-            "operation": "max_sim"
-          },
-          {
-            "dim": 256,
-            "loops_per_measurement": 100,
-            "num_doc_vectors": 64,
-            "num_measurements": 50,
-            "num_query_vectors": 16,
-            "operation": "max_sim"
-          },
-          {
-            "dim": 384,
-            "loops_per_measurement": 20,
-            "num_doc_vectors": 128,
-            "num_measurements": 50,
-            "num_query_vectors": 32,
-            "operation": "max_sim"
-          },
-          {
-            "dim": 256,
-            "loops_per_measurement": 200,
-            "num_doc_vectors": 16,
-            "num_measurements": 50,
-            "num_query_vectors": 32,
-            "operation": "max_sim"
-          },
-          {
-            "dim": 264,
-            "loops_per_measurement": 50,
-            "num_doc_vectors": 32,
-            "num_measurements": 50,
-            "num_query_vectors": 64,
-            "operation": "max_sim"
-          },
-          {
-            "dim": 128,
-            "loops_per_measurement": 10,
-            "num_doc_vectors": 1250,
-            "num_measurements": 50,
-            "num_query_vectors": 32,
-            "operation": "max_sim"
-          },
-          {
-            "dim": 512,
-            "loops_per_measurement": 2,
-            "num_doc_vectors": 1250,
-            "num_measurements": 20,
-            "num_query_vectors": 64,
-            "operation": "max_sim"
-          },
-          {
-            "dim": 128,
-            "loops_per_measurement": 200,
-            "num_doc_vectors": 32,
-            "num_measurements": 50,
-            "num_query_vectors": 64,
-            "operation": "max_sim"
-          },
-          {
-            "dim": 512,
-            "loops_per_measurement": 50,
-            "num_doc_vectors": 32,
-            "num_measurements": 50,
-            "num_query_vectors": 32,
-            "operation": "max_sim"
-          }
-        ]
-      },
-      "type": "multi-vector-op"
-    },
-    "results": [
-      {
-        "latencies": [
-          777,
-          777,
-          778,
-          780,
-          780,
-          781,
-          804,
-          838,
-          838,
-          838,
-          838,
-          839,
-          839,
-          839,
-          840,
-          842,
-          845,
-          850,
-          899,
-          926,
-          927,
-          931,
-          932,
-          937,
-          939,
-          956,
-          978,
-          1034,
-          1035,
-          1036,
-          1053,
-          1064,
-          1065,
-          1147,
-          1164,
-          1165,
-          1165,
-          1166,
-          1173,
-          1221,
-          1323,
-          1333,
-          1350,
-          1352,
-          1353,
-          1353,
-          1357,
-          1393,
-          1529,
-          1537
-        ],
-        "percentiles": {
-          "mean": 1030.32,
-          "median": 947.5,
-          "minimum": 777,
-          "p90": 1353,
-          "p99": 1537
-        },
-        "run": {
-          "dim": 128,
-          "loops_per_measurement": 500,
-          "num_doc_vectors": 32,
-          "num_measurements": 50,
-          "num_query_vectors": 8,
-          "operation": "chamfer"
-        }
-      },
-      {
-        "latencies": [
-          1029,
-          1029,
-          1030,
-          1030,
-          1030,
-          1030,
-          1030,
-          1031,
-          1032,
-          1034,
-          1035,
-          1038,
-          1050,
-          1058,
-          1070,
-          1112,
-          1112,
-          1112,
-          1112,
-          1112,
-          1112,
-          1112,
-          1113,
-          1117,
-          1119,
-          1120,
-          1123,
-          1145,
-          1146,
-          1146,
-          1146,
-          1148,
-          1152,
-          1167,
-          1192,
-          1192,
-          1192,
-          1192,
-          1193,
-          1207,
-          1235,
-          1251,
-          1254,
-          1256,
-          1257,
-          1261,
-          1293,
-          1330,
-          1330,
-          1344
-        ],
-        "percentiles": {
-          "mean": 1139.22,
-          "median": 1119.5,
-          "minimum": 1029,
-          "p90": 1261,
-          "p99": 1344
-        },
-        "run": {
-          "dim": 256,
-          "loops_per_measurement": 100,
-          "num_doc_vectors": 64,
-          "num_measurements": 50,
-          "num_query_vectors": 16,
-          "operation": "chamfer"
-        }
-      },
-      {
-        "latencies": [
-          1210,
-          1210,
-          1210,
-          1210,
-          1210,
-          1210,
-          1210,
-          1210,
-          1210,
-          1211,
-          1212,
-          1212,
-          1212,
-          1212,
-          1213,
-          1213,
-          1213,
-          1213,
-          1213,
-          1214,
-          1217,
-          1217,
-          1220,
-          1223,
-          1225,
-          1226,
-          1227,
-          1229,
-          1231,
-          1235,
-          1235,
-          1239,
-          1239,
-          1240,
-          1244,
-          1249,
-          1252,
-          1259,
-          1264,
-          1270,
-          1281,
-          1294,
-          1299,
-          1306,
-          1312,
-          1315,
-          1332,
-          1341,
-          1383,
-          1484
-        ],
-        "percentiles": {
-          "mean": 1246.32,
-          "median": 1225.5,
-          "minimum": 1210,
-          "p90": 1315,
-          "p99": 1484
-        },
-        "run": {
-          "dim": 384,
-          "loops_per_measurement": 20,
-          "num_doc_vectors": 128,
-          "num_measurements": 50,
-          "num_query_vectors": 32,
-          "operation": "chamfer"
-        }
-      },
-      {
-        "latencies": [
-          958,
-          958,
-          958,
-          958,
-          958,
-          960,
-          960,
-          960,
-          961,
-          961,
-          961,
-          961,
-          961,
-          961,
-          961,
-          961,
-          961,
-          961,
-          961,
-          961,
-          961,
-          961,
-          961,
-          961,
-          961,
-          961,
-          961,
-          962,
-          962,
-          963,
-          964,
-          964,
-          965,
-          965,
-          965,
-          966,
-          966,
-          973,
-          974,
-          974,
-          981,
-          981,
-          983,
-          985,
-          987,
-          987,
-          987,
-          990,
-          999,
-          999
-        ],
-        "percentiles": {
-          "mean": 967.42,
-          "median": 961.0,
-          "minimum": 958,
-          "p90": 987,
-          "p99": 999
-        },
-        "run": {
-          "dim": 256,
-          "loops_per_measurement": 200,
-          "num_doc_vectors": 16,
-          "num_measurements": 50,
-          "num_query_vectors": 32,
-          "operation": "chamfer"
-        }
-      },
-      {
-        "latencies": [
-          1017,
-          1017,
-          1017,
-          1017,
-          1017,
-          1017,
-          1017,
-          1017,
-          1017,
-          1017,
-          1017,
-          1017,
-          1018,
-          1018,
-          1018,
-          1018,
-          1018,
-          1018,
-          1018,
-          1018,
-          1018,
-          1019,
-          1019,
-          1019,
-          1019,
-          1020,
-          1020,
-          1020,
-          1020,
-          1020,
-          1020,
-          1021,
-          1022,
-          1023,
-          1023,
-          1026,
-          1029,
-          1031,
-          1032,
-          1033,
-          1034,
-          1035,
-          1036,
-          1037,
-          1041,
-          1044,
-          1044,
-          1045,
-          1046,
-          1065
-        ],
-        "percentiles": {
-          "mean": 1024.58,
-          "median": 1019.5,
-          "minimum": 1017,
-          "p90": 1044,
-          "p99": 1065
-        },
-        "run": {
-          "dim": 264,
-          "loops_per_measurement": 50,
-          "num_doc_vectors": 32,
-          "num_measurements": 50,
-          "num_query_vectors": 64,
-          "operation": "chamfer"
-        }
-      },
-      {
-        "latencies": [
-          1854,
-          1855,
-          1855,
-          1855,
-          1855,
-          1855,
-          1856,
-          1856,
-          1856,
-          1857,
-          1857,
-          1857,
-          1857,
-          1857,
-          1857,
-          1858,
-          1858,
-          1858,
-          1858,
-          1858,
-          1858,
-          1858,
-          1859,
-          1860,
-          1861,
-          1861,
-          1863,
-          1866,
-          1869,
-          1870,
-          1871,
-          1871,
-          1871,
-          1872,
-          1874,
-          1875,
-          1881,
-          1883,
-          1885,
-          1885,
-          1890,
-          1892,
-          1892,
-          1892,
-          1892,
-          1899,
-          1906,
-          1909,
-          1909,
-          1916
-        ],
-        "percentiles": {
-          "mean": 1870.38,
-          "median": 1861.0,
-          "minimum": 1854,
-          "p90": 1899,
-          "p99": 1916
-        },
-        "run": {
-          "dim": 128,
-          "loops_per_measurement": 10,
-          "num_doc_vectors": 1250,
-          "num_measurements": 50,
-          "num_query_vectors": 32,
-          "operation": "chamfer"
-        }
-      },
-      {
-        "latencies": [
-          3180,
-          3180,
-          3180,
-          3180,
-          3180,
-          3181,
-          3181,
-          3181,
-          3181,
-          3183,
-          3185,
-          3187,
-          3205,
-          3206,
-          3207,
-          3208,
-          3211,
-          3218,
-          3220,
-          3268
-        ],
-        "percentiles": {
-          "mean": 3196.1,
-          "median": 3184.0,
-          "minimum": 3180,
-          "p90": 3220,
-          "p99": 3268
-        },
-        "run": {
-          "dim": 512,
-          "loops_per_measurement": 2,
-          "num_doc_vectors": 1250,
-          "num_measurements": 20,
-          "num_query_vectors": 64,
-          "operation": "chamfer"
-        }
-      },
-      {
-        "latencies": [
-          1784,
-          1784,
-          1784,
-          1784,
-          1784,
-          1784,
-          1785,
-          1785,
-          1790,
-          1791,
-          1791,
-          1792,
-          1792,
-          1792,
-          1792,
-          1792,
-          1792,
-          1792,
-          1795,
-          1795,
-          1796,
-          1796,
-          1796,
-          1796,
-          1798,
-          1800,
-          1803,
-          1805,
-          1814,
-          1815,
-          1817,
-          1818,
-          1821,
-          1826,
-          1840,
-          1845,
-          1856,
-          1858,
-          1878,
-          1879,
-          1879,
-          1884,
-          1888,
-          1890,
-          1893,
-          1905,
-          1907,
-          1912,
-          1918,
-          1950
-        ],
-        "percentiles": {
-          "mean": 1825.26,
-          "median": 1799.0,
-          "minimum": 1784,
-          "p90": 1905,
-          "p99": 1950
-        },
-        "run": {
-          "dim": 128,
-          "loops_per_measurement": 200,
-          "num_doc_vectors": 32,
-          "num_measurements": 50,
-          "num_query_vectors": 64,
-          "operation": "chamfer"
-        }
-      },
-      {
-        "latencies": [
-          1017,
-          1017,
-          1017,
-          1017,
-          1017,
-          1017,
-          1017,
-          1017,
-          1017,
-          1017,
-          1017,
-          1017,
-          1017,
-          1017,
-          1017,
-          1017,
-          1017,
-          1017,
-          1017,
-          1017,
-          1017,
-          1017,
-          1017,
-          1017,
-          1017,
-          1018,
-          1018,
-          1018,
-          1018,
-          1018,
-          1019,
-          1020,
-          1020,
-          1021,
-          1021,
-          1022,
-          1022,
-          1022,
-          1023,
-          1027,
-          1030,
-          1030,
-          1035,
-          1043,
-          1043,
-          1044,
-          1045,
-          1049,
-          1049,
-          1060
-        ],
-        "percentiles": {
-          "mean": 1023.2,
-          "median": 1017.5,
-          "minimum": 1017,
-          "p90": 1044,
-          "p99": 1060
-        },
-        "run": {
-          "dim": 512,
-          "loops_per_measurement": 50,
-          "num_doc_vectors": 32,
-          "num_measurements": 50,
-          "num_query_vectors": 32,
-          "operation": "chamfer"
-        }
-      },
-      {
-        "latencies": [
-          567,
-          567,
-          567,
-          567,
-          567,
-          567,
-          567,
-          567,
-          567,
-          567,
-          567,
-          567,
-          567,
-          567,
-          567,
-          569,
-          569,
-          569,
-          569,
-          569,
-          569,
-          570,
-          570,
-          570,
-          570,
-          570,
-          570,
-          570,
-          570,
-          570,
-          571,
-          571,
-          571,
-          571,
-          571,
-          571,
-          571,
-          571,
-          571,
-          571,
-          571,
-          571,
-          571,
-          571,
-          574,
-          578,
-          578,
-          594,
-          595,
-          598
-        ],
-        "percentiles": {
-          "mean": 571.2,
-          "median": 570.0,
-          "minimum": 567,
-          "p90": 578,
-          "p99": 598
-        },
-        "run": {
-          "dim": 128,
-          "loops_per_measurement": 500,
-          "num_doc_vectors": 32,
-          "num_measurements": 50,
-          "num_query_vectors": 8,
-          "operation": "max_sim"
-        }
-      },
-      {
-        "latencies": [
-          988,
-          988,
-          988,
-          988,
-          988,
-          988,
-          988,
-          988,
-          988,
-          989,
-          989,
-          989,
-          989,
-          989,
-          989,
-          989,
-          989,
-          989,
-          991,
-          991,
-          991,
-          991,
-          991,
-          991,
-          991,
-          991,
-          991,
-          991,
-          991,
-          991,
-          991,
-          991,
-          992,
-          992,
-          992,
-          992,
-          992,
-          992,
-          992,
-          992,
-          992,
-          996,
-          996,
-          1004,
-          1009,
-          1013,
-          1018,
-          1020,
-          1047,
-          1057
-        ],
-        "percentiles": {
-          "mean": 995.1,
-          "median": 991.0,
-          "minimum": 988,
-          "p90": 1013,
-          "p99": 1057
-        },
-        "run": {
-          "dim": 256,
-          "loops_per_measurement": 100,
-          "num_doc_vectors": 64,
-          "num_measurements": 50,
-          "num_query_vectors": 16,
-          "operation": "max_sim"
-        }
-      },
-      {
-        "latencies": [
-          1210,
-          1210,
-          1210,
-          1210,
-          1210,
-          1210,
-          1210,
-          1210,
-          1211,
-          1211,
-          1211,
-          1212,
-          1213,
-          1213,
-          1213,
-          1213,
-          1213,
-          1213,
-          1213,
-          1213,
-          1213,
-          1213,
-          1214,
-          1214,
-          1214,
-          1214,
-          1214,
-          1214,
-          1214,
-          1214,
-          1214,
-          1214,
-          1216,
-          1217,
-          1217,
-          1217,
-          1218,
-          1220,
-          1222,
-          1223,
-          1224,
-          1224,
-          1225,
-          1227,
-          1238,
-          1239,
-          1239,
-          1241,
-          1242,
-          1243
-        ],
-        "percentiles": {
-          "mean": 1217.74,
-          "median": 1214.0,
-          "minimum": 1210,
-          "p90": 1239,
-          "p99": 1243
-        },
-        "run": {
-          "dim": 384,
-          "loops_per_measurement": 20,
-          "num_doc_vectors": 128,
-          "num_measurements": 50,
-          "num_query_vectors": 32,
-          "operation": "max_sim"
-        }
-      },
-      {
-        "latencies": [
-          953,
-          953,
-          953,
-          953,
-          953,
-          953,
-          954,
-          954,
-          956,
-          957,
-          957,
-          957,
-          957,
-          957,
-          957,
-          957,
-          957,
-          957,
-          957,
-          957,
-          957,
-          957,
-          957,
-          957,
-          957,
-          957,
-          957,
-          957,
-          958,
-          958,
-          958,
-          958,
-          958,
-          958,
-          960,
-          961,
-          961,
-          961,
-          961,
-          961,
-          961,
-          961,
-          962,
-          963,
-          971,
-          976,
-          978,
-          984,
-          984,
-          987
-        ],
-        "percentiles": {
-          "mean": 960.1,
-          "median": 957.0,
-          "minimum": 953,
-          "p90": 976,
-          "p99": 987
-        },
-        "run": {
-          "dim": 256,
-          "loops_per_measurement": 200,
-          "num_doc_vectors": 16,
-          "num_measurements": 50,
-          "num_query_vectors": 32,
-          "operation": "max_sim"
-        }
-      },
-      {
-        "latencies": [
-          1016,
-          1016,
-          1016,
-          1017,
-          1017,
-          1017,
-          1017,
-          1017,
-          1017,
-          1017,
-          1017,
-          1017,
-          1017,
-          1017,
-          1017,
-          1017,
-          1017,
-          1017,
-          1017,
-          1017,
-          1017,
-          1018,
-          1018,
-          1018,
-          1018,
-          1018,
-          1018,
-          1018,
-          1018,
-          1019,
-          1019,
-          1019,
-          1019,
-          1019,
-          1019,
-          1019,
-          1021,
-          1021,
-          1023,
-          1023,
-          1025,
-          1032,
-          1044,
-          1045,
-          1045,
-          1045,
-          1047,
-          1052,
-          1058,
-          1061
-        ],
-        "percentiles": {
-          "mean": 1023.46,
-          "median": 1018.0,
-          "minimum": 1016,
-          "p90": 1045,
-          "p99": 1061
-        },
-        "run": {
-          "dim": 264,
-          "loops_per_measurement": 50,
-          "num_doc_vectors": 32,
-          "num_measurements": 50,
-          "num_query_vectors": 64,
-          "operation": "max_sim"
-        }
-      },
-      {
-        "latencies": [
-          1858,
-          1858,
-          1860,
-          1860,
-          1860,
-          1860,
-          1860,
-          1860,
-          1860,
-          1860,
-          1860,
-          1861,
-          1861,
-          1861,
-          1861,
-          1861,
-          1861,
-          1861,
-          1862,
-          1863,
-          1863,
-          1864,
-          1865,
-          1867,
-          1868,
-          1872,
-          1873,
-          1876,
-          1878,
-          1881,
-          1882,
-          1883,
-          1888,
-          1889,
-          1889,
-          1890,
-          1890,
-          1890,
-          1891,
-          1892,
-          1905,
-          1906,
-          1908,
-          1934,
-          1962,
-          1967,
-          1974,
-          1988,
-          2004,
-          2014
-        ],
-        "percentiles": {
-          "mean": 1887.22,
-          "median": 1870.0,
-          "minimum": 1858,
-          "p90": 1967,
-          "p99": 2014
-        },
-        "run": {
-          "dim": 128,
-          "loops_per_measurement": 10,
-          "num_doc_vectors": 1250,
-          "num_measurements": 50,
-          "num_query_vectors": 32,
-          "operation": "max_sim"
-        }
-      },
-      {
-        "latencies": [
-          3177,
-          3177,
-          3177,
-          3179,
-          3192,
-          3201,
-          3212,
-          3222,
-          3251,
-          3251,
-          3255,
-          3256,
-          3256,
-          3321,
-          3381,
-          3399,
-          3400,
-          3419,
-          3422,
-          3445
-        ],
-        "percentiles": {
-          "mean": 3279.65,
-          "median": 3253.0,
-          "minimum": 3177,
-          "p90": 3422,
-          "p99": 3445
-        },
-        "run": {
-          "dim": 512,
-          "loops_per_measurement": 2,
-          "num_doc_vectors": 1250,
-          "num_measurements": 20,
-          "num_query_vectors": 64,
-          "operation": "max_sim"
-        }
-      },
-      {
-        "latencies": [
-          1783,
-          1784,
-          1787,
-          1791,
-          1791,
-          1791,
-          1813,
-          1838,
-          1853,
-          1868,
-          1871,
-          1882,
-          1882,
-          1884,
-          1890,
-          1899,
-          1899,
-          1899,
-          1900,
-          1901,
-          1905,
-          1906,
-          1908,
-          1909,
-          1911,
-          1911,
-          1911,
-          1911,
-          1914,
-          1915,
-          1915,
-          1916,
-          1916,
-          1917,
-          1919,
-          1922,
-          1922,
-          1923,
-          1923,
-          1925,
-          1927,
-          1927,
-          1928,
-          1929,
-          1929,
-          1933,
-          1937,
-          1938,
-          1940,
-          1983
-        ],
-        "percentiles": {
-          "mean": 1893.52,
-          "median": 1911.0,
-          "minimum": 1783,
-          "p90": 1933,
-          "p99": 1983
-        },
-        "run": {
-          "dim": 128,
-          "loops_per_measurement": 200,
-          "num_doc_vectors": 32,
-          "num_measurements": 50,
-          "num_query_vectors": 64,
-          "operation": "max_sim"
-        }
-      },
-      {
-        "latencies": [
-          1017,
-          1017,
-          1017,
-          1017,
-          1017,
-          1017,
-          1017,
-          1017,
-          1017,
-          1017,
-          1017,
-          1017,
-          1017,
-          1017,
-          1017,
-          1020,
-          1023,
-          1023,
-          1025,
-          1028,
-          1033,
-          1033,
-          1034,
-          1037,
-          1038,
-          1040,
-          1043,
-          1044,
-          1052,
-          1052,
-          1057,
-          1060,
-          1063,
-          1078,
-          1088,
-          1088,
-          1088,
-          1088,
-          1088,
-          1088,
-          1088,
-          1088,
-          1090,
-          1090,
-          1090,
-          1092,
-          1093,
-          1093,
-          1094,
-          1094
-        ],
-        "percentiles": {
-          "mean": 1049.56,
-          "median": 1039.0,
-          "minimum": 1017,
-          "p90": 1092,
-          "p99": 1094
-        },
-        "run": {
-          "dim": 512,
-          "loops_per_measurement": 50,
-          "num_doc_vectors": 32,
-          "num_measurements": 50,
-          "num_query_vectors": 32,
-          "operation": "max_sim"
-        }
-      }
-    ]
-  },
-  {
-    "input": {
-      "content": {
-        "element_type": "float16",
-        "implementation": "optimized",
-        "runs": [
-          {
-            "dim": 256,
-            "loops_per_measurement": 100,
-            "num_doc_vectors": 64,
-            "num_measurements": 50,
-            "num_query_vectors": 16,
-            "operation": "chamfer"
-          },
-          {
-            "dim": 128,
-            "loops_per_measurement": 10,
-            "num_doc_vectors": 1250,
-            "num_measurements": 50,
-            "num_query_vectors": 32,
-            "operation": "chamfer"
-          },
-          {
-            "dim": 256,
-            "loops_per_measurement": 100,
-            "num_doc_vectors": 64,
-            "num_measurements": 50,
-            "num_query_vectors": 16,
-            "operation": "max_sim"
-          },
-          {
-            "dim": 128,
-            "loops_per_measurement": 10,
-            "num_doc_vectors": 1250,
-            "num_measurements": 50,
-            "num_query_vectors": 32,
-            "operation": "max_sim"
-          }
-        ]
-      },
-      "type": "multi-vector-op"
-    },
-    "results": [
-      {
-        "latencies": [
-          1734,
-          1734,
-          1736,
-          1736,
-          1737,
-          1737,
-          1737,
-          1738,
-          1738,
-          1738,
-          1738,
-          1739,
-          1740,
-          1740,
-          1741,
-          1744,
-          1744,
-          1751,
-          1751,
-          1753,
-          1754,
-          1754,
-          1756,
-          1759,
-          1761,
-          1764,
-          1767,
-          1767,
-          1767,
-          1768,
-          1768,
-          1769,
-          1769,
-          1773,
-          1774,
-          1775,
-          1779,
-          1787,
-          1794,
-          1808,
-          1822,
-          1825,
-          1829,
-          1829,
-          1844,
-          1846,
-          1852,
-          1859,
-          1903,
-          2194
-        ],
-        "percentiles": {
-          "mean": 1780.44,
-          "median": 1762.5,
-          "minimum": 1734,
-          "p90": 1846,
-          "p99": 2194
-        },
-        "run": {
-          "dim": 256,
-          "loops_per_measurement": 100,
-          "num_doc_vectors": 64,
-          "num_measurements": 50,
-          "num_query_vectors": 16,
-          "operation": "chamfer"
-        }
-      },
-      {
-        "latencies": [
-          2130,
-          2130,
-          2130,
-          2131,
-          2133,
-          2133,
-          2140,
-          2142,
-          2149,
-          2151,
-          2158,
-          2160,
-          2163,
-          2164,
-          2166,
-          2167,
-          2167,
-          2168,
-          2171,
-          2173,
-          2174,
-          2176,
-          2177,
-          2178,
-          2178,
-          2181,
-          2184,
-          2189,
-          2195,
-          2195,
-          2197,
-          2198,
-          2198,
-          2201,
-          2203,
-          2207,
-          2215,
-          2217,
-          2220,
-          2229,
-          2240,
-          2242,
-          2243,
-          2249,
-          2250,
-          2291,
-          2305,
-          2438,
-          2613,
-          2643
-        ],
-        "percentiles": {
-          "mean": 2209.04,
-          "median": 2179.5,
-          "minimum": 2130,
-          "p90": 2291,
-          "p99": 2643
-        },
-        "run": {
-          "dim": 128,
-          "loops_per_measurement": 10,
-          "num_doc_vectors": 1250,
-          "num_measurements": 50,
-          "num_query_vectors": 32,
-          "operation": "chamfer"
-        }
-      },
-      {
-        "latencies": [
-          1731,
-          1733,
-          1737,
-          1737,
-          1737,
-          1741,
-          1741,
-          1745,
-          1745,
-          1750,
-          1750,
-          1750,
-          1750,
-          1751,
-          1754,
-          1754,
-          1755,
-          1758,
-          1758,
-          1759,
-          1761,
-          1761,
-          1766,
-          1768,
-          1770,
-          1771,
-          1771,
-          1772,
-          1773,
-          1773,
-          1775,
-          1776,
-          1776,
-          1778,
-          1785,
-          1788,
-          1789,
-          1791,
-          1795,
-          1800,
-          1804,
-          1808,
-          1814,
-          1822,
-          1832,
-          1833,
-          1834,
-          1864,
-          1867,
-          1869
-        ],
-        "percentiles": {
-          "mean": 1776.44,
-          "median": 1770.5,
-          "minimum": 1731,
-          "p90": 1833,
-          "p99": 1869
-        },
-        "run": {
-          "dim": 256,
-          "loops_per_measurement": 100,
-          "num_doc_vectors": 64,
-          "num_measurements": 50,
-          "num_query_vectors": 16,
-          "operation": "max_sim"
-        }
-      },
-      {
-        "latencies": [
-          2127,
-          2127,
-          2129,
-          2130,
-          2132,
-          2141,
-          2142,
-          2142,
-          2147,
-          2148,
-          2149,
-          2150,
-          2154,
-          2154,
-          2159,
-          2162,
-          2166,
-          2168,
-          2170,
-          2173,
-          2177,
-          2180,
-          2180,
-          2181,
-          2181,
-          2182,
-          2183,
-          2187,
-          2196,
-          2196,
-          2199,
-          2200,
-          2204,
-          2211,
-          2213,
-          2216,
-          2224,
-          2255,
-          2256,
-          2271,
-          2354,
-          2488,
-          2493,
-          2495,
-          2498,
-          2505,
-          2525,
-          2653,
-          2657,
-          3515
-        ],
-        "percentiles": {
-          "mean": 2264.9,
-          "median": 2181.5,
-          "minimum": 2127,
-          "p90": 2505,
-          "p99": 3515
-        },
-        "run": {
-          "dim": 128,
-          "loops_per_measurement": 10,
-          "num_doc_vectors": 1250,
-          "num_measurements": 50,
-          "num_query_vectors": 32,
-          "operation": "max_sim"
-        }
-      }
-    ]
-  },
-  {
-    "input": {
-      "content": {
-        "element_type": "float32",
-        "implementation": "reference",
-        "runs": [
-          {
-            "dim": 128,
-            "loops_per_measurement": 50,
-            "num_doc_vectors": 32,
-            "num_measurements": 50,
-            "num_query_vectors": 8,
-            "operation": "chamfer"
-          },
-          {
-            "dim": 384,
-            "loops_per_measurement": 2,
-            "num_doc_vectors": 128,
-            "num_measurements": 50,
-            "num_query_vectors": 32,
-            "operation": "chamfer"
-          },
-          {
-            "dim": 128,
-            "loops_per_measurement": 50,
-            "num_doc_vectors": 32,
-            "num_measurements": 50,
-            "num_query_vectors": 8,
-            "operation": "max_sim"
-          },
-          {
-            "dim": 384,
-            "loops_per_measurement": 2,
-            "num_doc_vectors": 128,
-            "num_measurements": 50,
-            "num_query_vectors": 32,
-            "operation": "max_sim"
-          }
-        ]
-      },
-      "type": "multi-vector-op"
-    },
-    "results": [
-      {
-        "latencies": [
-          64,
-          64,
-          64,
-          66,
-          66,
-          66,
-          66,
-          66,
-          66,
-          66,
-          66,
-          66,
-          66,
-          66,
-          66,
-          66,
-          66,
-          66,
-          66,
-          66,
-          66,
-          66,
-          66,
-          66,
-          66,
-          66,
-          66,
-          66,
-          66,
-          66,
-          66,
-          66,
-          66,
-          66,
-          66,
-          66,
-          67,
-          67,
-          67,
-          67,
-          67,
-          67,
-          67,
-          67,
-          67,
-          68,
-          68,
-          69,
-          71,
-          127
-        ],
-        "percentiles": {
-          "mean": 67.52,
-          "median": 66.0,
-          "minimum": 64,
-          "p90": 68,
-          "p99": 127
-        },
-        "run": {
-          "dim": 128,
-          "loops_per_measurement": 50,
-          "num_doc_vectors": 32,
-          "num_measurements": 50,
-          "num_query_vectors": 8,
-          "operation": "chamfer"
-        }
-      },
-      {
-        "latencies": [
-          130,
-          130,
-          130,
-          130,
-          130,
-          130,
-          130,
-          130,
-          130,
-          130,
-          131,
-          131,
-          131,
-          131,
-          132,
-          132,
-          133,
-          133,
-          135,
-          136,
-          136,
-          137,
-          138,
-          138,
-          138,
-          138,
-          138,
-          138,
-          138,
-          138,
-          138,
-          138,
-          138,
-          138,
-          138,
-          138,
-          138,
-          138,
-          139,
-          139,
-          139,
-          139,
-          139,
-          140,
-          140,
-          140,
-          141,
-          143,
-          147,
-          161
-        ],
-        "percentiles": {
-          "mean": 136.26,
-          "median": 138.0,
-          "minimum": 130,
-          "p90": 140,
-          "p99": 161
-        },
-        "run": {
-          "dim": 384,
-          "loops_per_measurement": 2,
-          "num_doc_vectors": 128,
-          "num_measurements": 50,
-          "num_query_vectors": 32,
-          "operation": "chamfer"
-        }
-      },
-      {
-        "latencies": [
-          62,
-          62,
-          62,
-          62,
-          62,
-          63,
-          63,
-          63,
-          63,
-          63,
-          63,
-          63,
-          63,
-          63,
-          63,
-          64,
-          64,
-          65,
-          66,
-          66,
-          66,
-          66,
-          66,
-          66,
-          66,
-          66,
-          66,
-          66,
-          66,
-          66,
-          66,
-          66,
-          66,
-          66,
-          66,
-          66,
-          67,
-          67,
-          67,
-          67,
-          67,
-          67,
-          67,
-          68,
-          68,
-          69,
-          71,
-          72,
-          78,
-          106
-        ],
-        "percentiles": {
-          "mean": 66.44,
-          "median": 66.0,
-          "minimum": 62,
-          "p90": 69,
-          "p99": 106
-        },
-        "run": {
-          "dim": 128,
-          "loops_per_measurement": 50,
-          "num_doc_vectors": 32,
-          "num_measurements": 50,
-          "num_query_vectors": 8,
-          "operation": "max_sim"
-        }
-      },
-      {
-        "latencies": [
-          130,
-          131,
-          131,
-          131,
-          131,
-          131,
-          131,
-          131,
-          131,
-          131,
-          131,
-          131,
-          131,
-          131,
-          131,
-          131,
-          131,
-          131,
-          131,
-          131,
-          131,
-          131,
-          131,
-          132,
-          132,
-          132,
-          132,
-          132,
-          133,
-          133,
-          134,
-          134,
-          135,
-          135,
-          135,
-          136,
-          136,
-          137,
-          139,
-          139,
-          140,
-          142,
-          142,
-          143,
-          144,
-          145,
-          145,
-          147,
-          155,
-          158
-        ],
-        "percentiles": {
-          "mean": 135.18,
-          "median": 132.0,
-          "minimum": 130,
-          "p90": 145,
-          "p99": 158
-        },
-        "run": {
-          "dim": 384,
-          "loops_per_measurement": 2,
-          "num_doc_vectors": 128,
-          "num_measurements": 50,
-          "num_query_vectors": 32,
-          "operation": "max_sim"
-        }
-      }
-    ]
-  },
-  {
-    "input": {
-      "content": {
-        "element_type": "float16",
-        "implementation": "reference",
-        "runs": [
-          {
-            "dim": 128,
-            "loops_per_measurement": 50,
-            "num_doc_vectors": 32,
-            "num_measurements": 50,
-            "num_query_vectors": 8,
-            "operation": "chamfer"
-          },
-          {
-            "dim": 384,
-            "loops_per_measurement": 2,
-            "num_doc_vectors": 128,
-            "num_measurements": 50,
-            "num_query_vectors": 32,
-            "operation": "max_sim"
-          }
-        ]
-      },
-      "type": "multi-vector-op"
-    },
-    "results": [
-      {
-        "latencies": [
-          73,
-          73,
-          73,
-          73,
-          73,
-          73,
-          73,
-          73,
-          73,
-          73,
-          73,
-          73,
-          73,
-          73,
-          73,
-          73,
-          73,
-          74,
-          74,
-          74,
-          74,
-          74,
-          74,
-          75,
-          75,
-          76,
-          76,
-          76,
-          76,
-          76,
-          76,
-          77,
-          77,
-          77,
-          77,
-          77,
-          77,
-          77,
-          77,
-          77,
-          78,
-          78,
-          78,
-          79,
-          80,
-          80,
-          80,
-          84,
-          87,
-          92
-        ],
-        "percentiles": {
-          "mean": 76.0,
-          "median": 75.5,
-          "minimum": 73,
-          "p90": 80,
-          "p99": 92
-        },
-        "run": {
-          "dim": 128,
-          "loops_per_measurement": 50,
-          "num_doc_vectors": 32,
-          "num_measurements": 50,
-          "num_query_vectors": 8,
-          "operation": "chamfer"
-        }
-      },
-      {
-        "latencies": [
-          135,
-          135,
-          135,
-          135,
-          135,
-          135,
-          135,
-          135,
-          136,
-          136,
-          137,
-          138,
-          140,
-          141,
-          141,
-          141,
-          141,
-          141,
-          142,
-          142,
-          142,
-          142,
-          142,
-          142,
-          142,
-          142,
-          142,
-          142,
-          142,
-          142,
-          142,
-          142,
-          142,
-          142,
-          143,
-          143,
-          143,
-          144,
-          144,
-          145,
-          145,
-          145,
-          147,
-          150,
-          151,
-          151,
-          153,
-          154,
-          158,
-          158
-        ],
-        "percentiles": {
-          "mean": 142.36,
-          "median": 142.0,
-          "minimum": 135,
-          "p90": 151,
-          "p99": 158
-        },
-        "run": {
-          "dim": 384,
-          "loops_per_measurement": 2,
-          "num_doc_vectors": 128,
-          "num_measurements": 50,
-          "num_query_vectors": 32,
-          "operation": "max_sim"
-        }
-      }
-    ]
-  }
-]
\ No newline at end of file

From 3a89c3750bef66322e5e5c3f90e91d43e941a74b Mon Sep 17 00:00:00 2001
From: Suryansh Gupta <suryansh.gupta2000@gmail.com>
Date: Thu, 7 May 2026 12:09:00 +0530
Subject: [PATCH 5/8] Add Cargo.lock

---
 Cargo.lock | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Cargo.lock b/Cargo.lock
index c7b68684e..e179d3320 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -699,7 +699,7 @@ dependencies = [
 
 [[package]]
 name = "diskann-benchmark-multi-vector"
-version = "0.50.1"
+version = "0.51.0"
 dependencies = [
  "anyhow",
  "diskann-benchmark-runner",

From 96d17b30378159ffdfc04b5afbeb0bf225992de2 Mon Sep 17 00:00:00 2001
From: Suryansh Gupta <suryansh.gupta2000@gmail.com>
Date: Fri, 8 May 2026 01:48:37 +0530
Subject: [PATCH 6/8] Remove unused scalar benchmark config file

---
 .../graph_index_scalar_oai_large.json         | 115 ------------------
 1 file changed, 115 deletions(-)
 delete mode 100644 diskann-benchmark/perf_test_inputs/graph_index_scalar_oai_large.json

diff --git a/diskann-benchmark/perf_test_inputs/graph_index_scalar_oai_large.json b/diskann-benchmark/perf_test_inputs/graph_index_scalar_oai_large.json
deleted file mode 100644
index 09752477a..000000000
--- a/diskann-benchmark/perf_test_inputs/graph_index_scalar_oai_large.json
+++ /dev/null
@@ -1,115 +0,0 @@
-{
-    "search_directories": [
-      "/mnt/nvme/s"
-    ],
-    "jobs": [
-      {
-        "type": "graph-index-build-sq",
-        "content": {
-          "build": {
-            "data_type": "float16",
-            "data": "SentenceChunk_OAILarge_1M_normalized_1000000.bin",
-            "distance": "squared_l2",
-            "max_degree": 32,
-            "l_build": 100,
-            "alpha": 1.2,
-            "backedge_ratio": 1.0,
-            "num_threads": 8,
-            "multi_insert":null,
-            "search_phase": {
-              "queries": "SentenceChunk_OAILarge_query_normalized_6809.bin",
-              "groundtruth": "SentenceChunk-1M-gt-6k-recall-at2000",
-              "reps": 2,
-              "num_threads": [
-                8
-              ],
-              "runs": [
-                {
-                  "search_n": 10,
-                  "search_l": [
-                    50
-                  ],
-                  "recall_k": 10
-                }
-              ]
-            }
-          },
-          "num_bits": 1,
-          "standard_deviations": 2,
-          "use_fp_for_search": true
-        }
-      },
-      {
-        "type": "graph-index-build-sq",
-        "content": {
-          "build": {
-            "data_type": "float16",
-            "data": "SentenceChunk_OAILarge_1M_normalized_1000000.bin",
-            "distance": "squared_l2",
-            "max_degree": 32,
-            "l_build": 100,
-            "alpha": 1.2,
-            "backedge_ratio": 1.0,
-            "num_threads": 8,
-            "multi_insert":null,
-            "search_phase": {
-              "queries": "SentenceChunk_OAILarge_query_normalized_6809.bin",
-              "groundtruth": "SentenceChunk-1M-gt-6k-recall-at2000",
-              "reps": 2,
-              "num_threads": [
-                8
-              ],
-              "runs": [
-                {
-                  "search_n": 10,
-                  "search_l": [
-                    50
-                  ],
-                  "recall_k": 10
-                }
-              ]
-            }
-          },
-          "num_bits": 4,
-          "standard_deviations": 2,
-          "use_fp_for_search": true
-        }
-      },
-      {
-        "type": "graph-index-build-sq",
-        "content": {
-          "build": {
-            "data_type": "float16",
-            "data": "SentenceChunk_OAILarge_1M_normalized_1000000.bin",
-            "distance": "squared_l2",
-            "max_degree": 32,
-            "l_build": 100,
-            "alpha": 1.2,
-            "backedge_ratio": 1.0,
-            "num_threads": 8,
-            "multi_insert":null,
-            "search_phase": {
-              "queries": "SentenceChunk_OAILarge_query_normalized_6809.bin",
-              "groundtruth": "SentenceChunk-1M-gt-6k-recall-at2000",
-              "reps": 1,
-              "num_threads": [
-                8
-              ],
-              "runs": [
-                {
-                  "search_n": 10,
-                  "search_l": [
-                    50
-                  ],
-                  "recall_k": 10
-                }
-              ]
-            }
-          },
-          "num_bits": 8,
-          "standard_deviations": 2,
-          "use_fp_for_search": true
-        }
-      }
-    ]
-  }
\ No newline at end of file

From 6b33719c7b082fb6142d0b372c0c227c501fdc8c Mon Sep 17 00:00:00 2001
From: Suryansh Gupta <suryansh.gupta2000@gmail.com>
Date: Fri, 8 May 2026 01:51:13 +0530
Subject: [PATCH 7/8] Revert "Remove unused scalar benchmark config file"

This reverts commit 96d17b30378159ffdfc04b5afbeb0bf225992de2.
---
 .../graph_index_scalar_oai_large.json         | 115 ++++++++++++++++++
 1 file changed, 115 insertions(+)
 create mode 100644 diskann-benchmark/perf_test_inputs/graph_index_scalar_oai_large.json

diff --git a/diskann-benchmark/perf_test_inputs/graph_index_scalar_oai_large.json b/diskann-benchmark/perf_test_inputs/graph_index_scalar_oai_large.json
new file mode 100644
index 000000000..09752477a
--- /dev/null
+++ b/diskann-benchmark/perf_test_inputs/graph_index_scalar_oai_large.json
@@ -0,0 +1,115 @@
+{
+    "search_directories": [
+      "/mnt/nvme/s"
+    ],
+    "jobs": [
+      {
+        "type": "graph-index-build-sq",
+        "content": {
+          "build": {
+            "data_type": "float16",
+            "data": "SentenceChunk_OAILarge_1M_normalized_1000000.bin",
+            "distance": "squared_l2",
+            "max_degree": 32,
+            "l_build": 100,
+            "alpha": 1.2,
+            "backedge_ratio": 1.0,
+            "num_threads": 8,
+            "multi_insert":null,
+            "search_phase": {
+              "queries": "SentenceChunk_OAILarge_query_normalized_6809.bin",
+              "groundtruth": "SentenceChunk-1M-gt-6k-recall-at2000",
+              "reps": 2,
+              "num_threads": [
+                8
+              ],
+              "runs": [
+                {
+                  "search_n": 10,
+                  "search_l": [
+                    50
+                  ],
+                  "recall_k": 10
+                }
+              ]
+            }
+          },
+          "num_bits": 1,
+          "standard_deviations": 2,
+          "use_fp_for_search": true
+        }
+      },
+      {
+        "type": "graph-index-build-sq",
+        "content": {
+          "build": {
+            "data_type": "float16",
+            "data": "SentenceChunk_OAILarge_1M_normalized_1000000.bin",
+            "distance": "squared_l2",
+            "max_degree": 32,
+            "l_build": 100,
+            "alpha": 1.2,
+            "backedge_ratio": 1.0,
+            "num_threads": 8,
+            "multi_insert":null,
+            "search_phase": {
+              "queries": "SentenceChunk_OAILarge_query_normalized_6809.bin",
+              "groundtruth": "SentenceChunk-1M-gt-6k-recall-at2000",
+              "reps": 2,
+              "num_threads": [
+                8
+              ],
+              "runs": [
+                {
+                  "search_n": 10,
+                  "search_l": [
+                    50
+                  ],
+                  "recall_k": 10
+                }
+              ]
+            }
+          },
+          "num_bits": 4,
+          "standard_deviations": 2,
+          "use_fp_for_search": true
+        }
+      },
+      {
+        "type": "graph-index-build-sq",
+        "content": {
+          "build": {
+            "data_type": "float16",
+            "data": "SentenceChunk_OAILarge_1M_normalized_1000000.bin",
+            "distance": "squared_l2",
+            "max_degree": 32,
+            "l_build": 100,
+            "alpha": 1.2,
+            "backedge_ratio": 1.0,
+            "num_threads": 8,
+            "multi_insert":null,
+            "search_phase": {
+              "queries": "SentenceChunk_OAILarge_query_normalized_6809.bin",
+              "groundtruth": "SentenceChunk-1M-gt-6k-recall-at2000",
+              "reps": 1,
+              "num_threads": [
+                8
+              ],
+              "runs": [
+                {
+                  "search_n": 10,
+                  "search_l": [
+                    50
+                  ],
+                  "recall_k": 10
+                }
+              ]
+            }
+          },
+          "num_bits": 8,
+          "standard_deviations": 2,
+          "use_fp_for_search": true
+        }
+      }
+    ]
+  }
\ No newline at end of file

From d06df7ee59ebb69009837dbdbc5bcd8bcaedfc84 Mon Sep 17 00:00:00 2001
From: Suryansh Gupta <suryansh.gupta2000@gmail.com>
Date: Tue, 12 May 2026 20:47:04 +0530
Subject: [PATCH 8/8] Fold the new crate to existing diskann-benchmark crate

---
 Cargo.lock                                    |  17 -
 Cargo.toml                                    |   1 -
 diskann-benchmark-multi-vector/Cargo.toml     |  30 -
 diskann-benchmark-multi-vector/README.md      | 136 ---
 diskann-benchmark-multi-vector/src/bin.rs     |  96 --
 diskann-benchmark-multi-vector/src/lib.rs     | 938 ------------------
 diskann-benchmark/Cargo.toml                  |   3 +
 .../example/multi-vector-test.json            |   0
 .../example}/multi-vector.json                |   0
 .../multi-vector-tolerance.json               |   0
 diskann-benchmark/src/backend/mod.rs          |   2 +
 diskann-benchmark/src/backend/multi_vector.rs | 806 +++++++++++++++
 diskann-benchmark/src/inputs/mod.rs           |   2 +
 diskann-benchmark/src/inputs/multi_vector.rs  | 190 ++++
 diskann-benchmark/src/main.rs                 |  86 ++
 .../src/multi_vector/matrix.rs                |  44 +
 diskann-quantization/src/multi_vector/mod.rs  |   4 +-
 17 files changed, 1135 insertions(+), 1220 deletions(-)
 delete mode 100644 diskann-benchmark-multi-vector/Cargo.toml
 delete mode 100644 diskann-benchmark-multi-vector/README.md
 delete mode 100644 diskann-benchmark-multi-vector/src/bin.rs
 delete mode 100644 diskann-benchmark-multi-vector/src/lib.rs
 rename diskann-benchmark-multi-vector/examples/test.json => diskann-benchmark/example/multi-vector-test.json (100%)
 rename {diskann-benchmark-multi-vector/examples => diskann-benchmark/example}/multi-vector.json (100%)
 rename diskann-benchmark-multi-vector/examples/tolerance.json => diskann-benchmark/perf_test_inputs/multi-vector-tolerance.json (100%)
 create mode 100644 diskann-benchmark/src/backend/multi_vector.rs
 create mode 100644 diskann-benchmark/src/inputs/multi_vector.rs

diff --git a/Cargo.lock b/Cargo.lock
index e179d3320..1713f4b87 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -697,23 +697,6 @@ dependencies = [
  "tokio",
 ]
 
-[[package]]
-name = "diskann-benchmark-multi-vector"
-version = "0.51.0"
-dependencies = [
- "anyhow",
- "diskann-benchmark-runner",
- "diskann-quantization",
- "diskann-utils",
- "diskann-vector",
- "half",
- "rand 0.9.4",
- "serde",
- "serde_json",
- "tempfile",
- "thiserror 2.0.17",
-]
-
 [[package]]
 name = "diskann-benchmark-runner"
 version = "0.51.0"
diff --git a/Cargo.toml b/Cargo.toml
index cce02b501..6353773c9 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -21,7 +21,6 @@ members = [
     "diskann-benchmark-runner",
     "diskann-benchmark-core",
     "diskann-benchmark-simd",
-    "diskann-benchmark-multi-vector",
     "diskann-benchmark",
     "diskann-tools",
     "vectorset",
diff --git a/diskann-benchmark-multi-vector/Cargo.toml b/diskann-benchmark-multi-vector/Cargo.toml
deleted file mode 100644
index f8eb937e1..000000000
--- a/diskann-benchmark-multi-vector/Cargo.toml
+++ /dev/null
@@ -1,30 +0,0 @@
-[package]
-name = "diskann-benchmark-multi-vector"
-version.workspace = true
-description.workspace = true
-authors.workspace = true
-documentation.workspace = true
-license.workspace = true
-edition.workspace = true
-
-[[bin]]
-name = "benchmark-multi-vector"
-path = "src/bin.rs"
-
-[dependencies]
-anyhow.workspace = true
-diskann-utils = { workspace = true, default-features = false }
-half = { workspace = true, features = ["rand_distr"] }
-diskann-benchmark-runner = { workspace = true }
-diskann-quantization = { workspace = true }
-diskann-vector = { workspace = true }
-rand.workspace = true
-serde = { workspace = true, features = ["derive"] }
-serde_json.workspace = true
-thiserror.workspace = true
-
-[lints]
-workspace = true
-
-[dev-dependencies]
-tempfile.workspace = true
diff --git a/diskann-benchmark-multi-vector/README.md b/diskann-benchmark-multi-vector/README.md
deleted file mode 100644
index 014a393a1..000000000
--- a/diskann-benchmark-multi-vector/README.md
+++ /dev/null
@@ -1,136 +0,0 @@
-# diskann-benchmark-multi-vector
-
-Benchmarks and regression detection for the **multi-vector distance
-operations** exposed by `diskann-quantization` — `Chamfer` and `MaxSim` —
-across `f32` and `f16` element types.
-
-## Layout
-
-- `src/lib.rs` — benchmark library: input/tolerance schemas, kernel
-  dispatch, regression checker.
-- `src/bin.rs` — `benchmark-multi-vector` CLI entry point.
-- `examples/multi-vector.json` — full benchmark matrix covering both
-  operations across the registered kernels and a representative range of
-  shapes.
-- `examples/test.json` — minimal smoke configuration consumed by the
-  integration tests.
-- `examples/tolerance.json` — default regression thresholds.
-
-## Registered kernels
-
-The crate registers four kernels — one per `(element_type, implementation)`
-pair:
-
-| Tag                              | Element | Implementation       |
-| -------------------------------- | ------- | -------------------- |
-| `multi-vector-op-f32-optimized`  | `f32`   | `QueryComputer`      |
-| `multi-vector-op-f16-optimized`  | `f16`   | `QueryComputer`      |
-| `multi-vector-op-f32-reference`  | `f32`   | `Chamfer` / `MaxSim` |
-| `multi-vector-op-f16-reference`  | `f16`   | `Chamfer` / `MaxSim` |
-
-The **optimized** path constructs a `QueryComputer` once per shape (which
-internally selects the best available SIMD kernel for the host) and calls
-`chamfer` / `max_sim` inside the timed loop. The **reference** path drives
-the `Chamfer` / `MaxSim` fallback used by the `multi_vector` unit tests —
-useful both as a numerical ground truth and as a baseline to measure SIMD
-speedups against.
-
-## Time normalization
-
-Per-measurement latency is normalized to **nanoseconds per inner-product
-call**, abbreviated `ns/IP`:
-
-```
-ns/IP = min_latency_µs * 1000 / (Q * D * loops_per_measurement)
-```
-
-Two important properties:
-
-- **Independent of `Q`, `D`, and `loops_per_measurement`.** Reshaping the
-  benchmark or scaling the loop budget leaves the metric unchanged, so
-  cache-residency effects and SIMD utilization show up directly.
-- **Approximately linear in `Dim`.** Each inner-product call is itself an
-  O(`Dim`) operation, so `ns/IP` grows with `Dim` — that is why the table
-  headers read `ns/IP @ Dim`. Compare across rows with the same `Dim`; to
-  compare across different `Dim`s, divide further by `Dim` to recover ns
-  per scalar multiply.
-
-This is the right metric for the two things this crate cares about:
-detecting per-shape regressions (the `Dim` factor cancels) and comparing
-optimized vs. reference at a fixed shape.
-
-## Usage
-
-All examples below assume you are inside the crate directory and use a
-small shell function for brevity:
-
-```bash
-bench() { cargo run --release -p diskann-benchmark-multi-vector --bin benchmark-multi-vector -- "$@"; }
-```
-
-### Run benchmarks
-
-`run` executes every job in the input file and writes per-measurement
-latencies plus percentiles to the output file:
-
-```bash
-bench run --input-file examples/multi-vector.json --output-file before.json
-```
-
-### Regression check workflow
-
-The check workflow is **two-phase**: validate the tolerance file once, then
-compare two recorded result files.
-
-**Phase 1 — preflight.** No benchmarks are executed. The verifier confirms
-that every entry in `tolerance.json` matches at least one job in the input
-file, and that every job is matched by exactly one entry. Run it whenever
-you edit `tolerance.json`:
-
-```bash
-bench check verify \
-  --tolerances examples/tolerance.json \
-  --input-file examples/multi-vector.json
-```
-
-**Phase 2 — comparison.** Record results before and after a code change,
-then compare. The command exits non-zero if any run regresses past its
-tolerance:
-
-```bash
-# On the baseline commit
-bench run --input-file examples/multi-vector.json --output-file before.json
-
-# On the change commit
-bench run --input-file examples/multi-vector.json --output-file after.json
-
-# Compare
-bench check run \
-  --tolerances examples/tolerance.json \
-  --input-file examples/multi-vector.json \
-  --before before.json --after after.json \
-  --output-file checks.json
-```
-
-A run **fails** when its post-change `ns/IP` minimum exceeds the
-baseline minimum by more than `min_time_regression` (default `0.05` =
-5%). Improvements (negative change) always pass.
-
-### How tolerances are matched to jobs
-
-Each entry in `tolerance.json` has the shape `{ input, tolerance }`. The
-`input` block acts as a **partial template** against the jobs in the input
-file: any field present must match; missing fields are wildcards.
-
-The shipped `tolerance.json` uses an empty `"content": {}`, which matches
-every `multi-vector-op` job — so a single 5% threshold applies to all four
-kernels. To apply different thresholds per implementation, add more
-specific entries, e.g.:
-
-```json
-{ "input":     { "type": "multi-vector-op", "content": { "implementation": "reference" } },
-  "tolerance": { "type": "multi-vector-tolerance", "content": { "min_time_regression": 0.10 } } }
-```
-
-`check verify` will reject the file if entries overlap or leave any job
-unmatched.
diff --git a/diskann-benchmark-multi-vector/src/bin.rs b/diskann-benchmark-multi-vector/src/bin.rs
deleted file mode 100644
index d595533e7..000000000
--- a/diskann-benchmark-multi-vector/src/bin.rs
+++ /dev/null
@@ -1,96 +0,0 @@
-/*
- * Copyright (c) Microsoft Corporation.
- * Licensed under the MIT license.
- */
-
-use diskann_benchmark_multi_vector::{register, MultiVectorOp};
-use diskann_benchmark_runner::{output, registry, App, Output};
-
-pub fn main() -> anyhow::Result<()> {
-    // Create the pocket bench application.
-    let app = App::parse();
-    main_inner(&app, &mut output::default())
-}
-
-fn main_inner(app: &App, output: &mut dyn Output) -> anyhow::Result<()> {
-    // Register inputs and benchmarks.
-    let mut inputs = registry::Inputs::new();
-    inputs.register::<MultiVectorOp>()?;
-
-    let mut benchmarks = registry::Benchmarks::new();
-    register(&mut benchmarks);
-
-    // Here we go!
-    app.run(&inputs, &benchmarks, output)
-}
-
-///////////
-// Tests //
-///////////
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    use std::path::{Path, PathBuf};
-
-    use diskann_benchmark_runner::app::{Check, Commands};
-
-    fn run_integration_test(input_file: &Path, output_file: &Path) {
-        let commands = Commands::Run {
-            input_file: input_file.to_str().unwrap().into(),
-            output_file: output_file.to_str().unwrap().into(),
-            dry_run: false,
-            allow_debug: true,
-        };
-
-        let app = App::from_commands(commands);
-
-        let mut output = output::Memory::new();
-        main_inner(&app, &mut output).unwrap();
-        println!(
-            "output = {}",
-            String::from_utf8(output.into_inner()).unwrap()
-        );
-
-        assert!(output_file.exists());
-    }
-
-    fn run_check_test(input_file: &Path, tolerances: &Path) -> String {
-        let commands = Commands::Check(Check::Verify {
-            tolerances: tolerances.to_str().unwrap().into(),
-            input_file: input_file.to_str().unwrap().into(),
-        });
-
-        let app = App::from_commands(commands);
-
-        let mut output = output::Memory::new();
-        main_inner(&app, &mut output).unwrap();
-        String::from_utf8(output.into_inner()).unwrap()
-    }
-
-    #[test]
-    fn integration_test() {
-        let input_path = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
-            .join("examples")
-            .join("test.json");
-
-        let tempdir = tempfile::tempdir().unwrap();
-        let output_path = tempdir.path().join("output.json");
-
-        run_integration_test(&input_path, &output_path);
-    }
-
-    #[test]
-    fn check_verify() {
-        let input_path = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
-            .join("examples")
-            .join("test.json");
-        let tolerance_path = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
-            .join("examples")
-            .join("tolerance.json");
-
-        let stdout = run_check_test(&input_path, &tolerance_path);
-        println!("stdout = {}", stdout);
-    }
-}
diff --git a/diskann-benchmark-multi-vector/src/lib.rs b/diskann-benchmark-multi-vector/src/lib.rs
deleted file mode 100644
index df08d93dd..000000000
--- a/diskann-benchmark-multi-vector/src/lib.rs
+++ /dev/null
@@ -1,938 +0,0 @@
-/*
- * Copyright (c) Microsoft Corporation.
- * Licensed under the MIT license.
- */
-
-//! Multi-vector distance benchmarks with regression detection.
-
-use std::{io::Write, num::NonZeroUsize};
-
-use diskann_quantization::multi_vector::{Chamfer, MatRef, MaxSim, QueryComputer, Standard};
-use diskann_vector::distance::InnerProduct;
-use diskann_vector::{DistanceFunctionMut, PureDistanceFunction};
-use half::f16;
-use rand::{
-    distr::{Distribution, StandardUniform},
-    rngs::StdRng,
-    SeedableRng,
-};
-use serde::{Deserialize, Serialize};
-use thiserror::Error;
-
-use diskann_benchmark_runner::{
-    benchmark::{PassFail, Regression},
-    dispatcher::{Description, DispatchRule, FailureScore, MatchScore},
-    utils::{
-        datatype::{self, DataType},
-        num::{relative_change, NonNegativeFinite},
-        percentiles, MicroSeconds,
-    },
-    Any, Benchmark, CheckDeserialization, Checker, Input,
-};
-
-////////////////
-// Public API //
-////////////////
-
-/// Register all multi-vector benchmarks with the runner's dispatcher.
-pub fn register(dispatcher: &mut diskann_benchmark_runner::registry::Benchmarks) {
-    register_benchmarks_impl(dispatcher)
-}
-
-///////////
-// Utils //
-///////////
-
-#[derive(Debug, Clone, Copy)]
-struct DisplayWrapper<'a, T: ?Sized>(&'a T);
-
-impl<T: ?Sized> std::ops::Deref for DisplayWrapper<'_, T> {
-    type Target = T;
-    fn deref(&self) -> &T {
-        self.0
-    }
-}
-
-////////////
-// Inputs //
-////////////
-
-/// The two distance operations exposed by [`QueryComputer`].
-#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)]
-#[serde(rename_all = "snake_case")]
-pub enum Operation {
-    Chamfer,
-    MaxSim,
-}
-
-impl std::fmt::Display for Operation {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        let st = match self {
-            Self::Chamfer => "chamfer",
-            Self::MaxSim => "max_sim",
-        };
-        write!(f, "{}", st)
-    }
-}
-
-/// Which implementation tier to benchmark.
-#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)]
-#[serde(rename_all = "kebab-case")]
-enum Implementation {
-    Optimized,
-    Reference,
-}
-
-impl std::fmt::Display for Implementation {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        let st = match self {
-            Self::Optimized => "optimized",
-            Self::Reference => "reference",
-        };
-        write!(f, "{}", st)
-    }
-}
-
-/// One benchmark configuration: a single (operation, shape) measurement.
-#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
-struct Run {
-    operation: Operation,
-    num_query_vectors: NonZeroUsize,
-    num_doc_vectors: NonZeroUsize,
-    dim: NonZeroUsize,
-    loops_per_measurement: NonZeroUsize,
-    num_measurements: NonZeroUsize,
-}
-
-/// A complete multi-vector benchmark job.
-#[derive(Debug, Serialize, Deserialize)]
-pub struct MultiVectorOp {
-    element_type: DataType,
-    implementation: Implementation,
-    runs: Vec<Run>,
-}
-
-impl CheckDeserialization for MultiVectorOp {
-    fn check_deserialization(&mut self, _checker: &mut Checker) -> Result<(), anyhow::Error> {
-        Ok(())
-    }
-}
-
-macro_rules! write_field {
-    ($f:ident, $field:tt, $($expr:tt)*) => {
-        writeln!($f, "{:>18}: {}", $field, $($expr)*)
-    }
-}
-
-impl MultiVectorOp {
-    fn summarize_fields(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write_field!(f, "element type", self.element_type)?;
-        write_field!(f, "implementation", self.implementation)?;
-        write_field!(f, "number of runs", self.runs.len())?;
-        Ok(())
-    }
-}
-
-impl std::fmt::Display for MultiVectorOp {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        writeln!(f, "Multi-Vector Operation\n")?;
-        write_field!(f, "tag", Self::tag())?;
-        self.summarize_fields(f)
-    }
-}
-
-impl Input for MultiVectorOp {
-    fn tag() -> &'static str {
-        "multi-vector-op"
-    }
-
-    fn try_deserialize(
-        serialized: &serde_json::Value,
-        checker: &mut Checker,
-    ) -> anyhow::Result<Any> {
-        checker.any(Self::deserialize(serialized)?)
-    }
-
-    fn example() -> anyhow::Result<serde_json::Value> {
-        const NUM_QUERY_VECTORS: NonZeroUsize = NonZeroUsize::new(32).unwrap();
-        const NUM_DOC_VECTORS: NonZeroUsize = NonZeroUsize::new(64).unwrap();
-        const DIM: NonZeroUsize = NonZeroUsize::new(128).unwrap();
-        const LOOPS_PER_MEASUREMENT: NonZeroUsize = NonZeroUsize::new(200).unwrap();
-        const NUM_MEASUREMENTS: NonZeroUsize = NonZeroUsize::new(100).unwrap();
-
-        let runs = vec![
-            Run {
-                operation: Operation::Chamfer,
-                num_query_vectors: NUM_QUERY_VECTORS,
-                num_doc_vectors: NUM_DOC_VECTORS,
-                dim: DIM,
-                loops_per_measurement: LOOPS_PER_MEASUREMENT,
-                num_measurements: NUM_MEASUREMENTS,
-            },
-            Run {
-                operation: Operation::MaxSim,
-                num_query_vectors: NUM_QUERY_VECTORS,
-                num_doc_vectors: NUM_DOC_VECTORS,
-                dim: DIM,
-                loops_per_measurement: LOOPS_PER_MEASUREMENT,
-                num_measurements: NUM_MEASUREMENTS,
-            },
-        ];
-
-        Ok(serde_json::to_value(&Self {
-            element_type: DataType::Float32,
-            implementation: Implementation::Optimized,
-            runs,
-        })?)
-    }
-}
-
-//////////////////////
-// Regression Check //
-//////////////////////
-
-/// Tolerance thresholds for multi-vector benchmark regression detection.
-///
-/// Each field specifies the maximum allowed relative increase in the corresponding metric.
-/// For example, a value of `0.05` means a 5% increase is tolerated.
-#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
-struct MultiVectorTolerance {
-    min_time_regression: NonNegativeFinite,
-}
-
-impl CheckDeserialization for MultiVectorTolerance {
-    fn check_deserialization(&mut self, _checker: &mut Checker) -> Result<(), anyhow::Error> {
-        Ok(())
-    }
-}
-
-impl Input for MultiVectorTolerance {
-    fn tag() -> &'static str {
-        "multi-vector-tolerance"
-    }
-
-    fn try_deserialize(
-        serialized: &serde_json::Value,
-        checker: &mut Checker,
-    ) -> anyhow::Result<Any> {
-        checker.any(Self::deserialize(serialized)?)
-    }
-
-    fn example() -> anyhow::Result<serde_json::Value> {
-        const EXAMPLE: NonNegativeFinite = match NonNegativeFinite::new(0.05) {
-            Ok(v) => v,
-            Err(_) => panic!("use a non-negative finite please"),
-        };
-
-        Ok(serde_json::to_value(MultiVectorTolerance {
-            min_time_regression: EXAMPLE,
-        })?)
-    }
-}
-
-/// Per-run comparison result showing before/after percentile differences.
-#[derive(Debug, Serialize)]
-struct Comparison {
-    run: Run,
-    tolerance: MultiVectorTolerance,
-    before_min: f64,
-    after_min: f64,
-}
-
-/// Aggregated result of the regression check across all runs.
-#[derive(Debug, Serialize)]
-struct CheckResult {
-    checks: Vec<Comparison>,
-}
-
-impl std::fmt::Display for CheckResult {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        let header = [
-            "Operation",
-            "Q",
-            "D",
-            "Dim",
-            "Min Before (ns/IP @ Dim)",
-            "Min After (ns/IP @ Dim)",
-            "Change (%)",
-            "Remark",
-        ];
-
-        let mut table = diskann_benchmark_runner::utils::fmt::Table::new(header, self.checks.len());
-
-        for (i, c) in self.checks.iter().enumerate() {
-            let mut row = table.row(i);
-            let change = relative_change(c.before_min, c.after_min);
-
-            row.insert(c.run.operation, 0);
-            row.insert(c.run.num_query_vectors, 1);
-            row.insert(c.run.num_doc_vectors, 2);
-            row.insert(c.run.dim, 3);
-            row.insert(format!("{:.3}", c.before_min), 4);
-            row.insert(format!("{:.3}", c.after_min), 5);
-            match change {
-                Ok(change) => {
-                    row.insert(format!("{:.3} %", change * 100.0), 6);
-                    if change > c.tolerance.min_time_regression.get() {
-                        row.insert("FAIL", 7);
-                    }
-                }
-                Err(err) => {
-                    row.insert("invalid", 6);
-                    row.insert(err, 7);
-                }
-            }
-        }
-
-        table.fmt(f)
-    }
-}
-
-////////////////////////////
-// Benchmark Registration //
-////////////////////////////
-
-fn register_benchmarks_impl(dispatcher: &mut diskann_benchmark_runner::registry::Benchmarks) {
-    macro_rules! register {
-        ($impl:ident, $t:ty, $tag:literal) => {
-            dispatcher.register_regression($tag, Kernel::<$impl, $t>::new());
-        };
-    }
-
-    // Optimized (architecture-dispatched QueryComputer).
-    register!(Optimized, f32, "multi-vector-op-f32-optimized");
-    register!(Optimized, f16, "multi-vector-op-f16-optimized");
-
-    // Reference (Chamfer / MaxSim fallback path).
-    register!(Reference, f32, "multi-vector-op-f32-reference");
-    register!(Reference, f16, "multi-vector-op-f16-reference");
-}
-
-//////////////
-// Dispatch //
-//////////////
-
-/// Dispatch marker for the [`QueryComputer`] implementation.
-#[derive(Debug)]
-struct Optimized;
-
-/// Dispatch marker for the [`Chamfer`] / [`MaxSim`] fallback.
-#[derive(Debug)]
-struct Reference;
-
-/// A multi-vector benchmark.
-struct Kernel<I, T> {
-    _type: std::marker::PhantomData<(I, T)>,
-}
-
-impl<I, T> Kernel<I, T> {
-    fn new() -> Self {
-        Self {
-            _type: std::marker::PhantomData,
-        }
-    }
-}
-
-#[derive(Debug, Error)]
-#[error("this kernel handles a different implementation than {0}")]
-pub(crate) struct ImplementationMismatch(Implementation);
-
-macro_rules! impl_dispatch_rule {
-    ($marker:ident, $variant:ident, $description:literal) => {
-        impl DispatchRule<Implementation> for $marker {
-            type Error = ImplementationMismatch;
-
-            fn try_match(from: &Implementation) -> Result<MatchScore, FailureScore> {
-                if *from == Implementation::$variant {
-                    Ok(MatchScore(0))
-                } else {
-                    Err(FailureScore(1))
-                }
-            }
-
-            fn convert(from: Implementation) -> Result<Self, Self::Error> {
-                if from == Implementation::$variant {
-                    Ok($marker)
-                } else {
-                    Err(ImplementationMismatch(from))
-                }
-            }
-
-            fn description(
-                f: &mut std::fmt::Formatter<'_>,
-                from: Option<&Implementation>,
-            ) -> std::fmt::Result {
-                match from {
-                    None => write!(f, $description),
-                    Some(impl_) => {
-                        if Self::try_match(impl_).is_ok() {
-                            write!(f, "matched {}", impl_)
-                        } else {
-                            write!(f, "expected {}, got {}", Implementation::$variant, impl_)
-                        }
-                    }
-                }
-            }
-        }
-    };
-}
-
-impl_dispatch_rule!(
-    Optimized,
-    Optimized,
-    "QueryComputer (architecture-dispatched)"
-);
-impl_dispatch_rule!(Reference, Reference, "Chamfer / MaxSim fallback");
-
-impl<I, T> Benchmark for Kernel<I, T>
-where
-    datatype::Type<T>: DispatchRule<datatype::DataType>,
-    I: DispatchRule<Implementation, Error = ImplementationMismatch> + 'static,
-    Kernel<I, T>: RunBenchmark<I>,
-    T: 'static,
-{
-    type Input = MultiVectorOp;
-    type Output = Vec<RunResult>;
-
-    fn try_match(&self, from: &MultiVectorOp) -> Result<MatchScore, FailureScore> {
-        let mut failscore: Option<u32> = None;
-        if datatype::Type::<T>::try_match(&from.element_type).is_err() {
-            *failscore.get_or_insert(0) += 10;
-        }
-        if let Err(FailureScore(score)) = I::try_match(&from.implementation) {
-            *failscore.get_or_insert(0) += 2 + score;
-        }
-
-        match failscore {
-            None => Ok(MatchScore(0)),
-            Some(score) => Err(FailureScore(score)),
-        }
-    }
-
-    fn run(
-        &self,
-        input: &MultiVectorOp,
-        _: diskann_benchmark_runner::Checkpoint<'_>,
-        mut output: &mut dyn diskann_benchmark_runner::Output,
-    ) -> anyhow::Result<Self::Output> {
-        // The dispatcher only invokes `run` after `try_match` has already accepted
-        // the input, so a failure here would indicate a dispatcher bug.
-        I::convert(input.implementation).expect("try_match accepted the input");
-        writeln!(output, "{}", input)?;
-        let results = self.run_benchmark(input)?;
-        writeln!(output, "\n\n{}", DisplayWrapper(&*results))?;
-        Ok(results)
-    }
-
-    fn description(
-        &self,
-        f: &mut std::fmt::Formatter<'_>,
-        input: Option<&MultiVectorOp>,
-    ) -> std::fmt::Result {
-        match input {
-            None => {
-                writeln!(
-                    f,
-                    "- Element Type: {}",
-                    Description::<datatype::DataType, datatype::Type<T>>::new()
-                )?;
-                writeln!(
-                    f,
-                    "- Implementation: {}",
-                    Description::<Implementation, I>::new()
-                )?;
-            }
-            Some(input) => {
-                if let Err(err) = datatype::Type::<T>::try_match_verbose(&input.element_type) {
-                    writeln!(f, "\n    - Mismatched element type: {}", err)?;
-                }
-                if let Err(err) = I::try_match_verbose(&input.implementation) {
-                    writeln!(f, "\n    - Mismatched implementation: {}", err)?;
-                }
-            }
-        }
-        Ok(())
-    }
-}
-
-impl<I, T> Regression for Kernel<I, T>
-where
-    datatype::Type<T>: DispatchRule<datatype::DataType>,
-    I: DispatchRule<Implementation, Error = ImplementationMismatch> + 'static,
-    Kernel<I, T>: RunBenchmark<I>,
-    T: 'static,
-{
-    type Tolerances = MultiVectorTolerance;
-    type Pass = CheckResult;
-    type Fail = CheckResult;
-
-    fn check(
-        &self,
-        tolerance: &MultiVectorTolerance,
-        _input: &MultiVectorOp,
-        before: &Vec<RunResult>,
-        after: &Vec<RunResult>,
-    ) -> anyhow::Result<PassFail<CheckResult, CheckResult>> {
-        anyhow::ensure!(
-            before.len() == after.len(),
-            "before has {} runs but after has {}",
-            before.len(),
-            after.len(),
-        );
-
-        let mut passed = true;
-        let checks: Vec<Comparison> = std::iter::zip(before.iter(), after.iter())
-            .enumerate()
-            .map(|(i, (b, a))| {
-                anyhow::ensure!(b.run == a.run, "run {i} mismatched");
-
-                let computations_per_latency = b.computations_per_latency() as f64;
-
-                let before_min = b.percentiles.minimum.as_f64() * 1000.0 / computations_per_latency;
-                let after_min = a.percentiles.minimum.as_f64() * 1000.0 / computations_per_latency;
-
-                let comparison = Comparison {
-                    run: b.run.clone(),
-                    tolerance: *tolerance,
-                    before_min,
-                    after_min,
-                };
-
-                match relative_change(before_min, after_min) {
-                    Ok(change) => {
-                        if change > tolerance.min_time_regression.get() {
-                            passed = false;
-                        }
-                    }
-                    Err(_) => passed = false,
-                };
-
-                Ok(comparison)
-            })
-            .collect::<anyhow::Result<Vec<Comparison>>>()?;
-
-        let check = CheckResult { checks };
-
-        if passed {
-            Ok(PassFail::Pass(check))
-        } else {
-            Ok(PassFail::Fail(check))
-        }
-    }
-}
-
-///////////////
-// Benchmark //
-///////////////
-
-trait RunBenchmark<I> {
-    fn run_benchmark(&self, input: &MultiVectorOp) -> Result<Vec<RunResult>, anyhow::Error>;
-}
-
-#[derive(Debug, Serialize, Deserialize)]
-struct RunResult {
-    /// The configuration for this run.
-    run: Run,
-    /// Per-measurement latencies (over `loops_per_measurement` calls).
-    latencies: Vec<MicroSeconds>,
-    /// Latency percentiles.
-    percentiles: percentiles::Percentiles<MicroSeconds>,
-}
-
-impl RunResult {
-    fn computations_per_latency(&self) -> usize {
-        self.run.num_query_vectors.get()
-            * self.run.num_doc_vectors.get()
-            * self.run.loops_per_measurement.get()
-    }
-}
-
-impl std::fmt::Display for DisplayWrapper<'_, [RunResult]> {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        if self.is_empty() {
-            return Ok(());
-        }
-
-        // ns/IP is normalized as `min_latency_us * 1000 / (Q * D * loops)` and is
-        // approximately linear in `dim`. Compare across rows with the same `Dim`;
-        // divide further by `Dim` to recover ns per scalar multiply.
-        writeln!(
-            f,
-            "ns/IP = time per (query, doc) inner-product call (~ linear in Dim)"
-        )?;
-
-        let header = [
-            "Operation",
-            "Q",
-            "D",
-            "Dim",
-            "Min Time (ns/IP @ Dim)",
-            "Mean Time (ns/IP @ Dim)",
-            "Loops",
-            "Measurements",
-        ];
-
-        let mut table = diskann_benchmark_runner::utils::fmt::Table::new(header, self.len());
-
-        self.iter().enumerate().for_each(|(row, r)| {
-            let mut row = table.row(row);
-
-            let min_latency = r
-                .latencies
-                .iter()
-                .min()
-                .copied()
-                .unwrap_or(MicroSeconds::new(u64::MAX));
-            let mean_latency = r.percentiles.mean;
-
-            let computations_per_latency = r.computations_per_latency() as f64;
-
-            // Convert time from micro-seconds to nano-seconds per inner-product call
-            // (one (query, doc) pair, ~ linear in dim).
-            let min_time = min_latency.as_f64() / computations_per_latency * 1000.0;
-            let mean_time = mean_latency / computations_per_latency * 1000.0;
-
-            row.insert(r.run.operation, 0);
-            row.insert(r.run.num_query_vectors, 1);
-            row.insert(r.run.num_doc_vectors, 2);
-            row.insert(r.run.dim, 3);
-            row.insert(format!("{:.3}", min_time), 4);
-            row.insert(format!("{:.3}", mean_time), 5);
-            row.insert(r.run.loops_per_measurement, 6);
-            row.insert(r.run.num_measurements, 7);
-        });
-
-        table.fmt(f)
-    }
-}
-
-fn run_loops<F>(run: &Run, mut body: F) -> RunResult
-where
-    F: FnMut(),
-{
-    let mut latencies = Vec::with_capacity(run.num_measurements.get());
-
-    for _ in 0..run.num_measurements.get() {
-        let start = std::time::Instant::now();
-        for _ in 0..run.loops_per_measurement.get() {
-            body();
-        }
-        latencies.push(start.elapsed().into());
-    }
-
-    let percentiles = percentiles::compute_percentiles(&mut latencies).unwrap();
-    RunResult {
-        run: run.clone(),
-        latencies,
-        percentiles,
-    }
-}
-
-///////////////////
-// Data fixtures //
-///////////////////
-
-const RNG_SEED: u64 = 0x12345;
-
-struct Data<T> {
-    query_data: Box<[T]>,
-    doc_data: Box<[T]>,
-}
-
-impl<T: Copy> Data<T>
-where
-    StandardUniform: Distribution<T>,
-{
-    fn new(run: &Run) -> Self {
-        let mut rng = StdRng::seed_from_u64(RNG_SEED);
-        let query_data: Box<[T]> = (0..run.num_query_vectors.get() * run.dim.get())
-            .map(|_| StandardUniform.sample(&mut rng))
-            .collect();
-        let doc_data: Box<[T]> = (0..run.num_doc_vectors.get() * run.dim.get())
-            .map(|_| StandardUniform.sample(&mut rng))
-            .collect();
-
-        Self {
-            query_data,
-            doc_data,
-        }
-    }
-
-    fn query(&self, run: &Run) -> MatRef<'_, Standard<T>> {
-        MatRef::new(
-            Standard::new(run.num_query_vectors.get(), run.dim.get()).unwrap(),
-            &self.query_data,
-        )
-        .unwrap()
-    }
-
-    fn doc(&self, run: &Run) -> MatRef<'_, Standard<T>> {
-        MatRef::new(
-            Standard::new(run.num_doc_vectors.get(), run.dim.get()).unwrap(),
-            &self.doc_data,
-        )
-        .unwrap()
-    }
-}
-
-/////////////////////
-// Implementations //
-/////////////////////
-
-fn run_optimized<T>(input: &MultiVectorOp) -> anyhow::Result<Vec<RunResult>>
-where
-    T: Copy,
-    StandardUniform: Distribution<T>,
-    QueryComputer<T>: NewFromMatRef<T>,
-{
-    let mut results = Vec::with_capacity(input.runs.len());
-    for run in input.runs.iter() {
-        let data = Data::<T>::new(run);
-        // `QueryComputer` performs query-side precomputation that is intentionally
-        // amortized across many `chamfer` / `max_sim` calls; construct it once per
-        // shape, outside the timed loop.
-        let computer = <QueryComputer<T> as NewFromMatRef<T>>::new_from(data.query(run));
-        let doc = data.doc(run);
-
-        let result = match run.operation {
-            Operation::Chamfer => run_loops(run, || {
-                let v = computer.chamfer(doc);
-                std::hint::black_box(v);
-            }),
-            Operation::MaxSim => {
-                let mut scores = vec![0.0f32; run.num_query_vectors.get()];
-                run_loops(run, || {
-                    computer.max_sim(doc, &mut scores);
-                    std::hint::black_box(&mut scores);
-                })
-            }
-        };
-        results.push(result);
-    }
-    Ok(results)
-}
-
-/// Drive the [`Chamfer`] / [`MaxSim`] fallback path.
-fn run_reference<T>(input: &MultiVectorOp) -> anyhow::Result<Vec<RunResult>>
-where
-    T: Copy,
-    StandardUniform: Distribution<T>,
-    InnerProduct: for<'a, 'b> PureDistanceFunction<&'a [T], &'b [T], f32>,
-{
-    let mut results = Vec::with_capacity(input.runs.len());
-    for run in input.runs.iter() {
-        let data = Data::<T>::new(run);
-        let doc = data.doc(run);
-        // Hoist out of the timed loop to mirror the optimized path's
-        // per-shape precomputation.
-        let query: diskann_quantization::multi_vector::distance::QueryMatRef<'_, _> =
-            data.query(run).into();
-
-        let result = match run.operation {
-            Operation::Chamfer => run_loops(run, || {
-                let v = Chamfer::evaluate(query, doc);
-                std::hint::black_box(v);
-            }),
-            Operation::MaxSim => {
-                let mut scores = vec![0.0f32; run.num_query_vectors.get()];
-                let mut max_sim = MaxSim::new(&mut scores).unwrap();
-                run_loops(run, || {
-                    let _ = max_sim.evaluate(query, doc);
-                    std::hint::black_box(max_sim.scores_mut());
-                })
-            }
-        };
-        results.push(result);
-    }
-    Ok(results)
-}
-
-/// Element-type-erasing constructor for [`QueryComputer`].
-trait NewFromMatRef<T: Copy> {
-    fn new_from(query: MatRef<'_, Standard<T>>) -> QueryComputer<T>;
-}
-
-macro_rules! impl_kernel_for {
-    ($t:ty) => {
-        impl NewFromMatRef<$t> for QueryComputer<$t> {
-            fn new_from(query: MatRef<'_, Standard<$t>>) -> QueryComputer<$t> {
-                QueryComputer::<$t>::new(query)
-            }
-        }
-
-        impl RunBenchmark<Optimized> for Kernel<Optimized, $t> {
-            fn run_benchmark(
-                &self,
-                input: &MultiVectorOp,
-            ) -> Result<Vec<RunResult>, anyhow::Error> {
-                run_optimized::<$t>(input)
-            }
-        }
-
-        impl RunBenchmark<Reference> for Kernel<Reference, $t> {
-            fn run_benchmark(
-                &self,
-                input: &MultiVectorOp,
-            ) -> Result<Vec<RunResult>, anyhow::Error> {
-                run_reference::<$t>(input)
-            }
-        }
-    };
-}
-
-impl_kernel_for!(f32);
-impl_kernel_for!(f16);
-
-///////////
-// Tests //
-///////////
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    use diskann_benchmark_runner::{
-        benchmark::{PassFail, Regression},
-        utils::percentiles::compute_percentiles,
-    };
-
-    fn tiny_run(operation: Operation) -> Run {
-        Run {
-            operation,
-            num_query_vectors: NonZeroUsize::new(2).unwrap(),
-            num_doc_vectors: NonZeroUsize::new(2).unwrap(),
-            dim: NonZeroUsize::new(4).unwrap(),
-            loops_per_measurement: NonZeroUsize::new(1).unwrap(),
-            num_measurements: NonZeroUsize::new(1).unwrap(),
-        }
-    }
-
-    fn tiny_op() -> MultiVectorOp {
-        MultiVectorOp {
-            element_type: DataType::Float32,
-            implementation: Implementation::Optimized,
-            runs: vec![tiny_run(Operation::Chamfer)],
-        }
-    }
-
-    fn tiny_result(operation: Operation, minimum: u64) -> RunResult {
-        let run = tiny_run(operation);
-        let minimum = MicroSeconds::new(minimum);
-        let mut latencies = vec![minimum];
-        let percentiles = compute_percentiles(&mut latencies).unwrap();
-        RunResult {
-            run,
-            latencies,
-            percentiles,
-        }
-    }
-
-    fn tolerance(limit: f64) -> MultiVectorTolerance {
-        MultiVectorTolerance {
-            min_time_regression: NonNegativeFinite::new(limit).unwrap(),
-        }
-    }
-
-    #[test]
-    fn check_rejects_mismatched_runs() {
-        let kernel = Kernel::<Optimized, f32>::new();
-
-        let err = kernel
-            .check(
-                &tolerance(0.0),
-                &tiny_op(),
-                &vec![tiny_result(Operation::Chamfer, 100)],
-                &vec![tiny_result(Operation::MaxSim, 100)],
-            )
-            .unwrap_err();
-
-        assert_eq!(err.to_string(), "run 0 mismatched");
-    }
-
-    #[test]
-    fn check_allows_negative_relative_change() {
-        let kernel = Kernel::<Optimized, f32>::new();
-
-        let result = kernel
-            .check(
-                &tolerance(0.0),
-                &tiny_op(),
-                &vec![tiny_result(Operation::Chamfer, 100)],
-                &vec![tiny_result(Operation::Chamfer, 95)],
-            )
-            .unwrap();
-
-        assert!(matches!(result, PassFail::Pass(_)));
-    }
-
-    #[test]
-    fn check_passes_on_tolerance_boundary() {
-        let kernel = Kernel::<Optimized, f32>::new();
-
-        let result = kernel
-            .check(
-                &tolerance(0.05),
-                &tiny_op(),
-                &vec![tiny_result(Operation::Chamfer, 100)],
-                &vec![tiny_result(Operation::Chamfer, 105)],
-            )
-            .unwrap();
-
-        assert!(matches!(result, PassFail::Pass(_)));
-    }
-
-    #[test]
-    fn check_fails_above_tolerance_boundary() {
-        let kernel = Kernel::<Optimized, f32>::new();
-
-        let result = kernel
-            .check(
-                &tolerance(0.05),
-                &tiny_op(),
-                &vec![tiny_result(Operation::Chamfer, 100)],
-                &vec![tiny_result(Operation::Chamfer, 106)],
-            )
-            .unwrap();
-
-        assert!(matches!(result, PassFail::Fail(_)));
-    }
-
-    #[test]
-    fn check_result_display_includes_failure_details() {
-        let check = CheckResult {
-            checks: vec![Comparison {
-                run: tiny_run(Operation::Chamfer),
-                tolerance: tolerance(0.05),
-                before_min: 100.0,
-                after_min: 106.0,
-            }],
-        };
-
-        let rendered = check.to_string();
-        assert!(rendered.contains("Operation"), "rendered = {rendered}");
-        assert!(rendered.contains("chamfer"), "rendered = {rendered}");
-        assert!(rendered.contains("100.000"), "rendered = {rendered}");
-        assert!(rendered.contains("106.000"), "rendered = {rendered}");
-        assert!(rendered.contains("6.000 %"), "rendered = {rendered}");
-        assert!(rendered.contains("FAIL"), "rendered = {rendered}");
-    }
-
-    /// A "before" value of 0 means the measurement was too fast to obtain a
-    /// reliable signal, so we *could* be letting a regression through. We
-    /// require at least a non-zero value.
-    #[test]
-    fn zero_values_rejected() {
-        let kernel = Kernel::<Optimized, f32>::new();
-
-        let result = kernel
-            .check(
-                &tolerance(0.05),
-                &tiny_op(),
-                &vec![tiny_result(Operation::Chamfer, 0)],
-                &vec![tiny_result(Operation::Chamfer, 0)],
-            )
-            .unwrap();
-
-        assert!(matches!(result, PassFail::Fail(_)));
-    }
-}
diff --git a/diskann-benchmark/Cargo.toml b/diskann-benchmark/Cargo.toml
index bebaf4b8e..efd058ffb 100644
--- a/diskann-benchmark/Cargo.toml
+++ b/diskann-benchmark/Cargo.toml
@@ -63,6 +63,9 @@ scalar-quantization = []
 # Enable minmax-quantization based algorithms
 minmax-quantization = []
 
+# Enable multi-vector distance benchmarks (Chamfer / MaxSim)
+multi-vector = []
+
 # Enable Disk Index benchmarks
 disk-index = [
     "diskann-disk/perf_test",
diff --git a/diskann-benchmark-multi-vector/examples/test.json b/diskann-benchmark/example/multi-vector-test.json
similarity index 100%
rename from diskann-benchmark-multi-vector/examples/test.json
rename to diskann-benchmark/example/multi-vector-test.json
diff --git a/diskann-benchmark-multi-vector/examples/multi-vector.json b/diskann-benchmark/example/multi-vector.json
similarity index 100%
rename from diskann-benchmark-multi-vector/examples/multi-vector.json
rename to diskann-benchmark/example/multi-vector.json
diff --git a/diskann-benchmark-multi-vector/examples/tolerance.json b/diskann-benchmark/perf_test_inputs/multi-vector-tolerance.json
similarity index 100%
rename from diskann-benchmark-multi-vector/examples/tolerance.json
rename to diskann-benchmark/perf_test_inputs/multi-vector-tolerance.json
diff --git a/diskann-benchmark/src/backend/mod.rs b/diskann-benchmark/src/backend/mod.rs
index 24fe91d7e..0d1c61345 100644
--- a/diskann-benchmark/src/backend/mod.rs
+++ b/diskann-benchmark/src/backend/mod.rs
@@ -7,10 +7,12 @@ mod disk_index;
 mod exhaustive;
 mod filters;
 mod index;
+mod multi_vector;
 
 pub(crate) fn register_benchmarks(registry: &mut diskann_benchmark_runner::registry::Benchmarks) {
     exhaustive::register_benchmarks(registry);
     disk_index::register_benchmarks(registry);
     index::register_benchmarks(registry);
     filters::register_benchmarks(registry);
+    multi_vector::register_benchmarks(registry);
 }
diff --git a/diskann-benchmark/src/backend/multi_vector.rs b/diskann-benchmark/src/backend/multi_vector.rs
new file mode 100644
index 000000000..cfdb77f33
--- /dev/null
+++ b/diskann-benchmark/src/backend/multi_vector.rs
@@ -0,0 +1,806 @@
+/*
+ * Copyright (c) Microsoft Corporation.
+ * Licensed under the MIT license.
+ */
+
+//! Multi-vector distance benchmarks (Chamfer / MaxSim) with regression detection.
+
+use diskann_benchmark_runner::registry::Benchmarks;
+
+// Create a stub-module if the "multi-vector" feature is disabled.
+crate::utils::stub_impl!("multi-vector", inputs::multi_vector::MultiVectorOp);
+
+pub(super) fn register_benchmarks(benchmarks: &mut Benchmarks) {
+    #[cfg(feature = "multi-vector")]
+    {
+        use half::f16;
+
+        // Optimized (architecture-dispatched QueryComputer).
+        benchmarks.register_regression(
+            "multi-vector-op-f32-optimized",
+            imp::Kernel::<imp::Optimized, f32>::new(),
+        );
+        benchmarks.register_regression(
+            "multi-vector-op-f16-optimized",
+            imp::Kernel::<imp::Optimized, f16>::new(),
+        );
+
+        // Reference (Chamfer / MaxSim fallback path).
+        benchmarks.register_regression(
+            "multi-vector-op-f32-reference",
+            imp::Kernel::<imp::Reference, f32>::new(),
+        );
+        benchmarks.register_regression(
+            "multi-vector-op-f16-reference",
+            imp::Kernel::<imp::Reference, f16>::new(),
+        );
+    }
+
+    // Stub implementation
+    #[cfg(not(feature = "multi-vector"))]
+    imp::register("multi-vector-op", benchmarks);
+}
+
+#[cfg(feature = "multi-vector")]
+mod imp {
+    use std::io::Write;
+
+    use diskann_benchmark_runner::{
+        benchmark::{PassFail, Regression},
+        dispatcher::{DispatchRule, FailureScore, MatchScore},
+        utils::{datatype, num::relative_change, percentiles, MicroSeconds},
+        Benchmark,
+    };
+    use diskann_quantization::multi_vector::{
+        Chamfer, Init, Mat, MatRef, MaxSim, QueryComputer, Standard,
+    };
+    use diskann_vector::distance::InnerProduct;
+    use diskann_vector::{DistanceFunctionMut, PureDistanceFunction};
+    use half::f16;
+    use rand::{
+        distr::{Distribution, StandardUniform},
+        rngs::StdRng,
+        SeedableRng,
+    };
+    use serde::{Deserialize, Serialize};
+
+    use crate::inputs::multi_vector::{
+        Implementation, MultiVectorOp, MultiVectorTolerance, Operation, Run,
+    };
+
+    ///////////
+    // Utils //
+    ///////////
+
+    #[derive(Debug, Clone, Copy)]
+    pub(super) struct DisplayWrapper<'a, T: ?Sized>(pub(super) &'a T);
+
+    impl<T: ?Sized> std::ops::Deref for DisplayWrapper<'_, T> {
+        type Target = T;
+        fn deref(&self) -> &T {
+            self.0
+        }
+    }
+
+    //////////////
+    // Dispatch //
+    //////////////
+
+    /// Dispatch marker for the [`QueryComputer`] implementation.
+    #[derive(Debug)]
+    pub(super) struct Optimized;
+
+    /// Dispatch marker for the [`Chamfer`] / [`MaxSim`] fallback.
+    #[derive(Debug)]
+    pub(super) struct Reference;
+
+    /// A multi-vector benchmark.
+    pub(super) struct Kernel<I, T> {
+        _type: std::marker::PhantomData<(I, T)>,
+    }
+
+    impl<I, T> Kernel<I, T> {
+        pub(super) fn new() -> Self {
+            Self {
+                _type: std::marker::PhantomData,
+            }
+        }
+    }
+
+    /// Pairs the standard `TryFrom<Implementation>` conversion with the static
+    /// description info needed for friendly diagnostics in `Benchmark::description`.
+    pub(super) trait ImplementationMatcher:
+        TryFrom<Implementation, Error = FailureScore> + 'static
+    {
+        /// Human-readable description of which implementation this marker handles.
+        const DESCRIPTION: &'static str;
+        /// The implementation variant this marker expects (for mismatch diagnostics).
+        const EXPECTED: Implementation;
+    }
+
+    impl TryFrom<Implementation> for Optimized {
+        type Error = FailureScore;
+        fn try_from(i: Implementation) -> Result<Self, Self::Error> {
+            match i {
+                Implementation::Optimized => Ok(Self),
+                _ => Err(FailureScore(1)),
+            }
+        }
+    }
+
+    impl ImplementationMatcher for Optimized {
+        const DESCRIPTION: &'static str = "QueryComputer (architecture-dispatched)";
+        const EXPECTED: Implementation = Implementation::Optimized;
+    }
+
+    impl TryFrom<Implementation> for Reference {
+        type Error = FailureScore;
+        fn try_from(i: Implementation) -> Result<Self, Self::Error> {
+            match i {
+                Implementation::Reference => Ok(Self),
+                _ => Err(FailureScore(1)),
+            }
+        }
+    }
+
+    impl ImplementationMatcher for Reference {
+        const DESCRIPTION: &'static str = "Chamfer / MaxSim fallback";
+        const EXPECTED: Implementation = Implementation::Reference;
+    }
+
+    impl<I, T> Benchmark for Kernel<I, T>
+    where
+        datatype::Type<T>: DispatchRule<datatype::DataType>,
+        I: ImplementationMatcher,
+        Kernel<I, T>: RunBenchmark<I>,
+        T: 'static,
+    {
+        type Input = MultiVectorOp;
+        type Output = Vec<RunResult>;
+
+        fn try_match(&self, from: &MultiVectorOp) -> Result<MatchScore, FailureScore> {
+            let mut failscore: Option<u32> = None;
+            if datatype::Type::<T>::try_match(&from.element_type).is_err() {
+                *failscore.get_or_insert(0) += 10;
+            }
+            if let Err(FailureScore(score)) = I::try_from(from.implementation) {
+                *failscore.get_or_insert(0) += 2 + score;
+            }
+
+            match failscore {
+                None => Ok(MatchScore(0)),
+                Some(score) => Err(FailureScore(score)),
+            }
+        }
+
+        fn run(
+            &self,
+            input: &MultiVectorOp,
+            _: diskann_benchmark_runner::Checkpoint<'_>,
+            mut output: &mut dyn diskann_benchmark_runner::Output,
+        ) -> anyhow::Result<Self::Output> {
+            // The dispatcher only invokes `run` after `try_match` has already accepted
+            // the input, so a failure here would indicate a dispatcher bug.
+            I::try_from(input.implementation).expect("try_match accepted the input");
+            writeln!(output, "{}", input)?;
+            let results = self.run_benchmark(input)?;
+            writeln!(output, "\n\n{}", DisplayWrapper(&*results))?;
+            Ok(results)
+        }
+
+        fn description(
+            &self,
+            f: &mut std::fmt::Formatter<'_>,
+            input: Option<&MultiVectorOp>,
+        ) -> std::fmt::Result {
+            match input {
+                None => {
+                    writeln!(
+                        f,
+                        "- Element Type: {}",
+                        diskann_benchmark_runner::dispatcher::Description::<
+                            datatype::DataType,
+                            datatype::Type<T>,
+                        >::new()
+                    )?;
+                    writeln!(f, "- Implementation: {}", I::DESCRIPTION)?;
+                }
+                Some(input) => {
+                    if let Err(err) = datatype::Type::<T>::try_match_verbose(&input.element_type) {
+                        writeln!(f, "\n    - Mismatched element type: {}", err)?;
+                    }
+                    if I::try_from(input.implementation).is_err() {
+                        writeln!(
+                            f,
+                            "\n    - Mismatched implementation: expected {}, got {}",
+                            I::EXPECTED,
+                            input.implementation
+                        )?;
+                    }
+                }
+            }
+            Ok(())
+        }
+    }
+
+    impl<I, T> Regression for Kernel<I, T>
+    where
+        datatype::Type<T>: DispatchRule<datatype::DataType>,
+        I: ImplementationMatcher,
+        Kernel<I, T>: RunBenchmark<I>,
+        T: 'static,
+    {
+        type Tolerances = MultiVectorTolerance;
+        type Pass = CheckResult;
+        type Fail = CheckResult;
+
+        fn check(
+            &self,
+            tolerance: &MultiVectorTolerance,
+            _input: &MultiVectorOp,
+            before: &Vec<RunResult>,
+            after: &Vec<RunResult>,
+        ) -> anyhow::Result<PassFail<CheckResult, CheckResult>> {
+            anyhow::ensure!(
+                before.len() == after.len(),
+                "before has {} runs but after has {}",
+                before.len(),
+                after.len(),
+            );
+
+            let mut passed = true;
+            let checks: Vec<Comparison> = std::iter::zip(before.iter(), after.iter())
+                .enumerate()
+                .map(|(i, (b, a))| {
+                    anyhow::ensure!(b.run == a.run, "run {i} mismatched");
+
+                    let computations_per_latency = b.computations_per_latency() as f64;
+
+                    let before_min =
+                        b.percentiles.minimum.as_f64() * 1000.0 / computations_per_latency;
+                    let after_min =
+                        a.percentiles.minimum.as_f64() * 1000.0 / computations_per_latency;
+
+                    let comparison = Comparison {
+                        run: b.run.clone(),
+                        tolerance: *tolerance,
+                        before_min,
+                        after_min,
+                    };
+
+                    match relative_change(before_min, after_min) {
+                        Ok(change) => {
+                            if change > tolerance.min_time_regression.get() {
+                                passed = false;
+                            }
+                        }
+                        Err(_) => passed = false,
+                    };
+
+                    Ok(comparison)
+                })
+                .collect::<anyhow::Result<Vec<Comparison>>>()?;
+
+            let check = CheckResult { checks };
+
+            if passed {
+                Ok(PassFail::Pass(check))
+            } else {
+                Ok(PassFail::Fail(check))
+            }
+        }
+    }
+
+    //////////////////////
+    // Regression Check //
+    //////////////////////
+
+    /// Per-run comparison result showing before/after percentile differences.
+    #[derive(Debug, Serialize)]
+    pub(super) struct Comparison {
+        run: Run,
+        tolerance: MultiVectorTolerance,
+        before_min: f64,
+        after_min: f64,
+    }
+
+    /// Aggregated result of the regression check across all runs.
+    #[derive(Debug, Serialize)]
+    pub(super) struct CheckResult {
+        checks: Vec<Comparison>,
+    }
+
+    impl std::fmt::Display for CheckResult {
+        fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+            let header = [
+                "Operation",
+                "Q",
+                "D",
+                "Dim",
+                "Min Before (ns/IP @ Dim)",
+                "Min After (ns/IP @ Dim)",
+                "Change (%)",
+                "Remark",
+            ];
+
+            let mut table =
+                diskann_benchmark_runner::utils::fmt::Table::new(header, self.checks.len());
+
+            for (i, c) in self.checks.iter().enumerate() {
+                let mut row = table.row(i);
+                let change = relative_change(c.before_min, c.after_min);
+
+                row.insert(c.run.operation, 0);
+                row.insert(c.run.num_query_vectors, 1);
+                row.insert(c.run.num_doc_vectors, 2);
+                row.insert(c.run.dim, 3);
+                row.insert(format!("{:.3}", c.before_min), 4);
+                row.insert(format!("{:.3}", c.after_min), 5);
+                match change {
+                    Ok(change) => {
+                        row.insert(format!("{:.3} %", change * 100.0), 6);
+                        if change > c.tolerance.min_time_regression.get() {
+                            row.insert("FAIL", 7);
+                        }
+                    }
+                    Err(err) => {
+                        row.insert("invalid", 6);
+                        row.insert(err, 7);
+                    }
+                }
+            }
+
+            table.fmt(f)
+        }
+    }
+
+    ///////////////
+    // Benchmark //
+    ///////////////
+
+    pub(super) trait RunBenchmark<I> {
+        fn run_benchmark(&self, input: &MultiVectorOp) -> Result<Vec<RunResult>, anyhow::Error>;
+    }
+
+    #[derive(Debug, Serialize, Deserialize)]
+    pub(super) struct RunResult {
+        /// The configuration for this run.
+        run: Run,
+        /// Per-measurement latencies (over `loops_per_measurement` calls).
+        latencies: Vec<MicroSeconds>,
+        /// Latency percentiles.
+        percentiles: percentiles::Percentiles<MicroSeconds>,
+    }
+
+    impl RunResult {
+        fn computations_per_latency(&self) -> usize {
+            self.run.num_query_vectors.get()
+                * self.run.num_doc_vectors.get()
+                * self.run.loops_per_measurement.get()
+        }
+    }
+
+    impl std::fmt::Display for DisplayWrapper<'_, [RunResult]> {
+        fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+            if self.is_empty() {
+                return Ok(());
+            }
+
+            // ns/IP is normalized as `min_latency_us * 1000 / (Q * D * loops)` and is
+            // approximately linear in `dim`. Compare across rows with the same `Dim`;
+            // divide further by `Dim` to recover ns per scalar multiply.
+            writeln!(
+                f,
+                "ns/IP = time per (query, doc) inner-product call (~ linear in Dim)"
+            )?;
+
+            let header = [
+                "Operation",
+                "Q",
+                "D",
+                "Dim",
+                "Min Time (ns/IP @ Dim)",
+                "Mean Time (ns/IP @ Dim)",
+                "Loops",
+                "Measurements",
+            ];
+
+            let mut table = diskann_benchmark_runner::utils::fmt::Table::new(header, self.len());
+
+            self.iter().enumerate().for_each(|(row, r)| {
+                let mut row = table.row(row);
+
+                let min_latency = r
+                    .latencies
+                    .iter()
+                    .min()
+                    .copied()
+                    .unwrap_or(MicroSeconds::new(u64::MAX));
+                let mean_latency = r.percentiles.mean;
+
+                let computations_per_latency = r.computations_per_latency() as f64;
+
+                // Convert time from micro-seconds to nano-seconds per inner-product call
+                // (one (query, doc) pair, ~ linear in dim).
+                let min_time = min_latency.as_f64() / computations_per_latency * 1000.0;
+                let mean_time = mean_latency / computations_per_latency * 1000.0;
+
+                row.insert(r.run.operation, 0);
+                row.insert(r.run.num_query_vectors, 1);
+                row.insert(r.run.num_doc_vectors, 2);
+                row.insert(r.run.dim, 3);
+                row.insert(format!("{:.3}", min_time), 4);
+                row.insert(format!("{:.3}", mean_time), 5);
+                row.insert(r.run.loops_per_measurement, 6);
+                row.insert(r.run.num_measurements, 7);
+            });
+
+            table.fmt(f)
+        }
+    }
+
+    fn run_loops<F>(run: &Run, mut body: F) -> RunResult
+    where
+        F: FnMut(),
+    {
+        let mut latencies = Vec::with_capacity(run.num_measurements.get());
+
+        for _ in 0..run.num_measurements.get() {
+            let start = std::time::Instant::now();
+            for _ in 0..run.loops_per_measurement.get() {
+                body();
+            }
+            latencies.push(start.elapsed().into());
+        }
+
+        let percentiles = percentiles::compute_percentiles(&mut latencies).unwrap();
+        RunResult {
+            run: run.clone(),
+            latencies,
+            percentiles,
+        }
+    }
+
+    ///////////////////
+    // Data fixtures //
+    ///////////////////
+
+    const RNG_SEED: u64 = 0x12345;
+
+    struct Data<T: Copy> {
+        queries: Mat<Standard<T>>,
+        docs: Mat<Standard<T>>,
+    }
+
+    impl<T: Copy> Data<T>
+    where
+        StandardUniform: Distribution<T>,
+    {
+        fn new(run: &Run) -> Self {
+            let mut rng = StdRng::seed_from_u64(RNG_SEED);
+            let queries = Mat::new(
+                Standard::new(run.num_query_vectors.get(), run.dim.get()).unwrap(),
+                Init(|| StandardUniform.sample(&mut rng)),
+            )
+            .unwrap();
+            let docs = Mat::new(
+                Standard::new(run.num_doc_vectors.get(), run.dim.get()).unwrap(),
+                Init(|| StandardUniform.sample(&mut rng)),
+            )
+            .unwrap();
+            Self { queries, docs }
+        }
+    }
+
+    //////////////////////
+    // Distance kernels //
+    //////////////////////
+
+    /// Object-safe abstraction over a per-shape distance executor.
+    ///
+    /// The two implementations ([`OptimizedDistance`] and [`ReferenceDistance`]) share the
+    /// same hot-loop nest in [`run_with_distance`]; dispatching through `&dyn Distance<T>`
+    /// keeps `run_loops` from being monomorphised over the implementation axis.
+    trait Distance<T: Copy> {
+        fn chamfer(&self, doc: MatRef<'_, Standard<T>>) -> f32;
+        fn max_sim(&self, doc: MatRef<'_, Standard<T>>, scores: &mut [f32]);
+    }
+
+    /// Distance executor that drives [`QueryComputer`] (architecture-dispatched SIMD).
+    struct OptimizedDistance<T: Copy>(QueryComputer<T>);
+
+    impl<T: Copy> Distance<T> for OptimizedDistance<T> {
+        fn chamfer(&self, doc: MatRef<'_, Standard<T>>) -> f32 {
+            self.0.chamfer(doc)
+        }
+        fn max_sim(&self, doc: MatRef<'_, Standard<T>>, scores: &mut [f32]) {
+            self.0.max_sim(doc, scores);
+        }
+    }
+
+    /// Distance executor that drives the [`Chamfer`] / [`MaxSim`] fallback path.
+    struct ReferenceDistance<'a, T: Copy>(
+        diskann_quantization::multi_vector::distance::QueryMatRef<'a, Standard<T>>,
+    );
+
+    impl<T: Copy> Distance<T> for ReferenceDistance<'_, T>
+    where
+        InnerProduct: for<'q, 'd> PureDistanceFunction<&'q [T], &'d [T], f32>,
+    {
+        fn chamfer(&self, doc: MatRef<'_, Standard<T>>) -> f32 {
+            Chamfer::evaluate(self.0, doc)
+        }
+        fn max_sim(&self, doc: MatRef<'_, Standard<T>>, scores: &mut [f32]) {
+            // `MaxSim::new` is a non-empty check + pointer wrap, so constructing it per
+            // iteration is free — no need to hoist it out of the loop.
+            let mut max_sim = MaxSim::new(scores).unwrap();
+            let _ = max_sim.evaluate(self.0, doc);
+        }
+    }
+
+    /////////////////////
+    // Implementations //
+    /////////////////////
+
+    /// Shared loop nest. The trait-object dispatch happens once per outer iteration of
+    /// `run_loops`; the work inside each `chamfer` / `max_sim` call is O(Q*D*dim), so the
+    /// vtable hop is in the noise.
+    fn run_with_distance<T: Copy>(
+        run: &Run,
+        doc: MatRef<'_, Standard<T>>,
+        dist: &dyn Distance<T>,
+    ) -> RunResult {
+        match run.operation {
+            Operation::Chamfer => run_loops(run, || {
+                let v = dist.chamfer(doc);
+                std::hint::black_box(v);
+            }),
+            Operation::MaxSim => {
+                let mut scores = vec![0.0f32; run.num_query_vectors.get()];
+                run_loops(run, || {
+                    dist.max_sim(doc, &mut scores);
+                    std::hint::black_box(&mut scores);
+                })
+            }
+        }
+    }
+
+    fn run_optimized<T>(input: &MultiVectorOp) -> anyhow::Result<Vec<RunResult>>
+    where
+        T: Copy,
+        StandardUniform: Distribution<T>,
+        QueryComputer<T>: NewFromMatRef<T>,
+        OptimizedDistance<T>: Distance<T>,
+    {
+        let mut results = Vec::with_capacity(input.runs.len());
+        for run in input.runs.iter() {
+            let data = Data::<T>::new(run);
+            // `QueryComputer` performs query-side precomputation that is intentionally
+            // amortized across many `chamfer` / `max_sim` calls; construct it once per
+            // shape, outside the timed loop.
+            let dist = OptimizedDistance(<QueryComputer<T> as NewFromMatRef<T>>::new_from(
+                data.queries.as_view(),
+            ));
+            results.push(run_with_distance(run, data.docs.as_view(), &dist));
+        }
+        Ok(results)
+    }
+
+    /// Drive the [`Chamfer`] / [`MaxSim`] fallback path.
+    fn run_reference<T>(input: &MultiVectorOp) -> anyhow::Result<Vec<RunResult>>
+    where
+        T: Copy,
+        StandardUniform: Distribution<T>,
+        InnerProduct: for<'a, 'b> PureDistanceFunction<&'a [T], &'b [T], f32>,
+        for<'a> ReferenceDistance<'a, T>: Distance<T>,
+    {
+        let mut results = Vec::with_capacity(input.runs.len());
+        for run in input.runs.iter() {
+            let data = Data::<T>::new(run);
+            let dist = ReferenceDistance(data.queries.as_view().into());
+            results.push(run_with_distance(run, data.docs.as_view(), &dist));
+        }
+        Ok(results)
+    }
+
+    /// Element-type-erasing constructor for [`QueryComputer`].
+    ///
+    /// `QueryComputer::<T>::new` is defined as an inherent method on the concrete
+    /// `QueryComputer<f32>` / `QueryComputer<half::f16>` types (not a generic), so we need
+    /// this shim trait to let generic code (e.g. `run_optimized<T>`) call it.
+    trait NewFromMatRef<T: Copy> {
+        fn new_from(query: MatRef<'_, Standard<T>>) -> QueryComputer<T>;
+    }
+
+    impl NewFromMatRef<f32> for QueryComputer<f32> {
+        fn new_from(query: MatRef<'_, Standard<f32>>) -> QueryComputer<f32> {
+            QueryComputer::<f32>::new(query)
+        }
+    }
+
+    impl NewFromMatRef<f16> for QueryComputer<f16> {
+        fn new_from(query: MatRef<'_, Standard<f16>>) -> QueryComputer<f16> {
+            QueryComputer::<f16>::new(query)
+        }
+    }
+
+    impl<T> RunBenchmark<Optimized> for Kernel<Optimized, T>
+    where
+        T: Copy + 'static,
+        StandardUniform: Distribution<T>,
+        QueryComputer<T>: NewFromMatRef<T>,
+        OptimizedDistance<T>: Distance<T>,
+    {
+        fn run_benchmark(&self, input: &MultiVectorOp) -> anyhow::Result<Vec<RunResult>> {
+            run_optimized::<T>(input)
+        }
+    }
+
+    impl<T> RunBenchmark<Reference> for Kernel<Reference, T>
+    where
+        T: Copy + 'static,
+        StandardUniform: Distribution<T>,
+        InnerProduct: for<'a, 'b> PureDistanceFunction<&'a [T], &'b [T], f32>,
+        for<'a> ReferenceDistance<'a, T>: Distance<T>,
+    {
+        fn run_benchmark(&self, input: &MultiVectorOp) -> anyhow::Result<Vec<RunResult>> {
+            run_reference::<T>(input)
+        }
+    }
+
+    ///////////
+    // Tests //
+    ///////////
+
+    #[cfg(test)]
+    mod tests {
+        use std::num::NonZeroUsize;
+
+        use diskann_benchmark_runner::{
+            benchmark::{PassFail, Regression},
+            utils::{datatype::DataType, num::NonNegativeFinite, percentiles::compute_percentiles},
+        };
+
+        use super::*;
+
+        fn tiny_run(operation: Operation) -> Run {
+            Run {
+                operation,
+                num_query_vectors: NonZeroUsize::new(2).unwrap(),
+                num_doc_vectors: NonZeroUsize::new(2).unwrap(),
+                dim: NonZeroUsize::new(4).unwrap(),
+                loops_per_measurement: NonZeroUsize::new(1).unwrap(),
+                num_measurements: NonZeroUsize::new(1).unwrap(),
+            }
+        }
+
+        fn tiny_op() -> MultiVectorOp {
+            MultiVectorOp {
+                element_type: DataType::Float32,
+                implementation: Implementation::Optimized,
+                runs: vec![tiny_run(Operation::Chamfer)],
+            }
+        }
+
+        fn tiny_result(operation: Operation, minimum: u64) -> RunResult {
+            let run = tiny_run(operation);
+            let minimum = MicroSeconds::new(minimum);
+            let mut latencies = vec![minimum];
+            let percentiles = compute_percentiles(&mut latencies).unwrap();
+            RunResult {
+                run,
+                latencies,
+                percentiles,
+            }
+        }
+
+        fn tolerance(limit: f64) -> MultiVectorTolerance {
+            MultiVectorTolerance {
+                min_time_regression: NonNegativeFinite::new(limit).unwrap(),
+            }
+        }
+
+        #[test]
+        fn check_rejects_mismatched_runs() {
+            let kernel = Kernel::<Optimized, f32>::new();
+
+            let err = kernel
+                .check(
+                    &tolerance(0.0),
+                    &tiny_op(),
+                    &vec![tiny_result(Operation::Chamfer, 100)],
+                    &vec![tiny_result(Operation::MaxSim, 100)],
+                )
+                .unwrap_err();
+
+            assert_eq!(err.to_string(), "run 0 mismatched");
+        }
+
+        #[test]
+        fn check_allows_negative_relative_change() {
+            let kernel = Kernel::<Optimized, f32>::new();
+
+            let result = kernel
+                .check(
+                    &tolerance(0.0),
+                    &tiny_op(),
+                    &vec![tiny_result(Operation::Chamfer, 100)],
+                    &vec![tiny_result(Operation::Chamfer, 95)],
+                )
+                .unwrap();
+
+            assert!(matches!(result, PassFail::Pass(_)));
+        }
+
+        #[test]
+        fn check_passes_on_tolerance_boundary() {
+            let kernel = Kernel::<Optimized, f32>::new();
+
+            let result = kernel
+                .check(
+                    &tolerance(0.05),
+                    &tiny_op(),
+                    &vec![tiny_result(Operation::Chamfer, 100)],
+                    &vec![tiny_result(Operation::Chamfer, 105)],
+                )
+                .unwrap();
+
+            assert!(matches!(result, PassFail::Pass(_)));
+        }
+
+        #[test]
+        fn check_fails_above_tolerance_boundary() {
+            let kernel = Kernel::<Optimized, f32>::new();
+
+            let result = kernel
+                .check(
+                    &tolerance(0.05),
+                    &tiny_op(),
+                    &vec![tiny_result(Operation::Chamfer, 100)],
+                    &vec![tiny_result(Operation::Chamfer, 106)],
+                )
+                .unwrap();
+
+            assert!(matches!(result, PassFail::Fail(_)));
+        }
+
+        #[test]
+        fn check_result_display_includes_failure_details() {
+            let check = CheckResult {
+                checks: vec![Comparison {
+                    run: tiny_run(Operation::Chamfer),
+                    tolerance: tolerance(0.05),
+                    before_min: 100.0,
+                    after_min: 106.0,
+                }],
+            };
+
+            let rendered = check.to_string();
+            assert!(rendered.contains("Operation"), "rendered = {rendered}");
+            assert!(rendered.contains("chamfer"), "rendered = {rendered}");
+            assert!(rendered.contains("100.000"), "rendered = {rendered}");
+            assert!(rendered.contains("106.000"), "rendered = {rendered}");
+            assert!(rendered.contains("6.000 %"), "rendered = {rendered}");
+            assert!(rendered.contains("FAIL"), "rendered = {rendered}");
+        }
+
+        /// A "before" value of 0 means the measurement was too fast to obtain a
+        /// reliable signal, so we *could* be letting a regression through. We
+        /// require at least a non-zero value.
+        #[test]
+        fn zero_values_rejected() {
+            let kernel = Kernel::<Optimized, f32>::new();
+
+            let result = kernel
+                .check(
+                    &tolerance(0.05),
+                    &tiny_op(),
+                    &vec![tiny_result(Operation::Chamfer, 0)],
+                    &vec![tiny_result(Operation::Chamfer, 0)],
+                )
+                .unwrap();
+
+            assert!(matches!(result, PassFail::Fail(_)));
+        }
+    }
+}
diff --git a/diskann-benchmark/src/inputs/mod.rs b/diskann-benchmark/src/inputs/mod.rs
index 856412e2a..414a0b52e 100644
--- a/diskann-benchmark/src/inputs/mod.rs
+++ b/diskann-benchmark/src/inputs/mod.rs
@@ -7,6 +7,7 @@ pub(crate) mod disk;
 pub(crate) mod exhaustive;
 pub(crate) mod filters;
 pub(crate) mod graph_index;
+pub(crate) mod multi_vector;
 pub(crate) mod save_and_load;
 
 pub(crate) fn register_inputs(
@@ -16,6 +17,7 @@ pub(crate) fn register_inputs(
     exhaustive::register_inputs(registry)?;
     disk::register_inputs(registry)?;
     filters::register_inputs(registry)?;
+    multi_vector::register_inputs(registry)?;
     Ok(())
 }
 
diff --git a/diskann-benchmark/src/inputs/multi_vector.rs b/diskann-benchmark/src/inputs/multi_vector.rs
new file mode 100644
index 000000000..8010162d6
--- /dev/null
+++ b/diskann-benchmark/src/inputs/multi_vector.rs
@@ -0,0 +1,190 @@
+/*
+ * Copyright (c) Microsoft Corporation.
+ * Licensed under the MIT license.
+ */
+
+use std::num::NonZeroUsize;
+
+use diskann_benchmark_runner::{
+    utils::{datatype::DataType, num::NonNegativeFinite},
+    CheckDeserialization, Checker,
+};
+use serde::{Deserialize, Serialize};
+
+use crate::inputs::{as_input, Example};
+
+//////////////
+// Registry //
+//////////////
+
+as_input!(MultiVectorOp);
+as_input!(MultiVectorTolerance);
+
+pub(super) fn register_inputs(
+    registry: &mut diskann_benchmark_runner::registry::Inputs,
+) -> anyhow::Result<()> {
+    registry.register::<MultiVectorOp>()?;
+    registry.register::<MultiVectorTolerance>()?;
+    Ok(())
+}
+
+////////////////
+// Enum types //
+////////////////
+
+/// The two distance operations exposed by `QueryComputer`.
+#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)]
+#[serde(rename_all = "snake_case")]
+pub(crate) enum Operation {
+    Chamfer,
+    MaxSim,
+}
+
+impl std::fmt::Display for Operation {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        let st = match self {
+            Self::Chamfer => "chamfer",
+            Self::MaxSim => "max_sim",
+        };
+        write!(f, "{}", st)
+    }
+}
+
+/// Which implementation tier to benchmark.
+#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)]
+#[serde(rename_all = "kebab-case")]
+pub(crate) enum Implementation {
+    Optimized,
+    Reference,
+}
+
+impl std::fmt::Display for Implementation {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        let st = match self {
+            Self::Optimized => "optimized",
+            Self::Reference => "reference",
+        };
+        write!(f, "{}", st)
+    }
+}
+
+/// One benchmark configuration: a single (operation, shape) measurement.
+#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
+pub(crate) struct Run {
+    pub(crate) operation: Operation,
+    pub(crate) num_query_vectors: NonZeroUsize,
+    pub(crate) num_doc_vectors: NonZeroUsize,
+    pub(crate) dim: NonZeroUsize,
+    pub(crate) loops_per_measurement: NonZeroUsize,
+    pub(crate) num_measurements: NonZeroUsize,
+}
+
+///////////////////////
+// Multi-Vector Op   //
+///////////////////////
+
+/// A complete multi-vector benchmark job.
+#[derive(Debug, Serialize, Deserialize)]
+pub(crate) struct MultiVectorOp {
+    pub(crate) element_type: DataType,
+    pub(crate) implementation: Implementation,
+    pub(crate) runs: Vec<Run>,
+}
+
+impl MultiVectorOp {
+    pub(crate) const fn tag() -> &'static str {
+        "multi-vector-op"
+    }
+}
+
+impl CheckDeserialization for MultiVectorOp {
+    fn check_deserialization(&mut self, _checker: &mut Checker) -> Result<(), anyhow::Error> {
+        Ok(())
+    }
+}
+
+impl Example for MultiVectorOp {
+    fn example() -> Self {
+        const NUM_QUERY_VECTORS: NonZeroUsize = NonZeroUsize::new(32).unwrap();
+        const NUM_DOC_VECTORS: NonZeroUsize = NonZeroUsize::new(64).unwrap();
+        const DIM: NonZeroUsize = NonZeroUsize::new(128).unwrap();
+        const LOOPS_PER_MEASUREMENT: NonZeroUsize = NonZeroUsize::new(200).unwrap();
+        const NUM_MEASUREMENTS: NonZeroUsize = NonZeroUsize::new(100).unwrap();
+
+        let runs = vec![
+            Run {
+                operation: Operation::Chamfer,
+                num_query_vectors: NUM_QUERY_VECTORS,
+                num_doc_vectors: NUM_DOC_VECTORS,
+                dim: DIM,
+                loops_per_measurement: LOOPS_PER_MEASUREMENT,
+                num_measurements: NUM_MEASUREMENTS,
+            },
+            Run {
+                operation: Operation::MaxSim,
+                num_query_vectors: NUM_QUERY_VECTORS,
+                num_doc_vectors: NUM_DOC_VECTORS,
+                dim: DIM,
+                loops_per_measurement: LOOPS_PER_MEASUREMENT,
+                num_measurements: NUM_MEASUREMENTS,
+            },
+        ];
+
+        Self {
+            element_type: DataType::Float32,
+            implementation: Implementation::Optimized,
+            runs,
+        }
+    }
+}
+
+macro_rules! write_field {
+    ($f:ident, $field:tt, $($expr:tt)*) => {
+        writeln!($f, "{:>18}: {}", $field, $($expr)*)
+    }
+}
+
+impl std::fmt::Display for MultiVectorOp {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        writeln!(f, "Multi-Vector Operation\n")?;
+        write_field!(f, "tag", Self::tag())?;
+        write_field!(f, "element type", self.element_type)?;
+        write_field!(f, "implementation", self.implementation)?;
+        write_field!(f, "number of runs", self.runs.len())?;
+        Ok(())
+    }
+}
+
+/////////////////////////////
+// Multi-Vector Tolerance  //
+/////////////////////////////
+
+/// Tolerance thresholds for multi-vector benchmark regression detection.
+///
+/// Each field specifies the maximum allowed relative increase in the corresponding metric.
+/// For example, a value of `0.05` means a 5% increase is tolerated.
+#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
+pub(crate) struct MultiVectorTolerance {
+    pub(crate) min_time_regression: NonNegativeFinite,
+}
+
+impl MultiVectorTolerance {
+    pub(crate) const fn tag() -> &'static str {
+        "multi-vector-tolerance"
+    }
+}
+
+impl CheckDeserialization for MultiVectorTolerance {
+    fn check_deserialization(&mut self, _checker: &mut Checker) -> Result<(), anyhow::Error> {
+        Ok(())
+    }
+}
+
+impl Example for MultiVectorTolerance {
+    fn example() -> Self {
+        Self {
+            min_time_regression: NonNegativeFinite::new(0.05)
+                .expect("0.05 is a valid non-negative finite"),
+        }
+    }
+}
diff --git a/diskann-benchmark/src/main.rs b/diskann-benchmark/src/main.rs
index 424e63bb7..c7276f2e1 100644
--- a/diskann-benchmark/src/main.rs
+++ b/diskann-benchmark/src/main.rs
@@ -776,6 +776,92 @@ mod tests {
         assert!(!output_path.exists());
     }
 
+    ///////////////////
+    // Multi-Vector  //
+    ///////////////////
+
+    #[test]
+    fn multi_vector_integration() {
+        let path = example_directory().join("multi-vector-test.json");
+        let tempdir = tempfile::tempdir().unwrap();
+        let output_path = tempdir.path().join("output.json");
+        assert!(!output_path.exists());
+
+        let modified_input_path = tempdir.path().join("input.json");
+
+        let mut raw = value_from_file(&path);
+        prefix_search_directories(&mut raw, &root_directory());
+        save_to_file(&modified_input_path, &raw);
+
+        run_multi_vector_integration(&modified_input_path, &output_path)
+    }
+
+    #[cfg(feature = "multi-vector")]
+    fn run_multi_vector_integration(input_path: &std::path::Path, output_path: &std::path::Path) {
+        let command = Commands::Run {
+            input_file: input_path.to_owned(),
+            output_file: output_path.to_owned(),
+            dry_run: false,
+            allow_debug: true,
+        };
+
+        let cli = Cli::from_commands(command, true);
+        let mut output = Memory::new();
+
+        cli.run(&mut output).unwrap();
+        println!(
+            "output = {}",
+            String::from_utf8(output.into_inner()).unwrap()
+        );
+
+        // Check that the results file is generated.
+        assert!(output_path.exists());
+    }
+
+    #[cfg(not(feature = "multi-vector"))]
+    fn run_multi_vector_integration(input_path: &std::path::Path, output_path: &std::path::Path) {
+        let command = Commands::Run {
+            input_file: input_path.to_owned(),
+            output_file: output_path.to_owned(),
+            dry_run: false,
+            allow_debug: true,
+        };
+        let cli = Cli::from_commands(command, true);
+        let mut output = Memory::new();
+
+        let err = cli.run(&mut output).unwrap_err();
+        println!("err = {:?}", err);
+
+        let output = String::from_utf8(output.into_inner()).unwrap();
+        assert!(output.contains("\"multi-vector\" feature"));
+        println!("output = {}", output);
+
+        // The output file should not have been created because we failed the test.
+        assert!(!output_path.exists());
+    }
+
+    #[test]
+    #[cfg(feature = "multi-vector")]
+    fn multi_vector_check_verify() {
+        let input_path = example_directory().join("multi-vector-test.json");
+        let tolerance_path = project_directory()
+            .join("perf_test_inputs")
+            .join("multi-vector-tolerance.json");
+
+        let command = Commands::Check(diskann_benchmark_runner::app::Check::Verify {
+            tolerances: tolerance_path,
+            input_file: input_path,
+        });
+
+        let cli = Cli::from_commands(command, true);
+        let mut output = Memory::new();
+        cli.run(&mut output).unwrap();
+        println!(
+            "output = {}",
+            String::from_utf8(output.into_inner()).unwrap()
+        );
+    }
+
     #[test]
     fn quiet_suppresses_check_target_warning() {
         let cli = Cli::from_commands(Commands::Skeleton, true);
diff --git a/diskann-quantization/src/multi_vector/matrix.rs b/diskann-quantization/src/multi_vector/matrix.rs
index 70629d44c..bcbafaaa3 100644
--- a/diskann-quantization/src/multi_vector/matrix.rs
+++ b/diskann-quantization/src/multi_vector/matrix.rs
@@ -244,6 +244,18 @@ pub unsafe trait NewOwned<T>: ReprOwned {
 #[derive(Debug, Clone, Copy)]
 pub struct Defaulted;
 
+/// An initializer argument to [`NewOwned`] that invokes the wrapped closure for each
+/// element.
+///
+/// # Example
+/// ```
+/// use diskann_quantization::multi_vector::{Init, Mat, Standard};
+/// let mut n = 0;
+/// let mat = Mat::new(Standard::<i32>::new(1, 4).unwrap(), Init(|| { n += 1; n })).unwrap();
+/// assert_eq!(mat.as_slice(), &[1, 2, 3, 4]);
+/// ```
+pub struct Init<F>(pub F);
+
 /// Create a new [`Mat`] cloned from a view.
 pub trait NewCloned: ReprOwned {
     /// Clone the contents behind `v`, returning a new owning [`Mat`].
@@ -514,6 +526,22 @@ where
     }
 }
 
+// SAFETY: The implementation uses guarantees from `Box` to ensure that the pointer
+// initialized by it is non-null and properly aligned to the underlying type.
+unsafe impl<T, F> NewOwned<Init<F>> for Standard<T>
+where
+    T: Copy,
+    F: FnMut() -> T,
+{
+    type Error = crate::error::Infallible;
+    fn new_owned(self, mut init: Init<F>) -> Result<Mat<Self>, Self::Error> {
+        let b: Box<[T]> = (0..self.num_elements()).map(|_| (init.0)()).collect();
+
+        // SAFETY: By construction, `b` has length `self.num_elements()`.
+        Ok(unsafe { self.box_to_mat(b) })
+    }
+}
+
 // SAFETY: This checks that the slice has the correct length, which is all that is
 // required for [`Repr`].
 unsafe impl<T> NewRef<T> for Standard<T>
@@ -1767,6 +1795,22 @@ mod tests {
         }
     }
 
+    #[test]
+    fn test_standard_new_owned_with_init() {
+        let mut counter: i32 = 0;
+        let m = Mat::new(
+            Standard::<i32>::new(2, 3).unwrap(),
+            Init(|| {
+                let v = counter;
+                counter += 1;
+                v
+            }),
+        )
+        .unwrap();
+
+        assert_eq!(m.as_slice(), &[0, 1, 2, 3, 4, 5]);
+    }
+
     #[test]
     fn matref_new_slice_length_error() {
         let repr = Standard::<u32>::new(3, 4).unwrap();
diff --git a/diskann-quantization/src/multi_vector/mod.rs b/diskann-quantization/src/multi_vector/mod.rs
index 3670b1aaf..1d765bacc 100644
--- a/diskann-quantization/src/multi_vector/mod.rs
+++ b/diskann-quantization/src/multi_vector/mod.rs
@@ -74,6 +74,6 @@ pub(crate) mod matrix;
 pub use block_transposed::{BlockTransposed, BlockTransposedMut, BlockTransposedRef};
 pub use distance::{Chamfer, MaxSim, MaxSimError, QueryComputer, QueryMatRef};
 pub use matrix::{
-    Defaulted, LayoutError, Mat, MatMut, MatRef, NewCloned, NewMut, NewOwned, NewRef, Overflow,
-    Repr, ReprMut, ReprOwned, SliceError, Standard,
+    Defaulted, Init, LayoutError, Mat, MatMut, MatRef, NewCloned, NewMut, NewOwned, NewRef,
+    Overflow, Repr, ReprMut, ReprOwned, SliceError, Standard,
 };