Skip to content

Commit 56f0e17

Browse files
committed
feat: add benchmark tool for chunking performance evaluation
1 parent 10dc6ef commit 56f0e17

2 files changed

Lines changed: 135 additions & 1 deletion

File tree

.gitignore

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,4 +76,6 @@ test-data/*
7676
!test-data/realistic-1.0mb.txt
7777
!test-data/realistic-5.0mb.txt
7878

79-
.hypothesis
79+
.hypothesis
80+
81+
!**/src/bin

kiru-core/src/bin/benchmark.rs

Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,132 @@
1+
// kiru-core/src/bin/benchmark.rs
2+
3+
use kiru::{ChunkerBuilder, ChunkerEnum, Source};
4+
use serde::Serialize;
5+
use std::env;
6+
use std::time::Instant;
7+
8+
#[derive(Serialize)]
9+
struct BenchmarkResult {
10+
elapsed_secs: f64,
11+
num_chunks: usize,
12+
total_bytes: usize,
13+
throughput_mb_s: f64,
14+
}
15+
16+
#[derive(Serialize)]
17+
struct BenchmarkError {
18+
error: String,
19+
}
20+
21+
fn main() {
22+
let args: Vec<String> = env::args().collect();
23+
24+
if args.len() < 6 {
25+
let error = BenchmarkError {
26+
error: "Usage: benchmark <strategy> <source_type> <path> <chunk_size> <overlap>"
27+
.to_string(),
28+
};
29+
eprintln!("{}", serde_json::to_string(&error).unwrap());
30+
std::process::exit(1);
31+
}
32+
33+
let strategy = &args[1]; // "bytes" or "chars"
34+
let source_type = &args[2]; // "string" or "file" or "http" or "glob"
35+
let path = &args[3];
36+
let chunk_size: usize = match args[4].parse() {
37+
Ok(v) => v,
38+
Err(e) => {
39+
let error = BenchmarkError {
40+
error: format!("Invalid chunk_size: {}", e),
41+
};
42+
eprintln!("{}", serde_json::to_string(&error).unwrap());
43+
std::process::exit(1);
44+
}
45+
};
46+
let overlap: usize = match args[5].parse() {
47+
Ok(v) => v,
48+
Err(e) => {
49+
let error = BenchmarkError {
50+
error: format!("Invalid overlap: {}", e),
51+
};
52+
eprintln!("{}", serde_json::to_string(&error).unwrap());
53+
std::process::exit(1);
54+
}
55+
};
56+
57+
let result = run_benchmark(strategy, source_type, path, chunk_size, overlap);
58+
59+
match result {
60+
Ok(bench_result) => {
61+
println!("{}", serde_json::to_string(&bench_result).unwrap());
62+
}
63+
Err(e) => {
64+
let error = BenchmarkError {
65+
error: format!("Benchmark failed: {}", e),
66+
};
67+
eprintln!("{}", serde_json::to_string(&error).unwrap());
68+
std::process::exit(1);
69+
}
70+
}
71+
}
72+
73+
fn run_benchmark(
74+
strategy: &str,
75+
source_type: &str,
76+
path: &str,
77+
chunk_size: usize,
78+
overlap: usize,
79+
) -> Result<BenchmarkResult, Box<dyn std::error::Error>> {
80+
// Create the chunker using ChunkerBuilder
81+
let chunker = match strategy {
82+
"bytes" => ChunkerBuilder::by_bytes(ChunkerEnum::Bytes {
83+
chunk_size,
84+
overlap,
85+
}),
86+
"chars" => ChunkerBuilder::by_characters(ChunkerEnum::Characters {
87+
chunk_size,
88+
overlap,
89+
}),
90+
_ => {
91+
return Err(format!("Invalid strategy '{}'. Use 'bytes' or 'chars'", strategy).into());
92+
}
93+
};
94+
95+
// Parse the source based on source_type
96+
let source = match source_type {
97+
"file" => Source::File(path.to_string()),
98+
"http" | "https" => Source::Http(path.to_string()),
99+
"string" => Source::Text(path.to_string()),
100+
_ => {
101+
return Err(format!(
102+
"Invalid source_type '{}'. Use 'file', 'string', 'http', 'text', or 'glob'",
103+
source_type
104+
)
105+
.into());
106+
}
107+
};
108+
109+
// Run the benchmark
110+
let start = Instant::now();
111+
let mut num_chunks = 0;
112+
let mut total_bytes = 0;
113+
114+
let iterator = chunker.on_source(source)?;
115+
116+
for chunk in iterator {
117+
num_chunks += 1;
118+
total_bytes += chunk.len();
119+
std::hint::black_box(chunk.len());
120+
}
121+
122+
let elapsed = start.elapsed();
123+
let elapsed_secs = elapsed.as_secs_f64();
124+
let throughput_mb_s = (total_bytes as f64) / (1024.0 * 1024.0) / elapsed_secs;
125+
126+
Ok(BenchmarkResult {
127+
elapsed_secs,
128+
num_chunks,
129+
total_bytes,
130+
throughput_mb_s,
131+
})
132+
}

0 commit comments

Comments
 (0)