From ea83a2671eef7a1eb5fe1af2bdad5fd6a50fb8db Mon Sep 17 00:00:00 2001 From: Wes Chow Date: Tue, 26 May 2026 17:58:59 -0400 Subject: [PATCH 1/2] Fix handling of corrupt or empty documents. --- processors/src/docx_processor.rs | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/processors/src/docx_processor.rs b/processors/src/docx_processor.rs index 350da0aa..8273ff27 100644 --- a/processors/src/docx_processor.rs +++ b/processors/src/docx_processor.rs @@ -18,7 +18,22 @@ impl DocxProcessor { impl FileProcessor for DocxProcessor { fn process_file(&self, path: impl AsRef) -> anyhow::Result { - let docs = MarkdownDocument::from_file(path); + // `docx-parser::MarkdownDocument::from_file` uses `panic!` instead of returning + // `Result` when the file is missing, corrupt, or not a valid DOCX/ZIP archive. + // We catch that panic here and convert it into a proper anyhow error so callers + // get a clean Err(…) rather than a process-level abort. + let path = path.as_ref().to_owned(); + let docs = + std::panic::catch_unwind(move || MarkdownDocument::from_file(&path)).map_err(|e| { + let msg = if let Some(s) = e.downcast_ref::() { + s.clone() + } else if let Some(s) = e.downcast_ref::<&str>() { + s.to_string() + } else { + "unknown panic".to_string() + }; + anyhow::anyhow!("docx_parser panicked while opening file: {}", msg) + })?; let markdown = docs.to_markdown(false); self.markdown_processor.process_document(&markdown) } From a9188c7eeebeba74a2df6e9dbfd7246e4a205313 Mon Sep 17 00:00:00 2001 From: Wes Chow Date: Tue, 26 May 2026 18:00:41 -0400 Subject: [PATCH 2/2] Return errors when statistical functions are given bad input rather than panics. --- rust/src/chunkers/statistical.rs | 41 ++++++++++++++++++++++---------- 1 file changed, 28 insertions(+), 13 deletions(-) diff --git a/rust/src/chunkers/statistical.rs b/rust/src/chunkers/statistical.rs index b11c3e76..1aee24c9 100644 --- a/rust/src/chunkers/statistical.rs +++ b/rust/src/chunkers/statistical.rs @@ -11,27 +11,33 @@ use text_splitter::{ChunkConfig, TextSplitter}; // use text_splitter::{ChunkConfig, TextSplitter}; use tokenizers::Tokenizer; -fn median(data: &[T]) -> T +fn median(data: &[T]) -> Option where T: Copy + PartialOrd + std::ops::Add + std::ops::Div + From, { - assert!(!data.is_empty(), "median requires at least one data point"); + if data.is_empty() { + return None; + } let mut sorted = data.to_vec(); - sorted.sort_by(|a, b| a.partial_cmp(b).unwrap()); + // Use `unwrap_or` to handle NaN values (treat them as equal) instead of panicking. + sorted.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal)); let mid = sorted.len() / 2; - if sorted.len() % 2 == 0 { + let result = if sorted.len() % 2 == 0 { (sorted[mid - 1] + sorted[mid]) / T::from(2u8) } else { sorted[mid] - } + }; + Some(result) } -fn std_dev(data: &[f32]) -> f32 { - assert!(data.len() > 1, "standard deviation requires at least two data points"); +fn std_dev(data: &[f32]) -> Option { + if data.len() < 2 { + return None; + } let n = data.len() as f32; let mean = data.iter().sum::() / n; let variance = data.iter().map(|x| (x - mean).powi(2)).sum::() / n; - variance.sqrt() + Some(variance.sqrt()) } pub struct StatisticalChunker { @@ -255,7 +261,14 @@ impl StatisticalChunker { raw_similarities } - fn _find_optimal_threshold(&self, batch_splits: &[&str], similarities: &Vec) -> f32 { + fn _find_optimal_threshold(&self, batch_splits: &[&str], similarities: &[f32]) -> f32 { + // Guard: we need at least 2 similarity scores to compute median + std_dev. + // With 0 scores there are no chunk boundaries to find; return a neutral threshold. + // With 1 score there is no variance to measure; use that single score directly. + if similarities.len() < 2 { + return similarities.first().copied().unwrap_or(0.5); + } + let tokens = self .tokenizer .encode_batch(batch_splits.to_vec(), true) @@ -274,8 +287,10 @@ impl StatisticalChunker { .collect::>(); // analyze the distribution of similarity scores to set initial bounds - let median_score = median(similarities); - let std_dev = std_dev(similarities); + // Both median() and std_dev() return Option; the len() >= 2 guard above + // ensures they always return Some(_) here. + let median_score = median(similarities).unwrap_or(0.5); + let std_dev = std_dev(similarities).unwrap_or(0.0); // set initial bounds based on median and standard deviation let mut low = f32::max(0.0, median_score - std_dev); @@ -300,7 +315,7 @@ impl StatisticalChunker { .map(|(start, end)| cumulative_token_counts[*end] - cumulative_token_counts[*start]) .collect(); - median_tokens = median(&split_token_counts); + median_tokens = median(&split_token_counts).unwrap_or(0); if self.min_split_tokens - self.split_token_tolerance <= median_tokens && median_tokens <= self.max_split_tokens + self.split_token_tolerance @@ -315,7 +330,7 @@ impl StatisticalChunker { } calculated_threshold } - fn _find_split_indices(&self, similarities: &Vec, threshold: f32) -> Vec { + fn _find_split_indices(&self, similarities: &[f32], threshold: f32) -> Vec { let mut split_indices = Vec::new(); for (idx, score) in enumerate(similarities) { if *score < threshold {